From 0a6a40c2a8c184a2fb467efacfb1cd338d719e0b Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Wed, 17 Oct 2018 21:37:58 -0700 Subject: crypto: aes_ti - disable interrupts while accessing S-box In the "aes-fixed-time" AES implementation, disable interrupts while accessing the S-box, in order to make cache-timing attacks more difficult. Previously it was possible for the CPU to be interrupted while the S-box was loaded into L1 cache, potentially evicting the cachelines and causing later table lookups to be time-variant. In tests I did on x86 and ARM, this doesn't affect performance significantly. Responsiveness is potentially a concern, but interrupts are only disabled for a single AES block. Note that even after this change, the implementation still isn't necessarily guaranteed to be constant-time; see https://cr.yp.to/antiforgery/cachetiming-20050414.pdf for a discussion of the many difficulties involved in writing truly constant-time AES software. But it's valuable to make such attacks more difficult. Reviewed-by: Ard Biesheuvel Signed-off-by: Eric Biggers Signed-off-by: Herbert Xu --- crypto/Kconfig | 3 ++- crypto/aes_ti.c | 18 ++++++++++++++++++ 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/crypto/Kconfig b/crypto/Kconfig index f7a235db56aa..752005201013 100644 --- a/crypto/Kconfig +++ b/crypto/Kconfig @@ -1006,7 +1006,8 @@ config CRYPTO_AES_TI 8 for decryption), this implementation only uses just two S-boxes of 256 bytes each, and attempts to eliminate data dependent latencies by prefetching the entire table into the cache at the start of each - block. + block. Interrupts are also disabled to avoid races where cachelines + are evicted when the CPU is interrupted to do something else. config CRYPTO_AES_586 tristate "AES cipher algorithms (i586)" diff --git a/crypto/aes_ti.c b/crypto/aes_ti.c index 03023b2290e8..1ff9785b30f5 100644 --- a/crypto/aes_ti.c +++ b/crypto/aes_ti.c @@ -269,6 +269,7 @@ static void aesti_encrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in) const u32 *rkp = ctx->key_enc + 4; int rounds = 6 + ctx->key_length / 4; u32 st0[4], st1[4]; + unsigned long flags; int round; st0[0] = ctx->key_enc[0] ^ get_unaligned_le32(in); @@ -276,6 +277,12 @@ static void aesti_encrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in) st0[2] = ctx->key_enc[2] ^ get_unaligned_le32(in + 8); st0[3] = ctx->key_enc[3] ^ get_unaligned_le32(in + 12); + /* + * Temporarily disable interrupts to avoid races where cachelines are + * evicted when the CPU is interrupted to do something else. + */ + local_irq_save(flags); + st0[0] ^= __aesti_sbox[ 0] ^ __aesti_sbox[128]; st0[1] ^= __aesti_sbox[32] ^ __aesti_sbox[160]; st0[2] ^= __aesti_sbox[64] ^ __aesti_sbox[192]; @@ -300,6 +307,8 @@ static void aesti_encrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in) put_unaligned_le32(subshift(st1, 1) ^ rkp[5], out + 4); put_unaligned_le32(subshift(st1, 2) ^ rkp[6], out + 8); put_unaligned_le32(subshift(st1, 3) ^ rkp[7], out + 12); + + local_irq_restore(flags); } static void aesti_decrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in) @@ -308,6 +317,7 @@ static void aesti_decrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in) const u32 *rkp = ctx->key_dec + 4; int rounds = 6 + ctx->key_length / 4; u32 st0[4], st1[4]; + unsigned long flags; int round; st0[0] = ctx->key_dec[0] ^ get_unaligned_le32(in); @@ -315,6 +325,12 @@ static void aesti_decrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in) st0[2] = ctx->key_dec[2] ^ get_unaligned_le32(in + 8); st0[3] = ctx->key_dec[3] ^ get_unaligned_le32(in + 12); + /* + * Temporarily disable interrupts to avoid races where cachelines are + * evicted when the CPU is interrupted to do something else. + */ + local_irq_save(flags); + st0[0] ^= __aesti_inv_sbox[ 0] ^ __aesti_inv_sbox[128]; st0[1] ^= __aesti_inv_sbox[32] ^ __aesti_inv_sbox[160]; st0[2] ^= __aesti_inv_sbox[64] ^ __aesti_inv_sbox[192]; @@ -339,6 +355,8 @@ static void aesti_decrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in) put_unaligned_le32(inv_subshift(st1, 1) ^ rkp[5], out + 4); put_unaligned_le32(inv_subshift(st1, 2) ^ rkp[6], out + 8); put_unaligned_le32(inv_subshift(st1, 3) ^ rkp[7], out + 12); + + local_irq_restore(flags); } static struct crypto_alg aes_alg = { -- cgit v1.2.3-59-g8ed1b From 913a3aa07d16e5b302f408d497a4b829910de247 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Wed, 17 Oct 2018 21:37:59 -0700 Subject: crypto: arm/aes - add some hardening against cache-timing attacks Make the ARM scalar AES implementation closer to constant-time by disabling interrupts and prefetching the tables into L1 cache. This is feasible because due to ARM's "free" rotations, the main tables are only 1024 bytes instead of the usual 4096 used by most AES implementations. On ARM Cortex-A7, the speed loss is only about 5%. The resulting code is still over twice as fast as aes_ti.c. Responsiveness is potentially a concern, but interrupts are only disabled for a single AES block. Note that even after these changes, the implementation still isn't necessarily guaranteed to be constant-time; see https://cr.yp.to/antiforgery/cachetiming-20050414.pdf for a discussion of the many difficulties involved in writing truly constant-time AES software. But it's valuable to make such attacks more difficult. Much of this patch is based on patches suggested by Ard Biesheuvel. Suggested-by: Ard Biesheuvel Signed-off-by: Eric Biggers Reviewed-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- arch/arm/crypto/Kconfig | 9 ++++++ arch/arm/crypto/aes-cipher-core.S | 62 ++++++++++++++++++++++++++++++++------- crypto/aes_generic.c | 9 +++--- 3 files changed, 66 insertions(+), 14 deletions(-) diff --git a/arch/arm/crypto/Kconfig b/arch/arm/crypto/Kconfig index ef0c7feea6e2..0473a8f68389 100644 --- a/arch/arm/crypto/Kconfig +++ b/arch/arm/crypto/Kconfig @@ -69,6 +69,15 @@ config CRYPTO_AES_ARM help Use optimized AES assembler routines for ARM platforms. + On ARM processors without the Crypto Extensions, this is the + fastest AES implementation for single blocks. For multiple + blocks, the NEON bit-sliced implementation is usually faster. + + This implementation may be vulnerable to cache timing attacks, + since it uses lookup tables. However, as countermeasures it + disables IRQs and preloads the tables; it is hoped this makes + such attacks very difficult. + config CRYPTO_AES_ARM_BS tristate "Bit sliced AES using NEON instructions" depends on KERNEL_MODE_NEON diff --git a/arch/arm/crypto/aes-cipher-core.S b/arch/arm/crypto/aes-cipher-core.S index 184d6c2d15d5..f2d67c095e59 100644 --- a/arch/arm/crypto/aes-cipher-core.S +++ b/arch/arm/crypto/aes-cipher-core.S @@ -10,6 +10,7 @@ */ #include +#include #include .text @@ -41,7 +42,7 @@ .endif .endm - .macro __hround, out0, out1, in0, in1, in2, in3, t3, t4, enc, sz, op + .macro __hround, out0, out1, in0, in1, in2, in3, t3, t4, enc, sz, op, oldcpsr __select \out0, \in0, 0 __select t0, \in1, 1 __load \out0, \out0, 0, \sz, \op @@ -73,6 +74,14 @@ __load t0, t0, 3, \sz, \op __load \t4, \t4, 3, \sz, \op + .ifnb \oldcpsr + /* + * This is the final round and we're done with all data-dependent table + * lookups, so we can safely re-enable interrupts. + */ + restore_irqs \oldcpsr + .endif + eor \out1, \out1, t1, ror #24 eor \out0, \out0, t2, ror #16 ldm rk!, {t1, t2} @@ -83,14 +92,14 @@ eor \out1, \out1, t2 .endm - .macro fround, out0, out1, out2, out3, in0, in1, in2, in3, sz=2, op + .macro fround, out0, out1, out2, out3, in0, in1, in2, in3, sz=2, op, oldcpsr __hround \out0, \out1, \in0, \in1, \in2, \in3, \out2, \out3, 1, \sz, \op - __hround \out2, \out3, \in2, \in3, \in0, \in1, \in1, \in2, 1, \sz, \op + __hround \out2, \out3, \in2, \in3, \in0, \in1, \in1, \in2, 1, \sz, \op, \oldcpsr .endm - .macro iround, out0, out1, out2, out3, in0, in1, in2, in3, sz=2, op + .macro iround, out0, out1, out2, out3, in0, in1, in2, in3, sz=2, op, oldcpsr __hround \out0, \out1, \in0, \in3, \in2, \in1, \out2, \out3, 0, \sz, \op - __hround \out2, \out3, \in2, \in1, \in0, \in3, \in1, \in0, 0, \sz, \op + __hround \out2, \out3, \in2, \in1, \in0, \in3, \in1, \in0, 0, \sz, \op, \oldcpsr .endm .macro __rev, out, in @@ -118,13 +127,14 @@ .macro do_crypt, round, ttab, ltab, bsz push {r3-r11, lr} + // Load keys first, to reduce latency in case they're not cached yet. + ldm rk!, {r8-r11} + ldr r4, [in] ldr r5, [in, #4] ldr r6, [in, #8] ldr r7, [in, #12] - ldm rk!, {r8-r11} - #ifdef CONFIG_CPU_BIG_ENDIAN __rev r4, r4 __rev r5, r5 @@ -138,6 +148,25 @@ eor r7, r7, r11 __adrl ttab, \ttab + /* + * Disable interrupts and prefetch the 1024-byte 'ft' or 'it' table into + * L1 cache, assuming cacheline size >= 32. This is a hardening measure + * intended to make cache-timing attacks more difficult. They may not + * be fully prevented, however; see the paper + * https://cr.yp.to/antiforgery/cachetiming-20050414.pdf + * ("Cache-timing attacks on AES") for a discussion of the many + * difficulties involved in writing truly constant-time AES software. + */ + save_and_disable_irqs t0 + .set i, 0 + .rept 1024 / 128 + ldr r8, [ttab, #i + 0] + ldr r9, [ttab, #i + 32] + ldr r10, [ttab, #i + 64] + ldr r11, [ttab, #i + 96] + .set i, i + 128 + .endr + push {t0} // oldcpsr tst rounds, #2 bne 1f @@ -151,8 +180,21 @@ \round r4, r5, r6, r7, r8, r9, r10, r11 b 0b -2: __adrl ttab, \ltab - \round r4, r5, r6, r7, r8, r9, r10, r11, \bsz, b +2: .ifb \ltab + add ttab, ttab, #1 + .else + __adrl ttab, \ltab + // Prefetch inverse S-box for final round; see explanation above + .set i, 0 + .rept 256 / 64 + ldr t0, [ttab, #i + 0] + ldr t1, [ttab, #i + 32] + .set i, i + 64 + .endr + .endif + + pop {rounds} // oldcpsr + \round r4, r5, r6, r7, r8, r9, r10, r11, \bsz, b, rounds #ifdef CONFIG_CPU_BIG_ENDIAN __rev r4, r4 @@ -175,7 +217,7 @@ .endm ENTRY(__aes_arm_encrypt) - do_crypt fround, crypto_ft_tab, crypto_ft_tab + 1, 2 + do_crypt fround, crypto_ft_tab,, 2 ENDPROC(__aes_arm_encrypt) .align 5 diff --git a/crypto/aes_generic.c b/crypto/aes_generic.c index ca554d57d01e..13df33aca463 100644 --- a/crypto/aes_generic.c +++ b/crypto/aes_generic.c @@ -63,7 +63,8 @@ static inline u8 byte(const u32 x, const unsigned n) static const u32 rco_tab[10] = { 1, 2, 4, 8, 16, 32, 64, 128, 27, 54 }; -__visible const u32 crypto_ft_tab[4][256] = { +/* cacheline-aligned to facilitate prefetching into cache */ +__visible const u32 crypto_ft_tab[4][256] __cacheline_aligned = { { 0xa56363c6, 0x847c7cf8, 0x997777ee, 0x8d7b7bf6, 0x0df2f2ff, 0xbd6b6bd6, 0xb16f6fde, 0x54c5c591, @@ -327,7 +328,7 @@ __visible const u32 crypto_ft_tab[4][256] = { } }; -__visible const u32 crypto_fl_tab[4][256] = { +__visible const u32 crypto_fl_tab[4][256] __cacheline_aligned = { { 0x00000063, 0x0000007c, 0x00000077, 0x0000007b, 0x000000f2, 0x0000006b, 0x0000006f, 0x000000c5, @@ -591,7 +592,7 @@ __visible const u32 crypto_fl_tab[4][256] = { } }; -__visible const u32 crypto_it_tab[4][256] = { +__visible const u32 crypto_it_tab[4][256] __cacheline_aligned = { { 0x50a7f451, 0x5365417e, 0xc3a4171a, 0x965e273a, 0xcb6bab3b, 0xf1459d1f, 0xab58faac, 0x9303e34b, @@ -855,7 +856,7 @@ __visible const u32 crypto_it_tab[4][256] = { } }; -__visible const u32 crypto_il_tab[4][256] = { +__visible const u32 crypto_il_tab[4][256] __cacheline_aligned = { { 0x00000052, 0x00000009, 0x0000006a, 0x000000d5, 0x00000030, 0x00000036, 0x000000a5, 0x00000038, -- cgit v1.2.3-59-g8ed1b From e40fdb500b2bc92ae2e7e4ff29f6e9dedd854cc2 Mon Sep 17 00:00:00 2001 From: Gilad Ben-Yossef Date: Mon, 29 Oct 2018 09:50:12 +0000 Subject: crypto: ccree - add support for CryptoCell 713 Add support for Arm TrustZone CryptoCell 713. Note that this patch just enables using a 713 in backwards compatible mode to 712. Newer 713 specific features will follow. Signed-off-by: Gilad Ben-Yossef Signed-off-by: Herbert Xu --- drivers/crypto/Kconfig | 2 +- drivers/crypto/ccree/cc_driver.c | 23 +++++++++++++++-------- drivers/crypto/ccree/cc_driver.h | 5 +++-- 3 files changed, 19 insertions(+), 11 deletions(-) diff --git a/drivers/crypto/Kconfig b/drivers/crypto/Kconfig index caa98a7fe392..75d401c32a16 100644 --- a/drivers/crypto/Kconfig +++ b/drivers/crypto/Kconfig @@ -765,7 +765,7 @@ config CRYPTO_DEV_CCREE help Say 'Y' to enable a driver for the REE interface of the Arm TrustZone CryptoCell family of processors. Currently the - CryptoCell 712, 710 and 630 are supported. + CryptoCell 713, 712, 710 and 630 are supported. Choose this if you wish to use hardware acceleration of cryptographic operations on the system REE. If unsure say Y. diff --git a/drivers/crypto/ccree/cc_driver.c b/drivers/crypto/ccree/cc_driver.c index 1ff229c2aeab..630c598af627 100644 --- a/drivers/crypto/ccree/cc_driver.c +++ b/drivers/crypto/ccree/cc_driver.c @@ -43,6 +43,10 @@ struct cc_hw_data { /* Hardware revisions defs. */ +static const struct cc_hw_data cc713_hw = { + .name = "713", .rev = CC_HW_REV_713 +}; + static const struct cc_hw_data cc712_hw = { .name = "712", .rev = CC_HW_REV_712, .sig = 0xDCC71200U }; @@ -56,6 +60,7 @@ static const struct cc_hw_data cc630p_hw = { }; static const struct of_device_id arm_ccree_dev_of_match[] = { + { .compatible = "arm,cryptocell-713-ree", .data = &cc713_hw }, { .compatible = "arm,cryptocell-712-ree", .data = &cc712_hw }, { .compatible = "arm,cryptocell-710-ree", .data = &cc710_hw }, { .compatible = "arm,cryptocell-630p-ree", .data = &cc630p_hw }, @@ -297,15 +302,17 @@ static int init_cc_resources(struct platform_device *plat_dev) return rc; } - /* Verify correct mapping */ - signature_val = cc_ioread(new_drvdata, new_drvdata->sig_offset); - if (signature_val != hw_rev->sig) { - dev_err(dev, "Invalid CC signature: SIGNATURE=0x%08X != expected=0x%08X\n", - signature_val, hw_rev->sig); - rc = -EINVAL; - goto post_clk_err; + if (hw_rev->rev <= CC_HW_REV_712) { + /* Verify correct mapping */ + signature_val = cc_ioread(new_drvdata, new_drvdata->sig_offset); + if (signature_val != hw_rev->sig) { + dev_err(dev, "Invalid CC signature: SIGNATURE=0x%08X != expected=0x%08X\n", + signature_val, hw_rev->sig); + rc = -EINVAL; + goto post_clk_err; + } + dev_dbg(dev, "CC SIGNATURE=0x%08X\n", signature_val); } - dev_dbg(dev, "CC SIGNATURE=0x%08X\n", signature_val); /* Display HW versions */ dev_info(dev, "ARM CryptoCell %s Driver: HW version 0x%08X, Driver version %s\n", diff --git a/drivers/crypto/ccree/cc_driver.h b/drivers/crypto/ccree/cc_driver.h index d608a4faf662..a06e5c90fb68 100644 --- a/drivers/crypto/ccree/cc_driver.h +++ b/drivers/crypto/ccree/cc_driver.h @@ -36,12 +36,13 @@ extern bool cc_dump_desc; extern bool cc_dump_bytes; -#define DRV_MODULE_VERSION "4.0" +#define DRV_MODULE_VERSION "5.0" enum cc_hw_rev { CC_HW_REV_630 = 630, CC_HW_REV_710 = 710, - CC_HW_REV_712 = 712 + CC_HW_REV_712 = 712, + CC_HW_REV_713 = 713 }; #define CC_COHERENT_CACHE_PARAMS 0xEEE -- cgit v1.2.3-59-g8ed1b From d422912a8f0da30c4c6b2f482e5cfc0b85cd3119 Mon Sep 17 00:00:00 2001 From: Gilad Ben-Yossef Date: Mon, 29 Oct 2018 09:50:13 +0000 Subject: dt-bindings: crypto: ccree: add ccree 713 Add device tree bindings associating Arm TrustZone CryptoCell 713 with the ccree driver. Signed-off-by: Gilad Ben-Yossef Reviewed-by: Rob Herring Signed-off-by: Herbert Xu --- Documentation/devicetree/bindings/crypto/arm-cryptocell.txt | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/Documentation/devicetree/bindings/crypto/arm-cryptocell.txt b/Documentation/devicetree/bindings/crypto/arm-cryptocell.txt index 999fb2a810f6..0ac06ec05272 100644 --- a/Documentation/devicetree/bindings/crypto/arm-cryptocell.txt +++ b/Documentation/devicetree/bindings/crypto/arm-cryptocell.txt @@ -1,8 +1,11 @@ Arm TrustZone CryptoCell cryptographic engine Required properties: -- compatible: Should be one of: "arm,cryptocell-712-ree", - "arm,cryptocell-710-ree" or "arm,cryptocell-630p-ree". +- compatible: Should be one of - + "arm,cryptocell-713-ree" + "arm,cryptocell-712-ree" + "arm,cryptocell-710-ree" + "arm,cryptocell-630p-ree" - reg: Base physical address of the engine and length of memory mapped region. - interrupts: Interrupt number for the device. -- cgit v1.2.3-59-g8ed1b From 9b8d51f812ce5c8bfffe440391fe85f6e5349f07 Mon Sep 17 00:00:00 2001 From: Gilad Ben-Yossef Date: Mon, 29 Oct 2018 09:50:14 +0000 Subject: crypto: ccree - add SM4 support Add support for SM4 cipher in CryptoCell 713. Signed-off-by: Gilad Ben-Yossef Signed-off-by: Herbert Xu --- drivers/crypto/Kconfig | 1 + drivers/crypto/ccree/cc_cipher.c | 66 +++++++++++++++++++++++++++++++++ drivers/crypto/ccree/cc_hw_queue_defs.h | 3 ++ 3 files changed, 70 insertions(+) diff --git a/drivers/crypto/Kconfig b/drivers/crypto/Kconfig index 75d401c32a16..1de998d5bdb5 100644 --- a/drivers/crypto/Kconfig +++ b/drivers/crypto/Kconfig @@ -762,6 +762,7 @@ config CRYPTO_DEV_CCREE select CRYPTO_ECB select CRYPTO_CTR select CRYPTO_XTS + select CRYPTO_SM4 help Say 'Y' to enable a driver for the REE interface of the Arm TrustZone CryptoCell family of processors. Currently the diff --git a/drivers/crypto/ccree/cc_cipher.c b/drivers/crypto/ccree/cc_cipher.c index 7623b29911af..989e70f43bd9 100644 --- a/drivers/crypto/ccree/cc_cipher.c +++ b/drivers/crypto/ccree/cc_cipher.c @@ -7,6 +7,7 @@ #include #include #include +#include #include #include "cc_driver.h" @@ -83,6 +84,9 @@ static int validate_keys_sizes(struct cc_cipher_ctx *ctx_p, u32 size) if (size == DES3_EDE_KEY_SIZE || size == DES_KEY_SIZE) return 0; break; + case S_DIN_to_SM4: + if (size == SM4_KEY_SIZE) + return 0; default: break; } @@ -122,6 +126,17 @@ static int validate_data_size(struct cc_cipher_ctx *ctx_p, if (IS_ALIGNED(size, DES_BLOCK_SIZE)) return 0; break; + case S_DIN_to_SM4: + switch (ctx_p->cipher_mode) { + case DRV_CIPHER_CTR: + return 0; + case DRV_CIPHER_ECB: + case DRV_CIPHER_CBC: + if (IS_ALIGNED(size, SM4_BLOCK_SIZE)) + return 0; + default: + break; + } default: break; } @@ -522,6 +537,9 @@ static void cc_setup_cipher_data(struct crypto_tfm *tfm, case S_DIN_to_DES: flow_mode = DIN_DES_DOUT; break; + case S_DIN_to_SM4: + flow_mode = DIN_SM4_DOUT; + break; default: dev_err(dev, "invalid flow mode, flow_mode = %d\n", flow_mode); return; @@ -1324,6 +1342,54 @@ static const struct cc_alg_template skcipher_algs[] = { .flow_mode = S_DIN_to_DES, .min_hw_rev = CC_HW_REV_630, }, + { + .name = "cbc(sm4)", + .driver_name = "cbc-sm4-ccree", + .blocksize = SM4_BLOCK_SIZE, + .template_skcipher = { + .setkey = cc_cipher_setkey, + .encrypt = cc_cipher_encrypt, + .decrypt = cc_cipher_decrypt, + .min_keysize = SM4_KEY_SIZE, + .max_keysize = SM4_KEY_SIZE, + .ivsize = SM4_BLOCK_SIZE, + }, + .cipher_mode = DRV_CIPHER_CBC, + .flow_mode = S_DIN_to_SM4, + .min_hw_rev = CC_HW_REV_713, + }, + { + .name = "ecb(sm4)", + .driver_name = "ecb-sm4-ccree", + .blocksize = SM4_BLOCK_SIZE, + .template_skcipher = { + .setkey = cc_cipher_setkey, + .encrypt = cc_cipher_encrypt, + .decrypt = cc_cipher_decrypt, + .min_keysize = SM4_KEY_SIZE, + .max_keysize = SM4_KEY_SIZE, + .ivsize = 0, + }, + .cipher_mode = DRV_CIPHER_ECB, + .flow_mode = S_DIN_to_SM4, + .min_hw_rev = CC_HW_REV_713, + }, + { + .name = "ctr(sm4)", + .driver_name = "ctr-sm4-ccree", + .blocksize = SM4_BLOCK_SIZE, + .template_skcipher = { + .setkey = cc_cipher_setkey, + .encrypt = cc_cipher_encrypt, + .decrypt = cc_cipher_decrypt, + .min_keysize = SM4_KEY_SIZE, + .max_keysize = SM4_KEY_SIZE, + .ivsize = SM4_BLOCK_SIZE, + }, + .cipher_mode = DRV_CIPHER_CTR, + .flow_mode = S_DIN_to_SM4, + .min_hw_rev = CC_HW_REV_713, + }, }; static struct cc_crypto_alg *cc_create_alg(const struct cc_alg_template *tmpl, diff --git a/drivers/crypto/ccree/cc_hw_queue_defs.h b/drivers/crypto/ccree/cc_hw_queue_defs.h index 45985b955d2c..f719ff9c0277 100644 --- a/drivers/crypto/ccree/cc_hw_queue_defs.h +++ b/drivers/crypto/ccree/cc_hw_queue_defs.h @@ -107,6 +107,7 @@ enum cc_flow_mode { AES_to_AES_to_HASH_and_DOUT = 13, AES_to_AES_to_HASH = 14, AES_to_HASH_and_AES = 15, + DIN_SM4_DOUT = 16, DIN_AES_AESMAC = 17, HASH_to_DOUT = 18, /* setup flows */ @@ -114,9 +115,11 @@ enum cc_flow_mode { S_DIN_to_AES2 = 33, S_DIN_to_DES = 34, S_DIN_to_RC4 = 35, + S_DIN_to_SM4 = 36, S_DIN_to_HASH = 37, S_AES_to_DOUT = 38, S_AES2_to_DOUT = 39, + S_SM4_to_DOUT = 40, S_RC4_to_DOUT = 41, S_DES_to_DOUT = 42, S_HASH_to_DOUT = 43, -- cgit v1.2.3-59-g8ed1b From f1e52fd0fbd67fbf342932506cd7a6e27c090c5f Mon Sep 17 00:00:00 2001 From: Yael Chemla Date: Thu, 18 Oct 2018 13:59:57 +0100 Subject: crypto: ccree - adjust hash length to suit certain context specifics Adjust hash length such that it will not be fixed and general for all algs. Instead make it suitable for certain context information. This is preparation for SM3 support. Signed-off-by: Yael Chemla Signed-off-by: Herbert Xu --- drivers/crypto/ccree/cc_aead.c | 19 ++++++++++++++----- drivers/crypto/ccree/cc_driver.c | 10 ++++++++-- drivers/crypto/ccree/cc_driver.h | 2 +- drivers/crypto/ccree/cc_hash.c | 40 ++++++++++++++++++++++++---------------- 4 files changed, 47 insertions(+), 24 deletions(-) diff --git a/drivers/crypto/ccree/cc_aead.c b/drivers/crypto/ccree/cc_aead.c index 01b82b82f8b8..e0ac376ceb74 100644 --- a/drivers/crypto/ccree/cc_aead.c +++ b/drivers/crypto/ccree/cc_aead.c @@ -58,6 +58,7 @@ struct cc_aead_ctx { unsigned int enc_keylen; unsigned int auth_keylen; unsigned int authsize; /* Actual (reduced?) size of the MAC/ICv */ + unsigned int hash_len; enum drv_cipher_mode cipher_mode; enum cc_flow_mode flow_mode; enum drv_hash_mode auth_mode; @@ -122,6 +123,13 @@ static void cc_aead_exit(struct crypto_aead *tfm) } } +static unsigned int cc_get_aead_hash_len(struct crypto_aead *tfm) +{ + struct cc_aead_ctx *ctx = crypto_aead_ctx(tfm); + + return cc_get_default_hash_len(ctx->drvdata); +} + static int cc_aead_init(struct crypto_aead *tfm) { struct aead_alg *alg = crypto_aead_alg(tfm); @@ -196,6 +204,7 @@ static int cc_aead_init(struct crypto_aead *tfm) ctx->auth_state.hmac.ipad_opad = NULL; ctx->auth_state.hmac.padded_authkey = NULL; } + ctx->hash_len = cc_get_aead_hash_len(tfm); return 0; @@ -327,7 +336,7 @@ static int hmac_setkey(struct cc_hw_desc *desc, struct cc_aead_ctx *ctx) /* Load the hash current length*/ hw_desc_init(&desc[idx]); set_cipher_mode(&desc[idx], hash_mode); - set_din_const(&desc[idx], 0, ctx->drvdata->hash_len_sz); + set_din_const(&desc[idx], 0, ctx->hash_len); set_flow_mode(&desc[idx], S_DIN_to_HASH); set_setup_mode(&desc[idx], SETUP_LOAD_KEY0); idx++; @@ -465,7 +474,7 @@ static int cc_get_plain_hmac_key(struct crypto_aead *tfm, const u8 *key, /* Load the hash current length*/ hw_desc_init(&desc[idx]); set_cipher_mode(&desc[idx], hashmode); - set_din_const(&desc[idx], 0, ctx->drvdata->hash_len_sz); + set_din_const(&desc[idx], 0, ctx->hash_len); set_cipher_config1(&desc[idx], HASH_PADDING_ENABLED); set_flow_mode(&desc[idx], S_DIN_to_HASH); set_setup_mode(&desc[idx], SETUP_LOAD_KEY0); @@ -1001,7 +1010,7 @@ static void cc_set_hmac_desc(struct aead_request *req, struct cc_hw_desc desc[], hw_desc_init(&desc[idx]); set_cipher_mode(&desc[idx], hash_mode); set_din_sram(&desc[idx], cc_digest_len_addr(ctx->drvdata, hash_mode), - ctx->drvdata->hash_len_sz); + ctx->hash_len); set_flow_mode(&desc[idx], S_DIN_to_HASH); set_setup_mode(&desc[idx], SETUP_LOAD_KEY0); idx++; @@ -1098,7 +1107,7 @@ static void cc_proc_scheme_desc(struct aead_request *req, hw_desc_init(&desc[idx]); set_cipher_mode(&desc[idx], hash_mode); set_dout_sram(&desc[idx], aead_handle->sram_workspace_addr, - ctx->drvdata->hash_len_sz); + ctx->hash_len); set_flow_mode(&desc[idx], S_HASH_to_DOUT); set_setup_mode(&desc[idx], SETUP_WRITE_STATE1); set_cipher_do(&desc[idx], DO_PAD); @@ -1128,7 +1137,7 @@ static void cc_proc_scheme_desc(struct aead_request *req, hw_desc_init(&desc[idx]); set_cipher_mode(&desc[idx], hash_mode); set_din_sram(&desc[idx], cc_digest_len_addr(ctx->drvdata, hash_mode), - ctx->drvdata->hash_len_sz); + ctx->hash_len); set_cipher_config1(&desc[idx], HASH_PADDING_ENABLED); set_flow_mode(&desc[idx], S_DIN_to_HASH); set_setup_mode(&desc[idx], SETUP_LOAD_KEY0); diff --git a/drivers/crypto/ccree/cc_driver.c b/drivers/crypto/ccree/cc_driver.c index 630c598af627..09a6ada0b547 100644 --- a/drivers/crypto/ccree/cc_driver.c +++ b/drivers/crypto/ccree/cc_driver.c @@ -211,12 +211,10 @@ static int init_cc_resources(struct platform_device *plat_dev) new_drvdata->hw_rev = hw_rev->rev; if (hw_rev->rev >= CC_HW_REV_712) { - new_drvdata->hash_len_sz = HASH_LEN_SIZE_712; new_drvdata->axim_mon_offset = CC_REG(AXIM_MON_COMP); new_drvdata->sig_offset = CC_REG(HOST_SIGNATURE_712); new_drvdata->ver_offset = CC_REG(HOST_VERSION_712); } else { - new_drvdata->hash_len_sz = HASH_LEN_SIZE_630; new_drvdata->axim_mon_offset = CC_REG(AXIM_MON_COMP8); new_drvdata->sig_offset = CC_REG(HOST_SIGNATURE_630); new_drvdata->ver_offset = CC_REG(HOST_VERSION_630); @@ -468,6 +466,14 @@ int cc_clk_on(struct cc_drvdata *drvdata) return 0; } +unsigned int cc_get_default_hash_len(struct cc_drvdata *drvdata) +{ + if (drvdata->hw_rev >= CC_HW_REV_712) + return HASH_LEN_SIZE_712; + else + return HASH_LEN_SIZE_630; +} + void cc_clk_off(struct cc_drvdata *drvdata) { struct clk *clk = drvdata->clk; diff --git a/drivers/crypto/ccree/cc_driver.h b/drivers/crypto/ccree/cc_driver.h index a06e5c90fb68..170790967854 100644 --- a/drivers/crypto/ccree/cc_driver.h +++ b/drivers/crypto/ccree/cc_driver.h @@ -128,7 +128,6 @@ struct cc_drvdata { bool coherent; char *hw_rev_name; enum cc_hw_rev hw_rev; - u32 hash_len_sz; u32 axim_mon_offset; u32 sig_offset; u32 ver_offset; @@ -183,6 +182,7 @@ int init_cc_regs(struct cc_drvdata *drvdata, bool is_probe); void fini_cc_regs(struct cc_drvdata *drvdata); int cc_clk_on(struct cc_drvdata *drvdata); void cc_clk_off(struct cc_drvdata *drvdata); +unsigned int cc_get_default_hash_len(struct cc_drvdata *drvdata); static inline void cc_iowrite(struct cc_drvdata *drvdata, u32 reg, u32 val) { diff --git a/drivers/crypto/ccree/cc_hash.c b/drivers/crypto/ccree/cc_hash.c index b9313306c36f..7af5b61f3f66 100644 --- a/drivers/crypto/ccree/cc_hash.c +++ b/drivers/crypto/ccree/cc_hash.c @@ -82,6 +82,7 @@ struct cc_hash_ctx { int hash_mode; int hw_mode; int inter_digestsize; + unsigned int hash_len; struct completion setkey_comp; bool is_hmac; }; @@ -138,10 +139,10 @@ static void cc_init_req(struct device *dev, struct ahash_req_ctx *state, ctx->hash_mode == DRV_HASH_SHA384) memcpy(state->digest_bytes_len, digest_len_sha512_init, - ctx->drvdata->hash_len_sz); + ctx->hash_len); else memcpy(state->digest_bytes_len, digest_len_init, - ctx->drvdata->hash_len_sz); + ctx->hash_len); } if (ctx->hash_mode != DRV_HASH_NULL) { @@ -367,7 +368,7 @@ static int cc_fin_hmac(struct cc_hw_desc *desc, struct ahash_request *req, set_cipher_mode(&desc[idx], ctx->hw_mode); set_din_sram(&desc[idx], cc_digest_len_addr(ctx->drvdata, ctx->hash_mode), - ctx->drvdata->hash_len_sz); + ctx->hash_len); set_cipher_config1(&desc[idx], HASH_PADDING_ENABLED); set_flow_mode(&desc[idx], S_DIN_to_HASH); set_setup_mode(&desc[idx], SETUP_LOAD_KEY0); @@ -459,9 +460,9 @@ static int cc_hash_digest(struct ahash_request *req) if (is_hmac) { set_din_type(&desc[idx], DMA_DLLI, state->digest_bytes_len_dma_addr, - ctx->drvdata->hash_len_sz, NS_BIT); + ctx->hash_len, NS_BIT); } else { - set_din_const(&desc[idx], 0, ctx->drvdata->hash_len_sz); + set_din_const(&desc[idx], 0, ctx->hash_len); if (nbytes) set_cipher_config1(&desc[idx], HASH_PADDING_ENABLED); else @@ -478,7 +479,7 @@ static int cc_hash_digest(struct ahash_request *req) hw_desc_init(&desc[idx]); set_cipher_mode(&desc[idx], ctx->hw_mode); set_dout_dlli(&desc[idx], state->digest_buff_dma_addr, - ctx->drvdata->hash_len_sz, NS_BIT, 0); + ctx->hash_len, NS_BIT, 0); set_flow_mode(&desc[idx], S_HASH_to_DOUT); set_setup_mode(&desc[idx], SETUP_WRITE_STATE1); set_cipher_do(&desc[idx], DO_PAD); @@ -516,7 +517,7 @@ static int cc_restore_hash(struct cc_hw_desc *desc, struct cc_hash_ctx *ctx, set_cipher_mode(&desc[idx], ctx->hw_mode); set_cipher_config1(&desc[idx], HASH_PADDING_DISABLED); set_din_type(&desc[idx], DMA_DLLI, state->digest_bytes_len_dma_addr, - ctx->drvdata->hash_len_sz, NS_BIT); + ctx->hash_len, NS_BIT); set_flow_mode(&desc[idx], S_DIN_to_HASH); set_setup_mode(&desc[idx], SETUP_LOAD_KEY0); idx++; @@ -587,7 +588,7 @@ static int cc_hash_update(struct ahash_request *req) hw_desc_init(&desc[idx]); set_cipher_mode(&desc[idx], ctx->hw_mode); set_dout_dlli(&desc[idx], state->digest_bytes_len_dma_addr, - ctx->drvdata->hash_len_sz, NS_BIT, 1); + ctx->hash_len, NS_BIT, 1); set_queue_last_ind(ctx->drvdata, &desc[idx]); set_flow_mode(&desc[idx], S_HASH_to_DOUT); set_setup_mode(&desc[idx], SETUP_WRITE_STATE1); @@ -651,7 +652,7 @@ static int cc_do_finup(struct ahash_request *req, bool update) set_cipher_do(&desc[idx], DO_PAD); set_cipher_mode(&desc[idx], ctx->hw_mode); set_dout_dlli(&desc[idx], state->digest_bytes_len_dma_addr, - ctx->drvdata->hash_len_sz, NS_BIT, 0); + ctx->hash_len, NS_BIT, 0); set_setup_mode(&desc[idx], SETUP_WRITE_STATE1); set_flow_mode(&desc[idx], S_HASH_to_DOUT); idx++; @@ -749,7 +750,7 @@ static int cc_hash_setkey(struct crypto_ahash *ahash, const u8 *key, /* Load the hash current length*/ hw_desc_init(&desc[idx]); set_cipher_mode(&desc[idx], ctx->hw_mode); - set_din_const(&desc[idx], 0, ctx->drvdata->hash_len_sz); + set_din_const(&desc[idx], 0, ctx->hash_len); set_cipher_config1(&desc[idx], HASH_PADDING_ENABLED); set_flow_mode(&desc[idx], S_DIN_to_HASH); set_setup_mode(&desc[idx], SETUP_LOAD_KEY0); @@ -831,7 +832,7 @@ static int cc_hash_setkey(struct crypto_ahash *ahash, const u8 *key, /* Load the hash current length*/ hw_desc_init(&desc[idx]); set_cipher_mode(&desc[idx], ctx->hw_mode); - set_din_const(&desc[idx], 0, ctx->drvdata->hash_len_sz); + set_din_const(&desc[idx], 0, ctx->hash_len); set_flow_mode(&desc[idx], S_DIN_to_HASH); set_setup_mode(&desc[idx], SETUP_LOAD_KEY0); idx++; @@ -1069,6 +1070,13 @@ fail: return -ENOMEM; } +static int cc_get_hash_len(struct crypto_tfm *tfm) +{ + struct cc_hash_ctx *ctx = crypto_tfm_ctx(tfm); + + return cc_get_default_hash_len(ctx->drvdata); +} + static int cc_cra_init(struct crypto_tfm *tfm) { struct cc_hash_ctx *ctx = crypto_tfm_ctx(tfm); @@ -1086,7 +1094,7 @@ static int cc_cra_init(struct crypto_tfm *tfm) ctx->hw_mode = cc_alg->hw_mode; ctx->inter_digestsize = cc_alg->inter_digestsize; ctx->drvdata = cc_alg->drvdata; - + ctx->hash_len = cc_get_hash_len(tfm); return cc_alloc_ctx(ctx); } @@ -1465,8 +1473,8 @@ static int cc_hash_export(struct ahash_request *req, void *out) memcpy(out, state->digest_buff, ctx->inter_digestsize); out += ctx->inter_digestsize; - memcpy(out, state->digest_bytes_len, ctx->drvdata->hash_len_sz); - out += ctx->drvdata->hash_len_sz; + memcpy(out, state->digest_bytes_len, ctx->hash_len); + out += ctx->hash_len; memcpy(out, &curr_buff_cnt, sizeof(u32)); out += sizeof(u32); @@ -1494,8 +1502,8 @@ static int cc_hash_import(struct ahash_request *req, const void *in) memcpy(state->digest_buff, in, ctx->inter_digestsize); in += ctx->inter_digestsize; - memcpy(state->digest_bytes_len, in, ctx->drvdata->hash_len_sz); - in += ctx->drvdata->hash_len_sz; + memcpy(state->digest_bytes_len, in, ctx->hash_len); + in += ctx->hash_len; /* Sanity check the data as much as possible */ memcpy(&tmp, in, sizeof(u32)); -- cgit v1.2.3-59-g8ed1b From 18a1dc1fd56b4a5b468ad5bf984a2b0ea5ee8c69 Mon Sep 17 00:00:00 2001 From: Yael Chemla Date: Thu, 18 Oct 2018 13:59:58 +0100 Subject: crypto: ccree - modify set_cipher_mode usage from cc_hash encapsulate set_cipher_mode call with another api, preparation for specific hash behavior as needed in later patches when SM3 introduced. Signed-off-by: Yael Chemla Signed-off-by: Herbert Xu --- drivers/crypto/ccree/cc_hash.c | 18 +++++++++--------- drivers/crypto/ccree/cc_hw_queue_defs.h | 14 ++++++++++++++ 2 files changed, 23 insertions(+), 9 deletions(-) diff --git a/drivers/crypto/ccree/cc_hash.c b/drivers/crypto/ccree/cc_hash.c index 7af5b61f3f66..adcd9df2046e 100644 --- a/drivers/crypto/ccree/cc_hash.c +++ b/drivers/crypto/ccree/cc_hash.c @@ -322,7 +322,7 @@ static int cc_fin_result(struct cc_hw_desc *desc, struct ahash_request *req, /* Get final MAC result */ hw_desc_init(&desc[idx]); - set_cipher_mode(&desc[idx], ctx->hw_mode); + set_hash_cipher_mode(&desc[idx], ctx->hw_mode, ctx->hash_mode); /* TODO */ set_dout_dlli(&desc[idx], state->digest_result_dma_addr, digestsize, NS_BIT, 1); @@ -441,7 +441,7 @@ static int cc_hash_digest(struct ahash_request *req) * digest */ hw_desc_init(&desc[idx]); - set_cipher_mode(&desc[idx], ctx->hw_mode); + set_hash_cipher_mode(&desc[idx], ctx->hw_mode, ctx->hash_mode); if (is_hmac) { set_din_type(&desc[idx], DMA_DLLI, state->digest_buff_dma_addr, ctx->inter_digestsize, NS_BIT); @@ -455,7 +455,7 @@ static int cc_hash_digest(struct ahash_request *req) /* Load the hash current length */ hw_desc_init(&desc[idx]); - set_cipher_mode(&desc[idx], ctx->hw_mode); + set_hash_cipher_mode(&desc[idx], ctx->hw_mode, ctx->hash_mode); if (is_hmac) { set_din_type(&desc[idx], DMA_DLLI, @@ -505,7 +505,7 @@ static int cc_restore_hash(struct cc_hw_desc *desc, struct cc_hash_ctx *ctx, { /* Restore hash digest */ hw_desc_init(&desc[idx]); - set_cipher_mode(&desc[idx], ctx->hw_mode); + set_hash_cipher_mode(&desc[idx], ctx->hw_mode, ctx->hash_mode); set_din_type(&desc[idx], DMA_DLLI, state->digest_buff_dma_addr, ctx->inter_digestsize, NS_BIT); set_flow_mode(&desc[idx], S_DIN_to_HASH); @@ -514,7 +514,7 @@ static int cc_restore_hash(struct cc_hw_desc *desc, struct cc_hash_ctx *ctx, /* Restore hash current length */ hw_desc_init(&desc[idx]); - set_cipher_mode(&desc[idx], ctx->hw_mode); + set_hash_cipher_mode(&desc[idx], ctx->hw_mode, ctx->hash_mode); set_cipher_config1(&desc[idx], HASH_PADDING_DISABLED); set_din_type(&desc[idx], DMA_DLLI, state->digest_bytes_len_dma_addr, ctx->hash_len, NS_BIT); @@ -577,7 +577,7 @@ static int cc_hash_update(struct ahash_request *req) /* store the hash digest result in context */ hw_desc_init(&desc[idx]); - set_cipher_mode(&desc[idx], ctx->hw_mode); + set_hash_cipher_mode(&desc[idx], ctx->hw_mode, ctx->hash_mode); set_dout_dlli(&desc[idx], state->digest_buff_dma_addr, ctx->inter_digestsize, NS_BIT, 0); set_flow_mode(&desc[idx], S_HASH_to_DOUT); @@ -586,7 +586,7 @@ static int cc_hash_update(struct ahash_request *req) /* store current hash length in context */ hw_desc_init(&desc[idx]); - set_cipher_mode(&desc[idx], ctx->hw_mode); + set_hash_cipher_mode(&desc[idx], ctx->hw_mode, ctx->hash_mode); set_dout_dlli(&desc[idx], state->digest_bytes_len_dma_addr, ctx->hash_len, NS_BIT, 1); set_queue_last_ind(ctx->drvdata, &desc[idx]); @@ -650,7 +650,7 @@ static int cc_do_finup(struct ahash_request *req, bool update) /* Pad the hash */ hw_desc_init(&desc[idx]); set_cipher_do(&desc[idx], DO_PAD); - set_cipher_mode(&desc[idx], ctx->hw_mode); + set_hash_cipher_mode(&desc[idx], ctx->hw_mode, ctx->hash_mode); set_dout_dlli(&desc[idx], state->digest_bytes_len_dma_addr, ctx->hash_len, NS_BIT, 0); set_setup_mode(&desc[idx], SETUP_WRITE_STATE1); @@ -2035,7 +2035,7 @@ static void cc_setup_xcbc(struct ahash_request *areq, struct cc_hw_desc desc[], XCBC_MAC_K1_OFFSET), CC_AES_128_BIT_KEY_SIZE, NS_BIT); set_setup_mode(&desc[idx], SETUP_LOAD_KEY0); - set_cipher_mode(&desc[idx], DRV_CIPHER_XCBC_MAC); + set_hash_cipher_mode(&desc[idx], DRV_CIPHER_XCBC_MAC, ctx->hash_mode); set_cipher_config0(&desc[idx], DESC_DIRECTION_ENCRYPT_ENCRYPT); set_key_size_aes(&desc[idx], CC_AES_128_BIT_KEY_SIZE); set_flow_mode(&desc[idx], S_DIN_to_AES); diff --git a/drivers/crypto/ccree/cc_hw_queue_defs.h b/drivers/crypto/ccree/cc_hw_queue_defs.h index f719ff9c0277..6e8438a71478 100644 --- a/drivers/crypto/ccree/cc_hw_queue_defs.h +++ b/drivers/crypto/ccree/cc_hw_queue_defs.h @@ -457,6 +457,20 @@ static inline void set_cipher_mode(struct cc_hw_desc *pdesc, int mode) pdesc->word[4] |= FIELD_PREP(WORD4_CIPHER_MODE, mode); } +/* + * Set the cipher mode for hash algorithms. + * + * @pdesc: pointer HW descriptor struct + * @cipher_mode: Any one of the modes defined in [CC7x-DESC] + * @hash_mode: specifies which hash is being handled + */ +static inline void set_hash_cipher_mode(struct cc_hw_desc *pdesc, + enum drv_cipher_mode cipher_mode, + enum drv_hash_mode hash_mode) +{ + set_cipher_mode(pdesc, cipher_mode); +} + /* * Set the cipher configuration fields. * -- cgit v1.2.3-59-g8ed1b From 927574e0e85da61f84dcda15d5b6a2baa06cda46 Mon Sep 17 00:00:00 2001 From: Yael Chemla Date: Thu, 18 Oct 2018 13:59:59 +0100 Subject: crypto: ccree - add SM3 support Add support for SM3 cipher in CryptoCell 713. Signed-off-by: Yael Chemla Signed-off-by: Herbert Xu --- drivers/crypto/Kconfig | 1 + drivers/crypto/ccree/cc_crypto_ctx.h | 4 +- drivers/crypto/ccree/cc_hash.c | 119 ++++++++++++++++++++++++++------ drivers/crypto/ccree/cc_hw_queue_defs.h | 13 ++++ 4 files changed, 113 insertions(+), 24 deletions(-) diff --git a/drivers/crypto/Kconfig b/drivers/crypto/Kconfig index 1de998d5bdb5..9f1baaec385f 100644 --- a/drivers/crypto/Kconfig +++ b/drivers/crypto/Kconfig @@ -763,6 +763,7 @@ config CRYPTO_DEV_CCREE select CRYPTO_CTR select CRYPTO_XTS select CRYPTO_SM4 + select CRYPTO_SM3 help Say 'Y' to enable a driver for the REE interface of the Arm TrustZone CryptoCell family of processors. Currently the diff --git a/drivers/crypto/ccree/cc_crypto_ctx.h b/drivers/crypto/ccree/cc_crypto_ctx.h index e032544f4e31..c8dac273c563 100644 --- a/drivers/crypto/ccree/cc_crypto_ctx.h +++ b/drivers/crypto/ccree/cc_crypto_ctx.h @@ -115,7 +115,8 @@ enum drv_hash_mode { DRV_HASH_CBC_MAC = 6, DRV_HASH_XCBC_MAC = 7, DRV_HASH_CMAC = 8, - DRV_HASH_MODE_NUM = 9, + DRV_HASH_SM3 = 9, + DRV_HASH_MODE_NUM = 10, DRV_HASH_RESERVE32B = S32_MAX }; @@ -127,6 +128,7 @@ enum drv_hash_hw_mode { DRV_HASH_HW_SHA512 = 4, DRV_HASH_HW_SHA384 = 12, DRV_HASH_HW_GHASH = 6, + DRV_HASH_HW_SM3 = 14, DRV_HASH_HW_RESERVE32B = S32_MAX }; diff --git a/drivers/crypto/ccree/cc_hash.c b/drivers/crypto/ccree/cc_hash.c index adcd9df2046e..c80c9ae555d0 100644 --- a/drivers/crypto/ccree/cc_hash.c +++ b/drivers/crypto/ccree/cc_hash.c @@ -6,6 +6,7 @@ #include #include #include +#include #include #include "cc_driver.h" @@ -16,6 +17,7 @@ #define CC_MAX_HASH_SEQ_LEN 12 #define CC_MAX_OPAD_KEYS_SIZE CC_MAX_HASH_BLCK_SIZE +#define CC_SM3_HASH_LEN_SIZE 8 struct cc_hash_handle { cc_sram_addr_t digest_len_sram_addr; /* const value in SRAM*/ @@ -43,6 +45,9 @@ static u64 sha384_init[] = { static u64 sha512_init[] = { SHA512_H7, SHA512_H6, SHA512_H5, SHA512_H4, SHA512_H3, SHA512_H2, SHA512_H1, SHA512_H0 }; +static const u32 sm3_init[] = { + SM3_IVH, SM3_IVG, SM3_IVF, SM3_IVE, + SM3_IVD, SM3_IVC, SM3_IVB, SM3_IVA }; static void cc_setup_xcbc(struct ahash_request *areq, struct cc_hw_desc desc[], unsigned int *seq_size); @@ -1074,7 +1079,10 @@ static int cc_get_hash_len(struct crypto_tfm *tfm) { struct cc_hash_ctx *ctx = crypto_tfm_ctx(tfm); - return cc_get_default_hash_len(ctx->drvdata); + if (ctx->hash_mode == DRV_HASH_SM3) + return CC_SM3_HASH_LEN_SIZE; + else + return cc_get_default_hash_len(ctx->drvdata); } static int cc_cra_init(struct crypto_tfm *tfm) @@ -1523,6 +1531,7 @@ struct cc_hash_template { char mac_name[CRYPTO_MAX_ALG_NAME]; char mac_driver_name[CRYPTO_MAX_ALG_NAME]; unsigned int blocksize; + bool is_mac; bool synchronize; struct ahash_alg template_ahash; int hash_mode; @@ -1544,6 +1553,7 @@ static struct cc_hash_template driver_hash[] = { .mac_name = "hmac(sha1)", .mac_driver_name = "hmac-sha1-ccree", .blocksize = SHA1_BLOCK_SIZE, + .is_mac = true, .synchronize = false, .template_ahash = { .init = cc_hash_init, @@ -1570,6 +1580,7 @@ static struct cc_hash_template driver_hash[] = { .mac_name = "hmac(sha256)", .mac_driver_name = "hmac-sha256-ccree", .blocksize = SHA256_BLOCK_SIZE, + .is_mac = true, .template_ahash = { .init = cc_hash_init, .update = cc_hash_update, @@ -1595,6 +1606,7 @@ static struct cc_hash_template driver_hash[] = { .mac_name = "hmac(sha224)", .mac_driver_name = "hmac-sha224-ccree", .blocksize = SHA224_BLOCK_SIZE, + .is_mac = true, .template_ahash = { .init = cc_hash_init, .update = cc_hash_update, @@ -1620,6 +1632,7 @@ static struct cc_hash_template driver_hash[] = { .mac_name = "hmac(sha384)", .mac_driver_name = "hmac-sha384-ccree", .blocksize = SHA384_BLOCK_SIZE, + .is_mac = true, .template_ahash = { .init = cc_hash_init, .update = cc_hash_update, @@ -1645,6 +1658,7 @@ static struct cc_hash_template driver_hash[] = { .mac_name = "hmac(sha512)", .mac_driver_name = "hmac-sha512-ccree", .blocksize = SHA512_BLOCK_SIZE, + .is_mac = true, .template_ahash = { .init = cc_hash_init, .update = cc_hash_update, @@ -1670,6 +1684,7 @@ static struct cc_hash_template driver_hash[] = { .mac_name = "hmac(md5)", .mac_driver_name = "hmac-md5-ccree", .blocksize = MD5_HMAC_BLOCK_SIZE, + .is_mac = true, .template_ahash = { .init = cc_hash_init, .update = cc_hash_update, @@ -1689,10 +1704,35 @@ static struct cc_hash_template driver_hash[] = { .inter_digestsize = MD5_DIGEST_SIZE, .min_hw_rev = CC_HW_REV_630, }, + { + .name = "sm3", + .driver_name = "sm3-ccree", + .blocksize = SM3_BLOCK_SIZE, + .is_mac = false, + .template_ahash = { + .init = cc_hash_init, + .update = cc_hash_update, + .final = cc_hash_final, + .finup = cc_hash_finup, + .digest = cc_hash_digest, + .export = cc_hash_export, + .import = cc_hash_import, + .setkey = cc_hash_setkey, + .halg = { + .digestsize = SM3_DIGEST_SIZE, + .statesize = CC_STATE_SIZE(SM3_DIGEST_SIZE), + }, + }, + .hash_mode = DRV_HASH_SM3, + .hw_mode = DRV_HASH_HW_SM3, + .inter_digestsize = SM3_DIGEST_SIZE, + .min_hw_rev = CC_HW_REV_713, + }, { .mac_name = "xcbc(aes)", .mac_driver_name = "xcbc-aes-ccree", .blocksize = AES_BLOCK_SIZE, + .is_mac = true, .template_ahash = { .init = cc_hash_init, .update = cc_mac_update, @@ -1716,6 +1756,7 @@ static struct cc_hash_template driver_hash[] = { .mac_name = "cmac(aes)", .mac_driver_name = "cmac-aes-ccree", .blocksize = AES_BLOCK_SIZE, + .is_mac = true, .template_ahash = { .init = cc_hash_init, .update = cc_mac_update, @@ -1788,6 +1829,7 @@ int cc_init_hash_sram(struct cc_drvdata *drvdata) unsigned int larval_seq_len = 0; struct cc_hw_desc larval_seq[CC_DIGEST_SIZE_MAX / sizeof(u32)]; bool large_sha_supported = (drvdata->hw_rev >= CC_HW_REV_712); + bool sm3_supported = (drvdata->hw_rev >= CC_HW_REV_713); int rc = 0; /* Copy-to-sram digest-len */ @@ -1853,6 +1895,17 @@ int cc_init_hash_sram(struct cc_drvdata *drvdata) sram_buff_ofs += sizeof(sha256_init); larval_seq_len = 0; + if (sm3_supported) { + cc_set_sram_desc(sm3_init, sram_buff_ofs, + ARRAY_SIZE(sm3_init), larval_seq, + &larval_seq_len); + rc = send_request_init(drvdata, larval_seq, larval_seq_len); + if (rc) + goto init_digest_const_err; + sram_buff_ofs += sizeof(sm3_init); + larval_seq_len = 0; + } + if (large_sha_supported) { cc_set_sram_desc((u32 *)sha384_init, sram_buff_ofs, (ARRAY_SIZE(sha384_init) * 2), larval_seq, @@ -1919,6 +1972,9 @@ int cc_hash_alloc(struct cc_drvdata *drvdata) sizeof(sha224_init) + sizeof(sha256_init); + if (drvdata->hw_rev >= CC_HW_REV_713) + sram_size_to_alloc += sizeof(sm3_init); + if (drvdata->hw_rev >= CC_HW_REV_712) sram_size_to_alloc += sizeof(digest_len_sha512_init) + sizeof(sha384_init) + sizeof(sha512_init); @@ -1948,27 +2004,28 @@ int cc_hash_alloc(struct cc_drvdata *drvdata) /* We either support both HASH and MAC or none */ if (driver_hash[alg].min_hw_rev > drvdata->hw_rev) continue; - - /* register hmac version */ - t_alg = cc_alloc_hash_alg(&driver_hash[alg], dev, true); - if (IS_ERR(t_alg)) { - rc = PTR_ERR(t_alg); - dev_err(dev, "%s alg allocation failed\n", - driver_hash[alg].driver_name); - goto fail; - } - t_alg->drvdata = drvdata; - - rc = crypto_register_ahash(&t_alg->ahash_alg); - if (rc) { - dev_err(dev, "%s alg registration failed\n", - driver_hash[alg].driver_name); - kfree(t_alg); - goto fail; - } else { - list_add_tail(&t_alg->entry, &hash_handle->hash_list); + if (driver_hash[alg].is_mac) { + /* register hmac version */ + t_alg = cc_alloc_hash_alg(&driver_hash[alg], dev, true); + if (IS_ERR(t_alg)) { + rc = PTR_ERR(t_alg); + dev_err(dev, "%s alg allocation failed\n", + driver_hash[alg].driver_name); + goto fail; + } + t_alg->drvdata = drvdata; + + rc = crypto_register_ahash(&t_alg->ahash_alg); + if (rc) { + dev_err(dev, "%s alg registration failed\n", + driver_hash[alg].driver_name); + kfree(t_alg); + goto fail; + } else { + list_add_tail(&t_alg->entry, + &hash_handle->hash_list); + } } - if (hw_mode == DRV_CIPHER_XCBC_MAC || hw_mode == DRV_CIPHER_CMAC) continue; @@ -2170,6 +2227,8 @@ static const void *cc_larval_digest(struct device *dev, u32 mode) return sha384_init; case DRV_HASH_SHA512: return sha512_init; + case DRV_HASH_SM3: + return sm3_init; default: dev_err(dev, "Invalid hash mode (%d)\n", mode); return md5_init; @@ -2190,6 +2249,8 @@ cc_sram_addr_t cc_larval_digest_addr(void *drvdata, u32 mode) struct cc_drvdata *_drvdata = (struct cc_drvdata *)drvdata; struct cc_hash_handle *hash_handle = _drvdata->hash_handle; struct device *dev = drvdata_to_dev(_drvdata); + bool sm3_supported = (_drvdata->hw_rev >= CC_HW_REV_713); + cc_sram_addr_t addr; switch (mode) { case DRV_HASH_NULL: @@ -2208,19 +2269,31 @@ cc_sram_addr_t cc_larval_digest_addr(void *drvdata, u32 mode) sizeof(md5_init) + sizeof(sha1_init) + sizeof(sha224_init)); - case DRV_HASH_SHA384: + case DRV_HASH_SM3: return (hash_handle->larval_digest_sram_addr + sizeof(md5_init) + sizeof(sha1_init) + sizeof(sha224_init) + sizeof(sha256_init)); + case DRV_HASH_SHA384: + addr = (hash_handle->larval_digest_sram_addr + + sizeof(md5_init) + + sizeof(sha1_init) + + sizeof(sha224_init) + + sizeof(sha256_init)); + if (sm3_supported) + addr += sizeof(sm3_init); + return addr; case DRV_HASH_SHA512: - return (hash_handle->larval_digest_sram_addr + + addr = (hash_handle->larval_digest_sram_addr + sizeof(md5_init) + sizeof(sha1_init) + sizeof(sha224_init) + sizeof(sha256_init) + sizeof(sha384_init)); + if (sm3_supported) + addr += sizeof(sm3_init); + return addr; default: dev_err(dev, "Invalid hash mode (%d)\n", mode); } diff --git a/drivers/crypto/ccree/cc_hw_queue_defs.h b/drivers/crypto/ccree/cc_hw_queue_defs.h index 6e8438a71478..7a9b90db7db7 100644 --- a/drivers/crypto/ccree/cc_hw_queue_defs.h +++ b/drivers/crypto/ccree/cc_hw_queue_defs.h @@ -42,6 +42,7 @@ #define WORD3_QUEUE_LAST_IND CC_GENMASK(3, QUEUE_LAST_IND) #define WORD4_ACK_NEEDED CC_GENMASK(4, ACK_NEEDED) #define WORD4_AES_SEL_N_HASH CC_GENMASK(4, AES_SEL_N_HASH) +#define WORD4_AES_XOR_CRYPTO_KEY CC_GENMASK(4, AES_XOR_CRYPTO_KEY) #define WORD4_BYTES_SWAP CC_GENMASK(4, BYTES_SWAP) #define WORD4_CIPHER_CONF0 CC_GENMASK(4, CIPHER_CONF0) #define WORD4_CIPHER_CONF1 CC_GENMASK(4, CIPHER_CONF1) @@ -396,6 +397,16 @@ static inline void set_aes_not_hash_mode(struct cc_hw_desc *pdesc) pdesc->word[4] |= FIELD_PREP(WORD4_AES_SEL_N_HASH, 1); } +/* + * Set aes xor crypto key, this in some secenrios select SM3 engine + * + * @pdesc: pointer HW descriptor struct + */ +static inline void set_aes_xor_crypto_key(struct cc_hw_desc *pdesc) +{ + pdesc->word[4] |= FIELD_PREP(WORD4_AES_XOR_CRYPTO_KEY, 1); +} + /* * Set the DOUT field of a HW descriptors to SRAM mode * Note: No need to check SRAM alignment since host requests do not use SRAM and @@ -469,6 +480,8 @@ static inline void set_hash_cipher_mode(struct cc_hw_desc *pdesc, enum drv_hash_mode hash_mode) { set_cipher_mode(pdesc, cipher_mode); + if (hash_mode == DRV_HASH_SM3) + set_aes_xor_crypto_key(pdesc); } /* -- cgit v1.2.3-59-g8ed1b From 7172122be6a4712d699da4d261f92aa5ab3a78b8 Mon Sep 17 00:00:00 2001 From: Wenwen Wang Date: Thu, 18 Oct 2018 19:50:43 -0500 Subject: crypto: cavium/nitrox - fix a DMA pool free failure In crypto_alloc_context(), a DMA pool is allocated through dma_pool_alloc() to hold the crypto context. The meta data of the DMA pool, including the pool used for the allocation 'ndev->ctx_pool' and the base address of the DMA pool used by the device 'dma', are then stored to the beginning of the pool. These meta data are eventually used in crypto_free_context() to free the DMA pool through dma_pool_free(). However, given that the DMA pool can also be accessed by the device, a malicious device can modify these meta data, especially when the device is controlled to deploy an attack. This can cause an unexpected DMA pool free failure. To avoid the above issue, this patch introduces a new structure crypto_ctx_hdr and a new field chdr in the structure nitrox_crypto_ctx hold the meta data information of the DMA pool after the allocation. Note that the original structure ctx_hdr is not changed to ensure the compatibility. Cc: Signed-off-by: Wenwen Wang Signed-off-by: Herbert Xu --- drivers/crypto/cavium/nitrox/nitrox_algs.c | 12 +++++++----- drivers/crypto/cavium/nitrox/nitrox_lib.c | 22 +++++++++++++++++----- drivers/crypto/cavium/nitrox/nitrox_req.h | 7 +++++++ 3 files changed, 31 insertions(+), 10 deletions(-) diff --git a/drivers/crypto/cavium/nitrox/nitrox_algs.c b/drivers/crypto/cavium/nitrox/nitrox_algs.c index 2ae6124e5da6..5d54ebc20cb3 100644 --- a/drivers/crypto/cavium/nitrox/nitrox_algs.c +++ b/drivers/crypto/cavium/nitrox/nitrox_algs.c @@ -73,7 +73,7 @@ static int flexi_aes_keylen(int keylen) static int nitrox_skcipher_init(struct crypto_skcipher *tfm) { struct nitrox_crypto_ctx *nctx = crypto_skcipher_ctx(tfm); - void *fctx; + struct crypto_ctx_hdr *chdr; /* get the first device */ nctx->ndev = nitrox_get_first_device(); @@ -81,12 +81,14 @@ static int nitrox_skcipher_init(struct crypto_skcipher *tfm) return -ENODEV; /* allocate nitrox crypto context */ - fctx = crypto_alloc_context(nctx->ndev); - if (!fctx) { + chdr = crypto_alloc_context(nctx->ndev); + if (!chdr) { nitrox_put_device(nctx->ndev); return -ENOMEM; } - nctx->u.ctx_handle = (uintptr_t)fctx; + nctx->chdr = chdr; + nctx->u.ctx_handle = (uintptr_t)((u8 *)chdr->vaddr + + sizeof(struct ctx_hdr)); crypto_skcipher_set_reqsize(tfm, crypto_skcipher_reqsize(tfm) + sizeof(struct nitrox_kcrypt_request)); return 0; @@ -102,7 +104,7 @@ static void nitrox_skcipher_exit(struct crypto_skcipher *tfm) memset(&fctx->crypto, 0, sizeof(struct crypto_keys)); memset(&fctx->auth, 0, sizeof(struct auth_keys)); - crypto_free_context((void *)fctx); + crypto_free_context((void *)nctx->chdr); } nitrox_put_device(nctx->ndev); diff --git a/drivers/crypto/cavium/nitrox/nitrox_lib.c b/drivers/crypto/cavium/nitrox/nitrox_lib.c index 2260efa42308..9138bae12521 100644 --- a/drivers/crypto/cavium/nitrox/nitrox_lib.c +++ b/drivers/crypto/cavium/nitrox/nitrox_lib.c @@ -158,12 +158,19 @@ static void destroy_crypto_dma_pool(struct nitrox_device *ndev) void *crypto_alloc_context(struct nitrox_device *ndev) { struct ctx_hdr *ctx; + struct crypto_ctx_hdr *chdr; void *vaddr; dma_addr_t dma; + chdr = kmalloc(sizeof(*chdr), GFP_KERNEL); + if (!chdr) + return NULL; + vaddr = dma_pool_zalloc(ndev->ctx_pool, GFP_KERNEL, &dma); - if (!vaddr) + if (!vaddr) { + kfree(chdr); return NULL; + } /* fill meta data */ ctx = vaddr; @@ -171,7 +178,11 @@ void *crypto_alloc_context(struct nitrox_device *ndev) ctx->dma = dma; ctx->ctx_dma = dma + sizeof(struct ctx_hdr); - return ((u8 *)vaddr + sizeof(struct ctx_hdr)); + chdr->pool = ndev->ctx_pool; + chdr->dma = dma; + chdr->vaddr = vaddr; + + return chdr; } /** @@ -180,13 +191,14 @@ void *crypto_alloc_context(struct nitrox_device *ndev) */ void crypto_free_context(void *ctx) { - struct ctx_hdr *ctxp; + struct crypto_ctx_hdr *ctxp; if (!ctx) return; - ctxp = (struct ctx_hdr *)((u8 *)ctx - sizeof(struct ctx_hdr)); - dma_pool_free(ctxp->pool, ctxp, ctxp->dma); + ctxp = ctx; + dma_pool_free(ctxp->pool, ctxp->vaddr, ctxp->dma); + kfree(ctxp); } /** diff --git a/drivers/crypto/cavium/nitrox/nitrox_req.h b/drivers/crypto/cavium/nitrox/nitrox_req.h index d091b6f5f5dd..19f0a20e3bb3 100644 --- a/drivers/crypto/cavium/nitrox/nitrox_req.h +++ b/drivers/crypto/cavium/nitrox/nitrox_req.h @@ -181,12 +181,19 @@ struct flexi_crypto_context { struct auth_keys auth; }; +struct crypto_ctx_hdr { + struct dma_pool *pool; + dma_addr_t dma; + void *vaddr; +}; + struct nitrox_crypto_ctx { struct nitrox_device *ndev; union { u64 ctx_handle; struct flexi_crypto_context *fctx; } u; + struct crypto_ctx_hdr *chdr; }; struct nitrox_kcrypt_request { -- cgit v1.2.3-59-g8ed1b From fa4600734b74f74d9169c3015946d4722f8bcf79 Mon Sep 17 00:00:00 2001 From: Dmitry Eremin-Solenikov Date: Sat, 20 Oct 2018 02:01:52 +0300 Subject: crypto: cfb - fix decryption crypto_cfb_decrypt_segment() incorrectly XOR'ed generated keystream with IV, rather than with data stream, resulting in incorrect decryption. Test vectors will be added in the next patch. Signed-off-by: Dmitry Eremin-Solenikov Cc: stable@vger.kernel.org Signed-off-by: Herbert Xu --- crypto/cfb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crypto/cfb.c b/crypto/cfb.c index a0d68c09e1b9..fd4e8500e121 100644 --- a/crypto/cfb.c +++ b/crypto/cfb.c @@ -144,7 +144,7 @@ static int crypto_cfb_decrypt_segment(struct skcipher_walk *walk, do { crypto_cfb_encrypt_one(tfm, iv, dst); - crypto_xor(dst, iv, bsize); + crypto_xor(dst, src, bsize); iv = src; src += bsize; -- cgit v1.2.3-59-g8ed1b From 7da66670775d201f633577f5b15a4bbeebaaa2b0 Mon Sep 17 00:00:00 2001 From: Dmitry Eremin-Solenikov Date: Sat, 20 Oct 2018 02:01:53 +0300 Subject: crypto: testmgr - add AES-CFB tests Add AES128/192/256-CFB testvectors from NIST SP800-38A. Signed-off-by: Dmitry Eremin-Solenikov Cc: stable@vger.kernel.org Signed-off-by: Dmitry Eremin-Solenikov Signed-off-by: Herbert Xu --- crypto/tcrypt.c | 5 ++++ crypto/testmgr.c | 7 ++++++ crypto/testmgr.h | 76 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 88 insertions(+) diff --git a/crypto/tcrypt.c b/crypto/tcrypt.c index c20c9f5c18f2..1026173d721a 100644 --- a/crypto/tcrypt.c +++ b/crypto/tcrypt.c @@ -1736,6 +1736,7 @@ static int do_test(const char *alg, u32 type, u32 mask, int m, u32 num_mb) ret += tcrypt_test("ctr(aes)"); ret += tcrypt_test("rfc3686(ctr(aes))"); ret += tcrypt_test("ofb(aes)"); + ret += tcrypt_test("cfb(aes)"); break; case 11: @@ -2060,6 +2061,10 @@ static int do_test(const char *alg, u32 type, u32 mask, int m, u32 num_mb) speed_template_16_24_32); test_cipher_speed("ctr(aes)", DECRYPT, sec, NULL, 0, speed_template_16_24_32); + test_cipher_speed("cfb(aes)", ENCRYPT, sec, NULL, 0, + speed_template_16_24_32); + test_cipher_speed("cfb(aes)", DECRYPT, sec, NULL, 0, + speed_template_16_24_32); break; case 201: diff --git a/crypto/testmgr.c b/crypto/testmgr.c index b1f79c6bf409..84937ceb4bd8 100644 --- a/crypto/testmgr.c +++ b/crypto/testmgr.c @@ -2690,6 +2690,13 @@ static const struct alg_test_desc alg_test_descs[] = { .dec = __VECS(aes_ccm_dec_tv_template) } } + }, { + .alg = "cfb(aes)", + .test = alg_test_skcipher, + .fips_allowed = 1, + .suite = { + .cipher = __VECS(aes_cfb_tv_template) + }, }, { .alg = "chacha20", .test = alg_test_skcipher, diff --git a/crypto/testmgr.h b/crypto/testmgr.h index 1fe7b97ba03f..b5b0d29761ce 100644 --- a/crypto/testmgr.h +++ b/crypto/testmgr.h @@ -11449,6 +11449,82 @@ static const struct cipher_testvec aes_cbc_tv_template[] = { }, }; +static const struct cipher_testvec aes_cfb_tv_template[] = { + { /* From NIST SP800-38A */ + .key = "\x2b\x7e\x15\x16\x28\xae\xd2\xa6" + "\xab\xf7\x15\x88\x09\xcf\x4f\x3c", + .klen = 16, + .iv = "\x00\x01\x02\x03\x04\x05\x06\x07" + "\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f", + .ptext = "\x6b\xc1\xbe\xe2\x2e\x40\x9f\x96" + "\xe9\x3d\x7e\x11\x73\x93\x17\x2a" + "\xae\x2d\x8a\x57\x1e\x03\xac\x9c" + "\x9e\xb7\x6f\xac\x45\xaf\x8e\x51" + "\x30\xc8\x1c\x46\xa3\x5c\xe4\x11" + "\xe5\xfb\xc1\x19\x1a\x0a\x52\xef" + "\xf6\x9f\x24\x45\xdf\x4f\x9b\x17" + "\xad\x2b\x41\x7b\xe6\x6c\x37\x10", + .ctext = "\x3b\x3f\xd9\x2e\xb7\x2d\xad\x20" + "\x33\x34\x49\xf8\xe8\x3c\xfb\x4a" + "\xc8\xa6\x45\x37\xa0\xb3\xa9\x3f" + "\xcd\xe3\xcd\xad\x9f\x1c\xe5\x8b" + "\x26\x75\x1f\x67\xa3\xcb\xb1\x40" + "\xb1\x80\x8c\xf1\x87\xa4\xf4\xdf" + "\xc0\x4b\x05\x35\x7c\x5d\x1c\x0e" + "\xea\xc4\xc6\x6f\x9f\xf7\xf2\xe6", + .len = 64, + }, { + .key = "\x8e\x73\xb0\xf7\xda\x0e\x64\x52" + "\xc8\x10\xf3\x2b\x80\x90\x79\xe5" + "\x62\xf8\xea\xd2\x52\x2c\x6b\x7b", + .klen = 24, + .iv = "\x00\x01\x02\x03\x04\x05\x06\x07" + "\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f", + .ptext = "\x6b\xc1\xbe\xe2\x2e\x40\x9f\x96" + "\xe9\x3d\x7e\x11\x73\x93\x17\x2a" + "\xae\x2d\x8a\x57\x1e\x03\xac\x9c" + "\x9e\xb7\x6f\xac\x45\xaf\x8e\x51" + "\x30\xc8\x1c\x46\xa3\x5c\xe4\x11" + "\xe5\xfb\xc1\x19\x1a\x0a\x52\xef" + "\xf6\x9f\x24\x45\xdf\x4f\x9b\x17" + "\xad\x2b\x41\x7b\xe6\x6c\x37\x10", + .ctext = "\xcd\xc8\x0d\x6f\xdd\xf1\x8c\xab" + "\x34\xc2\x59\x09\xc9\x9a\x41\x74" + "\x67\xce\x7f\x7f\x81\x17\x36\x21" + "\x96\x1a\x2b\x70\x17\x1d\x3d\x7a" + "\x2e\x1e\x8a\x1d\xd5\x9b\x88\xb1" + "\xc8\xe6\x0f\xed\x1e\xfa\xc4\xc9" + "\xc0\x5f\x9f\x9c\xa9\x83\x4f\xa0" + "\x42\xae\x8f\xba\x58\x4b\x09\xff", + .len = 64, + }, { + .key = "\x60\x3d\xeb\x10\x15\xca\x71\xbe" + "\x2b\x73\xae\xf0\x85\x7d\x77\x81" + "\x1f\x35\x2c\x07\x3b\x61\x08\xd7" + "\x2d\x98\x10\xa3\x09\x14\xdf\xf4", + .klen = 32, + .iv = "\x00\x01\x02\x03\x04\x05\x06\x07" + "\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f", + .ptext = "\x6b\xc1\xbe\xe2\x2e\x40\x9f\x96" + "\xe9\x3d\x7e\x11\x73\x93\x17\x2a" + "\xae\x2d\x8a\x57\x1e\x03\xac\x9c" + "\x9e\xb7\x6f\xac\x45\xaf\x8e\x51" + "\x30\xc8\x1c\x46\xa3\x5c\xe4\x11" + "\xe5\xfb\xc1\x19\x1a\x0a\x52\xef" + "\xf6\x9f\x24\x45\xdf\x4f\x9b\x17" + "\xad\x2b\x41\x7b\xe6\x6c\x37\x10", + .ctext = "\xdc\x7e\x84\xbf\xda\x79\x16\x4b" + "\x7e\xcd\x84\x86\x98\x5d\x38\x60" + "\x39\xff\xed\x14\x3b\x28\xb1\xc8" + "\x32\x11\x3c\x63\x31\xe5\x40\x7b" + "\xdf\x10\x13\x24\x15\xe5\x4b\x92" + "\xa1\x3e\xd0\xa8\x26\x7a\xe2\xf9" + "\x75\xa3\x85\x74\x1a\xb9\xce\xf8" + "\x20\x31\x62\x3d\x55\xb1\xe4\x71", + .len = 64, + }, +}; + static const struct aead_testvec hmac_md5_ecb_cipher_null_enc_tv_template[] = { { /* Input data from RFC 2410 Case 1 */ #ifdef __LITTLE_ENDIAN -- cgit v1.2.3-59-g8ed1b From b1e3874c75ab15288f573b3532e507c37e8e7656 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Sat, 27 Oct 2018 15:49:26 +0100 Subject: pcrypt: use format specifier in kobject_add Passing string 'name' as the format specifier is potentially hazardous because name could (although very unlikely to) have a format specifier embedded in it causing issues when parsing the non-existent arguments to these. Follow best practice by using the "%s" format string for the string 'name'. Cleans up clang warning: crypto/pcrypt.c:397:40: warning: format string is not a string literal (potentially insecure) [-Wformat-security] Fixes: a3fb1e330dd2 ("pcrypt: Added sysfs interface to pcrypt") Signed-off-by: Colin Ian King Signed-off-by: Herbert Xu --- crypto/pcrypt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crypto/pcrypt.c b/crypto/pcrypt.c index f8ec3d4ba4a8..a5718c0a3dc4 100644 --- a/crypto/pcrypt.c +++ b/crypto/pcrypt.c @@ -394,7 +394,7 @@ static int pcrypt_sysfs_add(struct padata_instance *pinst, const char *name) int ret; pinst->kobj.kset = pcrypt_kset; - ret = kobject_add(&pinst->kobj, NULL, name); + ret = kobject_add(&pinst->kobj, NULL, "%s", name); if (!ret) kobject_uevent(&pinst->kobj, KOBJ_ADD); -- cgit v1.2.3-59-g8ed1b From fc6176a240ae93850be445f355c1dba769fe8467 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Tue, 30 Oct 2018 12:01:58 +0000 Subject: crypto: chelsio - clean up various indentation issues Trivial fix to clean up varous indentation issue Signed-off-by: Colin Ian King Signed-off-by: Herbert Xu --- drivers/crypto/chelsio/chcr_algo.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/crypto/chelsio/chcr_algo.c b/drivers/crypto/chelsio/chcr_algo.c index db203f8be429..497c57803de7 100644 --- a/drivers/crypto/chelsio/chcr_algo.c +++ b/drivers/crypto/chelsio/chcr_algo.c @@ -1311,8 +1311,8 @@ static int chcr_aes_decrypt(struct ablkcipher_request *req) return -ENOSPC; } - err = process_cipher(req, u_ctx->lldi.rxq_ids[c_ctx(tfm)->rx_qidx], - &skb, CHCR_DECRYPT_OP); + err = process_cipher(req, u_ctx->lldi.rxq_ids[c_ctx(tfm)->rx_qidx], + &skb, CHCR_DECRYPT_OP); if (err || !skb) return err; skb->dev = u_ctx->lldi.ports[0]; @@ -2008,7 +2008,7 @@ static int chcr_ahash_export(struct ahash_request *areq, void *out) memcpy(state->partial_hash, req_ctx->partial_hash, CHCR_HASH_MAX_DIGEST_SIZE); chcr_init_hctx_per_wr(state); - return 0; + return 0; } static int chcr_ahash_import(struct ahash_request *areq, const void *in) @@ -2249,7 +2249,7 @@ static int chcr_aead_fallback(struct aead_request *req, unsigned short op_type) req->base.complete, req->base.data); aead_request_set_crypt(subreq, req->src, req->dst, req->cryptlen, req->iv); - aead_request_set_ad(subreq, req->assoclen); + aead_request_set_ad(subreq, req->assoclen); return op_type ? crypto_aead_decrypt(subreq) : crypto_aead_encrypt(subreq); } @@ -3118,12 +3118,12 @@ static int chcr_gcm_setauthsize(struct crypto_aead *tfm, unsigned int authsize) aeadctx->mayverify = VERIFY_HW; break; case ICV_12: - aeadctx->hmac_ctrl = CHCR_SCMD_HMAC_CTRL_IPSEC_96BIT; - aeadctx->mayverify = VERIFY_HW; + aeadctx->hmac_ctrl = CHCR_SCMD_HMAC_CTRL_IPSEC_96BIT; + aeadctx->mayverify = VERIFY_HW; break; case ICV_14: - aeadctx->hmac_ctrl = CHCR_SCMD_HMAC_CTRL_PL3; - aeadctx->mayverify = VERIFY_HW; + aeadctx->hmac_ctrl = CHCR_SCMD_HMAC_CTRL_PL3; + aeadctx->mayverify = VERIFY_HW; break; case ICV_16: aeadctx->hmac_ctrl = CHCR_SCMD_HMAC_CTRL_NO_TRUNC; -- cgit v1.2.3-59-g8ed1b From ed848b652cc6a7a3e7f5e539edc3c2009366093e Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sat, 3 Nov 2018 14:56:02 -0700 Subject: crypto: user - remove redundant reporting functions The acomp, akcipher, and kpp algorithm types already have .report methods defined, so there's no need to duplicate this functionality in crypto_user itself; the duplicate functions are actually never executed. Remove the unused code. Signed-off-by: Eric Biggers Signed-off-by: Herbert Xu --- crypto/crypto_user_base.c | 59 ----------------------------------------------- 1 file changed, 59 deletions(-) diff --git a/crypto/crypto_user_base.c b/crypto/crypto_user_base.c index 784748dbb19f..1ecd9b08e38f 100644 --- a/crypto/crypto_user_base.c +++ b/crypto/crypto_user_base.c @@ -113,51 +113,6 @@ nla_put_failure: return -EMSGSIZE; } -static int crypto_report_acomp(struct sk_buff *skb, struct crypto_alg *alg) -{ - struct crypto_report_acomp racomp; - - strncpy(racomp.type, "acomp", sizeof(racomp.type)); - - if (nla_put(skb, CRYPTOCFGA_REPORT_ACOMP, - sizeof(struct crypto_report_acomp), &racomp)) - goto nla_put_failure; - return 0; - -nla_put_failure: - return -EMSGSIZE; -} - -static int crypto_report_akcipher(struct sk_buff *skb, struct crypto_alg *alg) -{ - struct crypto_report_akcipher rakcipher; - - strncpy(rakcipher.type, "akcipher", sizeof(rakcipher.type)); - - if (nla_put(skb, CRYPTOCFGA_REPORT_AKCIPHER, - sizeof(struct crypto_report_akcipher), &rakcipher)) - goto nla_put_failure; - return 0; - -nla_put_failure: - return -EMSGSIZE; -} - -static int crypto_report_kpp(struct sk_buff *skb, struct crypto_alg *alg) -{ - struct crypto_report_kpp rkpp; - - strncpy(rkpp.type, "kpp", sizeof(rkpp.type)); - - if (nla_put(skb, CRYPTOCFGA_REPORT_KPP, - sizeof(struct crypto_report_kpp), &rkpp)) - goto nla_put_failure; - return 0; - -nla_put_failure: - return -EMSGSIZE; -} - static int crypto_report_one(struct crypto_alg *alg, struct crypto_user_alg *ualg, struct sk_buff *skb) { @@ -202,20 +157,6 @@ static int crypto_report_one(struct crypto_alg *alg, goto nla_put_failure; break; - case CRYPTO_ALG_TYPE_ACOMPRESS: - if (crypto_report_acomp(skb, alg)) - goto nla_put_failure; - - break; - case CRYPTO_ALG_TYPE_AKCIPHER: - if (crypto_report_akcipher(skb, alg)) - goto nla_put_failure; - - break; - case CRYPTO_ALG_TYPE_KPP: - if (crypto_report_kpp(skb, alg)) - goto nla_put_failure; - break; } out: -- cgit v1.2.3-59-g8ed1b From 37db69e0b4923bff331820ee6969681937d8b065 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sat, 3 Nov 2018 14:56:03 -0700 Subject: crypto: user - clean up report structure copying There have been a pretty ridiculous number of issues with initializing the report structures that are copied to userspace by NETLINK_CRYPTO. Commit 4473710df1f8 ("crypto: user - Prepare for CRYPTO_MAX_ALG_NAME expansion") replaced some strncpy()s with strlcpy()s, thereby introducing information leaks. Later two other people tried to replace other strncpy()s with strlcpy() too, which would have introduced even more information leaks: - https://lore.kernel.org/patchwork/patch/954991/ - https://patchwork.kernel.org/patch/10434351/ Commit cac5818c25d0 ("crypto: user - Implement a generic crypto statistics") also uses the buggy strlcpy() approach and therefore leaks uninitialized memory to userspace. A fix was proposed, but it was originally incomplete. Seeing as how apparently no one can get this right with the current approach, change all the reporting functions to: - Start by memsetting the report structure to 0. This guarantees it's always initialized, regardless of what happens later. - Initialize all strings using strscpy(). This is safe after the memset, ensures null termination of long strings, avoids unnecessary work, and avoids the -Wstringop-truncation warnings from gcc. - Use sizeof(var) instead of sizeof(type). This is more robust against copy+paste errors. For simplicity, also reuse the -EMSGSIZE return value from nla_put(). Signed-off-by: Eric Biggers Signed-off-by: Herbert Xu --- crypto/ablkcipher.c | 32 ++++++--------- crypto/acompress.c | 10 ++--- crypto/aead.c | 14 +++---- crypto/ahash.c | 12 ++---- crypto/akcipher.c | 11 ++--- crypto/blkcipher.c | 16 +++----- crypto/crypto_user_base.c | 38 ++++++++--------- crypto/crypto_user_stat.c | 102 +++++++++++----------------------------------- crypto/kpp.c | 10 ++--- crypto/rng.c | 12 ++---- crypto/scompress.c | 11 ++--- crypto/shash.c | 12 ++---- crypto/skcipher.c | 15 +++---- 13 files changed, 96 insertions(+), 199 deletions(-) diff --git a/crypto/ablkcipher.c b/crypto/ablkcipher.c index 8882e90e868e..b5e9ce19d324 100644 --- a/crypto/ablkcipher.c +++ b/crypto/ablkcipher.c @@ -365,23 +365,19 @@ static int crypto_ablkcipher_report(struct sk_buff *skb, struct crypto_alg *alg) { struct crypto_report_blkcipher rblkcipher; - strncpy(rblkcipher.type, "ablkcipher", sizeof(rblkcipher.type)); - strncpy(rblkcipher.geniv, alg->cra_ablkcipher.geniv ?: "", + memset(&rblkcipher, 0, sizeof(rblkcipher)); + + strscpy(rblkcipher.type, "ablkcipher", sizeof(rblkcipher.type)); + strscpy(rblkcipher.geniv, alg->cra_ablkcipher.geniv ?: "", sizeof(rblkcipher.geniv)); - rblkcipher.geniv[sizeof(rblkcipher.geniv) - 1] = '\0'; rblkcipher.blocksize = alg->cra_blocksize; rblkcipher.min_keysize = alg->cra_ablkcipher.min_keysize; rblkcipher.max_keysize = alg->cra_ablkcipher.max_keysize; rblkcipher.ivsize = alg->cra_ablkcipher.ivsize; - if (nla_put(skb, CRYPTOCFGA_REPORT_BLKCIPHER, - sizeof(struct crypto_report_blkcipher), &rblkcipher)) - goto nla_put_failure; - return 0; - -nla_put_failure: - return -EMSGSIZE; + return nla_put(skb, CRYPTOCFGA_REPORT_BLKCIPHER, + sizeof(rblkcipher), &rblkcipher); } #else static int crypto_ablkcipher_report(struct sk_buff *skb, struct crypto_alg *alg) @@ -440,23 +436,19 @@ static int crypto_givcipher_report(struct sk_buff *skb, struct crypto_alg *alg) { struct crypto_report_blkcipher rblkcipher; - strncpy(rblkcipher.type, "givcipher", sizeof(rblkcipher.type)); - strncpy(rblkcipher.geniv, alg->cra_ablkcipher.geniv ?: "", + memset(&rblkcipher, 0, sizeof(rblkcipher)); + + strscpy(rblkcipher.type, "givcipher", sizeof(rblkcipher.type)); + strscpy(rblkcipher.geniv, alg->cra_ablkcipher.geniv ?: "", sizeof(rblkcipher.geniv)); - rblkcipher.geniv[sizeof(rblkcipher.geniv) - 1] = '\0'; rblkcipher.blocksize = alg->cra_blocksize; rblkcipher.min_keysize = alg->cra_ablkcipher.min_keysize; rblkcipher.max_keysize = alg->cra_ablkcipher.max_keysize; rblkcipher.ivsize = alg->cra_ablkcipher.ivsize; - if (nla_put(skb, CRYPTOCFGA_REPORT_BLKCIPHER, - sizeof(struct crypto_report_blkcipher), &rblkcipher)) - goto nla_put_failure; - return 0; - -nla_put_failure: - return -EMSGSIZE; + return nla_put(skb, CRYPTOCFGA_REPORT_BLKCIPHER, + sizeof(rblkcipher), &rblkcipher); } #else static int crypto_givcipher_report(struct sk_buff *skb, struct crypto_alg *alg) diff --git a/crypto/acompress.c b/crypto/acompress.c index 1544b7c057fb..0c5bedd06e70 100644 --- a/crypto/acompress.c +++ b/crypto/acompress.c @@ -33,15 +33,11 @@ static int crypto_acomp_report(struct sk_buff *skb, struct crypto_alg *alg) { struct crypto_report_acomp racomp; - strncpy(racomp.type, "acomp", sizeof(racomp.type)); + memset(&racomp, 0, sizeof(racomp)); - if (nla_put(skb, CRYPTOCFGA_REPORT_ACOMP, - sizeof(struct crypto_report_acomp), &racomp)) - goto nla_put_failure; - return 0; + strscpy(racomp.type, "acomp", sizeof(racomp.type)); -nla_put_failure: - return -EMSGSIZE; + return nla_put(skb, CRYPTOCFGA_REPORT_ACOMP, sizeof(racomp), &racomp); } #else static int crypto_acomp_report(struct sk_buff *skb, struct crypto_alg *alg) diff --git a/crypto/aead.c b/crypto/aead.c index 60b3bbe973e7..189c52d1f63a 100644 --- a/crypto/aead.c +++ b/crypto/aead.c @@ -119,20 +119,16 @@ static int crypto_aead_report(struct sk_buff *skb, struct crypto_alg *alg) struct crypto_report_aead raead; struct aead_alg *aead = container_of(alg, struct aead_alg, base); - strncpy(raead.type, "aead", sizeof(raead.type)); - strncpy(raead.geniv, "", sizeof(raead.geniv)); + memset(&raead, 0, sizeof(raead)); + + strscpy(raead.type, "aead", sizeof(raead.type)); + strscpy(raead.geniv, "", sizeof(raead.geniv)); raead.blocksize = alg->cra_blocksize; raead.maxauthsize = aead->maxauthsize; raead.ivsize = aead->ivsize; - if (nla_put(skb, CRYPTOCFGA_REPORT_AEAD, - sizeof(struct crypto_report_aead), &raead)) - goto nla_put_failure; - return 0; - -nla_put_failure: - return -EMSGSIZE; + return nla_put(skb, CRYPTOCFGA_REPORT_AEAD, sizeof(raead), &raead); } #else static int crypto_aead_report(struct sk_buff *skb, struct crypto_alg *alg) diff --git a/crypto/ahash.c b/crypto/ahash.c index e21667b4e10a..3a348fbcf8f9 100644 --- a/crypto/ahash.c +++ b/crypto/ahash.c @@ -498,18 +498,14 @@ static int crypto_ahash_report(struct sk_buff *skb, struct crypto_alg *alg) { struct crypto_report_hash rhash; - strncpy(rhash.type, "ahash", sizeof(rhash.type)); + memset(&rhash, 0, sizeof(rhash)); + + strscpy(rhash.type, "ahash", sizeof(rhash.type)); rhash.blocksize = alg->cra_blocksize; rhash.digestsize = __crypto_hash_alg_common(alg)->digestsize; - if (nla_put(skb, CRYPTOCFGA_REPORT_HASH, - sizeof(struct crypto_report_hash), &rhash)) - goto nla_put_failure; - return 0; - -nla_put_failure: - return -EMSGSIZE; + return nla_put(skb, CRYPTOCFGA_REPORT_HASH, sizeof(rhash), &rhash); } #else static int crypto_ahash_report(struct sk_buff *skb, struct crypto_alg *alg) diff --git a/crypto/akcipher.c b/crypto/akcipher.c index cfbdb06d8ca8..0cbeae137e0a 100644 --- a/crypto/akcipher.c +++ b/crypto/akcipher.c @@ -30,15 +30,12 @@ static int crypto_akcipher_report(struct sk_buff *skb, struct crypto_alg *alg) { struct crypto_report_akcipher rakcipher; - strncpy(rakcipher.type, "akcipher", sizeof(rakcipher.type)); + memset(&rakcipher, 0, sizeof(rakcipher)); - if (nla_put(skb, CRYPTOCFGA_REPORT_AKCIPHER, - sizeof(struct crypto_report_akcipher), &rakcipher)) - goto nla_put_failure; - return 0; + strscpy(rakcipher.type, "akcipher", sizeof(rakcipher.type)); -nla_put_failure: - return -EMSGSIZE; + return nla_put(skb, CRYPTOCFGA_REPORT_AKCIPHER, + sizeof(rakcipher), &rakcipher); } #else static int crypto_akcipher_report(struct sk_buff *skb, struct crypto_alg *alg) diff --git a/crypto/blkcipher.c b/crypto/blkcipher.c index f93abf13b5d4..193237514e90 100644 --- a/crypto/blkcipher.c +++ b/crypto/blkcipher.c @@ -507,23 +507,19 @@ static int crypto_blkcipher_report(struct sk_buff *skb, struct crypto_alg *alg) { struct crypto_report_blkcipher rblkcipher; - strncpy(rblkcipher.type, "blkcipher", sizeof(rblkcipher.type)); - strncpy(rblkcipher.geniv, alg->cra_blkcipher.geniv ?: "", + memset(&rblkcipher, 0, sizeof(rblkcipher)); + + strscpy(rblkcipher.type, "blkcipher", sizeof(rblkcipher.type)); + strscpy(rblkcipher.geniv, alg->cra_blkcipher.geniv ?: "", sizeof(rblkcipher.geniv)); - rblkcipher.geniv[sizeof(rblkcipher.geniv) - 1] = '\0'; rblkcipher.blocksize = alg->cra_blocksize; rblkcipher.min_keysize = alg->cra_blkcipher.min_keysize; rblkcipher.max_keysize = alg->cra_blkcipher.max_keysize; rblkcipher.ivsize = alg->cra_blkcipher.ivsize; - if (nla_put(skb, CRYPTOCFGA_REPORT_BLKCIPHER, - sizeof(struct crypto_report_blkcipher), &rblkcipher)) - goto nla_put_failure; - return 0; - -nla_put_failure: - return -EMSGSIZE; + return nla_put(skb, CRYPTOCFGA_REPORT_BLKCIPHER, + sizeof(rblkcipher), &rblkcipher); } #else static int crypto_blkcipher_report(struct sk_buff *skb, struct crypto_alg *alg) diff --git a/crypto/crypto_user_base.c b/crypto/crypto_user_base.c index 1ecd9b08e38f..7021efbb35a1 100644 --- a/crypto/crypto_user_base.c +++ b/crypto/crypto_user_base.c @@ -84,42 +84,38 @@ static int crypto_report_cipher(struct sk_buff *skb, struct crypto_alg *alg) { struct crypto_report_cipher rcipher; - strncpy(rcipher.type, "cipher", sizeof(rcipher.type)); + memset(&rcipher, 0, sizeof(rcipher)); + + strscpy(rcipher.type, "cipher", sizeof(rcipher.type)); rcipher.blocksize = alg->cra_blocksize; rcipher.min_keysize = alg->cra_cipher.cia_min_keysize; rcipher.max_keysize = alg->cra_cipher.cia_max_keysize; - if (nla_put(skb, CRYPTOCFGA_REPORT_CIPHER, - sizeof(struct crypto_report_cipher), &rcipher)) - goto nla_put_failure; - return 0; - -nla_put_failure: - return -EMSGSIZE; + return nla_put(skb, CRYPTOCFGA_REPORT_CIPHER, + sizeof(rcipher), &rcipher); } static int crypto_report_comp(struct sk_buff *skb, struct crypto_alg *alg) { struct crypto_report_comp rcomp; - strncpy(rcomp.type, "compression", sizeof(rcomp.type)); - if (nla_put(skb, CRYPTOCFGA_REPORT_COMPRESS, - sizeof(struct crypto_report_comp), &rcomp)) - goto nla_put_failure; - return 0; + memset(&rcomp, 0, sizeof(rcomp)); -nla_put_failure: - return -EMSGSIZE; + strscpy(rcomp.type, "compression", sizeof(rcomp.type)); + + return nla_put(skb, CRYPTOCFGA_REPORT_COMPRESS, sizeof(rcomp), &rcomp); } static int crypto_report_one(struct crypto_alg *alg, struct crypto_user_alg *ualg, struct sk_buff *skb) { - strncpy(ualg->cru_name, alg->cra_name, sizeof(ualg->cru_name)); - strncpy(ualg->cru_driver_name, alg->cra_driver_name, + memset(ualg, 0, sizeof(*ualg)); + + strscpy(ualg->cru_name, alg->cra_name, sizeof(ualg->cru_name)); + strscpy(ualg->cru_driver_name, alg->cra_driver_name, sizeof(ualg->cru_driver_name)); - strncpy(ualg->cru_module_name, module_name(alg->cra_module), + strscpy(ualg->cru_module_name, module_name(alg->cra_module), sizeof(ualg->cru_module_name)); ualg->cru_type = 0; @@ -132,9 +128,9 @@ static int crypto_report_one(struct crypto_alg *alg, if (alg->cra_flags & CRYPTO_ALG_LARVAL) { struct crypto_report_larval rl; - strncpy(rl.type, "larval", sizeof(rl.type)); - if (nla_put(skb, CRYPTOCFGA_REPORT_LARVAL, - sizeof(struct crypto_report_larval), &rl)) + memset(&rl, 0, sizeof(rl)); + strscpy(rl.type, "larval", sizeof(rl.type)); + if (nla_put(skb, CRYPTOCFGA_REPORT_LARVAL, sizeof(rl), &rl)) goto nla_put_failure; goto out; } diff --git a/crypto/crypto_user_stat.c b/crypto/crypto_user_stat.c index 1dfaa0ccd555..a6fb2e6f618d 100644 --- a/crypto/crypto_user_stat.c +++ b/crypto/crypto_user_stat.c @@ -39,7 +39,7 @@ static int crypto_report_aead(struct sk_buff *skb, struct crypto_alg *alg) memset(&raead, 0, sizeof(raead)); - strncpy(raead.type, "aead", sizeof(raead.type)); + strscpy(raead.type, "aead", sizeof(raead.type)); v32 = atomic_read(&alg->encrypt_cnt); raead.stat_encrypt_cnt = v32; @@ -52,13 +52,7 @@ static int crypto_report_aead(struct sk_buff *skb, struct crypto_alg *alg) v32 = atomic_read(&alg->aead_err_cnt); raead.stat_aead_err_cnt = v32; - if (nla_put(skb, CRYPTOCFGA_STAT_AEAD, - sizeof(struct crypto_stat), &raead)) - goto nla_put_failure; - return 0; - -nla_put_failure: - return -EMSGSIZE; + return nla_put(skb, CRYPTOCFGA_STAT_AEAD, sizeof(raead), &raead); } static int crypto_report_cipher(struct sk_buff *skb, struct crypto_alg *alg) @@ -69,7 +63,7 @@ static int crypto_report_cipher(struct sk_buff *skb, struct crypto_alg *alg) memset(&rcipher, 0, sizeof(rcipher)); - strlcpy(rcipher.type, "cipher", sizeof(rcipher.type)); + strscpy(rcipher.type, "cipher", sizeof(rcipher.type)); v32 = atomic_read(&alg->encrypt_cnt); rcipher.stat_encrypt_cnt = v32; @@ -82,13 +76,7 @@ static int crypto_report_cipher(struct sk_buff *skb, struct crypto_alg *alg) v32 = atomic_read(&alg->cipher_err_cnt); rcipher.stat_cipher_err_cnt = v32; - if (nla_put(skb, CRYPTOCFGA_STAT_CIPHER, - sizeof(struct crypto_stat), &rcipher)) - goto nla_put_failure; - return 0; - -nla_put_failure: - return -EMSGSIZE; + return nla_put(skb, CRYPTOCFGA_STAT_CIPHER, sizeof(rcipher), &rcipher); } static int crypto_report_comp(struct sk_buff *skb, struct crypto_alg *alg) @@ -99,7 +87,7 @@ static int crypto_report_comp(struct sk_buff *skb, struct crypto_alg *alg) memset(&rcomp, 0, sizeof(rcomp)); - strlcpy(rcomp.type, "compression", sizeof(rcomp.type)); + strscpy(rcomp.type, "compression", sizeof(rcomp.type)); v32 = atomic_read(&alg->compress_cnt); rcomp.stat_compress_cnt = v32; v64 = atomic64_read(&alg->compress_tlen); @@ -111,13 +99,7 @@ static int crypto_report_comp(struct sk_buff *skb, struct crypto_alg *alg) v32 = atomic_read(&alg->cipher_err_cnt); rcomp.stat_compress_err_cnt = v32; - if (nla_put(skb, CRYPTOCFGA_STAT_COMPRESS, - sizeof(struct crypto_stat), &rcomp)) - goto nla_put_failure; - return 0; - -nla_put_failure: - return -EMSGSIZE; + return nla_put(skb, CRYPTOCFGA_STAT_COMPRESS, sizeof(rcomp), &rcomp); } static int crypto_report_acomp(struct sk_buff *skb, struct crypto_alg *alg) @@ -128,7 +110,7 @@ static int crypto_report_acomp(struct sk_buff *skb, struct crypto_alg *alg) memset(&racomp, 0, sizeof(racomp)); - strlcpy(racomp.type, "acomp", sizeof(racomp.type)); + strscpy(racomp.type, "acomp", sizeof(racomp.type)); v32 = atomic_read(&alg->compress_cnt); racomp.stat_compress_cnt = v32; v64 = atomic64_read(&alg->compress_tlen); @@ -140,13 +122,7 @@ static int crypto_report_acomp(struct sk_buff *skb, struct crypto_alg *alg) v32 = atomic_read(&alg->cipher_err_cnt); racomp.stat_compress_err_cnt = v32; - if (nla_put(skb, CRYPTOCFGA_STAT_ACOMP, - sizeof(struct crypto_stat), &racomp)) - goto nla_put_failure; - return 0; - -nla_put_failure: - return -EMSGSIZE; + return nla_put(skb, CRYPTOCFGA_STAT_ACOMP, sizeof(racomp), &racomp); } static int crypto_report_akcipher(struct sk_buff *skb, struct crypto_alg *alg) @@ -157,7 +133,7 @@ static int crypto_report_akcipher(struct sk_buff *skb, struct crypto_alg *alg) memset(&rakcipher, 0, sizeof(rakcipher)); - strncpy(rakcipher.type, "akcipher", sizeof(rakcipher.type)); + strscpy(rakcipher.type, "akcipher", sizeof(rakcipher.type)); v32 = atomic_read(&alg->encrypt_cnt); rakcipher.stat_encrypt_cnt = v32; v64 = atomic64_read(&alg->encrypt_tlen); @@ -173,13 +149,8 @@ static int crypto_report_akcipher(struct sk_buff *skb, struct crypto_alg *alg) v32 = atomic_read(&alg->akcipher_err_cnt); rakcipher.stat_akcipher_err_cnt = v32; - if (nla_put(skb, CRYPTOCFGA_STAT_AKCIPHER, - sizeof(struct crypto_stat), &rakcipher)) - goto nla_put_failure; - return 0; - -nla_put_failure: - return -EMSGSIZE; + return nla_put(skb, CRYPTOCFGA_STAT_AKCIPHER, + sizeof(rakcipher), &rakcipher); } static int crypto_report_kpp(struct sk_buff *skb, struct crypto_alg *alg) @@ -189,7 +160,7 @@ static int crypto_report_kpp(struct sk_buff *skb, struct crypto_alg *alg) memset(&rkpp, 0, sizeof(rkpp)); - strlcpy(rkpp.type, "kpp", sizeof(rkpp.type)); + strscpy(rkpp.type, "kpp", sizeof(rkpp.type)); v = atomic_read(&alg->setsecret_cnt); rkpp.stat_setsecret_cnt = v; @@ -200,13 +171,7 @@ static int crypto_report_kpp(struct sk_buff *skb, struct crypto_alg *alg) v = atomic_read(&alg->kpp_err_cnt); rkpp.stat_kpp_err_cnt = v; - if (nla_put(skb, CRYPTOCFGA_STAT_KPP, - sizeof(struct crypto_stat), &rkpp)) - goto nla_put_failure; - return 0; - -nla_put_failure: - return -EMSGSIZE; + return nla_put(skb, CRYPTOCFGA_STAT_KPP, sizeof(rkpp), &rkpp); } static int crypto_report_ahash(struct sk_buff *skb, struct crypto_alg *alg) @@ -217,7 +182,7 @@ static int crypto_report_ahash(struct sk_buff *skb, struct crypto_alg *alg) memset(&rhash, 0, sizeof(rhash)); - strncpy(rhash.type, "ahash", sizeof(rhash.type)); + strscpy(rhash.type, "ahash", sizeof(rhash.type)); v32 = atomic_read(&alg->hash_cnt); rhash.stat_hash_cnt = v32; @@ -226,13 +191,7 @@ static int crypto_report_ahash(struct sk_buff *skb, struct crypto_alg *alg) v32 = atomic_read(&alg->hash_err_cnt); rhash.stat_hash_err_cnt = v32; - if (nla_put(skb, CRYPTOCFGA_STAT_HASH, - sizeof(struct crypto_stat), &rhash)) - goto nla_put_failure; - return 0; - -nla_put_failure: - return -EMSGSIZE; + return nla_put(skb, CRYPTOCFGA_STAT_HASH, sizeof(rhash), &rhash); } static int crypto_report_shash(struct sk_buff *skb, struct crypto_alg *alg) @@ -243,7 +202,7 @@ static int crypto_report_shash(struct sk_buff *skb, struct crypto_alg *alg) memset(&rhash, 0, sizeof(rhash)); - strncpy(rhash.type, "shash", sizeof(rhash.type)); + strscpy(rhash.type, "shash", sizeof(rhash.type)); v32 = atomic_read(&alg->hash_cnt); rhash.stat_hash_cnt = v32; @@ -252,13 +211,7 @@ static int crypto_report_shash(struct sk_buff *skb, struct crypto_alg *alg) v32 = atomic_read(&alg->hash_err_cnt); rhash.stat_hash_err_cnt = v32; - if (nla_put(skb, CRYPTOCFGA_STAT_HASH, - sizeof(struct crypto_stat), &rhash)) - goto nla_put_failure; - return 0; - -nla_put_failure: - return -EMSGSIZE; + return nla_put(skb, CRYPTOCFGA_STAT_HASH, sizeof(rhash), &rhash); } static int crypto_report_rng(struct sk_buff *skb, struct crypto_alg *alg) @@ -269,7 +222,7 @@ static int crypto_report_rng(struct sk_buff *skb, struct crypto_alg *alg) memset(&rrng, 0, sizeof(rrng)); - strncpy(rrng.type, "rng", sizeof(rrng.type)); + strscpy(rrng.type, "rng", sizeof(rrng.type)); v32 = atomic_read(&alg->generate_cnt); rrng.stat_generate_cnt = v32; @@ -280,13 +233,7 @@ static int crypto_report_rng(struct sk_buff *skb, struct crypto_alg *alg) v32 = atomic_read(&alg->hash_err_cnt); rrng.stat_rng_err_cnt = v32; - if (nla_put(skb, CRYPTOCFGA_STAT_RNG, - sizeof(struct crypto_stat), &rrng)) - goto nla_put_failure; - return 0; - -nla_put_failure: - return -EMSGSIZE; + return nla_put(skb, CRYPTOCFGA_STAT_RNG, sizeof(rrng), &rrng); } static int crypto_reportstat_one(struct crypto_alg *alg, @@ -295,10 +242,10 @@ static int crypto_reportstat_one(struct crypto_alg *alg, { memset(ualg, 0, sizeof(*ualg)); - strlcpy(ualg->cru_name, alg->cra_name, sizeof(ualg->cru_name)); - strlcpy(ualg->cru_driver_name, alg->cra_driver_name, + strscpy(ualg->cru_name, alg->cra_name, sizeof(ualg->cru_name)); + strscpy(ualg->cru_driver_name, alg->cra_driver_name, sizeof(ualg->cru_driver_name)); - strlcpy(ualg->cru_module_name, module_name(alg->cra_module), + strscpy(ualg->cru_module_name, module_name(alg->cra_module), sizeof(ualg->cru_module_name)); ualg->cru_type = 0; @@ -312,9 +259,8 @@ static int crypto_reportstat_one(struct crypto_alg *alg, struct crypto_stat rl; memset(&rl, 0, sizeof(rl)); - strlcpy(rl.type, "larval", sizeof(rl.type)); - if (nla_put(skb, CRYPTOCFGA_STAT_LARVAL, - sizeof(struct crypto_stat), &rl)) + strscpy(rl.type, "larval", sizeof(rl.type)); + if (nla_put(skb, CRYPTOCFGA_STAT_LARVAL, sizeof(rl), &rl)) goto nla_put_failure; goto out; } diff --git a/crypto/kpp.c b/crypto/kpp.c index a90edc27af77..bc2f1006a2f7 100644 --- a/crypto/kpp.c +++ b/crypto/kpp.c @@ -30,15 +30,11 @@ static int crypto_kpp_report(struct sk_buff *skb, struct crypto_alg *alg) { struct crypto_report_kpp rkpp; - strncpy(rkpp.type, "kpp", sizeof(rkpp.type)); + memset(&rkpp, 0, sizeof(rkpp)); - if (nla_put(skb, CRYPTOCFGA_REPORT_KPP, - sizeof(struct crypto_report_kpp), &rkpp)) - goto nla_put_failure; - return 0; + strscpy(rkpp.type, "kpp", sizeof(rkpp.type)); -nla_put_failure: - return -EMSGSIZE; + return nla_put(skb, CRYPTOCFGA_REPORT_KPP, sizeof(rkpp), &rkpp); } #else static int crypto_kpp_report(struct sk_buff *skb, struct crypto_alg *alg) diff --git a/crypto/rng.c b/crypto/rng.c index 547f16ecbfb0..2406501b90b7 100644 --- a/crypto/rng.c +++ b/crypto/rng.c @@ -74,17 +74,13 @@ static int crypto_rng_report(struct sk_buff *skb, struct crypto_alg *alg) { struct crypto_report_rng rrng; - strncpy(rrng.type, "rng", sizeof(rrng.type)); + memset(&rrng, 0, sizeof(rrng)); - rrng.seedsize = seedsize(alg); + strscpy(rrng.type, "rng", sizeof(rrng.type)); - if (nla_put(skb, CRYPTOCFGA_REPORT_RNG, - sizeof(struct crypto_report_rng), &rrng)) - goto nla_put_failure; - return 0; + rrng.seedsize = seedsize(alg); -nla_put_failure: - return -EMSGSIZE; + return nla_put(skb, CRYPTOCFGA_REPORT_RNG, sizeof(rrng), &rrng); } #else static int crypto_rng_report(struct sk_buff *skb, struct crypto_alg *alg) diff --git a/crypto/scompress.c b/crypto/scompress.c index 968bbcf65c94..6f8305f8c300 100644 --- a/crypto/scompress.c +++ b/crypto/scompress.c @@ -40,15 +40,12 @@ static int crypto_scomp_report(struct sk_buff *skb, struct crypto_alg *alg) { struct crypto_report_comp rscomp; - strncpy(rscomp.type, "scomp", sizeof(rscomp.type)); + memset(&rscomp, 0, sizeof(rscomp)); - if (nla_put(skb, CRYPTOCFGA_REPORT_COMPRESS, - sizeof(struct crypto_report_comp), &rscomp)) - goto nla_put_failure; - return 0; + strscpy(rscomp.type, "scomp", sizeof(rscomp.type)); -nla_put_failure: - return -EMSGSIZE; + return nla_put(skb, CRYPTOCFGA_REPORT_COMPRESS, + sizeof(rscomp), &rscomp); } #else static int crypto_scomp_report(struct sk_buff *skb, struct crypto_alg *alg) diff --git a/crypto/shash.c b/crypto/shash.c index d21f04d70dce..44d297b82a8f 100644 --- a/crypto/shash.c +++ b/crypto/shash.c @@ -408,18 +408,14 @@ static int crypto_shash_report(struct sk_buff *skb, struct crypto_alg *alg) struct crypto_report_hash rhash; struct shash_alg *salg = __crypto_shash_alg(alg); - strncpy(rhash.type, "shash", sizeof(rhash.type)); + memset(&rhash, 0, sizeof(rhash)); + + strscpy(rhash.type, "shash", sizeof(rhash.type)); rhash.blocksize = alg->cra_blocksize; rhash.digestsize = salg->digestsize; - if (nla_put(skb, CRYPTOCFGA_REPORT_HASH, - sizeof(struct crypto_report_hash), &rhash)) - goto nla_put_failure; - return 0; - -nla_put_failure: - return -EMSGSIZE; + return nla_put(skb, CRYPTOCFGA_REPORT_HASH, sizeof(rhash), &rhash); } #else static int crypto_shash_report(struct sk_buff *skb, struct crypto_alg *alg) diff --git a/crypto/skcipher.c b/crypto/skcipher.c index 4caab81d2d02..17be8d9c714e 100644 --- a/crypto/skcipher.c +++ b/crypto/skcipher.c @@ -897,21 +897,18 @@ static int crypto_skcipher_report(struct sk_buff *skb, struct crypto_alg *alg) struct skcipher_alg *skcipher = container_of(alg, struct skcipher_alg, base); - strncpy(rblkcipher.type, "skcipher", sizeof(rblkcipher.type)); - strncpy(rblkcipher.geniv, "", sizeof(rblkcipher.geniv)); + memset(&rblkcipher, 0, sizeof(rblkcipher)); + + strscpy(rblkcipher.type, "skcipher", sizeof(rblkcipher.type)); + strscpy(rblkcipher.geniv, "", sizeof(rblkcipher.geniv)); rblkcipher.blocksize = alg->cra_blocksize; rblkcipher.min_keysize = skcipher->min_keysize; rblkcipher.max_keysize = skcipher->max_keysize; rblkcipher.ivsize = skcipher->ivsize; - if (nla_put(skb, CRYPTOCFGA_REPORT_BLKCIPHER, - sizeof(struct crypto_report_blkcipher), &rblkcipher)) - goto nla_put_failure; - return 0; - -nla_put_failure: - return -EMSGSIZE; + return nla_put(skb, CRYPTOCFGA_REPORT_BLKCIPHER, + sizeof(rblkcipher), &rblkcipher); } #else static int crypto_skcipher_report(struct sk_buff *skb, struct crypto_alg *alg) -- cgit v1.2.3-59-g8ed1b From 196ad6043e9fe93c4ae3dac02b5c8fd337f58c2d Mon Sep 17 00:00:00 2001 From: Gilad Ben-Yossef Date: Sun, 4 Nov 2018 10:05:24 +0000 Subject: crypto: testmgr - mark cts(cbc(aes)) as FIPS allowed As per Sp800-38A addendum from Oct 2010[1], cts(cbc(aes)) is allowed as a FIPS mode algorithm. Mark it as such. [1] https://csrc.nist.gov/publications/detail/sp/800-38a/addendum/final Signed-off-by: Gilad Ben-Yossef Reviewed-by: Stephan Mueller Signed-off-by: Herbert Xu --- crypto/testmgr.c | 1 + 1 file changed, 1 insertion(+) diff --git a/crypto/testmgr.c b/crypto/testmgr.c index 84937ceb4bd8..512ebf697f7e 100644 --- a/crypto/testmgr.c +++ b/crypto/testmgr.c @@ -2812,6 +2812,7 @@ static const struct alg_test_desc alg_test_descs[] = { }, { .alg = "cts(cbc(aes))", .test = alg_test_skcipher, + .fips_allowed = 1, .suite = { .cipher = __VECS(cts_mode_tv_template) } -- cgit v1.2.3-59-g8ed1b From 2eb4942b6609d35a4e835644a33203b0aef7443d Mon Sep 17 00:00:00 2001 From: Vitaly Chikunov Date: Mon, 5 Nov 2018 11:36:18 +0300 Subject: crypto: ecc - check for invalid values in the key verification test Currently used scalar multiplication algorithm (Matthieu Rivain, 2011) have invalid values for scalar == 1, n-1, and for regularized version n-2, which was previously not checked. Verify that they are not used as private keys. Signed-off-by: Vitaly Chikunov Signed-off-by: Herbert Xu --- crypto/ecc.c | 42 ++++++++++++++++++++++++++---------------- 1 file changed, 26 insertions(+), 16 deletions(-) diff --git a/crypto/ecc.c b/crypto/ecc.c index 8facafd67802..9d24e6522730 100644 --- a/crypto/ecc.c +++ b/crypto/ecc.c @@ -904,30 +904,43 @@ static inline void ecc_swap_digits(const u64 *in, u64 *out, out[i] = __swab64(in[ndigits - 1 - i]); } -int ecc_is_key_valid(unsigned int curve_id, unsigned int ndigits, - const u64 *private_key, unsigned int private_key_len) +static int __ecc_is_key_valid(const struct ecc_curve *curve, + const u64 *private_key, unsigned int ndigits) { - int nbytes; - const struct ecc_curve *curve = ecc_get_curve(curve_id); + u64 one[ECC_MAX_DIGITS] = { 1, }; + u64 res[ECC_MAX_DIGITS]; if (!private_key) return -EINVAL; - nbytes = ndigits << ECC_DIGITS_TO_BYTES_SHIFT; - - if (private_key_len != nbytes) + if (curve->g.ndigits != ndigits) return -EINVAL; - if (vli_is_zero(private_key, ndigits)) + /* Make sure the private key is in the range [2, n-3]. */ + if (vli_cmp(one, private_key, ndigits) != -1) return -EINVAL; - - /* Make sure the private key is in the range [1, n-1]. */ - if (vli_cmp(curve->n, private_key, ndigits) != 1) + vli_sub(res, curve->n, one, ndigits); + vli_sub(res, res, one, ndigits); + if (vli_cmp(res, private_key, ndigits) != 1) return -EINVAL; return 0; } +int ecc_is_key_valid(unsigned int curve_id, unsigned int ndigits, + const u64 *private_key, unsigned int private_key_len) +{ + int nbytes; + const struct ecc_curve *curve = ecc_get_curve(curve_id); + + nbytes = ndigits << ECC_DIGITS_TO_BYTES_SHIFT; + + if (private_key_len != nbytes) + return -EINVAL; + + return __ecc_is_key_valid(curve, private_key, ndigits); +} + /* * ECC private keys are generated using the method of extra random bits, * equivalent to that described in FIPS 186-4, Appendix B.4.1. @@ -971,11 +984,8 @@ int ecc_gen_privkey(unsigned int curve_id, unsigned int ndigits, u64 *privkey) if (err) return err; - if (vli_is_zero(priv, ndigits)) - return -EINVAL; - - /* Make sure the private key is in the range [1, n-1]. */ - if (vli_cmp(curve->n, priv, ndigits) != 1) + /* Make sure the private key is in the valid range. */ + if (__ecc_is_key_valid(curve, priv, ndigits)) return -EINVAL; ecc_swap_digits(priv, privkey, ndigits); -- cgit v1.2.3-59-g8ed1b From ecd6d5c9cba5fc6053ba21e3f8a4c536f65ea27a Mon Sep 17 00:00:00 2001 From: Gilad Ben-Yossef Date: Mon, 5 Nov 2018 12:05:01 +0000 Subject: crypto: cts - document NIST standard status cts(cbc(aes)) as used in the kernel has been added to NIST standard as CBC-CS3. Document it as such. Signed-off-by: Gilad Ben-Yossef Suggested-by: Stephan Mueller Acked-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- crypto/Kconfig | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/crypto/Kconfig b/crypto/Kconfig index 752005201013..06eb23cade43 100644 --- a/crypto/Kconfig +++ b/crypto/Kconfig @@ -430,11 +430,14 @@ config CRYPTO_CTS help CTS: Cipher Text Stealing This is the Cipher Text Stealing mode as described by - Section 8 of rfc2040 and referenced by rfc3962. - (rfc3962 includes errata information in its Appendix A) + Section 8 of rfc2040 and referenced by rfc3962 + (rfc3962 includes errata information in its Appendix A) or + CBC-CS3 as defined by NIST in Sp800-38A addendum from Oct 2010. This mode is required for Kerberos gss mechanism support for AES encryption. + See: https://csrc.nist.gov/publications/detail/sp/800-38a/addendum/final + config CRYPTO_ECB tristate "ECB support" select CRYPTO_BLKCIPHER -- cgit v1.2.3-59-g8ed1b From 4f0129d13e69bad0363fd75553fb22897b32c379 Mon Sep 17 00:00:00 2001 From: Raveendra Padasalagi Date: Tue, 6 Nov 2018 13:58:58 +0530 Subject: crypto: bcm - fix normal/non key hash algorithm failure Remove setkey() callback handler for normal/non key hash algorithms and keep it for AES-CBC/CMAC which needs key. Fixes: 9d12ba86f818 ("crypto: brcm - Add Broadcom SPU driver") Signed-off-by: Raveendra Padasalagi Signed-off-by: Herbert Xu --- drivers/crypto/bcm/cipher.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/crypto/bcm/cipher.c b/drivers/crypto/bcm/cipher.c index 2d1f1db9f807..4d67e22a4664 100644 --- a/drivers/crypto/bcm/cipher.c +++ b/drivers/crypto/bcm/cipher.c @@ -4652,12 +4652,16 @@ static int spu_register_ahash(struct iproc_alg_s *driver_alg) hash->halg.statesize = sizeof(struct spu_hash_export_s); if (driver_alg->auth_info.mode != HASH_MODE_HMAC) { - hash->setkey = ahash_setkey; hash->init = ahash_init; hash->update = ahash_update; hash->final = ahash_final; hash->finup = ahash_finup; hash->digest = ahash_digest; + if ((driver_alg->auth_info.alg == HASH_ALG_AES) && + ((driver_alg->auth_info.mode == HASH_MODE_XCBC) || + (driver_alg->auth_info.mode == HASH_MODE_CMAC))) { + hash->setkey = ahash_setkey; + } } else { hash->setkey = ahash_hmac_setkey; hash->init = ahash_hmac_init; -- cgit v1.2.3-59-g8ed1b From d65ddecbea3c07f9f93af9d32680e650f20aa102 Mon Sep 17 00:00:00 2001 From: Brajeswar Ghosh Date: Tue, 6 Nov 2018 17:46:14 +0530 Subject: crypto: aes-ce - Remove duplicate header Remove asm/hwcap.h which is included more than once Signed-off-by: Brajeswar Ghosh Acked-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- arch/arm/crypto/aes-ce-glue.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/arm/crypto/aes-ce-glue.c b/arch/arm/crypto/aes-ce-glue.c index d0a9cec73707..5affb8482379 100644 --- a/arch/arm/crypto/aes-ce-glue.c +++ b/arch/arm/crypto/aes-ce-glue.c @@ -10,7 +10,6 @@ #include #include -#include #include #include #include -- cgit v1.2.3-59-g8ed1b From fe18957e8e87403a9d4be8e8a62352ef107def99 Mon Sep 17 00:00:00 2001 From: Vitaly Chikunov Date: Wed, 7 Nov 2018 00:00:01 +0300 Subject: crypto: streebog - add Streebog hash function Add GOST/IETF Streebog hash function (GOST R 34.11-2012, RFC 6986) generic hash transformation. Cc: linux-integrity@vger.kernel.org Signed-off-by: Vitaly Chikunov Reviewed-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- crypto/Kconfig | 12 + crypto/Makefile | 1 + crypto/streebog_generic.c | 1140 +++++++++++++++++++++++++++++++++++++++++++++ include/crypto/streebog.h | 34 ++ 4 files changed, 1187 insertions(+) create mode 100644 crypto/streebog_generic.c create mode 100644 include/crypto/streebog.h diff --git a/crypto/Kconfig b/crypto/Kconfig index 06eb23cade43..62dbd1a99fa3 100644 --- a/crypto/Kconfig +++ b/crypto/Kconfig @@ -939,6 +939,18 @@ config CRYPTO_SM3 http://www.oscca.gov.cn/UpFile/20101222141857786.pdf https://datatracker.ietf.org/doc/html/draft-shen-sm3-hash +config CRYPTO_STREEBOG + tristate "Streebog Hash Function" + select CRYPTO_HASH + help + Streebog Hash Function (GOST R 34.11-2012, RFC 6986) is one of the Russian + cryptographic standard algorithms (called GOST algorithms). + This setting enables two hash algorithms with 256 and 512 bits output. + + References: + https://tc26.ru/upload/iblock/fed/feddbb4d26b685903faa2ba11aea43f6.pdf + https://tools.ietf.org/html/rfc6986 + config CRYPTO_TGR192 tristate "Tiger digest algorithms" select CRYPTO_HASH diff --git a/crypto/Makefile b/crypto/Makefile index 5c207c76abf7..abbd86fdbad2 100644 --- a/crypto/Makefile +++ b/crypto/Makefile @@ -71,6 +71,7 @@ obj-$(CONFIG_CRYPTO_SHA256) += sha256_generic.o obj-$(CONFIG_CRYPTO_SHA512) += sha512_generic.o obj-$(CONFIG_CRYPTO_SHA3) += sha3_generic.o obj-$(CONFIG_CRYPTO_SM3) += sm3_generic.o +obj-$(CONFIG_CRYPTO_STREEBOG) += streebog_generic.o obj-$(CONFIG_CRYPTO_WP512) += wp512.o CFLAGS_wp512.o := $(call cc-option,-fno-schedule-insns) # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=79149 obj-$(CONFIG_CRYPTO_TGR192) += tgr192.o diff --git a/crypto/streebog_generic.c b/crypto/streebog_generic.c new file mode 100644 index 000000000000..03272a22afce --- /dev/null +++ b/crypto/streebog_generic.c @@ -0,0 +1,1140 @@ +// SPDX-License-Identifier: GPL-2.0+ OR BSD-2-Clause +/* + * Streebog hash function as specified by GOST R 34.11-2012 and + * described at https://tools.ietf.org/html/rfc6986 + * + * Copyright (c) 2013 Alexey Degtyarev + * Copyright (c) 2018 Vitaly Chikunov + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + */ + +#include +#include +#include +#include + +static const struct streebog_uint512 buffer0 = { { + 0, 0, 0, 0, 0, 0, 0, 0 +} }; + +static const struct streebog_uint512 buffer512 = { { + cpu_to_le64(0x200), 0, 0, 0, 0, 0, 0, 0 +} }; + +static const struct streebog_uint512 C[12] = { + { { + cpu_to_le64(0xdd806559f2a64507ULL), + cpu_to_le64(0x05767436cc744d23ULL), + cpu_to_le64(0xa2422a08a460d315ULL), + cpu_to_le64(0x4b7ce09192676901ULL), + cpu_to_le64(0x714eb88d7585c4fcULL), + cpu_to_le64(0x2f6a76432e45d016ULL), + cpu_to_le64(0xebcb2f81c0657c1fULL), + cpu_to_le64(0xb1085bda1ecadae9ULL) + } }, + { { + cpu_to_le64(0xe679047021b19bb7ULL), + cpu_to_le64(0x55dda21bd7cbcd56ULL), + cpu_to_le64(0x5cb561c2db0aa7caULL), + cpu_to_le64(0x9ab5176b12d69958ULL), + cpu_to_le64(0x61d55e0f16b50131ULL), + cpu_to_le64(0xf3feea720a232b98ULL), + cpu_to_le64(0x4fe39d460f70b5d7ULL), + cpu_to_le64(0x6fa3b58aa99d2f1aULL) + } }, + { { + cpu_to_le64(0x991e96f50aba0ab2ULL), + cpu_to_le64(0xc2b6f443867adb31ULL), + cpu_to_le64(0xc1c93a376062db09ULL), + cpu_to_le64(0xd3e20fe490359eb1ULL), + cpu_to_le64(0xf2ea7514b1297b7bULL), + cpu_to_le64(0x06f15e5f529c1f8bULL), + cpu_to_le64(0x0a39fc286a3d8435ULL), + cpu_to_le64(0xf574dcac2bce2fc7ULL) + } }, + { { + cpu_to_le64(0x220cbebc84e3d12eULL), + cpu_to_le64(0x3453eaa193e837f1ULL), + cpu_to_le64(0xd8b71333935203beULL), + cpu_to_le64(0xa9d72c82ed03d675ULL), + cpu_to_le64(0x9d721cad685e353fULL), + cpu_to_le64(0x488e857e335c3c7dULL), + cpu_to_le64(0xf948e1a05d71e4ddULL), + cpu_to_le64(0xef1fdfb3e81566d2ULL) + } }, + { { + cpu_to_le64(0x601758fd7c6cfe57ULL), + cpu_to_le64(0x7a56a27ea9ea63f5ULL), + cpu_to_le64(0xdfff00b723271a16ULL), + cpu_to_le64(0xbfcd1747253af5a3ULL), + cpu_to_le64(0x359e35d7800fffbdULL), + cpu_to_le64(0x7f151c1f1686104aULL), + cpu_to_le64(0x9a3f410c6ca92363ULL), + cpu_to_le64(0x4bea6bacad474799ULL) + } }, + { { + cpu_to_le64(0xfa68407a46647d6eULL), + cpu_to_le64(0xbf71c57236904f35ULL), + cpu_to_le64(0x0af21f66c2bec6b6ULL), + cpu_to_le64(0xcffaa6b71c9ab7b4ULL), + cpu_to_le64(0x187f9ab49af08ec6ULL), + cpu_to_le64(0x2d66c4f95142a46cULL), + cpu_to_le64(0x6fa4c33b7a3039c0ULL), + cpu_to_le64(0xae4faeae1d3ad3d9ULL) + } }, + { { + cpu_to_le64(0x8886564d3a14d493ULL), + cpu_to_le64(0x3517454ca23c4af3ULL), + cpu_to_le64(0x06476983284a0504ULL), + cpu_to_le64(0x0992abc52d822c37ULL), + cpu_to_le64(0xd3473e33197a93c9ULL), + cpu_to_le64(0x399ec6c7e6bf87c9ULL), + cpu_to_le64(0x51ac86febf240954ULL), + cpu_to_le64(0xf4c70e16eeaac5ecULL) + } }, + { { + cpu_to_le64(0xa47f0dd4bf02e71eULL), + cpu_to_le64(0x36acc2355951a8d9ULL), + cpu_to_le64(0x69d18d2bd1a5c42fULL), + cpu_to_le64(0xf4892bcb929b0690ULL), + cpu_to_le64(0x89b4443b4ddbc49aULL), + cpu_to_le64(0x4eb7f8719c36de1eULL), + cpu_to_le64(0x03e7aa020c6e4141ULL), + cpu_to_le64(0x9b1f5b424d93c9a7ULL) + } }, + { { + cpu_to_le64(0x7261445183235adbULL), + cpu_to_le64(0x0e38dc92cb1f2a60ULL), + cpu_to_le64(0x7b2b8a9aa6079c54ULL), + cpu_to_le64(0x800a440bdbb2ceb1ULL), + cpu_to_le64(0x3cd955b7e00d0984ULL), + cpu_to_le64(0x3a7d3a1b25894224ULL), + cpu_to_le64(0x944c9ad8ec165fdeULL), + cpu_to_le64(0x378f5a541631229bULL) + } }, + { { + cpu_to_le64(0x74b4c7fb98459cedULL), + cpu_to_le64(0x3698fad1153bb6c3ULL), + cpu_to_le64(0x7a1e6c303b7652f4ULL), + cpu_to_le64(0x9fe76702af69334bULL), + cpu_to_le64(0x1fffe18a1b336103ULL), + cpu_to_le64(0x8941e71cff8a78dbULL), + cpu_to_le64(0x382ae548b2e4f3f3ULL), + cpu_to_le64(0xabbedea680056f52ULL) + } }, + { { + cpu_to_le64(0x6bcaa4cd81f32d1bULL), + cpu_to_le64(0xdea2594ac06fd85dULL), + cpu_to_le64(0xefbacd1d7d476e98ULL), + cpu_to_le64(0x8a1d71efea48b9caULL), + cpu_to_le64(0x2001802114846679ULL), + cpu_to_le64(0xd8fa6bbbebab0761ULL), + cpu_to_le64(0x3002c6cd635afe94ULL), + cpu_to_le64(0x7bcd9ed0efc889fbULL) + } }, + { { + cpu_to_le64(0x48bc924af11bd720ULL), + cpu_to_le64(0xfaf417d5d9b21b99ULL), + cpu_to_le64(0xe71da4aa88e12852ULL), + cpu_to_le64(0x5d80ef9d1891cc86ULL), + cpu_to_le64(0xf82012d430219f9bULL), + cpu_to_le64(0xcda43c32bcdf1d77ULL), + cpu_to_le64(0xd21380b00449b17aULL), + cpu_to_le64(0x378ee767f11631baULL) + } } +}; + +static const u8 Tau[64] = { + 0, 8, 16, 24, 32, 40, 48, 56, + 1, 9, 17, 25, 33, 41, 49, 57, + 2, 10, 18, 26, 34, 42, 50, 58, + 3, 11, 19, 27, 35, 43, 51, 59, + 4, 12, 20, 28, 36, 44, 52, 60, + 5, 13, 21, 29, 37, 45, 53, 61, + 6, 14, 22, 30, 38, 46, 54, 62, + 7, 15, 23, 31, 39, 47, 55, 63 +}; + +static const u8 Pi[256] = { + 252, 238, 221, 17, 207, 110, 49, 22, + 251, 196, 250, 218, 35, 197, 4, 77, + 233, 119, 240, 219, 147, 46, 153, 186, + 23, 54, 241, 187, 20, 205, 95, 193, + 249, 24, 101, 90, 226, 92, 239, 33, + 129, 28, 60, 66, 139, 1, 142, 79, + 5, 132, 2, 174, 227, 106, 143, 160, + 6, 11, 237, 152, 127, 212, 211, 31, + 235, 52, 44, 81, 234, 200, 72, 171, + 242, 42, 104, 162, 253, 58, 206, 204, + 181, 112, 14, 86, 8, 12, 118, 18, + 191, 114, 19, 71, 156, 183, 93, 135, + 21, 161, 150, 41, 16, 123, 154, 199, + 243, 145, 120, 111, 157, 158, 178, 177, + 50, 117, 25, 61, 255, 53, 138, 126, + 109, 84, 198, 128, 195, 189, 13, 87, + 223, 245, 36, 169, 62, 168, 67, 201, + 215, 121, 214, 246, 124, 34, 185, 3, + 224, 15, 236, 222, 122, 148, 176, 188, + 220, 232, 40, 80, 78, 51, 10, 74, + 167, 151, 96, 115, 30, 0, 98, 68, + 26, 184, 56, 130, 100, 159, 38, 65, + 173, 69, 70, 146, 39, 94, 85, 47, + 140, 163, 165, 125, 105, 213, 149, 59, + 7, 88, 179, 64, 134, 172, 29, 247, + 48, 55, 107, 228, 136, 217, 231, 137, + 225, 27, 131, 73, 76, 63, 248, 254, + 141, 83, 170, 144, 202, 216, 133, 97, + 32, 113, 103, 164, 45, 43, 9, 91, + 203, 155, 37, 208, 190, 229, 108, 82, + 89, 166, 116, 210, 230, 244, 180, 192, + 209, 102, 175, 194, 57, 75, 99, 182 +}; + +static const unsigned long long Ax[8][256] = { + { + 0xd01f715b5c7ef8e6ULL, 0x16fa240980778325ULL, 0xa8a42e857ee049c8ULL, + 0x6ac1068fa186465bULL, 0x6e417bd7a2e9320bULL, 0x665c8167a437daabULL, + 0x7666681aa89617f6ULL, 0x4b959163700bdcf5ULL, 0xf14be6b78df36248ULL, + 0xc585bd689a625cffULL, 0x9557d7fca67d82cbULL, 0x89f0b969af6dd366ULL, + 0xb0833d48749f6c35ULL, 0xa1998c23b1ecbc7cULL, 0x8d70c431ac02a736ULL, + 0xd6dfbc2fd0a8b69eULL, 0x37aeb3e551fa198bULL, 0x0b7d128a40b5cf9cULL, + 0x5a8f2008b5780cbcULL, 0xedec882284e333e5ULL, 0xd25fc177d3c7c2ceULL, + 0x5e0f5d50b61778ecULL, 0x1d873683c0c24cb9ULL, 0xad040bcbb45d208cULL, + 0x2f89a0285b853c76ULL, 0x5732fff6791b8d58ULL, 0x3e9311439ef6ec3fULL, + 0xc9183a809fd3c00fULL, 0x83adf3f5260a01eeULL, 0xa6791941f4e8ef10ULL, + 0x103ae97d0ca1cd5dULL, 0x2ce948121dee1b4aULL, 0x39738421dbf2bf53ULL, + 0x093da2a6cf0cf5b4ULL, 0xcd9847d89cbcb45fULL, 0xf9561c078b2d8ae8ULL, + 0x9c6a755a6971777fULL, 0xbc1ebaa0712ef0c5ULL, 0x72e61542abf963a6ULL, + 0x78bb5fde229eb12eULL, 0x14ba94250fceb90dULL, 0x844d6697630e5282ULL, + 0x98ea08026a1e032fULL, 0xf06bbea144217f5cULL, 0xdb6263d11ccb377aULL, + 0x641c314b2b8ee083ULL, 0x320e96ab9b4770cfULL, 0x1ee7deb986a96b85ULL, + 0xe96cf57a878c47b5ULL, 0xfdd6615f8842feb8ULL, 0xc83862965601dd1bULL, + 0x2ea9f83e92572162ULL, 0xf876441142ff97fcULL, 0xeb2c455608357d9dULL, + 0x5612a7e0b0c9904cULL, 0x6c01cbfb2d500823ULL, 0x4548a6a7fa037a2dULL, + 0xabc4c6bf388b6ef4ULL, 0xbade77d4fdf8bebdULL, 0x799b07c8eb4cac3aULL, + 0x0c9d87e805b19cf0ULL, 0xcb588aac106afa27ULL, 0xea0c1d40c1e76089ULL, + 0x2869354a1e816f1aULL, 0xff96d17307fbc490ULL, 0x9f0a9d602f1a5043ULL, + 0x96373fc6e016a5f7ULL, 0x5292dab8b3a6e41cULL, 0x9b8ae0382c752413ULL, + 0x4f15ec3b7364a8a5ULL, 0x3fb349555724f12bULL, 0xc7c50d4415db66d7ULL, + 0x92b7429ee379d1a7ULL, 0xd37f99611a15dfdaULL, 0x231427c05e34a086ULL, + 0xa439a96d7b51d538ULL, 0xb403401077f01865ULL, 0xdda2aea5901d7902ULL, + 0x0a5d4a9c8967d288ULL, 0xc265280adf660f93ULL, 0x8bb0094520d4e94eULL, + 0x2a29856691385532ULL, 0x42a833c5bf072941ULL, 0x73c64d54622b7eb2ULL, + 0x07e095624504536cULL, 0x8a905153e906f45aULL, 0x6f6123c16b3b2f1fULL, + 0xc6e55552dc097bc3ULL, 0x4468feb133d16739ULL, 0xe211e7f0c7398829ULL, + 0xa2f96419f7879b40ULL, 0x19074bdbc3ad38e9ULL, 0xf4ebc3f9474e0b0cULL, + 0x43886bd376d53455ULL, 0xd8028beb5aa01046ULL, 0x51f23282f5cdc320ULL, + 0xe7b1c2be0d84e16dULL, 0x081dfab006dee8a0ULL, 0x3b33340d544b857bULL, + 0x7f5bcabc679ae242ULL, 0x0edd37c48a08a6d8ULL, 0x81ed43d9a9b33bc6ULL, + 0xb1a3655ebd4d7121ULL, 0x69a1eeb5e7ed6167ULL, 0xf6ab73d5c8f73124ULL, + 0x1a67a3e185c61fd5ULL, 0x2dc91004d43c065eULL, 0x0240b02c8fb93a28ULL, + 0x90f7f2b26cc0eb8fULL, 0x3cd3a16f114fd617ULL, 0xaae49ea9f15973e0ULL, + 0x06c0cd748cd64e78ULL, 0xda423bc7d5192a6eULL, 0xc345701c16b41287ULL, + 0x6d2193ede4821537ULL, 0xfcf639494190e3acULL, 0x7c3b228621f1c57eULL, + 0xfb16ac2b0494b0c0ULL, 0xbf7e529a3745d7f9ULL, 0x6881b6a32e3f7c73ULL, + 0xca78d2bad9b8e733ULL, 0xbbfe2fc2342aa3a9ULL, 0x0dbddffecc6381e4ULL, + 0x70a6a56e2440598eULL, 0xe4d12a844befc651ULL, 0x8c509c2765d0ba22ULL, + 0xee8c6018c28814d9ULL, 0x17da7c1f49a59e31ULL, 0x609c4c1328e194d3ULL, + 0xb3e3d57232f44b09ULL, 0x91d7aaa4a512f69bULL, 0x0ffd6fd243dabbccULL, + 0x50d26a943c1fde34ULL, 0x6be15e9968545b4fULL, 0x94778fea6faf9fdfULL, + 0x2b09dd7058ea4826ULL, 0x677cd9716de5c7bfULL, 0x49d5214fffb2e6ddULL, + 0x0360e83a466b273cULL, 0x1fc786af4f7b7691ULL, 0xa0b9d435783ea168ULL, + 0xd49f0c035f118cb6ULL, 0x01205816c9d21d14ULL, 0xac2453dd7d8f3d98ULL, + 0x545217cc3f70aa64ULL, 0x26b4028e9489c9c2ULL, 0xdec2469fd6765e3eULL, + 0x04807d58036f7450ULL, 0xe5f17292823ddb45ULL, 0xf30b569b024a5860ULL, + 0x62dcfc3fa758aefbULL, 0xe84cad6c4e5e5aa1ULL, 0xccb81fce556ea94bULL, + 0x53b282ae7a74f908ULL, 0x1b47fbf74c1402c1ULL, 0x368eebf39828049fULL, + 0x7afbeff2ad278b06ULL, 0xbe5e0a8cfe97caedULL, 0xcfd8f7f413058e77ULL, + 0xf78b2bc301252c30ULL, 0x4d555c17fcdd928dULL, 0x5f2f05467fc565f8ULL, + 0x24f4b2a21b30f3eaULL, 0x860dd6bbecb768aaULL, 0x4c750401350f8f99ULL, + 0x0000000000000000ULL, 0xecccd0344d312ef1ULL, 0xb5231806be220571ULL, + 0xc105c030990d28afULL, 0x653c695de25cfd97ULL, 0x159acc33c61ca419ULL, + 0xb89ec7f872418495ULL, 0xa9847693b73254dcULL, 0x58cf90243ac13694ULL, + 0x59efc832f3132b80ULL, 0x5c4fed7c39ae42c4ULL, 0x828dabe3efd81cfaULL, + 0xd13f294d95ace5f2ULL, 0x7d1b7a90e823d86aULL, 0xb643f03cf849224dULL, + 0x3df3f979d89dcb03ULL, 0x7426d836272f2ddeULL, 0xdfe21e891fa4432aULL, + 0x3a136c1b9d99986fULL, 0xfa36f43dcd46add4ULL, 0xc025982650df35bbULL, + 0x856d3e81aadc4f96ULL, 0xc4a5e57e53b041ebULL, 0x4708168b75ba4005ULL, + 0xaf44bbe73be41aa4ULL, 0x971767d029c4b8e3ULL, 0xb9be9feebb939981ULL, + 0x215497ecd18d9aaeULL, 0x316e7e91dd2c57f3ULL, 0xcef8afe2dad79363ULL, + 0x3853dc371220a247ULL, 0x35ee03c9de4323a3ULL, 0xe6919aa8c456fc79ULL, + 0xe05157dc4880b201ULL, 0x7bdbb7e464f59612ULL, 0x127a59518318f775ULL, + 0x332ecebd52956ddbULL, 0x8f30741d23bb9d1eULL, 0xd922d3fd93720d52ULL, + 0x7746300c61440ae2ULL, 0x25d4eab4d2e2eefeULL, 0x75068020eefd30caULL, + 0x135a01474acaea61ULL, 0x304e268714fe4ae7ULL, 0xa519f17bb283c82cULL, + 0xdc82f6b359cf6416ULL, 0x5baf781e7caa11a8ULL, 0xb2c38d64fb26561dULL, + 0x34ce5bdf17913eb7ULL, 0x5d6fb56af07c5fd0ULL, 0x182713cd0a7f25fdULL, + 0x9e2ac576e6c84d57ULL, 0x9aaab82ee5a73907ULL, 0xa3d93c0f3e558654ULL, + 0x7e7b92aaae48ff56ULL, 0x872d8ead256575beULL, 0x41c8dbfff96c0e7dULL, + 0x99ca5014a3cc1e3bULL, 0x40e883e930be1369ULL, 0x1ca76e95091051adULL, + 0x4e35b42dbab6b5b1ULL, 0x05a0254ecabd6944ULL, 0xe1710fca8152af15ULL, + 0xf22b0e8dcb984574ULL, 0xb763a82a319b3f59ULL, 0x63fca4296e8ab3efULL, + 0x9d4a2d4ca0a36a6bULL, 0xe331bfe60eeb953dULL, 0xd5bf541596c391a2ULL, + 0xf5cb9bef8e9c1618ULL, 0x46284e9dbc685d11ULL, 0x2074cffa185f87baULL, + 0xbd3ee2b6b8fcedd1ULL, 0xae64e3f1f23607b0ULL, 0xfeb68965ce29d984ULL, + 0x55724fdaf6a2b770ULL, 0x29496d5cd753720eULL, 0xa75941573d3af204ULL, + 0x8e102c0bea69800aULL, 0x111ab16bc573d049ULL, 0xd7ffe439197aab8aULL, + 0xefac380e0b5a09cdULL, 0x48f579593660fbc9ULL, 0x22347fd697e6bd92ULL, + 0x61bc1405e13389c7ULL, 0x4ab5c975b9d9c1e1ULL, 0x80cd1bcf606126d2ULL, + 0x7186fd78ed92449aULL, 0x93971a882aabccb3ULL, 0x88d0e17f66bfce72ULL, + 0x27945a985d5bd4d6ULL + }, { + 0xde553f8c05a811c8ULL, 0x1906b59631b4f565ULL, 0x436e70d6b1964ff7ULL, + 0x36d343cb8b1e9d85ULL, 0x843dfacc858aab5aULL, 0xfdfc95c299bfc7f9ULL, + 0x0f634bdea1d51fa2ULL, 0x6d458b3b76efb3cdULL, 0x85c3f77cf8593f80ULL, + 0x3c91315fbe737cb2ULL, 0x2148b03366ace398ULL, 0x18f8b8264c6761bfULL, + 0xc830c1c495c9fb0fULL, 0x981a76102086a0aaULL, 0xaa16012142f35760ULL, + 0x35cc54060c763cf6ULL, 0x42907d66cc45db2dULL, 0x8203d44b965af4bcULL, + 0x3d6f3cefc3a0e868ULL, 0xbc73ff69d292bda7ULL, 0x8722ed0102e20a29ULL, + 0x8f8185e8cd34deb7ULL, 0x9b0561dda7ee01d9ULL, 0x5335a0193227fad6ULL, + 0xc9cecc74e81a6fd5ULL, 0x54f5832e5c2431eaULL, 0x99e47ba05d553470ULL, + 0xf7bee756acd226ceULL, 0x384e05a5571816fdULL, 0xd1367452a47d0e6aULL, + 0xf29fde1c386ad85bULL, 0x320c77316275f7caULL, 0xd0c879e2d9ae9ab0ULL, + 0xdb7406c69110ef5dULL, 0x45505e51a2461011ULL, 0xfc029872e46c5323ULL, + 0xfa3cb6f5f7bc0cc5ULL, 0x031f17cd8768a173ULL, 0xbd8df2d9af41297dULL, + 0x9d3b4f5ab43e5e3fULL, 0x4071671b36feee84ULL, 0x716207e7d3e3b83dULL, + 0x48d20ff2f9283a1aULL, 0x27769eb4757cbc7eULL, 0x5c56ebc793f2e574ULL, + 0xa48b474f9ef5dc18ULL, 0x52cbada94ff46e0cULL, 0x60c7da982d8199c6ULL, + 0x0e9d466edc068b78ULL, 0x4eec2175eaf865fcULL, 0x550b8e9e21f7a530ULL, + 0x6b7ba5bc653fec2bULL, 0x5eb7f1ba6949d0ddULL, 0x57ea94e3db4c9099ULL, + 0xf640eae6d101b214ULL, 0xdd4a284182c0b0bbULL, 0xff1d8fbf6304f250ULL, + 0xb8accb933bf9d7e8ULL, 0xe8867c478eb68c4dULL, 0x3f8e2692391bddc1ULL, + 0xcb2fd60912a15a7cULL, 0xaec935dbab983d2fULL, 0xf55ffd2b56691367ULL, + 0x80e2ce366ce1c115ULL, 0x179bf3f8edb27e1dULL, 0x01fe0db07dd394daULL, + 0xda8a0b76ecc37b87ULL, 0x44ae53e1df9584cbULL, 0xb310b4b77347a205ULL, + 0xdfab323c787b8512ULL, 0x3b511268d070b78eULL, 0x65e6e3d2b9396753ULL, + 0x6864b271e2574d58ULL, 0x259784c98fc789d7ULL, 0x02e11a7dfabb35a9ULL, + 0x8841a6dfa337158bULL, 0x7ade78c39b5dcdd0ULL, 0xb7cf804d9a2cc84aULL, + 0x20b6bd831b7f7742ULL, 0x75bd331d3a88d272ULL, 0x418f6aab4b2d7a5eULL, + 0xd9951cbb6babdaf4ULL, 0xb6318dfde7ff5c90ULL, 0x1f389b112264aa83ULL, + 0x492c024284fbaec0ULL, 0xe33a0363c608f9a0ULL, 0x2688930408af28a4ULL, + 0xc7538a1a341ce4adULL, 0x5da8e677ee2171aeULL, 0x8c9e92254a5c7fc4ULL, + 0x63d8cd55aae938b5ULL, 0x29ebd8daa97a3706ULL, 0x959827b37be88aa1ULL, + 0x1484e4356adadf6eULL, 0xa7945082199d7d6bULL, 0xbf6ce8a455fa1cd4ULL, + 0x9cc542eac9edcae5ULL, 0x79c16f0e1c356ca3ULL, 0x89bfab6fdee48151ULL, + 0xd4174d1830c5f0ffULL, 0x9258048415eb419dULL, 0x6139d72850520d1cULL, + 0x6a85a80c18ec78f1ULL, 0xcd11f88e0171059aULL, 0xcceff53e7ca29140ULL, + 0xd229639f2315af19ULL, 0x90b91ef9ef507434ULL, 0x5977d28d074a1be1ULL, + 0x311360fce51d56b9ULL, 0xc093a92d5a1f2f91ULL, 0x1a19a25bb6dc5416ULL, + 0xeb996b8a09de2d3eULL, 0xfee3820f1ed7668aULL, 0xd7085ad5b7ad518cULL, + 0x7fff41890fe53345ULL, 0xec5948bd67dde602ULL, 0x2fd5f65dbaaa68e0ULL, + 0xa5754affe32648c2ULL, 0xf8ddac880d07396cULL, 0x6fa491468c548664ULL, + 0x0c7c5c1326bdbed1ULL, 0x4a33158f03930fb3ULL, 0x699abfc19f84d982ULL, + 0xe4fa2054a80b329cULL, 0x6707f9af438252faULL, 0x08a368e9cfd6d49eULL, + 0x47b1442c58fd25b8ULL, 0xbbb3dc5ebc91769bULL, 0x1665fe489061eac7ULL, + 0x33f27a811fa66310ULL, 0x93a609346838d547ULL, 0x30ed6d4c98cec263ULL, + 0x1dd9816cd8df9f2aULL, 0x94662a03063b1e7bULL, 0x83fdd9fbeb896066ULL, + 0x7b207573e68e590aULL, 0x5f49fc0a149a4407ULL, 0x343259b671a5a82cULL, + 0xfbc2bb458a6f981fULL, 0xc272b350a0a41a38ULL, 0x3aaf1fd8ada32354ULL, + 0x6cbb868b0b3c2717ULL, 0xa2b569c88d2583feULL, 0xf180c9d1bf027928ULL, + 0xaf37386bd64ba9f5ULL, 0x12bacab2790a8088ULL, 0x4c0d3b0810435055ULL, + 0xb2eeb9070e9436dfULL, 0xc5b29067cea7d104ULL, 0xdcb425f1ff132461ULL, + 0x4f122cc5972bf126ULL, 0xac282fa651230886ULL, 0xe7e537992f6393efULL, + 0xe61b3a2952b00735ULL, 0x709c0a57ae302ce7ULL, 0xe02514ae416058d3ULL, + 0xc44c9dd7b37445deULL, 0x5a68c5408022ba92ULL, 0x1c278cdca50c0bf0ULL, + 0x6e5a9cf6f18712beULL, 0x86dce0b17f319ef3ULL, 0x2d34ec2040115d49ULL, + 0x4bcd183f7e409b69ULL, 0x2815d56ad4a9a3dcULL, 0x24698979f2141d0dULL, + 0x0000000000000000ULL, 0x1ec696a15fb73e59ULL, 0xd86b110b16784e2eULL, + 0x8e7f8858b0e74a6dULL, 0x063e2e8713d05fe6ULL, 0xe2c40ed3bbdb6d7aULL, + 0xb1f1aeca89fc97acULL, 0xe1db191e3cb3cc09ULL, 0x6418ee62c4eaf389ULL, + 0xc6ad87aa49cf7077ULL, 0xd6f65765ca7ec556ULL, 0x9afb6c6dda3d9503ULL, + 0x7ce05644888d9236ULL, 0x8d609f95378feb1eULL, 0x23a9aa4e9c17d631ULL, + 0x6226c0e5d73aac6fULL, 0x56149953a69f0443ULL, 0xeeb852c09d66d3abULL, + 0x2b0ac2a753c102afULL, 0x07c023376e03cb3cULL, 0x2ccae1903dc2c993ULL, + 0xd3d76e2f5ec63bc3ULL, 0x9e2458973356ff4cULL, 0xa66a5d32644ee9b1ULL, + 0x0a427294356de137ULL, 0x783f62be61e6f879ULL, 0x1344c70204d91452ULL, + 0x5b96c8f0fdf12e48ULL, 0xa90916ecc59bf613ULL, 0xbe92e5142829880eULL, + 0x727d102a548b194eULL, 0x1be7afebcb0fc0ccULL, 0x3e702b2244c8491bULL, + 0xd5e940a84d166425ULL, 0x66f9f41f3e51c620ULL, 0xabe80c913f20c3baULL, + 0xf07ec461c2d1edf2ULL, 0xf361d3ac45b94c81ULL, 0x0521394a94b8fe95ULL, + 0xadd622162cf09c5cULL, 0xe97871f7f3651897ULL, 0xf4a1f09b2bba87bdULL, + 0x095d6559b2054044ULL, 0x0bbc7f2448be75edULL, 0x2af4cf172e129675ULL, + 0x157ae98517094bb4ULL, 0x9fda55274e856b96ULL, 0x914713499283e0eeULL, + 0xb952c623462a4332ULL, 0x74433ead475b46a8ULL, 0x8b5eb112245fb4f8ULL, + 0xa34b6478f0f61724ULL, 0x11a5dd7ffe6221fbULL, 0xc16da49d27ccbb4bULL, + 0x76a224d0bde07301ULL, 0x8aa0bca2598c2022ULL, 0x4df336b86d90c48fULL, + 0xea67663a740db9e4ULL, 0xef465f70e0b54771ULL, 0x39b008152acb8227ULL, + 0x7d1e5bf4f55e06ecULL, 0x105bd0cf83b1b521ULL, 0x775c2960c033e7dbULL, + 0x7e014c397236a79fULL, 0x811cc386113255cfULL, 0xeda7450d1a0e72d8ULL, + 0x5889df3d7a998f3bULL, 0x2e2bfbedc779fc3aULL, 0xce0eef438619a4e9ULL, + 0x372d4e7bf6cd095fULL, 0x04df34fae96b6a4fULL, 0xf923a13870d4adb6ULL, + 0xa1aa7e050a4d228dULL, 0xa8f71b5cb84862c9ULL, 0xb52e9a306097fde3ULL, + 0x0d8251a35b6e2a0bULL, 0x2257a7fee1c442ebULL, 0x73831d9a29588d94ULL, + 0x51d4ba64c89ccf7fULL, 0x502ab7d4b54f5ba5ULL, 0x97793dce8153bf08ULL, + 0xe5042de4d5d8a646ULL, 0x9687307efc802bd2ULL, 0xa05473b5779eb657ULL, + 0xb4d097801d446939ULL, 0xcff0e2f3fbca3033ULL, 0xc38cbee0dd778ee2ULL, + 0x464f499c252eb162ULL, 0xcad1dbb96f72cea6ULL, 0xba4dd1eec142e241ULL, + 0xb00fa37af42f0376ULL + }, { + 0xcce4cd3aa968b245ULL, 0x089d5484e80b7fafULL, 0x638246c1b3548304ULL, + 0xd2fe0ec8c2355492ULL, 0xa7fbdf7ff2374eeeULL, 0x4df1600c92337a16ULL, + 0x84e503ea523b12fbULL, 0x0790bbfd53ab0c4aULL, 0x198a780f38f6ea9dULL, + 0x2ab30c8f55ec48cbULL, 0xe0f7fed6b2c49db5ULL, 0xb6ecf3f422cadbdcULL, + 0x409c9a541358df11ULL, 0xd3ce8a56dfde3fe3ULL, 0xc3e9224312c8c1a0ULL, + 0x0d6dfa58816ba507ULL, 0xddf3e1b179952777ULL, 0x04c02a42748bb1d9ULL, + 0x94c2abff9f2decb8ULL, 0x4f91752da8f8acf4ULL, 0x78682befb169bf7bULL, + 0xe1c77a48af2ff6c4ULL, 0x0c5d7ec69c80ce76ULL, 0x4cc1e4928fd81167ULL, + 0xfeed3d24d9997b62ULL, 0x518bb6dfc3a54a23ULL, 0x6dbf2d26151f9b90ULL, + 0xb5bc624b05ea664fULL, 0xe86aaa525acfe21aULL, 0x4801ced0fb53a0beULL, + 0xc91463e6c00868edULL, 0x1027a815cd16fe43ULL, 0xf67069a0319204cdULL, + 0xb04ccc976c8abce7ULL, 0xc0b9b3fc35e87c33ULL, 0xf380c77c58f2de65ULL, + 0x50bb3241de4e2152ULL, 0xdf93f490435ef195ULL, 0xf1e0d25d62390887ULL, + 0xaf668bfb1a3c3141ULL, 0xbc11b251f00a7291ULL, 0x73a5eed47e427d47ULL, + 0x25bee3f6ee4c3b2eULL, 0x43cc0beb34786282ULL, 0xc824e778dde3039cULL, + 0xf97d86d98a327728ULL, 0xf2b043e24519b514ULL, 0xe297ebf7880f4b57ULL, + 0x3a94a49a98fab688ULL, 0x868516cb68f0c419ULL, 0xeffa11af0964ee50ULL, + 0xa4ab4ec0d517f37dULL, 0xa9c6b498547c567aULL, 0x8e18424f80fbbbb6ULL, + 0x0bcdc53bcf2bc23cULL, 0x137739aaea3643d0ULL, 0x2c1333ec1bac2ff0ULL, + 0x8d48d3f0a7db0625ULL, 0x1e1ac3f26b5de6d7ULL, 0xf520f81f16b2b95eULL, + 0x9f0f6ec450062e84ULL, 0x0130849e1deb6b71ULL, 0xd45e31ab8c7533a9ULL, + 0x652279a2fd14e43fULL, 0x3209f01e70f1c927ULL, 0xbe71a770cac1a473ULL, + 0x0e3d6be7a64b1894ULL, 0x7ec8148cff29d840ULL, 0xcb7476c7fac3be0fULL, + 0x72956a4a63a91636ULL, 0x37f95ec21991138fULL, 0x9e3fea5a4ded45f5ULL, + 0x7b38ba50964902e8ULL, 0x222e580bbde73764ULL, 0x61e253e0899f55e6ULL, + 0xfc8d2805e352ad80ULL, 0x35994be3235ac56dULL, 0x09add01af5e014deULL, + 0x5e8659a6780539c6ULL, 0xb17c48097161d796ULL, 0x026015213acbd6e2ULL, + 0xd1ae9f77e515e901ULL, 0xb7dc776a3f21b0adULL, 0xaba6a1b96eb78098ULL, + 0x9bcf4486248d9f5dULL, 0x582666c536455efdULL, 0xfdbdac9bfeb9c6f1ULL, + 0xc47999be4163cdeaULL, 0x765540081722a7efULL, 0x3e548ed8ec710751ULL, + 0x3d041f67cb51bac2ULL, 0x7958af71ac82d40aULL, 0x36c9da5c047a78feULL, + 0xed9a048e33af38b2ULL, 0x26ee7249c96c86bdULL, 0x900281bdeba65d61ULL, + 0x11172c8bd0fd9532ULL, 0xea0abf73600434f8ULL, 0x42fc8f75299309f3ULL, + 0x34a9cf7d3eb1ae1cULL, 0x2b838811480723baULL, 0x5ce64c8742ceef24ULL, + 0x1adae9b01fd6570eULL, 0x3c349bf9d6bad1b3ULL, 0x82453c891c7b75c0ULL, + 0x97923a40b80d512bULL, 0x4a61dbf1c198765cULL, 0xb48ce6d518010d3eULL, + 0xcfb45c858e480fd6ULL, 0xd933cbf30d1e96aeULL, 0xd70ea014ab558e3aULL, + 0xc189376228031742ULL, 0x9262949cd16d8b83ULL, 0xeb3a3bed7def5f89ULL, + 0x49314a4ee6b8cbcfULL, 0xdcc3652f647e4c06ULL, 0xda635a4c2a3e2b3dULL, + 0x470c21a940f3d35bULL, 0x315961a157d174b4ULL, 0x6672e81dda3459acULL, + 0x5b76f77a1165e36eULL, 0x445cb01667d36ec8ULL, 0xc5491d205c88a69bULL, + 0x456c34887a3805b9ULL, 0xffddb9bac4721013ULL, 0x99af51a71e4649bfULL, + 0xa15be01cbc7729d5ULL, 0x52db2760e485f7b0ULL, 0x8c78576eba306d54ULL, + 0xae560f6507d75a30ULL, 0x95f22f6182c687c9ULL, 0x71c5fbf54489aba5ULL, + 0xca44f259e728d57eULL, 0x88b87d2ccebbdc8dULL, 0xbab18d32be4a15aaULL, + 0x8be8ec93e99b611eULL, 0x17b713e89ebdf209ULL, 0xb31c5d284baa0174ULL, + 0xeeca9531148f8521ULL, 0xb8d198138481c348ULL, 0x8988f9b2d350b7fcULL, + 0xb9e11c8d996aa839ULL, 0x5a4673e40c8e881fULL, 0x1687977683569978ULL, + 0xbf4123eed72acf02ULL, 0x4ea1f1b3b513c785ULL, 0xe767452be16f91ffULL, + 0x7505d1b730021a7cULL, 0xa59bca5ec8fc980cULL, 0xad069eda20f7e7a3ULL, + 0x38f4b1bba231606aULL, 0x60d2d77e94743e97ULL, 0x9affc0183966f42cULL, + 0x248e6768f3a7505fULL, 0xcdd449a4b483d934ULL, 0x87b59255751baf68ULL, + 0x1bea6d2e023d3c7fULL, 0x6b1f12455b5ffcabULL, 0x743555292de9710dULL, + 0xd8034f6d10f5fddfULL, 0xc6198c9f7ba81b08ULL, 0xbb8109aca3a17edbULL, + 0xfa2d1766ad12cabbULL, 0xc729080166437079ULL, 0x9c5fff7b77269317ULL, + 0x0000000000000000ULL, 0x15d706c9a47624ebULL, 0x6fdf38072fd44d72ULL, + 0x5fb6dd3865ee52b7ULL, 0xa33bf53d86bcff37ULL, 0xe657c1b5fc84fa8eULL, + 0xaa962527735cebe9ULL, 0x39c43525bfda0b1bULL, 0x204e4d2a872ce186ULL, + 0x7a083ece8ba26999ULL, 0x554b9c9db72efbfaULL, 0xb22cd9b656416a05ULL, + 0x96a2bedea5e63a5aULL, 0x802529a826b0a322ULL, 0x8115ad363b5bc853ULL, + 0x8375b81701901eb1ULL, 0x3069e53f4a3a1fc5ULL, 0xbd2136cfede119e0ULL, + 0x18bafc91251d81ecULL, 0x1d4a524d4c7d5b44ULL, 0x05f0aedc6960daa8ULL, + 0x29e39d3072ccf558ULL, 0x70f57f6b5962c0d4ULL, 0x989fd53903ad22ceULL, + 0xf84d024797d91c59ULL, 0x547b1803aac5908bULL, 0xf0d056c37fd263f6ULL, + 0xd56eb535919e58d8ULL, 0x1c7ad6d351963035ULL, 0x2e7326cd2167f912ULL, + 0xac361a443d1c8cd2ULL, 0x697f076461942a49ULL, 0x4b515f6fdc731d2dULL, + 0x8ad8680df4700a6fULL, 0x41ac1eca0eb3b460ULL, 0x7d988533d80965d3ULL, + 0xa8f6300649973d0bULL, 0x7765c4960ac9cc9eULL, 0x7ca801adc5e20ea2ULL, + 0xdea3700e5eb59ae4ULL, 0xa06b6482a19c42a4ULL, 0x6a2f96db46b497daULL, + 0x27def6d7d487edccULL, 0x463ca5375d18b82aULL, 0xa6cb5be1efdc259fULL, + 0x53eba3fef96e9cc1ULL, 0xce84d81b93a364a7ULL, 0xf4107c810b59d22fULL, + 0x333974806d1aa256ULL, 0x0f0def79bba073e5ULL, 0x231edc95a00c5c15ULL, + 0xe437d494c64f2c6cULL, 0x91320523f64d3610ULL, 0x67426c83c7df32ddULL, + 0x6eefbc99323f2603ULL, 0x9d6f7be56acdf866ULL, 0x5916e25b2bae358cULL, + 0x7ff89012e2c2b331ULL, 0x035091bf2720bd93ULL, 0x561b0d22900e4669ULL, + 0x28d319ae6f279e29ULL, 0x2f43a2533c8c9263ULL, 0xd09e1be9f8fe8270ULL, + 0xf740ed3e2c796fbcULL, 0xdb53ded237d5404cULL, 0x62b2c25faebfe875ULL, + 0x0afd41a5d2c0a94dULL, 0x6412fd3ce0ff8f4eULL, 0xe3a76f6995e42026ULL, + 0x6c8fa9b808f4f0e1ULL, 0xc2d9a6dd0f23aad1ULL, 0x8f28c6d19d10d0c7ULL, + 0x85d587744fd0798aULL, 0xa20b71a39b579446ULL, 0x684f83fa7c7f4138ULL, + 0xe507500adba4471dULL, 0x3f640a46f19a6c20ULL, 0x1247bd34f7dd28a1ULL, + 0x2d23b77206474481ULL, 0x93521002cc86e0f2ULL, 0x572b89bc8de52d18ULL, + 0xfb1d93f8b0f9a1caULL, 0xe95a2ecc4724896bULL, 0x3ba420048511ddf9ULL, + 0xd63e248ab6bee54bULL, 0x5dd6c8195f258455ULL, 0x06a03f634e40673bULL, + 0x1f2a476c76b68da6ULL, 0x217ec9b49ac78af7ULL, 0xecaa80102e4453c3ULL, + 0x14e78257b99d4f9aULL + }, { + 0x20329b2cc87bba05ULL, 0x4f5eb6f86546a531ULL, 0xd4f44775f751b6b1ULL, + 0x8266a47b850dfa8bULL, 0xbb986aa15a6ca985ULL, 0xc979eb08f9ae0f99ULL, + 0x2da6f447a2375ea1ULL, 0x1e74275dcd7d8576ULL, 0xbc20180a800bc5f8ULL, + 0xb4a2f701b2dc65beULL, 0xe726946f981b6d66ULL, 0x48e6c453bf21c94cULL, + 0x42cad9930f0a4195ULL, 0xefa47b64aacccd20ULL, 0x71180a8960409a42ULL, + 0x8bb3329bf6a44e0cULL, 0xd34c35de2d36daccULL, 0xa92f5b7cbc23dc96ULL, + 0xb31a85aa68bb09c3ULL, 0x13e04836a73161d2ULL, 0xb24dfc4129c51d02ULL, + 0x8ae44b70b7da5acdULL, 0xe671ed84d96579a7ULL, 0xa4bb3417d66f3832ULL, + 0x4572ab38d56d2de8ULL, 0xb1b47761ea47215cULL, 0xe81c09cf70aba15dULL, + 0xffbdb872ce7f90acULL, 0xa8782297fd5dc857ULL, 0x0d946f6b6a4ce4a4ULL, + 0xe4df1f4f5b995138ULL, 0x9ebc71edca8c5762ULL, 0x0a2c1dc0b02b88d9ULL, + 0x3b503c115d9d7b91ULL, 0xc64376a8111ec3a2ULL, 0xcec199a323c963e4ULL, + 0xdc76a87ec58616f7ULL, 0x09d596e073a9b487ULL, 0x14583a9d7d560dafULL, + 0xf4c6dc593f2a0cb4ULL, 0xdd21d19584f80236ULL, 0x4a4836983ddde1d3ULL, + 0xe58866a41ae745f9ULL, 0xf591a5b27e541875ULL, 0x891dc05074586693ULL, + 0x5b068c651810a89eULL, 0xa30346bc0c08544fULL, 0x3dbf3751c684032dULL, + 0x2a1e86ec785032dcULL, 0xf73f5779fca830eaULL, 0xb60c05ca30204d21ULL, + 0x0cc316802b32f065ULL, 0x8770241bdd96be69ULL, 0xb861e18199ee95dbULL, + 0xf805cad91418fcd1ULL, 0x29e70dccbbd20e82ULL, 0xc7140f435060d763ULL, + 0x0f3a9da0e8b0cc3bULL, 0xa2543f574d76408eULL, 0xbd7761e1c175d139ULL, + 0x4b1f4f737ca3f512ULL, 0x6dc2df1f2fc137abULL, 0xf1d05c3967b14856ULL, + 0xa742bf3715ed046cULL, 0x654030141d1697edULL, 0x07b872abda676c7dULL, + 0x3ce84eba87fa17ecULL, 0xc1fb0403cb79afdfULL, 0x3e46bc7105063f73ULL, + 0x278ae987121cd678ULL, 0xa1adb4778ef47cd0ULL, 0x26dd906c5362c2b9ULL, + 0x05168060589b44e2ULL, 0xfbfc41f9d79ac08fULL, 0x0e6de44ba9ced8faULL, + 0x9feb08068bf243a3ULL, 0x7b341749d06b129bULL, 0x229c69e74a87929aULL, + 0xe09ee6c4427c011bULL, 0x5692e30e725c4c3aULL, 0xda99a33e5e9f6e4bULL, + 0x353dd85af453a36bULL, 0x25241b4c90e0fee7ULL, 0x5de987258309d022ULL, + 0xe230140fc0802984ULL, 0x93281e86a0c0b3c6ULL, 0xf229d719a4337408ULL, + 0x6f6c2dd4ad3d1f34ULL, 0x8ea5b2fbae3f0aeeULL, 0x8331dd90c473ee4aULL, + 0x346aa1b1b52db7aaULL, 0xdf8f235e06042aa9ULL, 0xcc6f6b68a1354b7bULL, + 0x6c95a6f46ebf236aULL, 0x52d31a856bb91c19ULL, 0x1a35ded6d498d555ULL, + 0xf37eaef2e54d60c9ULL, 0x72e181a9a3c2a61cULL, 0x98537aad51952fdeULL, + 0x16f6c856ffaa2530ULL, 0xd960281e9d1d5215ULL, 0x3a0745fa1ce36f50ULL, + 0x0b7b642bf1559c18ULL, 0x59a87eae9aec8001ULL, 0x5e100c05408bec7cULL, + 0x0441f98b19e55023ULL, 0xd70dcc5534d38aefULL, 0x927f676de1bea707ULL, + 0x9769e70db925e3e5ULL, 0x7a636ea29115065aULL, 0x468b201816ef11b6ULL, + 0xab81a9b73edff409ULL, 0xc0ac7de88a07bb1eULL, 0x1f235eb68c0391b7ULL, + 0x6056b074458dd30fULL, 0xbe8eeac102f7ed67ULL, 0xcd381283e04b5fbaULL, + 0x5cbefecec277c4e3ULL, 0xd21b4c356c48ce0dULL, 0x1019c31664b35d8cULL, + 0x247362a7d19eea26ULL, 0xebe582efb3299d03ULL, 0x02aef2cb82fc289fULL, + 0x86275df09ce8aaa8ULL, 0x28b07427faac1a43ULL, 0x38a9b7319e1f47cfULL, + 0xc82e92e3b8d01b58ULL, 0x06ef0b409b1978bcULL, 0x62f842bfc771fb90ULL, + 0x9904034610eb3b1fULL, 0xded85ab5477a3e68ULL, 0x90d195a663428f98ULL, + 0x5384636e2ac708d8ULL, 0xcbd719c37b522706ULL, 0xae9729d76644b0ebULL, + 0x7c8c65e20a0c7ee6ULL, 0x80c856b007f1d214ULL, 0x8c0b40302cc32271ULL, + 0xdbcedad51fe17a8aULL, 0x740e8ae938dbdea0ULL, 0xa615c6dc549310adULL, + 0x19cc55f6171ae90bULL, 0x49b1bdb8fe5fdd8dULL, 0xed0a89af2830e5bfULL, + 0x6a7aadb4f5a65bd6ULL, 0x7e22972988f05679ULL, 0xf952b3325566e810ULL, + 0x39fecedadf61530eULL, 0x6101c99f04f3c7ceULL, 0x2e5f7f6761b562ffULL, + 0xf08725d226cf5c97ULL, 0x63af3b54860fef51ULL, 0x8ff2cb10ef411e2fULL, + 0x884ab9bb35267252ULL, 0x4df04433e7ba8daeULL, 0x9afd8866d3690741ULL, + 0x66b9bb34de94abb3ULL, 0x9baaf18d92171380ULL, 0x543c11c5f0a064a5ULL, + 0x17a1b1bdbed431f1ULL, 0xb5f58eeaf3a2717fULL, 0xc355f6c849858740ULL, + 0xec5df044694ef17eULL, 0xd83751f5dc6346d4ULL, 0xfc4433520dfdacf2ULL, + 0x0000000000000000ULL, 0x5a51f58e596ebc5fULL, 0x3285aaf12e34cf16ULL, + 0x8d5c39db6dbd36b0ULL, 0x12b731dde64f7513ULL, 0x94906c2d7aa7dfbbULL, + 0x302b583aacc8e789ULL, 0x9d45facd090e6b3cULL, 0x2165e2c78905aec4ULL, + 0x68d45f7f775a7349ULL, 0x189b2c1d5664fdcaULL, 0xe1c99f2f030215daULL, + 0x6983269436246788ULL, 0x8489af3b1e148237ULL, 0xe94b702431d5b59cULL, + 0x33d2d31a6f4adbd7ULL, 0xbfd9932a4389f9a6ULL, 0xb0e30e8aab39359dULL, + 0xd1e2c715afcaf253ULL, 0x150f43763c28196eULL, 0xc4ed846393e2eb3dULL, + 0x03f98b20c3823c5eULL, 0xfd134ab94c83b833ULL, 0x556b682eb1de7064ULL, + 0x36c4537a37d19f35ULL, 0x7559f30279a5ca61ULL, 0x799ae58252973a04ULL, + 0x9c12832648707ffdULL, 0x78cd9c6913e92ec5ULL, 0x1d8dac7d0effb928ULL, + 0x439da0784e745554ULL, 0x413352b3cc887dcbULL, 0xbacf134a1b12bd44ULL, + 0x114ebafd25cd494dULL, 0x2f08068c20cb763eULL, 0x76a07822ba27f63fULL, + 0xeab2fb04f25789c2ULL, 0xe3676de481fe3d45ULL, 0x1b62a73d95e6c194ULL, + 0x641749ff5c68832cULL, 0xa5ec4dfc97112cf3ULL, 0xf6682e92bdd6242bULL, + 0x3f11c59a44782bb2ULL, 0x317c21d1edb6f348ULL, 0xd65ab5be75ad9e2eULL, + 0x6b2dd45fb4d84f17ULL, 0xfaab381296e4d44eULL, 0xd0b5befeeeb4e692ULL, + 0x0882ef0b32d7a046ULL, 0x512a91a5a83b2047ULL, 0x963e9ee6f85bf724ULL, + 0x4e09cf132438b1f0ULL, 0x77f701c9fb59e2feULL, 0x7ddb1c094b726a27ULL, + 0x5f4775ee01f5f8bdULL, 0x9186ec4d223c9b59ULL, 0xfeeac1998f01846dULL, + 0xac39db1ce4b89874ULL, 0xb75b7c21715e59e0ULL, 0xafc0503c273aa42aULL, + 0x6e3b543fec430bf5ULL, 0x704f7362213e8e83ULL, 0x58ff0745db9294c0ULL, + 0x67eec2df9feabf72ULL, 0xa0facd9ccf8a6811ULL, 0xb936986ad890811aULL, + 0x95c715c63bd9cb7aULL, 0xca8060283a2c33c7ULL, 0x507de84ee9453486ULL, + 0x85ded6d05f6a96f6ULL, 0x1cdad5964f81ade9ULL, 0xd5a33e9eb62fa270ULL, + 0x40642b588df6690aULL, 0x7f75eec2c98e42b8ULL, 0x2cf18dace3494a60ULL, + 0x23cb100c0bf9865bULL, 0xeef3028febb2d9e1ULL, 0x4425d2d394133929ULL, + 0xaad6d05c7fa1e0c8ULL, 0xad6ea2f7a5c68cb5ULL, 0xc2028f2308fb9381ULL, + 0x819f2f5b468fc6d5ULL, 0xc5bafd88d29cfffcULL, 0x47dc59f357910577ULL, + 0x2b49ff07392e261dULL, 0x57c59ae5332258fbULL, 0x73b6f842e2bcb2ddULL, + 0xcf96e04862b77725ULL, 0x4ca73dd8a6c4996fULL, 0x015779eb417e14c1ULL, + 0x37932a9176af8bf4ULL + }, { + 0x190a2c9b249df23eULL, 0x2f62f8b62263e1e9ULL, 0x7a7f754740993655ULL, + 0x330b7ba4d5564d9fULL, 0x4c17a16a46672582ULL, 0xb22f08eb7d05f5b8ULL, + 0x535f47f40bc148ccULL, 0x3aec5d27d4883037ULL, 0x10ed0a1825438f96ULL, + 0x516101f72c233d17ULL, 0x13cc6f949fd04eaeULL, 0x739853c441474bfdULL, + 0x653793d90d3f5b1bULL, 0x5240647b96b0fc2fULL, 0x0c84890ad27623e0ULL, + 0xd7189b32703aaea3ULL, 0x2685de3523bd9c41ULL, 0x99317c5b11bffefaULL, + 0x0d9baa854f079703ULL, 0x70b93648fbd48ac5ULL, 0xa80441fce30bc6beULL, + 0x7287704bdc36ff1eULL, 0xb65384ed33dc1f13ULL, 0xd36417343ee34408ULL, + 0x39cd38ab6e1bf10fULL, 0x5ab861770a1f3564ULL, 0x0ebacf09f594563bULL, + 0xd04572b884708530ULL, 0x3cae9722bdb3af47ULL, 0x4a556b6f2f5cbaf2ULL, + 0xe1704f1f76c4bd74ULL, 0x5ec4ed7144c6dfcfULL, 0x16afc01d4c7810e6ULL, + 0x283f113cd629ca7aULL, 0xaf59a8761741ed2dULL, 0xeed5a3991e215facULL, + 0x3bf37ea849f984d4ULL, 0xe413e096a56ce33cULL, 0x2c439d3a98f020d1ULL, + 0x637559dc6404c46bULL, 0x9e6c95d1e5f5d569ULL, 0x24bb9836045fe99aULL, + 0x44efa466dac8ecc9ULL, 0xc6eab2a5c80895d6ULL, 0x803b50c035220cc4ULL, + 0x0321658cba93c138ULL, 0x8f9ebc465dc7ee1cULL, 0xd15a5137190131d3ULL, + 0x0fa5ec8668e5e2d8ULL, 0x91c979578d1037b1ULL, 0x0642ca05693b9f70ULL, + 0xefca80168350eb4fULL, 0x38d21b24f36a45ecULL, 0xbeab81e1af73d658ULL, + 0x8cbfd9cae7542f24ULL, 0xfd19cc0d81f11102ULL, 0x0ac6430fbb4dbc90ULL, + 0x1d76a09d6a441895ULL, 0x2a01573ff1cbbfa1ULL, 0xb572e161894fde2bULL, + 0x8124734fa853b827ULL, 0x614b1fdf43e6b1b0ULL, 0x68ac395c4238cc18ULL, + 0x21d837bfd7f7b7d2ULL, 0x20c714304a860331ULL, 0x5cfaab726324aa14ULL, + 0x74c5ba4eb50d606eULL, 0xf3a3030474654739ULL, 0x23e671bcf015c209ULL, + 0x45f087e947b9582aULL, 0xd8bd77b418df4c7bULL, 0xe06f6c90ebb50997ULL, + 0x0bd96080263c0873ULL, 0x7e03f9410e40dcfeULL, 0xb8e94be4c6484928ULL, + 0xfb5b0608e8ca8e72ULL, 0x1a2b49179e0e3306ULL, 0x4e29e76961855059ULL, + 0x4f36c4e6fcf4e4baULL, 0x49740ee395cf7bcaULL, 0xc2963ea386d17f7dULL, + 0x90d65ad810618352ULL, 0x12d34c1b02a1fa4dULL, 0xfa44258775bb3a91ULL, + 0x18150f14b9ec46ddULL, 0x1491861e6b9a653dULL, 0x9a1019d7ab2c3fc2ULL, + 0x3668d42d06fe13d7ULL, 0xdcc1fbb25606a6d0ULL, 0x969490dd795a1c22ULL, + 0x3549b1a1bc6dd2efULL, 0xc94f5e23a0ed770eULL, 0xb9f6686b5b39fdcbULL, + 0xc4d4f4a6efeae00dULL, 0xe732851a1fff2204ULL, 0x94aad6de5eb869f9ULL, + 0x3f8ff2ae07206e7fULL, 0xfe38a9813b62d03aULL, 0xa7a1ad7a8bee2466ULL, + 0x7b6056c8dde882b6ULL, 0x302a1e286fc58ca7ULL, 0x8da0fa457a259bc7ULL, + 0xb3302b64e074415bULL, 0x5402ae7eff8b635fULL, 0x08f8050c9cafc94bULL, + 0xae468bf98a3059ceULL, 0x88c355cca98dc58fULL, 0xb10e6d67c7963480ULL, + 0xbad70de7e1aa3cf3ULL, 0xbfb4a26e320262bbULL, 0xcb711820870f02d5ULL, + 0xce12b7a954a75c9dULL, 0x563ce87dd8691684ULL, 0x9f73b65e7884618aULL, + 0x2b1e74b06cba0b42ULL, 0x47cec1ea605b2df1ULL, 0x1c698312f735ac76ULL, + 0x5fdbcefed9b76b2cULL, 0x831a354c8fb1cdfcULL, 0x820516c312c0791fULL, + 0xb74ca762aeadabf0ULL, 0xfc06ef821c80a5e1ULL, 0x5723cbf24518a267ULL, + 0x9d4df05d5f661451ULL, 0x588627742dfd40bfULL, 0xda8331b73f3d39a0ULL, + 0x17b0e392d109a405ULL, 0xf965400bcf28fba9ULL, 0x7c3dbf4229a2a925ULL, + 0x023e460327e275dbULL, 0x6cd0b55a0ce126b3ULL, 0xe62da695828e96e7ULL, + 0x42ad6e63b3f373b9ULL, 0xe50cc319381d57dfULL, 0xc5cbd729729b54eeULL, + 0x46d1e265fd2a9912ULL, 0x6428b056904eeff8ULL, 0x8be23040131e04b7ULL, + 0x6709d5da2add2ec0ULL, 0x075de98af44a2b93ULL, 0x8447dcc67bfbe66fULL, + 0x6616f655b7ac9a23ULL, 0xd607b8bded4b1a40ULL, 0x0563af89d3a85e48ULL, + 0x3db1b4ad20c21ba4ULL, 0x11f22997b8323b75ULL, 0x292032b34b587e99ULL, + 0x7f1cdace9331681dULL, 0x8e819fc9c0b65affULL, 0xa1e3677fe2d5bb16ULL, + 0xcd33d225ee349da5ULL, 0xd9a2543b85aef898ULL, 0x795e10cbfa0af76dULL, + 0x25a4bbb9992e5d79ULL, 0x78413344677b438eULL, 0xf0826688cef68601ULL, + 0xd27b34bba392f0ebULL, 0x551d8df162fad7bcULL, 0x1e57c511d0d7d9adULL, + 0xdeffbdb171e4d30bULL, 0xf4feea8e802f6caaULL, 0xa480c8f6317de55eULL, + 0xa0fc44f07fa40ff5ULL, 0x95b5f551c3c9dd1aULL, 0x22f952336d6476eaULL, + 0x0000000000000000ULL, 0xa6be8ef5169f9085ULL, 0xcc2cf1aa73452946ULL, + 0x2e7ddb39bf12550aULL, 0xd526dd3157d8db78ULL, 0x486b2d6c08becf29ULL, + 0x9b0f3a58365d8b21ULL, 0xac78cdfaadd22c15ULL, 0xbc95c7e28891a383ULL, + 0x6a927f5f65dab9c3ULL, 0xc3891d2c1ba0cb9eULL, 0xeaa92f9f50f8b507ULL, + 0xcf0d9426c9d6e87eULL, 0xca6e3baf1a7eb636ULL, 0xab25247059980786ULL, + 0x69b31ad3df4978fbULL, 0xe2512a93cc577c4cULL, 0xff278a0ea61364d9ULL, + 0x71a615c766a53e26ULL, 0x89dc764334fc716cULL, 0xf87a638452594f4aULL, + 0xf2bc208be914f3daULL, 0x8766b94ac1682757ULL, 0xbbc82e687cdb8810ULL, + 0x626a7a53f9757088ULL, 0xa2c202f358467a2eULL, 0x4d0882e5db169161ULL, + 0x09e7268301de7da8ULL, 0xe897699c771ac0dcULL, 0xc8507dac3d9cc3edULL, + 0xc0a878a0a1330aa6ULL, 0x978bb352e42ba8c1ULL, 0xe9884a13ea6b743fULL, + 0x279afdbabecc28a2ULL, 0x047c8c064ed9eaabULL, 0x507e2278b15289f4ULL, + 0x599904fbb08cf45cULL, 0xbd8ae46d15e01760ULL, 0x31353da7f2b43844ULL, + 0x8558ff49e68a528cULL, 0x76fbfc4d92ef15b5ULL, 0x3456922e211c660cULL, + 0x86799ac55c1993b4ULL, 0x3e90d1219a51da9cULL, 0x2d5cbeb505819432ULL, + 0x982e5fd48cce4a19ULL, 0xdb9c1238a24c8d43ULL, 0xd439febecaa96f9bULL, + 0x418c0bef0960b281ULL, 0x158ea591f6ebd1deULL, 0x1f48e69e4da66d4eULL, + 0x8afd13cf8e6fb054ULL, 0xf5e1c9011d5ed849ULL, 0xe34e091c5126c8afULL, + 0xad67ee7530a398f6ULL, 0x43b24dec2e82c75aULL, 0x75da99c1287cd48dULL, + 0x92e81cdb3783f689ULL, 0xa3dd217cc537cecdULL, 0x60543c50de970553ULL, + 0x93f73f54aaf2426aULL, 0xa91b62737e7a725dULL, 0xf19d4507538732e2ULL, + 0x77e4dfc20f9ea156ULL, 0x7d229ccdb4d31dc6ULL, 0x1b346a98037f87e5ULL, + 0xedf4c615a4b29e94ULL, 0x4093286094110662ULL, 0xb0114ee85ae78063ULL, + 0x6ff1d0d6b672e78bULL, 0x6dcf96d591909250ULL, 0xdfe09e3eec9567e8ULL, + 0x3214582b4827f97cULL, 0xb46dc2ee143e6ac8ULL, 0xf6c0ac8da7cd1971ULL, + 0xebb60c10cd8901e4ULL, 0xf7df8f023abcad92ULL, 0x9c52d3d2c217a0b2ULL, + 0x6b8d5cd0f8ab0d20ULL, 0x3777f7a29b8fa734ULL, 0x011f238f9d71b4e3ULL, + 0xc1b75b2f3c42be45ULL, 0x5de588fdfe551ef7ULL, 0x6eeef3592b035368ULL, + 0xaa3a07ffc4e9b365ULL, 0xecebe59a39c32a77ULL, 0x5ba742f8976e8187ULL, + 0x4b4a48e0b22d0e11ULL, 0xddded83dcb771233ULL, 0xa59feb79ac0c51bdULL, + 0xc7f5912a55792135ULL + }, { + 0x6d6ae04668a9b08aULL, 0x3ab3f04b0be8c743ULL, 0xe51e166b54b3c908ULL, + 0xbe90a9eb35c2f139ULL, 0xb2c7066637f2bec1ULL, 0xaa6945613392202cULL, + 0x9a28c36f3b5201ebULL, 0xddce5a93ab536994ULL, 0x0e34133ef6382827ULL, + 0x52a02ba1ec55048bULL, 0xa2f88f97c4b2a177ULL, 0x8640e513ca2251a5ULL, + 0xcdf1d36258137622ULL, 0xfe6cb708dedf8ddbULL, 0x8a174a9ec8121e5dULL, + 0x679896036b81560eULL, 0x59ed033395795feeULL, 0x1dd778ab8b74edafULL, + 0xee533ef92d9f926dULL, 0x2a8c79baf8a8d8f5ULL, 0x6bcf398e69b119f6ULL, + 0xe20491742fafdd95ULL, 0x276488e0809c2aecULL, 0xea955b82d88f5cceULL, + 0x7102c63a99d9e0c4ULL, 0xf9763017a5c39946ULL, 0x429fa2501f151b3dULL, + 0x4659c72bea05d59eULL, 0x984b7fdccf5a6634ULL, 0xf742232953fbb161ULL, + 0x3041860e08c021c7ULL, 0x747bfd9616cd9386ULL, 0x4bb1367192312787ULL, + 0x1b72a1638a6c44d3ULL, 0x4a0e68a6e8359a66ULL, 0x169a5039f258b6caULL, + 0xb98a2ef44edee5a4ULL, 0xd9083fe85e43a737ULL, 0x967f6ce239624e13ULL, + 0x8874f62d3c1a7982ULL, 0x3c1629830af06e3fULL, 0x9165ebfd427e5a8eULL, + 0xb5dd81794ceeaa5cULL, 0x0de8f15a7834f219ULL, 0x70bd98ede3dd5d25ULL, + 0xaccc9ca9328a8950ULL, 0x56664eda1945ca28ULL, 0x221db34c0f8859aeULL, + 0x26dbd637fa98970dULL, 0x1acdffb4f068f932ULL, 0x4585254f64090fa0ULL, + 0x72de245e17d53afaULL, 0x1546b25d7c546cf4ULL, 0x207e0ffffb803e71ULL, + 0xfaaad2732bcf4378ULL, 0xb462dfae36ea17bdULL, 0xcf926fd1ac1b11fdULL, + 0xe0672dc7dba7ba4aULL, 0xd3fa49ad5d6b41b3ULL, 0x8ba81449b216a3bcULL, + 0x14f9ec8a0650d115ULL, 0x40fc1ee3eb1d7ce2ULL, 0x23a2ed9b758ce44fULL, + 0x782c521b14fddc7eULL, 0x1c68267cf170504eULL, 0xbcf31558c1ca96e6ULL, + 0xa781b43b4ba6d235ULL, 0xf6fd7dfe29ff0c80ULL, 0xb0a4bad5c3fad91eULL, + 0xd199f51ea963266cULL, 0x414340349119c103ULL, 0x5405f269ed4dadf7ULL, + 0xabd61bb649969dcdULL, 0x6813dbeae7bdc3c8ULL, 0x65fb2ab09f8931d1ULL, + 0xf1e7fae152e3181dULL, 0xc1a67cef5a2339daULL, 0x7a4feea8e0f5bba1ULL, + 0x1e0b9acf05783791ULL, 0x5b8ebf8061713831ULL, 0x80e53cdbcb3af8d9ULL, + 0x7e898bd315e57502ULL, 0xc6bcfbf0213f2d47ULL, 0x95a38e86b76e942dULL, + 0x092e94218d243cbaULL, 0x8339debf453622e7ULL, 0xb11be402b9fe64ffULL, + 0x57d9100d634177c9ULL, 0xcc4e8db52217cbc3ULL, 0x3b0cae9c71ec7aa2ULL, + 0xfb158ca451cbfe99ULL, 0x2b33276d82ac6514ULL, 0x01bf5ed77a04bde1ULL, + 0xc5601994af33f779ULL, 0x75c4a3416cc92e67ULL, 0xf3844652a6eb7fc2ULL, + 0x3487e375fdd0ef64ULL, 0x18ae430704609eedULL, 0x4d14efb993298efbULL, + 0x815a620cb13e4538ULL, 0x125c354207487869ULL, 0x9eeea614ce42cf48ULL, + 0xce2d3106d61fac1cULL, 0xbbe99247bad6827bULL, 0x071a871f7b1c149dULL, + 0x2e4a1cc10db81656ULL, 0x77a71ff298c149b8ULL, 0x06a5d9c80118a97cULL, + 0xad73c27e488e34b1ULL, 0x443a7b981e0db241ULL, 0xe3bbcfa355ab6074ULL, + 0x0af276450328e684ULL, 0x73617a896dd1871bULL, 0x58525de4ef7de20fULL, + 0xb7be3dcab8e6cd83ULL, 0x19111dd07e64230cULL, 0x842359a03e2a367aULL, + 0x103f89f1f3401fb6ULL, 0xdc710444d157d475ULL, 0xb835702334da5845ULL, + 0x4320fc876511a6dcULL, 0xd026abc9d3679b8dULL, 0x17250eee885c0b2bULL, + 0x90dab52a387ae76fULL, 0x31fed8d972c49c26ULL, 0x89cba8fa461ec463ULL, + 0x2ff5421677bcabb7ULL, 0x396f122f85e41d7dULL, 0xa09b332430bac6a8ULL, + 0xc888e8ced7070560ULL, 0xaeaf201ac682ee8fULL, 0x1180d7268944a257ULL, + 0xf058a43628e7a5fcULL, 0xbd4c4b8fbbce2b07ULL, 0xa1246df34abe7b49ULL, + 0x7d5569b79be9af3cULL, 0xa9b5a705bd9efa12ULL, 0xdb6b835baa4bc0e8ULL, + 0x05793bac8f147342ULL, 0x21c1512881848390ULL, 0xfdb0556c50d357e5ULL, + 0x613d4fcb6a99ff72ULL, 0x03dce2648e0cda3eULL, 0xe949b9e6568386f0ULL, + 0xfc0f0bbb2ad7ea04ULL, 0x6a70675913b5a417ULL, 0x7f36d5046fe1c8e3ULL, + 0x0c57af8d02304ff8ULL, 0x32223abdfcc84618ULL, 0x0891caf6f720815bULL, + 0xa63eeaec31a26fd4ULL, 0x2507345374944d33ULL, 0x49d28ac266394058ULL, + 0xf5219f9aa7f3d6beULL, 0x2d96fea583b4cc68ULL, 0x5a31e1571b7585d0ULL, + 0x8ed12fe53d02d0feULL, 0xdfade6205f5b0e4bULL, 0x4cabb16ee92d331aULL, + 0x04c6657bf510cea3ULL, 0xd73c2cd6a87b8f10ULL, 0xe1d87310a1a307abULL, + 0x6cd5be9112ad0d6bULL, 0x97c032354366f3f2ULL, 0xd4e0ceb22677552eULL, + 0x0000000000000000ULL, 0x29509bde76a402cbULL, 0xc27a9e8bd42fe3e4ULL, + 0x5ef7842cee654b73ULL, 0xaf107ecdbc86536eULL, 0x3fcacbe784fcb401ULL, + 0xd55f90655c73e8cfULL, 0xe6c2f40fdabf1336ULL, 0xe8f6e7312c873b11ULL, + 0xeb2a0555a28be12fULL, 0xe4a148bc2eb774e9ULL, 0x9b979db84156bc0aULL, + 0x6eb60222e6a56ab4ULL, 0x87ffbbc4b026ec44ULL, 0xc703a5275b3b90a6ULL, + 0x47e699fc9001687fULL, 0x9c8d1aa73a4aa897ULL, 0x7cea3760e1ed12ddULL, + 0x4ec80ddd1d2554c5ULL, 0x13e36b957d4cc588ULL, 0x5d2b66486069914dULL, + 0x92b90999cc7280b0ULL, 0x517cc9c56259deb5ULL, 0xc937b619ad03b881ULL, + 0xec30824ad997f5b2ULL, 0xa45d565fc5aa080bULL, 0xd6837201d27f32f1ULL, + 0x635ef3789e9198adULL, 0x531f75769651b96aULL, 0x4f77530a6721e924ULL, + 0x486dd4151c3dfdb9ULL, 0x5f48dafb9461f692ULL, 0x375b011173dc355aULL, + 0x3da9775470f4d3deULL, 0x8d0dcd81b30e0ac0ULL, 0x36e45fc609d888bbULL, + 0x55baacbe97491016ULL, 0x8cb29356c90ab721ULL, 0x76184125e2c5f459ULL, + 0x99f4210bb55edbd5ULL, 0x6f095cf59ca1d755ULL, 0x9f51f8c3b44672a9ULL, + 0x3538bda287d45285ULL, 0x50c39712185d6354ULL, 0xf23b1885dcefc223ULL, + 0x79930ccc6ef9619fULL, 0xed8fdc9da3934853ULL, 0xcb540aaa590bdf5eULL, + 0x5c94389f1a6d2cacULL, 0xe77daad8a0bbaed7ULL, 0x28efc5090ca0bf2aULL, + 0xbf2ff73c4fc64cd8ULL, 0xb37858b14df60320ULL, 0xf8c96ec0dfc724a7ULL, + 0x828680683f329f06ULL, 0x941cd051cd6a29ccULL, 0xc3c5c05cae2b5e05ULL, + 0xb601631dc2e27062ULL, 0xc01922382027843bULL, 0x24b86a840e90f0d2ULL, + 0xd245177a276ffc52ULL, 0x0f8b4de98c3c95c6ULL, 0x3e759530fef809e0ULL, + 0x0b4d2892792c5b65ULL, 0xc4df4743d5374a98ULL, 0xa5e20888bfaeb5eaULL, + 0xba56cc90c0d23f9aULL, 0x38d04cf8ffe0a09cULL, 0x62e1adafe495254cULL, + 0x0263bcb3f40867dfULL, 0xcaeb547d230f62bfULL, 0x6082111c109d4293ULL, + 0xdad4dd8cd04f7d09ULL, 0xefec602e579b2f8cULL, 0x1fb4c4187f7c8a70ULL, + 0xffd3e9dfa4db303aULL, 0x7bf0b07f9af10640ULL, 0xf49ec14dddf76b5fULL, + 0x8f6e713247066d1fULL, 0x339d646a86ccfbf9ULL, 0x64447467e58d8c30ULL, + 0x2c29a072f9b07189ULL, 0xd8b7613f24471ad6ULL, 0x6627c8d41185ebefULL, + 0xa347d140beb61c96ULL, 0xde12b8f7255fb3aaULL, 0x9d324470404e1576ULL, + 0x9306574eb6763d51ULL, 0xa80af9d2c79a47f3ULL, 0x859c0777442e8b9bULL, + 0x69ac853d9db97e29ULL + }, { + 0xc3407dfc2de6377eULL, 0x5b9e93eea4256f77ULL, 0xadb58fdd50c845e0ULL, + 0x5219ff11a75bed86ULL, 0x356b61cfd90b1de9ULL, 0xfb8f406e25abe037ULL, + 0x7a5a0231c0f60796ULL, 0x9d3cd216e1f5020bULL, 0x0c6550fb6b48d8f3ULL, + 0xf57508c427ff1c62ULL, 0x4ad35ffa71cb407dULL, 0x6290a2da1666aa6dULL, + 0xe284ec2349355f9fULL, 0xb3c307c53d7c84ecULL, 0x05e23c0468365a02ULL, + 0x190bac4d6c9ebfa8ULL, 0x94bbbee9e28b80faULL, 0xa34fc777529cb9b5ULL, + 0xcc7b39f095bcd978ULL, 0x2426addb0ce532e3ULL, 0x7e79329312ce4fc7ULL, + 0xab09a72eebec2917ULL, 0xf8d15499f6b9d6c2ULL, 0x1a55b8babf8c895dULL, + 0xdb8add17fb769a85ULL, 0xb57f2f368658e81bULL, 0x8acd36f18f3f41f6ULL, + 0x5ce3b7bba50f11d3ULL, 0x114dcc14d5ee2f0aULL, 0xb91a7fcded1030e8ULL, + 0x81d5425fe55de7a1ULL, 0xb6213bc1554adeeeULL, 0x80144ef95f53f5f2ULL, + 0x1e7688186db4c10cULL, 0x3b912965db5fe1bcULL, 0xc281715a97e8252dULL, + 0x54a5d7e21c7f8171ULL, 0x4b12535ccbc5522eULL, 0x1d289cefbea6f7f9ULL, + 0x6ef5f2217d2e729eULL, 0xe6a7dc819b0d17ceULL, 0x1b94b41c05829b0eULL, + 0x33d7493c622f711eULL, 0xdcf7f942fa5ce421ULL, 0x600fba8b7f7a8ecbULL, + 0x46b60f011a83988eULL, 0x235b898e0dcf4c47ULL, 0x957ab24f588592a9ULL, + 0x4354330572b5c28cULL, 0xa5f3ef84e9b8d542ULL, 0x8c711e02341b2d01ULL, + 0x0b1874ae6a62a657ULL, 0x1213d8e306fc19ffULL, 0xfe6d7c6a4d9dba35ULL, + 0x65ed868f174cd4c9ULL, 0x88522ea0e6236550ULL, 0x899322065c2d7703ULL, + 0xc01e690bfef4018bULL, 0x915982ed8abddaf8ULL, 0xbe675b98ec3a4e4cULL, + 0xa996bf7f82f00db1ULL, 0xe1daf8d49a27696aULL, 0x2effd5d3dc8986e7ULL, + 0xd153a51f2b1a2e81ULL, 0x18caa0ebd690adfbULL, 0x390e3134b243c51aULL, + 0x2778b92cdff70416ULL, 0x029f1851691c24a6ULL, 0x5e7cafeacc133575ULL, + 0xfa4e4cc89fa5f264ULL, 0x5a5f9f481e2b7d24ULL, 0x484c47ab18d764dbULL, + 0x400a27f2a1a7f479ULL, 0xaeeb9b2a83da7315ULL, 0x721c626879869734ULL, + 0x042330a2d2384851ULL, 0x85f672fd3765aff0ULL, 0xba446b3a3e02061dULL, + 0x73dd6ecec3888567ULL, 0xffac70ccf793a866ULL, 0xdfa9edb5294ed2d4ULL, + 0x6c6aea7014325638ULL, 0x834a5a0e8c41c307ULL, 0xcdba35562fb2cb2bULL, + 0x0ad97808d06cb404ULL, 0x0f3b440cb85aee06ULL, 0xe5f9c876481f213bULL, + 0x98deee1289c35809ULL, 0x59018bbfcd394bd1ULL, 0xe01bf47220297b39ULL, + 0xde68e1139340c087ULL, 0x9fa3ca4788e926adULL, 0xbb85679c840c144eULL, + 0x53d8f3b71d55ffd5ULL, 0x0da45c5dd146caa0ULL, 0x6f34fe87c72060cdULL, + 0x57fbc315cf6db784ULL, 0xcee421a1fca0fddeULL, 0x3d2d0196607b8d4bULL, + 0x642c8a29ad42c69aULL, 0x14aff010bdd87508ULL, 0xac74837beac657b3ULL, + 0x3216459ad821634dULL, 0x3fb219c70967a9edULL, 0x06bc28f3bb246cf7ULL, + 0xf2082c9126d562c6ULL, 0x66b39278c45ee23cULL, 0xbd394f6f3f2878b9ULL, + 0xfd33689d9e8f8cc0ULL, 0x37f4799eb017394fULL, 0x108cc0b26fe03d59ULL, + 0xda4bd1b1417888d6ULL, 0xb09d1332ee6eb219ULL, 0x2f3ed975668794b4ULL, + 0x58c0871977375982ULL, 0x7561463d78ace990ULL, 0x09876cff037e82f1ULL, + 0x7fb83e35a8c05d94ULL, 0x26b9b58a65f91645ULL, 0xef20b07e9873953fULL, + 0x3148516d0b3355b8ULL, 0x41cb2b541ba9e62aULL, 0x790416c613e43163ULL, + 0xa011d380818e8f40ULL, 0x3a5025c36151f3efULL, 0xd57095bdf92266d0ULL, + 0x498d4b0da2d97688ULL, 0x8b0c3a57353153a5ULL, 0x21c491df64d368e1ULL, + 0x8f2f0af5e7091bf4ULL, 0x2da1c1240f9bb012ULL, 0xc43d59a92ccc49daULL, + 0xbfa6573e56345c1fULL, 0x828b56a8364fd154ULL, 0x9a41f643e0df7cafULL, + 0xbcf843c985266aeaULL, 0x2b1de9d7b4bfdce5ULL, 0x20059d79dedd7ab2ULL, + 0x6dabe6d6ae3c446bULL, 0x45e81bf6c991ae7bULL, 0x6351ae7cac68b83eULL, + 0xa432e32253b6c711ULL, 0xd092a9b991143cd2ULL, 0xcac711032e98b58fULL, + 0xd8d4c9e02864ac70ULL, 0xc5fc550f96c25b89ULL, 0xd7ef8dec903e4276ULL, + 0x67729ede7e50f06fULL, 0xeac28c7af045cf3dULL, 0xb15c1f945460a04aULL, + 0x9cfddeb05bfb1058ULL, 0x93c69abce3a1fe5eULL, 0xeb0380dc4a4bdd6eULL, + 0xd20db1e8f8081874ULL, 0x229a8528b7c15e14ULL, 0x44291750739fbc28ULL, + 0xd3ccbd4e42060a27ULL, 0xf62b1c33f4ed2a97ULL, 0x86a8660ae4779905ULL, + 0xd62e814a2a305025ULL, 0x477703a7a08d8addULL, 0x7b9b0e977af815c5ULL, + 0x78c51a60a9ea2330ULL, 0xa6adfb733aaae3b7ULL, 0x97e5aa1e3199b60fULL, + 0x0000000000000000ULL, 0xf4b404629df10e31ULL, 0x5564db44a6719322ULL, + 0x9207961a59afec0dULL, 0x9624a6b88b97a45cULL, 0x363575380a192b1cULL, + 0x2c60cd82b595a241ULL, 0x7d272664c1dc7932ULL, 0x7142769faa94a1c1ULL, + 0xa1d0df263b809d13ULL, 0x1630e841d4c451aeULL, 0xc1df65ad44fa13d8ULL, + 0x13d2d445bcf20bacULL, 0xd915c546926abe23ULL, 0x38cf3d92084dd749ULL, + 0xe766d0272103059dULL, 0xc7634d5effde7f2fULL, 0x077d2455012a7ea4ULL, + 0xedbfa82ff16fb199ULL, 0xaf2a978c39d46146ULL, 0x42953fa3c8bbd0dfULL, + 0xcb061da59496a7dcULL, 0x25e7a17db6eb20b0ULL, 0x34aa6d6963050fbaULL, + 0xa76cf7d580a4f1e4ULL, 0xf7ea10954ee338c4ULL, 0xfcf2643b24819e93ULL, + 0xcf252d0746aeef8dULL, 0x4ef06f58a3f3082cULL, 0x563acfb37563a5d7ULL, + 0x5086e740ce47c920ULL, 0x2982f186dda3f843ULL, 0x87696aac5e798b56ULL, + 0x5d22bb1d1f010380ULL, 0x035e14f7d31236f5ULL, 0x3cec0d30da759f18ULL, + 0xf3c920379cdb7095ULL, 0xb8db736b571e22bbULL, 0xdd36f5e44052f672ULL, + 0xaac8ab8851e23b44ULL, 0xa857b3d938fe1fe2ULL, 0x17f1e4e76eca43fdULL, + 0xec7ea4894b61a3caULL, 0x9e62c6e132e734feULL, 0xd4b1991b432c7483ULL, + 0x6ad6c283af163acfULL, 0x1ce9904904a8e5aaULL, 0x5fbda34c761d2726ULL, + 0xf910583f4cb7c491ULL, 0xc6a241f845d06d7cULL, 0x4f3163fe19fd1a7fULL, + 0xe99c988d2357f9c8ULL, 0x8eee06535d0709a7ULL, 0x0efa48aa0254fc55ULL, + 0xb4be23903c56fa48ULL, 0x763f52caabbedf65ULL, 0xeee1bcd8227d876cULL, + 0xe345e085f33b4dccULL, 0x3e731561b369bbbeULL, 0x2843fd2067adea10ULL, + 0x2adce5710eb1ceb6ULL, 0xb7e03767ef44ccbdULL, 0x8db012a48e153f52ULL, + 0x61ceb62dc5749c98ULL, 0xe85d942b9959eb9bULL, 0x4c6f7709caef2c8aULL, + 0x84377e5b8d6bbda3ULL, 0x30895dcbb13d47ebULL, 0x74a04a9bc2a2fbc3ULL, + 0x6b17ce251518289cULL, 0xe438c4d0f2113368ULL, 0x1fb784bed7bad35fULL, + 0x9b80fae55ad16efcULL, 0x77fe5e6c11b0cd36ULL, 0xc858095247849129ULL, + 0x08466059b97090a2ULL, 0x01c10ca6ba0e1253ULL, 0x6988d6747c040c3aULL, + 0x6849dad2c60a1e69ULL, 0x5147ebe67449db73ULL, 0xc99905f4fd8a837aULL, + 0x991fe2b433cd4a5aULL, 0xf09734c04fc94660ULL, 0xa28ecbd1e892abe6ULL, + 0xf1563866f5c75433ULL, 0x4dae7baf70e13ed9ULL, 0x7ce62ac27bd26b61ULL, + 0x70837a39109ab392ULL, 0x90988e4b30b3c8abULL, 0xb2020b63877296bfULL, + 0x156efcb607d6675bULL + }, { + 0xe63f55ce97c331d0ULL, 0x25b506b0015bba16ULL, 0xc8706e29e6ad9ba8ULL, + 0x5b43d3775d521f6aULL, 0x0bfa3d577035106eULL, 0xab95fc172afb0e66ULL, + 0xf64b63979e7a3276ULL, 0xf58b4562649dad4bULL, 0x48f7c3dbae0c83f1ULL, + 0xff31916642f5c8c5ULL, 0xcbb048dc1c4a0495ULL, 0x66b8f83cdf622989ULL, + 0x35c130e908e2b9b0ULL, 0x7c761a61f0b34fa1ULL, 0x3601161cf205268dULL, + 0x9e54ccfe2219b7d6ULL, 0x8b7d90a538940837ULL, 0x9cd403588ea35d0bULL, + 0xbc3c6fea9ccc5b5aULL, 0xe5ff733b6d24aeedULL, 0xceed22de0f7eb8d2ULL, + 0xec8581cab1ab545eULL, 0xb96105e88ff8e71dULL, 0x8ca03501871a5eadULL, + 0x76ccce65d6db2a2fULL, 0x5883f582a7b58057ULL, 0x3f7be4ed2e8adc3eULL, + 0x0fe7be06355cd9c9ULL, 0xee054e6c1d11be83ULL, 0x1074365909b903a6ULL, + 0x5dde9f80b4813c10ULL, 0x4a770c7d02b6692cULL, 0x5379c8d5d7809039ULL, + 0xb4067448161ed409ULL, 0x5f5e5026183bd6cdULL, 0xe898029bf4c29df9ULL, + 0x7fb63c940a54d09cULL, 0xc5171f897f4ba8bcULL, 0xa6f28db7b31d3d72ULL, + 0x2e4f3be7716eaa78ULL, 0x0d6771a099e63314ULL, 0x82076254e41bf284ULL, + 0x2f0fd2b42733df98ULL, 0x5c9e76d3e2dc49f0ULL, 0x7aeb569619606cdbULL, + 0x83478b07b2468764ULL, 0xcfadcb8d5923cd32ULL, 0x85dac7f05b95a41eULL, + 0xb5469d1b4043a1e9ULL, 0xb821ecbbd9a592fdULL, 0x1b8e0b0e798c13c8ULL, + 0x62a57b6d9a0be02eULL, 0xfcf1b793b81257f8ULL, 0x9d94ea0bd8fe28ebULL, + 0x4cea408aeb654a56ULL, 0x23284a47e888996cULL, 0x2d8f1d128b893545ULL, + 0xf4cbac3132c0d8abULL, 0xbd7c86b9ca912ebaULL, 0x3a268eef3dbe6079ULL, + 0xf0d62f6077a9110cULL, 0x2735c916ade150cbULL, 0x89fd5f03942ee2eaULL, + 0x1acee25d2fd16628ULL, 0x90f39bab41181bffULL, 0x430dfe8cde39939fULL, + 0xf70b8ac4c8274796ULL, 0x1c53aeaac6024552ULL, 0x13b410acf35e9c9bULL, + 0xa532ab4249faa24fULL, 0x2b1251e5625a163fULL, 0xd7e3e676da4841c7ULL, + 0xa7b264e4e5404892ULL, 0xda8497d643ae72d3ULL, 0x861ae105a1723b23ULL, + 0x38a6414991048aa4ULL, 0x6578dec92585b6b4ULL, 0x0280cfa6acbaeaddULL, + 0x88bdb650c273970aULL, 0x9333bd5ebbff84c2ULL, 0x4e6a8f2c47dfa08bULL, + 0x321c954db76cef2aULL, 0x418d312a72837942ULL, 0xb29b38bfffcdf773ULL, + 0x6c022c38f90a4c07ULL, 0x5a033a240b0f6a8aULL, 0x1f93885f3ce5da6fULL, + 0xc38a537e96988bc6ULL, 0x39e6a81ac759ff44ULL, 0x29929e43cee0fce2ULL, + 0x40cdd87924de0ca2ULL, 0xe9d8ebc8a29fe819ULL, 0x0c2798f3cfbb46f4ULL, + 0x55e484223e53b343ULL, 0x4650948ecd0d2fd8ULL, 0x20e86cb2126f0651ULL, + 0x6d42c56baf5739e7ULL, 0xa06fc1405ace1e08ULL, 0x7babbfc54f3d193bULL, + 0x424d17df8864e67fULL, 0xd8045870ef14980eULL, 0xc6d7397c85ac3781ULL, + 0x21a885e1443273b1ULL, 0x67f8116f893f5c69ULL, 0x24f5efe35706cff6ULL, + 0xd56329d076f2ab1aULL, 0x5e1eb9754e66a32dULL, 0x28d2771098bd8902ULL, + 0x8f6013f47dfdc190ULL, 0x17a993fdb637553cULL, 0xe0a219397e1012aaULL, + 0x786b9930b5da8606ULL, 0x6e82e39e55b0a6daULL, 0x875a0856f72f4ec3ULL, + 0x3741ff4fa458536dULL, 0xac4859b3957558fcULL, 0x7ef6d5c75c09a57cULL, + 0xc04a758b6c7f14fbULL, 0xf9acdd91ab26ebbfULL, 0x7391a467c5ef9668ULL, + 0x335c7c1ee1319acaULL, 0xa91533b18641e4bbULL, 0xe4bf9a683b79db0dULL, + 0x8e20faa72ba0b470ULL, 0x51f907737b3a7ae4ULL, 0x2268a314bed5ec8cULL, + 0xd944b123b949edeeULL, 0x31dcb3b84d8b7017ULL, 0xd3fe65279f218860ULL, + 0x097af2f1dc8ffab3ULL, 0x9b09a6fc312d0b91ULL, 0xcc6ded78a3c4520fULL, + 0x3481d9ba5ebfcc50ULL, 0x4f2a667f1182d56bULL, 0xdfd9fdd4509ace94ULL, + 0x26752045fbbc252bULL, 0xbffc491f662bc467ULL, 0xdd593272fc202449ULL, + 0x3cbbc218d46d4303ULL, 0x91b372f817456e1fULL, 0x681faf69bc6385a0ULL, + 0xb686bbeebaa43ed4ULL, 0x1469b5084cd0ca01ULL, 0x98c98009cbca94acULL, + 0x6438379a73d8c354ULL, 0xc2caba2dc0c5fe26ULL, 0x3e3b0dbe78d7a9deULL, + 0x50b9ee202d670f04ULL, 0x4590b27b37eab0e5ULL, 0x6025b4cb36b10af3ULL, + 0xfb2c1237079c0162ULL, 0xa12f28130c936be8ULL, 0x4b37e52e54eb1cccULL, + 0x083a1ba28ad28f53ULL, 0xc10a9cd83a22611bULL, 0x9f1425ad7444c236ULL, + 0x069d4cf7e9d3237aULL, 0xedc56899e7f621beULL, 0x778c273680865fcfULL, + 0x309c5aeb1bd605f7ULL, 0x8de0dc52d1472b4dULL, 0xf8ec34c2fd7b9e5fULL, + 0xea18cd3d58787724ULL, 0xaad515447ca67b86ULL, 0x9989695a9d97e14cULL, + 0x0000000000000000ULL, 0xf196c63321f464ecULL, 0x71116bc169557cb5ULL, + 0xaf887f466f92c7c1ULL, 0x972e3e0ffe964d65ULL, 0x190ec4a8d536f915ULL, + 0x95aef1a9522ca7b8ULL, 0xdc19db21aa7d51a9ULL, 0x94ee18fa0471d258ULL, + 0x8087adf248a11859ULL, 0xc457f6da2916dd5cULL, 0xfa6cfb6451c17482ULL, + 0xf256e0c6db13fbd1ULL, 0x6a9f60cf10d96f7dULL, 0x4daaa9d9bd383fb6ULL, + 0x03c026f5fae79f3dULL, 0xde99148706c7bb74ULL, 0x2a52b8b6340763dfULL, + 0x6fc20acd03edd33aULL, 0xd423c08320afdefaULL, 0xbbe1ca4e23420dc0ULL, + 0x966ed75ca8cb3885ULL, 0xeb58246e0e2502c4ULL, 0x055d6a021334bc47ULL, + 0xa47242111fa7d7afULL, 0xe3623fcc84f78d97ULL, 0x81c744a11efc6db9ULL, + 0xaec8961539cfb221ULL, 0xf31609958d4e8e31ULL, 0x63e5923ecc5695ceULL, + 0x47107ddd9b505a38ULL, 0xa3afe7b5a0298135ULL, 0x792b7063e387f3e6ULL, + 0x0140e953565d75e0ULL, 0x12f4f9ffa503e97bULL, 0x750ce8902c3cb512ULL, + 0xdbc47e8515f30733ULL, 0x1ed3610c6ab8af8fULL, 0x5239218681dde5d9ULL, + 0xe222d69fd2aaf877ULL, 0xfe71783514a8bd25ULL, 0xcaf0a18f4a177175ULL, + 0x61655d9860ec7f13ULL, 0xe77fbc9dc19e4430ULL, 0x2ccff441ddd440a5ULL, + 0x16e97aaee06a20dcULL, 0xa855dae2d01c915bULL, 0x1d1347f9905f30b2ULL, + 0xb7c652bdecf94b34ULL, 0xd03e43d265c6175dULL, 0xfdb15ec0ee4f2218ULL, + 0x57644b8492e9599eULL, 0x07dda5a4bf8e569aULL, 0x54a46d71680ec6a3ULL, + 0x5624a2d7c4b42c7eULL, 0xbebca04c3076b187ULL, 0x7d36f332a6ee3a41ULL, + 0x3b6667bc6be31599ULL, 0x695f463aea3ef040ULL, 0xad08b0e0c3282d1cULL, + 0xb15b1e4a052a684eULL, 0x44d05b2861b7c505ULL, 0x15295c5b1a8dbfe1ULL, + 0x744c01c37a61c0f2ULL, 0x59c31cd1f1e8f5b7ULL, 0xef45a73f4b4ccb63ULL, + 0x6bdf899c46841a9dULL, 0x3dfb2b4b823036e3ULL, 0xa2ef0ee6f674f4d5ULL, + 0x184e2dfb836b8cf5ULL, 0x1134df0a5fe47646ULL, 0xbaa1231d751f7820ULL, + 0xd17eaa81339b62bdULL, 0xb01bf71953771daeULL, 0x849a2ea30dc8d1feULL, + 0x705182923f080955ULL, 0x0ea757556301ac29ULL, 0x041d83514569c9a7ULL, + 0x0abad4042668658eULL, 0x49b72a88f851f611ULL, 0x8a3d79f66ec97dd7ULL, + 0xcd2d042bf59927efULL, 0xc930877ab0f0ee48ULL, 0x9273540deda2f122ULL, + 0xc797d02fd3f14261ULL, 0xe1e2f06a284d674aULL, 0xd2be8c74c97cfd80ULL, + 0x9a494faf67707e71ULL, 0xb3dbd1eca9908293ULL, 0x72d14d3493b2e388ULL, + 0xd6a30f258c153427ULL + } +}; /* Ax */ + +static void streebog_xor(const struct streebog_uint512 *x, + const struct streebog_uint512 *y, + struct streebog_uint512 *z) +{ + z->qword[0] = x->qword[0] ^ y->qword[0]; + z->qword[1] = x->qword[1] ^ y->qword[1]; + z->qword[2] = x->qword[2] ^ y->qword[2]; + z->qword[3] = x->qword[3] ^ y->qword[3]; + z->qword[4] = x->qword[4] ^ y->qword[4]; + z->qword[5] = x->qword[5] ^ y->qword[5]; + z->qword[6] = x->qword[6] ^ y->qword[6]; + z->qword[7] = x->qword[7] ^ y->qword[7]; +} + +static void streebog_xlps(const struct streebog_uint512 *x, + const struct streebog_uint512 *y, + struct streebog_uint512 *data) +{ + u64 r0, r1, r2, r3, r4, r5, r6, r7; + int i; + + r0 = le64_to_cpu(x->qword[0] ^ y->qword[0]); + r1 = le64_to_cpu(x->qword[1] ^ y->qword[1]); + r2 = le64_to_cpu(x->qword[2] ^ y->qword[2]); + r3 = le64_to_cpu(x->qword[3] ^ y->qword[3]); + r4 = le64_to_cpu(x->qword[4] ^ y->qword[4]); + r5 = le64_to_cpu(x->qword[5] ^ y->qword[5]); + r6 = le64_to_cpu(x->qword[6] ^ y->qword[6]); + r7 = le64_to_cpu(x->qword[7] ^ y->qword[7]); + + for (i = 0; i <= 7; i++) { + data->qword[i] = cpu_to_le64(Ax[0][r0 & 0xFF]); + data->qword[i] ^= cpu_to_le64(Ax[1][r1 & 0xFF]); + data->qword[i] ^= cpu_to_le64(Ax[2][r2 & 0xFF]); + data->qword[i] ^= cpu_to_le64(Ax[3][r3 & 0xFF]); + data->qword[i] ^= cpu_to_le64(Ax[4][r4 & 0xFF]); + data->qword[i] ^= cpu_to_le64(Ax[5][r5 & 0xFF]); + data->qword[i] ^= cpu_to_le64(Ax[6][r6 & 0xFF]); + data->qword[i] ^= cpu_to_le64(Ax[7][r7 & 0xFF]); + r0 >>= 8; + r1 >>= 8; + r2 >>= 8; + r3 >>= 8; + r4 >>= 8; + r5 >>= 8; + r6 >>= 8; + r7 >>= 8; + } +} + +static void streebog_round(int i, struct streebog_uint512 *Ki, + struct streebog_uint512 *data) +{ + streebog_xlps(Ki, &C[i], Ki); + streebog_xlps(Ki, data, data); +} + +static int streebog_init(struct shash_desc *desc) +{ + struct streebog_state *ctx = shash_desc_ctx(desc); + unsigned int digest_size = crypto_shash_digestsize(desc->tfm); + unsigned int i; + + memset(ctx, 0, sizeof(struct streebog_state)); + for (i = 0; i < 8; i++) { + if (digest_size == STREEBOG256_DIGEST_SIZE) + ctx->h.qword[i] = 0x0101010101010101ULL; + } + return 0; +} + +static void streebog_pad(struct streebog_state *ctx) +{ + if (ctx->fillsize >= STREEBOG_BLOCK_SIZE) + return; + + memset(ctx->buffer + ctx->fillsize, 0, + sizeof(ctx->buffer) - ctx->fillsize); + + ctx->buffer[ctx->fillsize] = 1; +} + +static void streebog_add512(const struct streebog_uint512 *x, + const struct streebog_uint512 *y, + struct streebog_uint512 *r) +{ + u64 carry = 0; + int i; + + for (i = 0; i < 8; i++) { + const u64 left = le64_to_cpu(x->qword[i]); + u64 sum; + + sum = left + le64_to_cpu(y->qword[i]) + carry; + if (sum != left) + carry = (sum < left); + r->qword[i] = cpu_to_le64(sum); + } +} + +static void streebog_g(struct streebog_uint512 *h, + const struct streebog_uint512 *N, + const u8 *m) +{ + struct streebog_uint512 Ki, data; + unsigned int i; + + streebog_xlps(h, N, &data); + + /* Starting E() */ + Ki = data; + streebog_xlps(&Ki, (const struct streebog_uint512 *)&m[0], &data); + + for (i = 0; i < 11; i++) + streebog_round(i, &Ki, &data); + + streebog_xlps(&Ki, &C[11], &Ki); + streebog_xor(&Ki, &data, &data); + /* E() done */ + + streebog_xor(&data, h, &data); + streebog_xor(&data, (const struct streebog_uint512 *)&m[0], h); +} + +static void streebog_stage2(struct streebog_state *ctx, const u8 *data) +{ + streebog_g(&ctx->h, &ctx->N, data); + + streebog_add512(&ctx->N, &buffer512, &ctx->N); + streebog_add512(&ctx->Sigma, (const struct streebog_uint512 *)data, + &ctx->Sigma); +} + +static void streebog_stage3(struct streebog_state *ctx) +{ + struct streebog_uint512 buf = { { 0 } }; + + buf.qword[0] = cpu_to_le64(ctx->fillsize << 3); + streebog_pad(ctx); + + streebog_g(&ctx->h, &ctx->N, (const u8 *)&ctx->buffer); + streebog_add512(&ctx->N, &buf, &ctx->N); + streebog_add512(&ctx->Sigma, + (const struct streebog_uint512 *)&ctx->buffer[0], + &ctx->Sigma); + streebog_g(&ctx->h, &buffer0, (const u8 *)&ctx->N); + streebog_g(&ctx->h, &buffer0, (const u8 *)&ctx->Sigma); + memcpy(&ctx->hash, &ctx->h, sizeof(struct streebog_uint512)); +} + +static int streebog_update(struct shash_desc *desc, const u8 *data, + unsigned int len) +{ + struct streebog_state *ctx = shash_desc_ctx(desc); + size_t chunksize; + + if (ctx->fillsize) { + chunksize = STREEBOG_BLOCK_SIZE - ctx->fillsize; + if (chunksize > len) + chunksize = len; + memcpy(&ctx->buffer[ctx->fillsize], data, chunksize); + ctx->fillsize += chunksize; + len -= chunksize; + data += chunksize; + + if (ctx->fillsize == STREEBOG_BLOCK_SIZE) { + streebog_stage2(ctx, ctx->buffer); + ctx->fillsize = 0; + } + } + + while (len >= STREEBOG_BLOCK_SIZE) { + streebog_stage2(ctx, data); + data += STREEBOG_BLOCK_SIZE; + len -= STREEBOG_BLOCK_SIZE; + } + + if (len) { + memcpy(&ctx->buffer, data, len); + ctx->fillsize = len; + } + return 0; +} + +static int streebog_final(struct shash_desc *desc, u8 *digest) +{ + struct streebog_state *ctx = shash_desc_ctx(desc); + + streebog_stage3(ctx); + ctx->fillsize = 0; + if (crypto_shash_digestsize(desc->tfm) == STREEBOG256_DIGEST_SIZE) + memcpy(digest, &ctx->hash.qword[4], STREEBOG256_DIGEST_SIZE); + else + memcpy(digest, &ctx->hash.qword[0], STREEBOG512_DIGEST_SIZE); + return 0; +} + +static struct shash_alg algs[2] = { { + .digestsize = STREEBOG256_DIGEST_SIZE, + .init = streebog_init, + .update = streebog_update, + .final = streebog_final, + .descsize = sizeof(struct streebog_state), + .base = { + .cra_name = "streebog256", + .cra_driver_name = "streebog256-generic", + .cra_blocksize = STREEBOG_BLOCK_SIZE, + .cra_module = THIS_MODULE, + }, +}, { + .digestsize = STREEBOG512_DIGEST_SIZE, + .init = streebog_init, + .update = streebog_update, + .final = streebog_final, + .descsize = sizeof(struct streebog_state), + .base = { + .cra_name = "streebog512", + .cra_driver_name = "streebog512-generic", + .cra_blocksize = STREEBOG_BLOCK_SIZE, + .cra_module = THIS_MODULE, + } +} }; + +static int __init streebog_mod_init(void) +{ + return crypto_register_shashes(algs, ARRAY_SIZE(algs)); +} + +static void __exit streebog_mod_fini(void) +{ + crypto_unregister_shashes(algs, ARRAY_SIZE(algs)); +} + +module_init(streebog_mod_init); +module_exit(streebog_mod_fini); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Vitaly Chikunov "); +MODULE_DESCRIPTION("Streebog Hash Function"); + +MODULE_ALIAS_CRYPTO("streebog256"); +MODULE_ALIAS_CRYPTO("streebog256-generic"); +MODULE_ALIAS_CRYPTO("streebog512"); +MODULE_ALIAS_CRYPTO("streebog512-generic"); diff --git a/include/crypto/streebog.h b/include/crypto/streebog.h new file mode 100644 index 000000000000..4af119f7e07b --- /dev/null +++ b/include/crypto/streebog.h @@ -0,0 +1,34 @@ +/* SPDX-License-Identifier: GPL-2.0+ OR BSD-2-Clause */ +/* + * Copyright (c) 2013 Alexey Degtyarev + * Copyright (c) 2018 Vitaly Chikunov + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + */ + +#ifndef _CRYPTO_STREEBOG_H_ +#define _CRYPTO_STREEBOG_H_ + +#include + +#define STREEBOG256_DIGEST_SIZE 32 +#define STREEBOG512_DIGEST_SIZE 64 +#define STREEBOG_BLOCK_SIZE 64 + +struct streebog_uint512 { + u64 qword[8]; +}; + +struct streebog_state { + u8 buffer[STREEBOG_BLOCK_SIZE]; + struct streebog_uint512 hash; + struct streebog_uint512 h; + struct streebog_uint512 N; + struct streebog_uint512 Sigma; + size_t fillsize; +}; + +#endif /* !_CRYPTO_STREEBOG_H_ */ -- cgit v1.2.3-59-g8ed1b From dfdda82e3b84c13601be09f8351ec4f15a4fbe03 Mon Sep 17 00:00:00 2001 From: Vitaly Chikunov Date: Wed, 7 Nov 2018 00:00:02 +0300 Subject: crypto: streebog - register Streebog in hash info for IMA Register Streebog hash function in Hash Info arrays to let IMA use it for its purposes. Cc: linux-integrity@vger.kernel.org Signed-off-by: Vitaly Chikunov Reviewed-by: Mimi Zohar Signed-off-by: Herbert Xu --- crypto/hash_info.c | 4 ++++ include/crypto/hash_info.h | 1 + include/uapi/linux/hash_info.h | 2 ++ 3 files changed, 7 insertions(+) diff --git a/crypto/hash_info.c b/crypto/hash_info.c index 7b1e0b188ce6..1dd095e4b451 100644 --- a/crypto/hash_info.c +++ b/crypto/hash_info.c @@ -32,6 +32,8 @@ const char *const hash_algo_name[HASH_ALGO__LAST] = { [HASH_ALGO_TGR_160] = "tgr160", [HASH_ALGO_TGR_192] = "tgr192", [HASH_ALGO_SM3_256] = "sm3-256", + [HASH_ALGO_STREEBOG_256] = "streebog256", + [HASH_ALGO_STREEBOG_512] = "streebog512", }; EXPORT_SYMBOL_GPL(hash_algo_name); @@ -54,5 +56,7 @@ const int hash_digest_size[HASH_ALGO__LAST] = { [HASH_ALGO_TGR_160] = TGR160_DIGEST_SIZE, [HASH_ALGO_TGR_192] = TGR192_DIGEST_SIZE, [HASH_ALGO_SM3_256] = SM3256_DIGEST_SIZE, + [HASH_ALGO_STREEBOG_256] = STREEBOG256_DIGEST_SIZE, + [HASH_ALGO_STREEBOG_512] = STREEBOG512_DIGEST_SIZE, }; EXPORT_SYMBOL_GPL(hash_digest_size); diff --git a/include/crypto/hash_info.h b/include/crypto/hash_info.h index 56f217d41f12..91786b68dbdb 100644 --- a/include/crypto/hash_info.h +++ b/include/crypto/hash_info.h @@ -15,6 +15,7 @@ #include #include +#include #include diff --git a/include/uapi/linux/hash_info.h b/include/uapi/linux/hash_info.h index eea5d02c58de..74a8609fcb4d 100644 --- a/include/uapi/linux/hash_info.h +++ b/include/uapi/linux/hash_info.h @@ -33,6 +33,8 @@ enum hash_algo { HASH_ALGO_TGR_160, HASH_ALGO_TGR_192, HASH_ALGO_SM3_256, + HASH_ALGO_STREEBOG_256, + HASH_ALGO_STREEBOG_512, HASH_ALGO__LAST }; -- cgit v1.2.3-59-g8ed1b From 25a0b9d4e512ea04d80c84bd5e3b9e2722b92ec1 Mon Sep 17 00:00:00 2001 From: Vitaly Chikunov Date: Wed, 7 Nov 2018 00:00:03 +0300 Subject: crypto: streebog - add Streebog test vectors Add testmgr and tcrypt tests and vectors for Streebog hash function from RFC 6986 and GOST R 34.11-2012, for HMAC-Streebog vectors are from RFC 7836 and R 50.1.113-2016. Cc: linux-integrity@vger.kernel.org Signed-off-by: Vitaly Chikunov Acked-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- crypto/tcrypt.c | 40 ++++++++++++++++++- crypto/testmgr.c | 24 ++++++++++++ crypto/testmgr.h | 116 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 179 insertions(+), 1 deletion(-) diff --git a/crypto/tcrypt.c b/crypto/tcrypt.c index 1026173d721a..5fb120474902 100644 --- a/crypto/tcrypt.c +++ b/crypto/tcrypt.c @@ -76,7 +76,9 @@ static char *check[] = { "cast6", "arc4", "michael_mic", "deflate", "crc32c", "tea", "xtea", "khazad", "wp512", "wp384", "wp256", "tnepres", "xeta", "fcrypt", "camellia", "seed", "salsa20", "rmd128", "rmd160", "rmd256", "rmd320", - "lzo", "cts", "sha3-224", "sha3-256", "sha3-384", "sha3-512", NULL + "lzo", "cts", "sha3-224", "sha3-256", "sha3-384", "sha3-512", + "streebog256", "streebog512", + NULL }; static u32 block_sizes[] = { 16, 64, 256, 1024, 8192, 0 }; @@ -1914,6 +1916,14 @@ static int do_test(const char *alg, u32 type, u32 mask, int m, u32 num_mb) ret += tcrypt_test("sm3"); break; + case 53: + ret += tcrypt_test("streebog256"); + break; + + case 54: + ret += tcrypt_test("streebog512"); + break; + case 100: ret += tcrypt_test("hmac(md5)"); break; @@ -1970,6 +1980,14 @@ static int do_test(const char *alg, u32 type, u32 mask, int m, u32 num_mb) ret += tcrypt_test("hmac(sha3-512)"); break; + case 115: + ret += tcrypt_test("hmac(streebog256)"); + break; + + case 116: + ret += tcrypt_test("hmac(streebog512)"); + break; + case 150: ret += tcrypt_test("ansi_cprng"); break; @@ -2412,6 +2430,16 @@ static int do_test(const char *alg, u32 type, u32 mask, int m, u32 num_mb) test_hash_speed("sm3", sec, generic_hash_speed_template); if (mode > 300 && mode < 400) break; /* fall through */ + case 327: + test_hash_speed("streebog256", sec, + generic_hash_speed_template); + if (mode > 300 && mode < 400) break; + /* fall through */ + case 328: + test_hash_speed("streebog512", sec, + generic_hash_speed_template); + if (mode > 300 && mode < 400) break; + /* fall through */ case 399: break; @@ -2525,6 +2553,16 @@ static int do_test(const char *alg, u32 type, u32 mask, int m, u32 num_mb) num_mb); if (mode > 400 && mode < 500) break; /* fall through */ + case 426: + test_mb_ahash_speed("streebog256", sec, + generic_hash_speed_template, num_mb); + if (mode > 400 && mode < 500) break; + /* fall through */ + case 427: + test_mb_ahash_speed("streebog512", sec, + generic_hash_speed_template, num_mb); + if (mode > 400 && mode < 500) break; + /* fall through */ case 499: break; diff --git a/crypto/testmgr.c b/crypto/testmgr.c index 512ebf697f7e..379794a259a7 100644 --- a/crypto/testmgr.c +++ b/crypto/testmgr.c @@ -3192,6 +3192,18 @@ static const struct alg_test_desc alg_test_descs[] = { .suite = { .hash = __VECS(hmac_sha512_tv_template) } + }, { + .alg = "hmac(streebog256)", + .test = alg_test_hash, + .suite = { + .hash = __VECS(hmac_streebog256_tv_template) + } + }, { + .alg = "hmac(streebog512)", + .test = alg_test_hash, + .suite = { + .hash = __VECS(hmac_streebog512_tv_template) + } }, { .alg = "jitterentropy_rng", .fips_allowed = 1, @@ -3504,6 +3516,18 @@ static const struct alg_test_desc alg_test_descs[] = { .suite = { .hash = __VECS(sm3_tv_template) } + }, { + .alg = "streebog256", + .test = alg_test_hash, + .suite = { + .hash = __VECS(streebog256_tv_template) + } + }, { + .alg = "streebog512", + .test = alg_test_hash, + .suite = { + .hash = __VECS(streebog512_tv_template) + } }, { .alg = "tgr128", .test = alg_test_hash, diff --git a/crypto/testmgr.h b/crypto/testmgr.h index b5b0d29761ce..fc1b6c0e9ed2 100644 --- a/crypto/testmgr.h +++ b/crypto/testmgr.h @@ -2307,6 +2307,122 @@ static const struct hash_testvec crct10dif_tv_template[] = { } }; +/* + * Streebog test vectors from RFC 6986 and GOST R 34.11-2012 + */ +static const struct hash_testvec streebog256_tv_template[] = { + { /* M1 */ + .plaintext = "012345678901234567890123456789012345678901234567890123456789012", + .psize = 63, + .digest = + "\x9d\x15\x1e\xef\xd8\x59\x0b\x89" + "\xda\xa6\xba\x6c\xb7\x4a\xf9\x27" + "\x5d\xd0\x51\x02\x6b\xb1\x49\xa4" + "\x52\xfd\x84\xe5\xe5\x7b\x55\x00", + }, + { /* M2 */ + .plaintext = + "\xd1\xe5\x20\xe2\xe5\xf2\xf0\xe8" + "\x2c\x20\xd1\xf2\xf0\xe8\xe1\xee" + "\xe6\xe8\x20\xe2\xed\xf3\xf6\xe8" + "\x2c\x20\xe2\xe5\xfe\xf2\xfa\x20" + "\xf1\x20\xec\xee\xf0\xff\x20\xf1" + "\xf2\xf0\xe5\xeb\xe0\xec\xe8\x20" + "\xed\xe0\x20\xf5\xf0\xe0\xe1\xf0" + "\xfb\xff\x20\xef\xeb\xfa\xea\xfb" + "\x20\xc8\xe3\xee\xf0\xe5\xe2\xfb", + .psize = 72, + .digest = + "\x9d\xd2\xfe\x4e\x90\x40\x9e\x5d" + "\xa8\x7f\x53\x97\x6d\x74\x05\xb0" + "\xc0\xca\xc6\x28\xfc\x66\x9a\x74" + "\x1d\x50\x06\x3c\x55\x7e\x8f\x50", + }, +}; + +static const struct hash_testvec streebog512_tv_template[] = { + { /* M1 */ + .plaintext = "012345678901234567890123456789012345678901234567890123456789012", + .psize = 63, + .digest = + "\x1b\x54\xd0\x1a\x4a\xf5\xb9\xd5" + "\xcc\x3d\x86\xd6\x8d\x28\x54\x62" + "\xb1\x9a\xbc\x24\x75\x22\x2f\x35" + "\xc0\x85\x12\x2b\xe4\xba\x1f\xfa" + "\x00\xad\x30\xf8\x76\x7b\x3a\x82" + "\x38\x4c\x65\x74\xf0\x24\xc3\x11" + "\xe2\xa4\x81\x33\x2b\x08\xef\x7f" + "\x41\x79\x78\x91\xc1\x64\x6f\x48", + }, + { /* M2 */ + .plaintext = + "\xd1\xe5\x20\xe2\xe5\xf2\xf0\xe8" + "\x2c\x20\xd1\xf2\xf0\xe8\xe1\xee" + "\xe6\xe8\x20\xe2\xed\xf3\xf6\xe8" + "\x2c\x20\xe2\xe5\xfe\xf2\xfa\x20" + "\xf1\x20\xec\xee\xf0\xff\x20\xf1" + "\xf2\xf0\xe5\xeb\xe0\xec\xe8\x20" + "\xed\xe0\x20\xf5\xf0\xe0\xe1\xf0" + "\xfb\xff\x20\xef\xeb\xfa\xea\xfb" + "\x20\xc8\xe3\xee\xf0\xe5\xe2\xfb", + .psize = 72, + .digest = + "\x1e\x88\xe6\x22\x26\xbf\xca\x6f" + "\x99\x94\xf1\xf2\xd5\x15\x69\xe0" + "\xda\xf8\x47\x5a\x3b\x0f\xe6\x1a" + "\x53\x00\xee\xe4\x6d\x96\x13\x76" + "\x03\x5f\xe8\x35\x49\xad\xa2\xb8" + "\x62\x0f\xcd\x7c\x49\x6c\xe5\xb3" + "\x3f\x0c\xb9\xdd\xdc\x2b\x64\x60" + "\x14\x3b\x03\xda\xba\xc9\xfb\x28", + }, +}; + +/* + * Two HMAC-Streebog test vectors from RFC 7836 and R 50.1.113-2016 A + */ +static const struct hash_testvec hmac_streebog256_tv_template[] = { + { + .key = "\x00\x01\x02\x03\x04\x05\x06\x07" + "\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f" + "\x10\x11\x12\x13\x14\x15\x16\x17" + "\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f", + .ksize = 32, + .plaintext = + "\x01\x26\xbd\xb8\x78\x00\xaf\x21" + "\x43\x41\x45\x65\x63\x78\x01\x00", + .psize = 16, + .digest = + "\xa1\xaa\x5f\x7d\xe4\x02\xd7\xb3" + "\xd3\x23\xf2\x99\x1c\x8d\x45\x34" + "\x01\x31\x37\x01\x0a\x83\x75\x4f" + "\xd0\xaf\x6d\x7c\xd4\x92\x2e\xd9", + }, +}; + +static const struct hash_testvec hmac_streebog512_tv_template[] = { + { + .key = "\x00\x01\x02\x03\x04\x05\x06\x07" + "\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f" + "\x10\x11\x12\x13\x14\x15\x16\x17" + "\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f", + .ksize = 32, + .plaintext = + "\x01\x26\xbd\xb8\x78\x00\xaf\x21" + "\x43\x41\x45\x65\x63\x78\x01\x00", + .psize = 16, + .digest = + "\xa5\x9b\xab\x22\xec\xae\x19\xc6" + "\x5f\xbd\xe6\xe5\xf4\xe9\xf5\xd8" + "\x54\x9d\x31\xf0\x37\xf9\xdf\x9b" + "\x90\x55\x00\xe1\x71\x92\x3a\x77" + "\x3d\x5f\x15\x30\xf2\xed\x7e\x96" + "\x4c\xb2\xee\xdc\x29\xe9\xad\x2f" + "\x3a\xfe\x93\xb2\x81\x4f\x79\xf5" + "\x00\x0f\xfc\x03\x66\xc2\x51\xe6", + }, +}; + /* Example vectors below taken from * http://www.oscca.gov.cn/UpFile/20101222141857786.pdf * -- cgit v1.2.3-59-g8ed1b From 70db8b79e561a2965927a08ccdb06c796834a1b7 Mon Sep 17 00:00:00 2001 From: Leonard Crestez Date: Wed, 7 Nov 2018 15:33:31 +0000 Subject: dt-bindings: crypto: Mention clocks for mxs-dcp Explicit clock enabling is required on 6sll and 6ull so mention that standard clock bindings are used. Signed-off-by: Leonard Crestez Reviewed-by: Fabio Estevam Reviewed-by: Rob Herring Signed-off-by: Herbert Xu --- Documentation/devicetree/bindings/crypto/fsl-dcp.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Documentation/devicetree/bindings/crypto/fsl-dcp.txt b/Documentation/devicetree/bindings/crypto/fsl-dcp.txt index 76a0b4e80e83..4e4d387e38a5 100644 --- a/Documentation/devicetree/bindings/crypto/fsl-dcp.txt +++ b/Documentation/devicetree/bindings/crypto/fsl-dcp.txt @@ -6,6 +6,8 @@ Required properties: - interrupts : Should contain MXS DCP interrupt numbers, VMI IRQ and DCP IRQ must be supplied, optionally Secure IRQ can be present, but is currently not implemented and not used. +- clocks : Clock reference (only required on some SOCs: 6ull and 6sll). +- clock-names : Must be "dcp". Example: -- cgit v1.2.3-59-g8ed1b From 57f002891e08c589abf09409f2d606dfd1164e32 Mon Sep 17 00:00:00 2001 From: Leonard Crestez Date: Wed, 7 Nov 2018 15:33:32 +0000 Subject: crypto: mxs-dcp - Add support for dcp clk On 6ull and 6sll the DCP block has a clock which needs to be explicitly enabled. Add minimal handling for this at probe/remove time. Signed-off-by: Leonard Crestez Reviewed-by: Fabio Estevam Signed-off-by: Herbert Xu --- drivers/crypto/mxs-dcp.c | 28 +++++++++++++++++++++++++--- 1 file changed, 25 insertions(+), 3 deletions(-) diff --git a/drivers/crypto/mxs-dcp.c b/drivers/crypto/mxs-dcp.c index 4e6ff32f8a7e..a2105cf33abb 100644 --- a/drivers/crypto/mxs-dcp.c +++ b/drivers/crypto/mxs-dcp.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include @@ -82,6 +83,7 @@ struct dcp { spinlock_t lock[DCP_MAX_CHANS]; struct task_struct *thread[DCP_MAX_CHANS]; struct crypto_queue queue[DCP_MAX_CHANS]; + struct clk *dcp_clk; }; enum dcp_chan { @@ -1053,11 +1055,24 @@ static int mxs_dcp_probe(struct platform_device *pdev) /* Re-align the structure so it fits the DCP constraints. */ sdcp->coh = PTR_ALIGN(sdcp->coh, DCP_ALIGNMENT); - /* Restart the DCP block. */ - ret = stmp_reset_block(sdcp->base); + /* DCP clock is optional, only used on some SOCs */ + sdcp->dcp_clk = devm_clk_get(dev, "dcp"); + if (IS_ERR(sdcp->dcp_clk)) { + if (sdcp->dcp_clk != ERR_PTR(-ENOENT)) + return PTR_ERR(sdcp->dcp_clk); + sdcp->dcp_clk = NULL; + } + ret = clk_prepare_enable(sdcp->dcp_clk); if (ret) return ret; + /* Restart the DCP block. */ + ret = stmp_reset_block(sdcp->base); + if (ret) { + dev_err(dev, "Failed reset\n"); + goto err_disable_unprepare_clk; + } + /* Initialize control register. */ writel(MXS_DCP_CTRL_GATHER_RESIDUAL_WRITES | MXS_DCP_CTRL_ENABLE_CONTEXT_CACHING | 0xf, @@ -1094,7 +1109,8 @@ static int mxs_dcp_probe(struct platform_device *pdev) NULL, "mxs_dcp_chan/sha"); if (IS_ERR(sdcp->thread[DCP_CHAN_HASH_SHA])) { dev_err(dev, "Error starting SHA thread!\n"); - return PTR_ERR(sdcp->thread[DCP_CHAN_HASH_SHA]); + ret = PTR_ERR(sdcp->thread[DCP_CHAN_HASH_SHA]); + goto err_disable_unprepare_clk; } sdcp->thread[DCP_CHAN_CRYPTO] = kthread_run(dcp_chan_thread_aes, @@ -1151,6 +1167,10 @@ err_destroy_aes_thread: err_destroy_sha_thread: kthread_stop(sdcp->thread[DCP_CHAN_HASH_SHA]); + +err_disable_unprepare_clk: + clk_disable_unprepare(sdcp->dcp_clk); + return ret; } @@ -1170,6 +1190,8 @@ static int mxs_dcp_remove(struct platform_device *pdev) kthread_stop(sdcp->thread[DCP_CHAN_HASH_SHA]); kthread_stop(sdcp->thread[DCP_CHAN_CRYPTO]); + clk_disable_unprepare(sdcp->dcp_clk); + platform_set_drvdata(pdev, NULL); global_sdcp = NULL; -- cgit v1.2.3-59-g8ed1b From d239b10d4ceb986d998779a4ed81824368aca831 Mon Sep 17 00:00:00 2001 From: Horia Geantă Date: Thu, 8 Nov 2018 15:36:27 +0200 Subject: crypto: caam - add register map changes cf. Era 10 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Era 10 changes the register map. The updates that affect the drivers: -new version registers are added -DBG_DBG[deco_state] field is moved to a new register - DBG_EXEC[19:16] @ 8_0E3Ch. Signed-off-by: Horia Geantă Signed-off-by: Herbert Xu --- drivers/crypto/caam/caamalg.c | 47 +++++++++++++++++-------- drivers/crypto/caam/caamalg_qi.c | 37 +++++++++++++++----- drivers/crypto/caam/caamhash.c | 20 ++++++++--- drivers/crypto/caam/caampkc.c | 10 ++++-- drivers/crypto/caam/caamrng.c | 10 +++++- drivers/crypto/caam/ctrl.c | 28 +++++++++++---- drivers/crypto/caam/desc.h | 7 ++++ drivers/crypto/caam/regs.h | 74 ++++++++++++++++++++++++++++++++++------ 8 files changed, 184 insertions(+), 49 deletions(-) diff --git a/drivers/crypto/caam/caamalg.c b/drivers/crypto/caam/caamalg.c index 869f092432de..9f1414030bc2 100644 --- a/drivers/crypto/caam/caamalg.c +++ b/drivers/crypto/caam/caamalg.c @@ -3135,7 +3135,7 @@ static int __init caam_algapi_init(void) struct device *ctrldev; struct caam_drv_private *priv; int i = 0, err = 0; - u32 cha_vid, cha_inst, des_inst, aes_inst, md_inst; + u32 aes_vid, aes_inst, des_inst, md_vid, md_inst; unsigned int md_limit = SHA512_DIGEST_SIZE; bool registered = false; @@ -3168,14 +3168,34 @@ static int __init caam_algapi_init(void) * Register crypto algorithms the device supports. * First, detect presence and attributes of DES, AES, and MD blocks. */ - cha_vid = rd_reg32(&priv->ctrl->perfmon.cha_id_ls); - cha_inst = rd_reg32(&priv->ctrl->perfmon.cha_num_ls); - des_inst = (cha_inst & CHA_ID_LS_DES_MASK) >> CHA_ID_LS_DES_SHIFT; - aes_inst = (cha_inst & CHA_ID_LS_AES_MASK) >> CHA_ID_LS_AES_SHIFT; - md_inst = (cha_inst & CHA_ID_LS_MD_MASK) >> CHA_ID_LS_MD_SHIFT; + if (priv->era < 10) { + u32 cha_vid, cha_inst; + + cha_vid = rd_reg32(&priv->ctrl->perfmon.cha_id_ls); + aes_vid = cha_vid & CHA_ID_LS_AES_MASK; + md_vid = (cha_vid & CHA_ID_LS_MD_MASK) >> CHA_ID_LS_MD_SHIFT; + + cha_inst = rd_reg32(&priv->ctrl->perfmon.cha_num_ls); + des_inst = (cha_inst & CHA_ID_LS_DES_MASK) >> + CHA_ID_LS_DES_SHIFT; + aes_inst = cha_inst & CHA_ID_LS_AES_MASK; + md_inst = (cha_inst & CHA_ID_LS_MD_MASK) >> CHA_ID_LS_MD_SHIFT; + } else { + u32 aesa, mdha; + + aesa = rd_reg32(&priv->ctrl->vreg.aesa); + mdha = rd_reg32(&priv->ctrl->vreg.mdha); + + aes_vid = (aesa & CHA_VER_VID_MASK) >> CHA_VER_VID_SHIFT; + md_vid = (mdha & CHA_VER_VID_MASK) >> CHA_VER_VID_SHIFT; + + des_inst = rd_reg32(&priv->ctrl->vreg.desa) & CHA_VER_NUM_MASK; + aes_inst = aesa & CHA_VER_NUM_MASK; + md_inst = mdha & CHA_VER_NUM_MASK; + } /* If MD is present, limit digest size based on LP256 */ - if (md_inst && ((cha_vid & CHA_ID_LS_MD_MASK) == CHA_ID_LS_MD_LP256)) + if (md_inst && md_vid == CHA_VER_VID_MD_LP256) md_limit = SHA256_DIGEST_SIZE; for (i = 0; i < ARRAY_SIZE(driver_algs); i++) { @@ -3196,10 +3216,10 @@ static int __init caam_algapi_init(void) * Check support for AES modes not available * on LP devices. */ - if ((cha_vid & CHA_ID_LS_AES_MASK) == CHA_ID_LS_AES_LP) - if ((t_alg->caam.class1_alg_type & OP_ALG_AAI_MASK) == - OP_ALG_AAI_XTS) - continue; + if (aes_vid == CHA_VER_VID_AES_LP && + (t_alg->caam.class1_alg_type & OP_ALG_AAI_MASK) == + OP_ALG_AAI_XTS) + continue; caam_skcipher_alg_init(t_alg); @@ -3236,9 +3256,8 @@ static int __init caam_algapi_init(void) * Check support for AES algorithms not available * on LP devices. */ - if ((cha_vid & CHA_ID_LS_AES_MASK) == CHA_ID_LS_AES_LP) - if (alg_aai == OP_ALG_AAI_GCM) - continue; + if (aes_vid == CHA_VER_VID_AES_LP && alg_aai == OP_ALG_AAI_GCM) + continue; /* * Skip algorithms requiring message digests diff --git a/drivers/crypto/caam/caamalg_qi.c b/drivers/crypto/caam/caamalg_qi.c index 23c9fc4975f8..c0d55310aade 100644 --- a/drivers/crypto/caam/caamalg_qi.c +++ b/drivers/crypto/caam/caamalg_qi.c @@ -2462,7 +2462,7 @@ static int __init caam_qi_algapi_init(void) struct device *ctrldev; struct caam_drv_private *priv; int i = 0, err = 0; - u32 cha_vid, cha_inst, des_inst, aes_inst, md_inst; + u32 aes_vid, aes_inst, des_inst, md_vid, md_inst; unsigned int md_limit = SHA512_DIGEST_SIZE; bool registered = false; @@ -2497,14 +2497,34 @@ static int __init caam_qi_algapi_init(void) * Register crypto algorithms the device supports. * First, detect presence and attributes of DES, AES, and MD blocks. */ - cha_vid = rd_reg32(&priv->ctrl->perfmon.cha_id_ls); - cha_inst = rd_reg32(&priv->ctrl->perfmon.cha_num_ls); - des_inst = (cha_inst & CHA_ID_LS_DES_MASK) >> CHA_ID_LS_DES_SHIFT; - aes_inst = (cha_inst & CHA_ID_LS_AES_MASK) >> CHA_ID_LS_AES_SHIFT; - md_inst = (cha_inst & CHA_ID_LS_MD_MASK) >> CHA_ID_LS_MD_SHIFT; + if (priv->era < 10) { + u32 cha_vid, cha_inst; + + cha_vid = rd_reg32(&priv->ctrl->perfmon.cha_id_ls); + aes_vid = cha_vid & CHA_ID_LS_AES_MASK; + md_vid = (cha_vid & CHA_ID_LS_MD_MASK) >> CHA_ID_LS_MD_SHIFT; + + cha_inst = rd_reg32(&priv->ctrl->perfmon.cha_num_ls); + des_inst = (cha_inst & CHA_ID_LS_DES_MASK) >> + CHA_ID_LS_DES_SHIFT; + aes_inst = cha_inst & CHA_ID_LS_AES_MASK; + md_inst = (cha_inst & CHA_ID_LS_MD_MASK) >> CHA_ID_LS_MD_SHIFT; + } else { + u32 aesa, mdha; + + aesa = rd_reg32(&priv->ctrl->vreg.aesa); + mdha = rd_reg32(&priv->ctrl->vreg.mdha); + + aes_vid = (aesa & CHA_VER_VID_MASK) >> CHA_VER_VID_SHIFT; + md_vid = (mdha & CHA_VER_VID_MASK) >> CHA_VER_VID_SHIFT; + + des_inst = rd_reg32(&priv->ctrl->vreg.desa) & CHA_VER_NUM_MASK; + aes_inst = aesa & CHA_VER_NUM_MASK; + md_inst = mdha & CHA_VER_NUM_MASK; + } /* If MD is present, limit digest size based on LP256 */ - if (md_inst && ((cha_vid & CHA_ID_LS_MD_MASK) == CHA_ID_LS_MD_LP256)) + if (md_inst && md_vid == CHA_VER_VID_MD_LP256) md_limit = SHA256_DIGEST_SIZE; for (i = 0; i < ARRAY_SIZE(driver_algs); i++) { @@ -2556,8 +2576,7 @@ static int __init caam_qi_algapi_init(void) * Check support for AES algorithms not available * on LP devices. */ - if (((cha_vid & CHA_ID_LS_AES_MASK) == CHA_ID_LS_AES_LP) && - (alg_aai == OP_ALG_AAI_GCM)) + if (aes_vid == CHA_VER_VID_AES_LP && alg_aai == OP_ALG_AAI_GCM) continue; /* diff --git a/drivers/crypto/caam/caamhash.c b/drivers/crypto/caam/caamhash.c index 46924affa0bd..81712aa5d0f2 100644 --- a/drivers/crypto/caam/caamhash.c +++ b/drivers/crypto/caam/caamhash.c @@ -3,6 +3,7 @@ * caam - Freescale FSL CAAM support for ahash functions of crypto API * * Copyright 2011 Freescale Semiconductor, Inc. + * Copyright 2018 NXP * * Based on caamalg.c crypto API driver. * @@ -1801,7 +1802,7 @@ static int __init caam_algapi_hash_init(void) int i = 0, err = 0; struct caam_drv_private *priv; unsigned int md_limit = SHA512_DIGEST_SIZE; - u32 cha_inst, cha_vid; + u32 md_inst, md_vid; dev_node = of_find_compatible_node(NULL, NULL, "fsl,sec-v4.0"); if (!dev_node) { @@ -1831,18 +1832,27 @@ static int __init caam_algapi_hash_init(void) * Register crypto algorithms the device supports. First, identify * presence and attributes of MD block. */ - cha_vid = rd_reg32(&priv->ctrl->perfmon.cha_id_ls); - cha_inst = rd_reg32(&priv->ctrl->perfmon.cha_num_ls); + if (priv->era < 10) { + md_vid = (rd_reg32(&priv->ctrl->perfmon.cha_id_ls) & + CHA_ID_LS_MD_MASK) >> CHA_ID_LS_MD_SHIFT; + md_inst = (rd_reg32(&priv->ctrl->perfmon.cha_num_ls) & + CHA_ID_LS_MD_MASK) >> CHA_ID_LS_MD_SHIFT; + } else { + u32 mdha = rd_reg32(&priv->ctrl->vreg.mdha); + + md_vid = (mdha & CHA_VER_VID_MASK) >> CHA_VER_VID_SHIFT; + md_inst = mdha & CHA_VER_NUM_MASK; + } /* * Skip registration of any hashing algorithms if MD block * is not present. */ - if (!((cha_inst & CHA_ID_LS_MD_MASK) >> CHA_ID_LS_MD_SHIFT)) + if (!md_inst) return -ENODEV; /* Limit digest size based on LP256 */ - if ((cha_vid & CHA_ID_LS_MD_MASK) == CHA_ID_LS_MD_LP256) + if (md_vid == CHA_VER_VID_MD_LP256) md_limit = SHA256_DIGEST_SIZE; INIT_LIST_HEAD(&hash_list); diff --git a/drivers/crypto/caam/caampkc.c b/drivers/crypto/caam/caampkc.c index 4fc209cbbeab..77ab28a2811a 100644 --- a/drivers/crypto/caam/caampkc.c +++ b/drivers/crypto/caam/caampkc.c @@ -3,6 +3,7 @@ * caam - Freescale FSL CAAM support for Public Key Cryptography * * Copyright 2016 Freescale Semiconductor, Inc. + * Copyright 2018 NXP * * There is no Shared Descriptor for PKC so that the Job Descriptor must carry * all the desired key parameters, input and output pointers. @@ -1017,7 +1018,7 @@ static int __init caam_pkc_init(void) struct platform_device *pdev; struct device *ctrldev; struct caam_drv_private *priv; - u32 cha_inst, pk_inst; + u32 pk_inst; int err; dev_node = of_find_compatible_node(NULL, NULL, "fsl,sec-v4.0"); @@ -1045,8 +1046,11 @@ static int __init caam_pkc_init(void) return -ENODEV; /* Determine public key hardware accelerator presence. */ - cha_inst = rd_reg32(&priv->ctrl->perfmon.cha_num_ls); - pk_inst = (cha_inst & CHA_ID_LS_PK_MASK) >> CHA_ID_LS_PK_SHIFT; + if (priv->era < 10) + pk_inst = (rd_reg32(&priv->ctrl->perfmon.cha_num_ls) & + CHA_ID_LS_PK_MASK) >> CHA_ID_LS_PK_SHIFT; + else + pk_inst = rd_reg32(&priv->ctrl->vreg.pkha) & CHA_VER_NUM_MASK; /* Do not register algorithms if PKHA is not present. */ if (!pk_inst) diff --git a/drivers/crypto/caam/caamrng.c b/drivers/crypto/caam/caamrng.c index 4318b0aa6fb9..a387c8d49a62 100644 --- a/drivers/crypto/caam/caamrng.c +++ b/drivers/crypto/caam/caamrng.c @@ -3,6 +3,7 @@ * caam - Freescale FSL CAAM support for hw_random * * Copyright 2011 Freescale Semiconductor, Inc. + * Copyright 2018 NXP * * Based on caamalg.c crypto API driver. * @@ -309,6 +310,7 @@ static int __init caam_rng_init(void) struct platform_device *pdev; struct device *ctrldev; struct caam_drv_private *priv; + u32 rng_inst; int err; dev_node = of_find_compatible_node(NULL, NULL, "fsl,sec-v4.0"); @@ -336,7 +338,13 @@ static int __init caam_rng_init(void) return -ENODEV; /* Check for an instantiated RNG before registration */ - if (!(rd_reg32(&priv->ctrl->perfmon.cha_num_ls) & CHA_ID_LS_RNG_MASK)) + if (priv->era < 10) + rng_inst = (rd_reg32(&priv->ctrl->perfmon.cha_num_ls) & + CHA_ID_LS_RNG_MASK) >> CHA_ID_LS_RNG_SHIFT; + else + rng_inst = rd_reg32(&priv->ctrl->vreg.rng) & CHA_VER_NUM_MASK; + + if (!rng_inst) return -ENODEV; dev = caam_jr_alloc(); diff --git a/drivers/crypto/caam/ctrl.c b/drivers/crypto/caam/ctrl.c index 3fc793193821..16bbc72f041a 100644 --- a/drivers/crypto/caam/ctrl.c +++ b/drivers/crypto/caam/ctrl.c @@ -3,6 +3,7 @@ * Controller-level driver, kernel property detection, initialization * * Copyright 2008-2012 Freescale Semiconductor, Inc. + * Copyright 2018 NXP */ #include @@ -106,7 +107,7 @@ static inline int run_descriptor_deco0(struct device *ctrldev, u32 *desc, struct caam_ctrl __iomem *ctrl = ctrlpriv->ctrl; struct caam_deco __iomem *deco = ctrlpriv->deco; unsigned int timeout = 100000; - u32 deco_dbg_reg, flags; + u32 deco_dbg_reg, deco_state, flags; int i; @@ -149,13 +150,22 @@ static inline int run_descriptor_deco0(struct device *ctrldev, u32 *desc, timeout = 10000000; do { deco_dbg_reg = rd_reg32(&deco->desc_dbg); + + if (ctrlpriv->era < 10) + deco_state = (deco_dbg_reg & DESC_DBG_DECO_STAT_MASK) >> + DESC_DBG_DECO_STAT_SHIFT; + else + deco_state = (rd_reg32(&deco->dbg_exec) & + DESC_DER_DECO_STAT_MASK) >> + DESC_DER_DECO_STAT_SHIFT; + /* * If an error occured in the descriptor, then * the DECO status field will be set to 0x0D */ - if ((deco_dbg_reg & DESC_DBG_DECO_STAT_MASK) == - DESC_DBG_DECO_STAT_HOST_ERR) + if (deco_state == DECO_STAT_HOST_ERR) break; + cpu_relax(); } while ((deco_dbg_reg & DESC_DBG_DECO_STAT_VALID) && --timeout); @@ -491,7 +501,7 @@ static int caam_probe(struct platform_device *pdev) struct caam_perfmon *perfmon; #endif u32 scfgr, comp_params; - u32 cha_vid_ls; + u8 rng_vid; int pg_size; int BLOCK_OFFSET = 0; @@ -733,15 +743,19 @@ static int caam_probe(struct platform_device *pdev) goto caam_remove; } - cha_vid_ls = rd_reg32(&ctrl->perfmon.cha_id_ls); + if (ctrlpriv->era < 10) + rng_vid = (rd_reg32(&ctrl->perfmon.cha_id_ls) & + CHA_ID_LS_RNG_MASK) >> CHA_ID_LS_RNG_SHIFT; + else + rng_vid = (rd_reg32(&ctrl->vreg.rng) & CHA_VER_VID_MASK) >> + CHA_VER_VID_SHIFT; /* * If SEC has RNG version >= 4 and RNG state handle has not been * already instantiated, do RNG instantiation * In case of SoCs with Management Complex, RNG is managed by MC f/w. */ - if (!ctrlpriv->mc_en && - (cha_vid_ls & CHA_ID_LS_RNG_MASK) >> CHA_ID_LS_RNG_SHIFT >= 4) { + if (!ctrlpriv->mc_en && rng_vid >= 4) { ctrlpriv->rng4_sh_init = rd_reg32(&ctrl->r4tst[0].rdsta); /* diff --git a/drivers/crypto/caam/desc.h b/drivers/crypto/caam/desc.h index f76ff160a02c..ec1ef06049b4 100644 --- a/drivers/crypto/caam/desc.h +++ b/drivers/crypto/caam/desc.h @@ -4,6 +4,7 @@ * Definitions to support CAAM descriptor instruction generation * * Copyright 2008-2011 Freescale Semiconductor, Inc. + * Copyright 2018 NXP */ #ifndef DESC_H @@ -1133,6 +1134,12 @@ #define OP_ALG_TYPE_CLASS1 (2 << OP_ALG_TYPE_SHIFT) #define OP_ALG_TYPE_CLASS2 (4 << OP_ALG_TYPE_SHIFT) +/* version register fields */ +#define OP_VER_CCHA_NUM 0x000000ff /* Number CCHAs instantiated */ +#define OP_VER_CCHA_MISC 0x0000ff00 /* CCHA Miscellaneous Information */ +#define OP_VER_CCHA_REV 0x00ff0000 /* CCHA Revision Number */ +#define OP_VER_CCHA_VID 0xff000000 /* CCHA Version ID */ + #define OP_ALG_ALGSEL_SHIFT 16 #define OP_ALG_ALGSEL_MASK (0xff << OP_ALG_ALGSEL_SHIFT) #define OP_ALG_ALGSEL_SUBMASK (0x0f << OP_ALG_ALGSEL_SHIFT) diff --git a/drivers/crypto/caam/regs.h b/drivers/crypto/caam/regs.h index 457815f965c0..3cd0822ea819 100644 --- a/drivers/crypto/caam/regs.h +++ b/drivers/crypto/caam/regs.h @@ -3,6 +3,7 @@ * CAAM hardware register-level view * * Copyright 2008-2011 Freescale Semiconductor, Inc. + * Copyright 2018 NXP */ #ifndef REGS_H @@ -211,6 +212,47 @@ struct jr_outentry { u32 jrstatus; /* Status for completed descriptor */ } __packed; +/* Version registers (Era 10+) e80-eff */ +struct version_regs { + u32 crca; /* CRCA_VERSION */ + u32 afha; /* AFHA_VERSION */ + u32 kfha; /* KFHA_VERSION */ + u32 pkha; /* PKHA_VERSION */ + u32 aesa; /* AESA_VERSION */ + u32 mdha; /* MDHA_VERSION */ + u32 desa; /* DESA_VERSION */ + u32 snw8a; /* SNW8A_VERSION */ + u32 snw9a; /* SNW9A_VERSION */ + u32 zuce; /* ZUCE_VERSION */ + u32 zuca; /* ZUCA_VERSION */ + u32 ccha; /* CCHA_VERSION */ + u32 ptha; /* PTHA_VERSION */ + u32 rng; /* RNG_VERSION */ + u32 trng; /* TRNG_VERSION */ + u32 aaha; /* AAHA_VERSION */ + u32 rsvd[10]; + u32 sr; /* SR_VERSION */ + u32 dma; /* DMA_VERSION */ + u32 ai; /* AI_VERSION */ + u32 qi; /* QI_VERSION */ + u32 jr; /* JR_VERSION */ + u32 deco; /* DECO_VERSION */ +}; + +/* Version registers bitfields */ + +/* Number of CHAs instantiated */ +#define CHA_VER_NUM_MASK 0xffull +/* CHA Miscellaneous Information */ +#define CHA_VER_MISC_SHIFT 8 +#define CHA_VER_MISC_MASK (0xffull << CHA_VER_MISC_SHIFT) +/* CHA Revision Number */ +#define CHA_VER_REV_SHIFT 16 +#define CHA_VER_REV_MASK (0xffull << CHA_VER_REV_SHIFT) +/* CHA Version ID */ +#define CHA_VER_VID_SHIFT 24 +#define CHA_VER_VID_MASK (0xffull << CHA_VER_VID_SHIFT) + /* * caam_perfmon - Performance Monitor/Secure Memory Status/ * CAAM Global Status/Component Version IDs @@ -223,15 +265,13 @@ struct jr_outentry { #define CHA_NUM_MS_DECONUM_MASK (0xfull << CHA_NUM_MS_DECONUM_SHIFT) /* - * CHA version IDs / instantiation bitfields + * CHA version IDs / instantiation bitfields (< Era 10) * Defined for use with the cha_id fields in perfmon, but the same shift/mask * selectors can be used to pull out the number of instantiated blocks within * cha_num fields in perfmon because the locations are the same. */ #define CHA_ID_LS_AES_SHIFT 0 #define CHA_ID_LS_AES_MASK (0xfull << CHA_ID_LS_AES_SHIFT) -#define CHA_ID_LS_AES_LP (0x3ull << CHA_ID_LS_AES_SHIFT) -#define CHA_ID_LS_AES_HP (0x4ull << CHA_ID_LS_AES_SHIFT) #define CHA_ID_LS_DES_SHIFT 4 #define CHA_ID_LS_DES_MASK (0xfull << CHA_ID_LS_DES_SHIFT) @@ -241,9 +281,6 @@ struct jr_outentry { #define CHA_ID_LS_MD_SHIFT 12 #define CHA_ID_LS_MD_MASK (0xfull << CHA_ID_LS_MD_SHIFT) -#define CHA_ID_LS_MD_LP256 (0x0ull << CHA_ID_LS_MD_SHIFT) -#define CHA_ID_LS_MD_LP512 (0x1ull << CHA_ID_LS_MD_SHIFT) -#define CHA_ID_LS_MD_HP (0x2ull << CHA_ID_LS_MD_SHIFT) #define CHA_ID_LS_RNG_SHIFT 16 #define CHA_ID_LS_RNG_MASK (0xfull << CHA_ID_LS_RNG_SHIFT) @@ -269,6 +306,13 @@ struct jr_outentry { #define CHA_ID_MS_JR_SHIFT 28 #define CHA_ID_MS_JR_MASK (0xfull << CHA_ID_MS_JR_SHIFT) +/* Specific CHA version IDs */ +#define CHA_VER_VID_AES_LP 0x3ull +#define CHA_VER_VID_AES_HP 0x4ull +#define CHA_VER_VID_MD_LP256 0x0ull +#define CHA_VER_VID_MD_LP512 0x1ull +#define CHA_VER_VID_MD_HP 0x2ull + struct sec_vid { u16 ip_id; u8 maj_rev; @@ -479,8 +523,10 @@ struct caam_ctrl { struct rng4tst r4tst[2]; }; - u32 rsvd9[448]; + u32 rsvd9[416]; + /* Version registers - introduced with era 10 e80-eff */ + struct version_regs vreg; /* Performance Monitor f00-fff */ struct caam_perfmon perfmon; }; @@ -570,8 +616,10 @@ struct caam_job_ring { u32 rsvd11; u32 jrcommand; /* JRCRx - JobR command */ - u32 rsvd12[932]; + u32 rsvd12[900]; + /* Version registers - introduced with era 10 e80-eff */ + struct version_regs vreg; /* Performance Monitor f00-fff */ struct caam_perfmon perfmon; }; @@ -878,13 +926,19 @@ struct caam_deco { u32 rsvd29[48]; u32 descbuf[64]; /* DxDESB - Descriptor buffer */ u32 rscvd30[193]; -#define DESC_DBG_DECO_STAT_HOST_ERR 0x00D00000 #define DESC_DBG_DECO_STAT_VALID 0x80000000 #define DESC_DBG_DECO_STAT_MASK 0x00F00000 +#define DESC_DBG_DECO_STAT_SHIFT 20 u32 desc_dbg; /* DxDDR - DECO Debug Register */ - u32 rsvd31[126]; + u32 rsvd31[13]; +#define DESC_DER_DECO_STAT_MASK 0x000F0000 +#define DESC_DER_DECO_STAT_SHIFT 16 + u32 dbg_exec; /* DxDER - DECO Debug Exec Register */ + u32 rsvd32[112]; }; +#define DECO_STAT_HOST_ERR 0xD + #define DECO_JQCR_WHL 0x20000000 #define DECO_JQCR_FOUR 0x10000000 -- cgit v1.2.3-59-g8ed1b From c99d4a2454009d6fb51e03248fac7629c4d6a5ca Mon Sep 17 00:00:00 2001 From: Horia Geantă Date: Thu, 8 Nov 2018 15:36:28 +0200 Subject: crypto: caam/qi2 - add support for ChaCha20 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add support for ChaCha20 skcipher algorithm. Signed-off-by: Carmen Iorga Signed-off-by: Horia Geantă Signed-off-by: Herbert Xu --- drivers/crypto/caam/caamalg_desc.c | 6 ++++-- drivers/crypto/caam/caamalg_qi2.c | 27 +++++++++++++++++++++++++-- drivers/crypto/caam/compat.h | 1 + drivers/crypto/caam/desc.h | 6 ++++++ 4 files changed, 36 insertions(+), 4 deletions(-) diff --git a/drivers/crypto/caam/caamalg_desc.c b/drivers/crypto/caam/caamalg_desc.c index 1a6f0da14106..d850590079a2 100644 --- a/drivers/crypto/caam/caamalg_desc.c +++ b/drivers/crypto/caam/caamalg_desc.c @@ -1228,7 +1228,8 @@ static inline void skcipher_append_src_dst(u32 *desc) * @desc: pointer to buffer used for descriptor construction * @cdata: pointer to block cipher transform definitions * Valid algorithm values - one of OP_ALG_ALGSEL_{AES, DES, 3DES} ANDed - * with OP_ALG_AAI_CBC or OP_ALG_AAI_CTR_MOD128. + * with OP_ALG_AAI_CBC or OP_ALG_AAI_CTR_MOD128 + * - OP_ALG_ALGSEL_CHACHA20 * @ivsize: initialization vector size * @is_rfc3686: true when ctr(aes) is wrapped by rfc3686 template * @ctx1_iv_off: IV offset in CONTEXT1 register @@ -1293,7 +1294,8 @@ EXPORT_SYMBOL(cnstr_shdsc_skcipher_encap); * @desc: pointer to buffer used for descriptor construction * @cdata: pointer to block cipher transform definitions * Valid algorithm values - one of OP_ALG_ALGSEL_{AES, DES, 3DES} ANDed - * with OP_ALG_AAI_CBC or OP_ALG_AAI_CTR_MOD128. + * with OP_ALG_AAI_CBC or OP_ALG_AAI_CTR_MOD128 + * - OP_ALG_ALGSEL_CHACHA20 * @ivsize: initialization vector size * @is_rfc3686: true when ctr(aes) is wrapped by rfc3686 template * @ctx1_iv_off: IV offset in CONTEXT1 register diff --git a/drivers/crypto/caam/caamalg_qi2.c b/drivers/crypto/caam/caamalg_qi2.c index 7d8ac0222fa3..a9e264bb9629 100644 --- a/drivers/crypto/caam/caamalg_qi2.c +++ b/drivers/crypto/caam/caamalg_qi2.c @@ -816,7 +816,9 @@ static int skcipher_setkey(struct crypto_skcipher *skcipher, const u8 *key, u32 *desc; u32 ctx1_iv_off = 0; const bool ctr_mode = ((ctx->cdata.algtype & OP_ALG_AAI_MASK) == - OP_ALG_AAI_CTR_MOD128); + OP_ALG_AAI_CTR_MOD128) && + ((ctx->cdata.algtype & OP_ALG_ALGSEL_MASK) != + OP_ALG_ALGSEL_CHACHA20); const bool is_rfc3686 = alg->caam.rfc3686; print_hex_dump_debug("key in @" __stringify(__LINE__)": ", @@ -1494,7 +1496,23 @@ static struct caam_skcipher_alg driver_algs[] = { .ivsize = AES_BLOCK_SIZE, }, .caam.class1_alg_type = OP_ALG_ALGSEL_AES | OP_ALG_AAI_XTS, - } + }, + { + .skcipher = { + .base = { + .cra_name = "chacha20", + .cra_driver_name = "chacha20-caam-qi2", + .cra_blocksize = 1, + }, + .setkey = skcipher_setkey, + .encrypt = skcipher_encrypt, + .decrypt = skcipher_decrypt, + .min_keysize = CHACHA20_KEY_SIZE, + .max_keysize = CHACHA20_KEY_SIZE, + .ivsize = CHACHA20_IV_SIZE, + }, + .caam.class1_alg_type = OP_ALG_ALGSEL_CHACHA20, + }, }; static struct caam_aead_alg driver_aeads[] = { @@ -4908,6 +4926,11 @@ static int dpaa2_caam_probe(struct fsl_mc_device *dpseci_dev) alg_sel == OP_ALG_ALGSEL_AES) continue; + /* Skip CHACHA20 algorithms if not supported by device */ + if (alg_sel == OP_ALG_ALGSEL_CHACHA20 && + !priv->sec_attr.ccha_acc_num) + continue; + t_alg->caam.dev = dev; caam_skcipher_alg_init(t_alg); diff --git a/drivers/crypto/caam/compat.h b/drivers/crypto/caam/compat.h index 9604ff7a335e..a5081b4050b6 100644 --- a/drivers/crypto/caam/compat.h +++ b/drivers/crypto/caam/compat.h @@ -36,6 +36,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/crypto/caam/desc.h b/drivers/crypto/caam/desc.h index ec1ef06049b4..9d117e51629f 100644 --- a/drivers/crypto/caam/desc.h +++ b/drivers/crypto/caam/desc.h @@ -1159,6 +1159,7 @@ #define OP_ALG_ALGSEL_KASUMI (0x70 << OP_ALG_ALGSEL_SHIFT) #define OP_ALG_ALGSEL_CRC (0x90 << OP_ALG_ALGSEL_SHIFT) #define OP_ALG_ALGSEL_SNOW_F9 (0xA0 << OP_ALG_ALGSEL_SHIFT) +#define OP_ALG_ALGSEL_CHACHA20 (0xD0 << OP_ALG_ALGSEL_SHIFT) #define OP_ALG_AAI_SHIFT 4 #define OP_ALG_AAI_MASK (0x1ff << OP_ALG_AAI_SHIFT) @@ -1206,6 +1207,11 @@ #define OP_ALG_AAI_RNG4_AI (0x80 << OP_ALG_AAI_SHIFT) #define OP_ALG_AAI_RNG4_SK (0x100 << OP_ALG_AAI_SHIFT) +/* Chacha20 AAI set */ +#define OP_ALG_AAI_AEAD (0x002 << OP_ALG_AAI_SHIFT) +#define OP_ALG_AAI_KEYSTREAM (0x001 << OP_ALG_AAI_SHIFT) +#define OP_ALG_AAI_BC8 (0x008 << OP_ALG_AAI_SHIFT) + /* hmac/smac AAI set */ #define OP_ALG_AAI_HASH (0x00 << OP_ALG_AAI_SHIFT) #define OP_ALG_AAI_HMAC (0x01 << OP_ALG_AAI_SHIFT) -- cgit v1.2.3-59-g8ed1b From 193188e5512db5e84d2d9a7a6a157de651e78f3a Mon Sep 17 00:00:00 2001 From: Cristian Stoica Date: Thu, 8 Nov 2018 15:36:29 +0200 Subject: crypto: chacha20poly1305 - export CHACHAPOLY_IV_SIZE MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move CHACHAPOLY_IV_SIZE to header file, so it can be reused. Signed-off-by: Cristian Stoica Signed-off-by: Horia Geantă Signed-off-by: Herbert Xu --- crypto/chacha20poly1305.c | 2 -- include/crypto/chacha20.h | 1 + 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/crypto/chacha20poly1305.c b/crypto/chacha20poly1305.c index 600afa99941f..f9dd5453046a 100644 --- a/crypto/chacha20poly1305.c +++ b/crypto/chacha20poly1305.c @@ -22,8 +22,6 @@ #include "internal.h" -#define CHACHAPOLY_IV_SIZE 12 - struct chachapoly_instance_ctx { struct crypto_skcipher_spawn chacha; struct crypto_ahash_spawn poly; diff --git a/include/crypto/chacha20.h b/include/crypto/chacha20.h index f76302d99e2b..2d3129442a52 100644 --- a/include/crypto/chacha20.h +++ b/include/crypto/chacha20.h @@ -13,6 +13,7 @@ #define CHACHA20_IV_SIZE 16 #define CHACHA20_KEY_SIZE 32 #define CHACHA20_BLOCK_SIZE 64 +#define CHACHAPOLY_IV_SIZE 12 struct chacha20_ctx { u32 key[8]; -- cgit v1.2.3-59-g8ed1b From d6bbd4eea243951d2543a0f427c9a6bf2835b6f5 Mon Sep 17 00:00:00 2001 From: Horia Geantă Date: Thu, 8 Nov 2018 15:36:30 +0200 Subject: crypto: caam/jr - add support for Chacha20 + Poly1305 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add support for Chacha20 + Poly1305 combined AEAD: -generic (rfc7539) -IPsec (rfc7634 - known as rfc7539esp in the kernel) Signed-off-by: Cristian Stoica Signed-off-by: Horia Geantă Signed-off-by: Herbert Xu --- drivers/crypto/caam/caamalg.c | 221 ++++++++++++++++++++++++++++++++++++- drivers/crypto/caam/caamalg_desc.c | 111 +++++++++++++++++++ drivers/crypto/caam/caamalg_desc.h | 4 + drivers/crypto/caam/compat.h | 1 + drivers/crypto/caam/desc.h | 15 +++ drivers/crypto/caam/desc_constr.h | 7 +- 6 files changed, 354 insertions(+), 5 deletions(-) diff --git a/drivers/crypto/caam/caamalg.c b/drivers/crypto/caam/caamalg.c index 9f1414030bc2..cbaeb264a261 100644 --- a/drivers/crypto/caam/caamalg.c +++ b/drivers/crypto/caam/caamalg.c @@ -72,6 +72,8 @@ #define AUTHENC_DESC_JOB_IO_LEN (AEAD_DESC_JOB_IO_LEN + \ CAAM_CMD_SZ * 5) +#define CHACHAPOLY_DESC_JOB_IO_LEN (AEAD_DESC_JOB_IO_LEN + CAAM_CMD_SZ * 6) + #define DESC_MAX_USED_BYTES (CAAM_DESC_BYTES_MAX - DESC_JOB_IO_LEN) #define DESC_MAX_USED_LEN (DESC_MAX_USED_BYTES / CAAM_CMD_SZ) @@ -513,6 +515,61 @@ static int rfc4543_setauthsize(struct crypto_aead *authenc, return 0; } +static int chachapoly_set_sh_desc(struct crypto_aead *aead) +{ + struct caam_ctx *ctx = crypto_aead_ctx(aead); + struct device *jrdev = ctx->jrdev; + unsigned int ivsize = crypto_aead_ivsize(aead); + u32 *desc; + + if (!ctx->cdata.keylen || !ctx->authsize) + return 0; + + desc = ctx->sh_desc_enc; + cnstr_shdsc_chachapoly(desc, &ctx->cdata, &ctx->adata, ivsize, + ctx->authsize, true); + dma_sync_single_for_device(jrdev, ctx->sh_desc_enc_dma, + desc_bytes(desc), ctx->dir); + + desc = ctx->sh_desc_dec; + cnstr_shdsc_chachapoly(desc, &ctx->cdata, &ctx->adata, ivsize, + ctx->authsize, false); + dma_sync_single_for_device(jrdev, ctx->sh_desc_dec_dma, + desc_bytes(desc), ctx->dir); + + return 0; +} + +static int chachapoly_setauthsize(struct crypto_aead *aead, + unsigned int authsize) +{ + struct caam_ctx *ctx = crypto_aead_ctx(aead); + + if (authsize != POLY1305_DIGEST_SIZE) + return -EINVAL; + + ctx->authsize = authsize; + return chachapoly_set_sh_desc(aead); +} + +static int chachapoly_setkey(struct crypto_aead *aead, const u8 *key, + unsigned int keylen) +{ + struct caam_ctx *ctx = crypto_aead_ctx(aead); + unsigned int ivsize = crypto_aead_ivsize(aead); + unsigned int saltlen = CHACHAPOLY_IV_SIZE - ivsize; + + if (keylen != CHACHA20_KEY_SIZE + saltlen) { + crypto_aead_set_flags(aead, CRYPTO_TFM_RES_BAD_KEY_LEN); + return -EINVAL; + } + + ctx->cdata.key_virt = key; + ctx->cdata.keylen = keylen - saltlen; + + return chachapoly_set_sh_desc(aead); +} + static int aead_setkey(struct crypto_aead *aead, const u8 *key, unsigned int keylen) { @@ -1031,6 +1088,40 @@ static void init_gcm_job(struct aead_request *req, /* End of blank commands */ } +static void init_chachapoly_job(struct aead_request *req, + struct aead_edesc *edesc, bool all_contig, + bool encrypt) +{ + struct crypto_aead *aead = crypto_aead_reqtfm(req); + unsigned int ivsize = crypto_aead_ivsize(aead); + unsigned int assoclen = req->assoclen; + u32 *desc = edesc->hw_desc; + u32 ctx_iv_off = 4; + + init_aead_job(req, edesc, all_contig, encrypt); + + if (ivsize != CHACHAPOLY_IV_SIZE) { + /* IPsec specific: CONTEXT1[223:128] = {NONCE, IV} */ + ctx_iv_off += 4; + + /* + * The associated data comes already with the IV but we need + * to skip it when we authenticate or encrypt... + */ + assoclen -= ivsize; + } + + append_math_add_imm_u32(desc, REG3, ZERO, IMM, assoclen); + + /* + * For IPsec load the IV further in the same register. + * For RFC7539 simply load the 12 bytes nonce in a single operation + */ + append_load_as_imm(desc, req->iv, ivsize, LDST_CLASS_1_CCB | + LDST_SRCDST_BYTE_CONTEXT | + ctx_iv_off << LDST_OFFSET_SHIFT); +} + static void init_authenc_job(struct aead_request *req, struct aead_edesc *edesc, bool all_contig, bool encrypt) @@ -1289,6 +1380,72 @@ static int gcm_encrypt(struct aead_request *req) return ret; } +static int chachapoly_encrypt(struct aead_request *req) +{ + struct aead_edesc *edesc; + struct crypto_aead *aead = crypto_aead_reqtfm(req); + struct caam_ctx *ctx = crypto_aead_ctx(aead); + struct device *jrdev = ctx->jrdev; + bool all_contig; + u32 *desc; + int ret; + + edesc = aead_edesc_alloc(req, CHACHAPOLY_DESC_JOB_IO_LEN, &all_contig, + true); + if (IS_ERR(edesc)) + return PTR_ERR(edesc); + + desc = edesc->hw_desc; + + init_chachapoly_job(req, edesc, all_contig, true); + print_hex_dump_debug("chachapoly jobdesc@" __stringify(__LINE__)": ", + DUMP_PREFIX_ADDRESS, 16, 4, desc, desc_bytes(desc), + 1); + + ret = caam_jr_enqueue(jrdev, desc, aead_encrypt_done, req); + if (!ret) { + ret = -EINPROGRESS; + } else { + aead_unmap(jrdev, edesc, req); + kfree(edesc); + } + + return ret; +} + +static int chachapoly_decrypt(struct aead_request *req) +{ + struct aead_edesc *edesc; + struct crypto_aead *aead = crypto_aead_reqtfm(req); + struct caam_ctx *ctx = crypto_aead_ctx(aead); + struct device *jrdev = ctx->jrdev; + bool all_contig; + u32 *desc; + int ret; + + edesc = aead_edesc_alloc(req, CHACHAPOLY_DESC_JOB_IO_LEN, &all_contig, + false); + if (IS_ERR(edesc)) + return PTR_ERR(edesc); + + desc = edesc->hw_desc; + + init_chachapoly_job(req, edesc, all_contig, false); + print_hex_dump_debug("chachapoly jobdesc@" __stringify(__LINE__)": ", + DUMP_PREFIX_ADDRESS, 16, 4, desc, desc_bytes(desc), + 1); + + ret = caam_jr_enqueue(jrdev, desc, aead_decrypt_done, req); + if (!ret) { + ret = -EINPROGRESS; + } else { + aead_unmap(jrdev, edesc, req); + kfree(edesc); + } + + return ret; +} + static int ipsec_gcm_encrypt(struct aead_request *req) { if (req->assoclen < 8) @@ -3002,6 +3159,50 @@ static struct caam_aead_alg driver_aeads[] = { .geniv = true, }, }, + { + .aead = { + .base = { + .cra_name = "rfc7539(chacha20,poly1305)", + .cra_driver_name = "rfc7539-chacha20-poly1305-" + "caam", + .cra_blocksize = 1, + }, + .setkey = chachapoly_setkey, + .setauthsize = chachapoly_setauthsize, + .encrypt = chachapoly_encrypt, + .decrypt = chachapoly_decrypt, + .ivsize = CHACHAPOLY_IV_SIZE, + .maxauthsize = POLY1305_DIGEST_SIZE, + }, + .caam = { + .class1_alg_type = OP_ALG_ALGSEL_CHACHA20 | + OP_ALG_AAI_AEAD, + .class2_alg_type = OP_ALG_ALGSEL_POLY1305 | + OP_ALG_AAI_AEAD, + }, + }, + { + .aead = { + .base = { + .cra_name = "rfc7539esp(chacha20,poly1305)", + .cra_driver_name = "rfc7539esp-chacha20-" + "poly1305-caam", + .cra_blocksize = 1, + }, + .setkey = chachapoly_setkey, + .setauthsize = chachapoly_setauthsize, + .encrypt = chachapoly_encrypt, + .decrypt = chachapoly_decrypt, + .ivsize = 8, + .maxauthsize = POLY1305_DIGEST_SIZE, + }, + .caam = { + .class1_alg_type = OP_ALG_ALGSEL_CHACHA20 | + OP_ALG_AAI_AEAD, + .class2_alg_type = OP_ALG_ALGSEL_POLY1305 | + OP_ALG_AAI_AEAD, + }, + }, }; static int caam_init_common(struct caam_ctx *ctx, struct caam_alg_entry *caam, @@ -3135,7 +3336,7 @@ static int __init caam_algapi_init(void) struct device *ctrldev; struct caam_drv_private *priv; int i = 0, err = 0; - u32 aes_vid, aes_inst, des_inst, md_vid, md_inst; + u32 aes_vid, aes_inst, des_inst, md_vid, md_inst, ccha_inst, ptha_inst; unsigned int md_limit = SHA512_DIGEST_SIZE; bool registered = false; @@ -3180,6 +3381,8 @@ static int __init caam_algapi_init(void) CHA_ID_LS_DES_SHIFT; aes_inst = cha_inst & CHA_ID_LS_AES_MASK; md_inst = (cha_inst & CHA_ID_LS_MD_MASK) >> CHA_ID_LS_MD_SHIFT; + ccha_inst = 0; + ptha_inst = 0; } else { u32 aesa, mdha; @@ -3192,6 +3395,8 @@ static int __init caam_algapi_init(void) des_inst = rd_reg32(&priv->ctrl->vreg.desa) & CHA_VER_NUM_MASK; aes_inst = aesa & CHA_VER_NUM_MASK; md_inst = mdha & CHA_VER_NUM_MASK; + ccha_inst = rd_reg32(&priv->ctrl->vreg.ccha) & CHA_VER_NUM_MASK; + ptha_inst = rd_reg32(&priv->ctrl->vreg.ptha) & CHA_VER_NUM_MASK; } /* If MD is present, limit digest size based on LP256 */ @@ -3252,6 +3457,14 @@ static int __init caam_algapi_init(void) if (!aes_inst && (c1_alg_sel == OP_ALG_ALGSEL_AES)) continue; + /* Skip CHACHA20 algorithms if not supported by device */ + if (c1_alg_sel == OP_ALG_ALGSEL_CHACHA20 && !ccha_inst) + continue; + + /* Skip POLY1305 algorithms if not supported by device */ + if (c2_alg_sel == OP_ALG_ALGSEL_POLY1305 && !ptha_inst) + continue; + /* * Check support for AES algorithms not available * on LP devices. @@ -3263,9 +3476,9 @@ static int __init caam_algapi_init(void) * Skip algorithms requiring message digests * if MD or MD size is not supported by device. */ - if (c2_alg_sel && - (!md_inst || (t_alg->aead.maxauthsize > md_limit))) - continue; + if ((c2_alg_sel & ~OP_ALG_ALGSEL_SUBMASK) == 0x40 && + (!md_inst || t_alg->aead.maxauthsize > md_limit)) + continue; caam_aead_alg_init(t_alg); diff --git a/drivers/crypto/caam/caamalg_desc.c b/drivers/crypto/caam/caamalg_desc.c index d850590079a2..0eb2add7e4e2 100644 --- a/drivers/crypto/caam/caamalg_desc.c +++ b/drivers/crypto/caam/caamalg_desc.c @@ -1213,6 +1213,117 @@ void cnstr_shdsc_rfc4543_decap(u32 * const desc, struct alginfo *cdata, } EXPORT_SYMBOL(cnstr_shdsc_rfc4543_decap); +/** + * cnstr_shdsc_chachapoly - Chacha20 + Poly1305 generic AEAD (rfc7539) and + * IPsec ESP (rfc7634, a.k.a. rfc7539esp) shared + * descriptor (non-protocol). + * @desc: pointer to buffer used for descriptor construction + * @cdata: pointer to block cipher transform definitions + * Valid algorithm values - OP_ALG_ALGSEL_CHACHA20 ANDed with + * OP_ALG_AAI_AEAD. + * @adata: pointer to authentication transform definitions + * Valid algorithm values - OP_ALG_ALGSEL_POLY1305 ANDed with + * OP_ALG_AAI_AEAD. + * @ivsize: initialization vector size + * @icvsize: integrity check value (ICV) size (truncated or full) + * @encap: true if encapsulation, false if decapsulation + */ +void cnstr_shdsc_chachapoly(u32 * const desc, struct alginfo *cdata, + struct alginfo *adata, unsigned int ivsize, + unsigned int icvsize, const bool encap) +{ + u32 *key_jump_cmd, *wait_cmd; + u32 nfifo; + const bool is_ipsec = (ivsize != CHACHAPOLY_IV_SIZE); + + /* Note: Context registers are saved. */ + init_sh_desc(desc, HDR_SHARE_SERIAL | HDR_SAVECTX); + + /* skip key loading if they are loaded due to sharing */ + key_jump_cmd = append_jump(desc, JUMP_JSL | JUMP_TEST_ALL | + JUMP_COND_SHRD); + + append_key_as_imm(desc, cdata->key_virt, cdata->keylen, cdata->keylen, + CLASS_1 | KEY_DEST_CLASS_REG); + + /* For IPsec load the salt from keymat in the context register */ + if (is_ipsec) + append_load_as_imm(desc, cdata->key_virt + cdata->keylen, 4, + LDST_CLASS_1_CCB | LDST_SRCDST_BYTE_CONTEXT | + 4 << LDST_OFFSET_SHIFT); + + set_jump_tgt_here(desc, key_jump_cmd); + + /* Class 2 and 1 operations: Poly & ChaCha */ + if (encap) { + append_operation(desc, adata->algtype | OP_ALG_AS_INITFINAL | + OP_ALG_ENCRYPT); + append_operation(desc, cdata->algtype | OP_ALG_AS_INITFINAL | + OP_ALG_ENCRYPT); + } else { + append_operation(desc, adata->algtype | OP_ALG_AS_INITFINAL | + OP_ALG_DECRYPT | OP_ALG_ICV_ON); + append_operation(desc, cdata->algtype | OP_ALG_AS_INITFINAL | + OP_ALG_DECRYPT); + } + + /* + * MAGIC with NFIFO + * Read associated data from the input and send them to class1 and + * class2 alignment blocks. From class1 send data to output fifo and + * then write it to memory since we don't need to encrypt AD. + */ + nfifo = NFIFOENTRY_DEST_BOTH | NFIFOENTRY_FC1 | NFIFOENTRY_FC2 | + NFIFOENTRY_DTYPE_POLY | NFIFOENTRY_BND; + append_load_imm_u32(desc, nfifo, LDST_CLASS_IND_CCB | + LDST_SRCDST_WORD_INFO_FIFO_SM | LDLEN_MATH3); + + append_math_add(desc, VARSEQINLEN, ZERO, REG3, CAAM_CMD_SZ); + append_math_add(desc, VARSEQOUTLEN, ZERO, REG3, CAAM_CMD_SZ); + append_seq_fifo_load(desc, 0, FIFOLD_TYPE_NOINFOFIFO | + FIFOLD_CLASS_CLASS1 | LDST_VLF); + append_move_len(desc, MOVE_AUX_LS | MOVE_SRC_AUX_ABLK | + MOVE_DEST_OUTFIFO | MOVELEN_MRSEL_MATH3); + append_seq_fifo_store(desc, 0, FIFOST_TYPE_MESSAGE_DATA | LDST_VLF); + + /* IPsec - copy IV at the output */ + if (is_ipsec) + append_seq_fifo_store(desc, ivsize, FIFOST_TYPE_METADATA | + 0x2 << 25); + + wait_cmd = append_jump(desc, JUMP_JSL | JUMP_TYPE_LOCAL | + JUMP_COND_NOP | JUMP_TEST_ALL); + set_jump_tgt_here(desc, wait_cmd); + + if (encap) { + /* Read and write cryptlen bytes */ + append_math_add(desc, VARSEQINLEN, SEQINLEN, REG0, CAAM_CMD_SZ); + append_math_add(desc, VARSEQOUTLEN, SEQINLEN, REG0, + CAAM_CMD_SZ); + aead_append_src_dst(desc, FIFOLD_TYPE_MSG1OUT2); + + /* Write ICV */ + append_seq_store(desc, icvsize, LDST_CLASS_2_CCB | + LDST_SRCDST_BYTE_CONTEXT); + } else { + /* Read and write cryptlen bytes */ + append_math_add(desc, VARSEQINLEN, SEQOUTLEN, REG0, + CAAM_CMD_SZ); + append_math_add(desc, VARSEQOUTLEN, SEQOUTLEN, REG0, + CAAM_CMD_SZ); + aead_append_src_dst(desc, FIFOLD_TYPE_MSG); + + /* Load ICV for verification */ + append_seq_fifo_load(desc, icvsize, FIFOLD_CLASS_CLASS2 | + FIFOLD_TYPE_LAST2 | FIFOLD_TYPE_ICV); + } + + print_hex_dump_debug("chachapoly shdesc@" __stringify(__LINE__)": ", + DUMP_PREFIX_ADDRESS, 16, 4, desc, desc_bytes(desc), + 1); +} +EXPORT_SYMBOL(cnstr_shdsc_chachapoly); + /* For skcipher encrypt and decrypt, read from req->src and write to req->dst */ static inline void skcipher_append_src_dst(u32 *desc) { diff --git a/drivers/crypto/caam/caamalg_desc.h b/drivers/crypto/caam/caamalg_desc.h index 1315c8f6f951..a1a7b0e6889d 100644 --- a/drivers/crypto/caam/caamalg_desc.h +++ b/drivers/crypto/caam/caamalg_desc.h @@ -96,6 +96,10 @@ void cnstr_shdsc_rfc4543_decap(u32 * const desc, struct alginfo *cdata, unsigned int ivsize, unsigned int icvsize, const bool is_qi); +void cnstr_shdsc_chachapoly(u32 * const desc, struct alginfo *cdata, + struct alginfo *adata, unsigned int ivsize, + unsigned int icvsize, const bool encap); + void cnstr_shdsc_skcipher_encap(u32 * const desc, struct alginfo *cdata, unsigned int ivsize, const bool is_rfc3686, const u32 ctx1_iv_off); diff --git a/drivers/crypto/caam/compat.h b/drivers/crypto/caam/compat.h index a5081b4050b6..8bde903f9f4a 100644 --- a/drivers/crypto/caam/compat.h +++ b/drivers/crypto/caam/compat.h @@ -37,6 +37,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/crypto/caam/desc.h b/drivers/crypto/caam/desc.h index 9d117e51629f..ec10230178c5 100644 --- a/drivers/crypto/caam/desc.h +++ b/drivers/crypto/caam/desc.h @@ -243,6 +243,7 @@ #define LDST_SRCDST_WORD_DESCBUF_SHARED (0x42 << LDST_SRCDST_SHIFT) #define LDST_SRCDST_WORD_DESCBUF_JOB_WE (0x45 << LDST_SRCDST_SHIFT) #define LDST_SRCDST_WORD_DESCBUF_SHARED_WE (0x46 << LDST_SRCDST_SHIFT) +#define LDST_SRCDST_WORD_INFO_FIFO_SM (0x71 << LDST_SRCDST_SHIFT) #define LDST_SRCDST_WORD_INFO_FIFO (0x7a << LDST_SRCDST_SHIFT) /* Offset in source/destination */ @@ -285,6 +286,12 @@ #define LDLEN_SET_OFIFO_OFFSET_SHIFT 0 #define LDLEN_SET_OFIFO_OFFSET_MASK (3 << LDLEN_SET_OFIFO_OFFSET_SHIFT) +/* Special Length definitions when dst=sm, nfifo-{sm,m} */ +#define LDLEN_MATH0 0 +#define LDLEN_MATH1 1 +#define LDLEN_MATH2 2 +#define LDLEN_MATH3 3 + /* * FIFO_LOAD/FIFO_STORE/SEQ_FIFO_LOAD/SEQ_FIFO_STORE * Command Constructs @@ -409,6 +416,7 @@ #define FIFOST_TYPE_MESSAGE_DATA (0x30 << FIFOST_TYPE_SHIFT) #define FIFOST_TYPE_RNGSTORE (0x34 << FIFOST_TYPE_SHIFT) #define FIFOST_TYPE_RNGFIFO (0x35 << FIFOST_TYPE_SHIFT) +#define FIFOST_TYPE_METADATA (0x3e << FIFOST_TYPE_SHIFT) #define FIFOST_TYPE_SKIP (0x3f << FIFOST_TYPE_SHIFT) /* @@ -1160,6 +1168,7 @@ #define OP_ALG_ALGSEL_CRC (0x90 << OP_ALG_ALGSEL_SHIFT) #define OP_ALG_ALGSEL_SNOW_F9 (0xA0 << OP_ALG_ALGSEL_SHIFT) #define OP_ALG_ALGSEL_CHACHA20 (0xD0 << OP_ALG_ALGSEL_SHIFT) +#define OP_ALG_ALGSEL_POLY1305 (0xE0 << OP_ALG_ALGSEL_SHIFT) #define OP_ALG_AAI_SHIFT 4 #define OP_ALG_AAI_MASK (0x1ff << OP_ALG_AAI_SHIFT) @@ -1400,6 +1409,7 @@ #define MOVE_SRC_MATH3 (0x07 << MOVE_SRC_SHIFT) #define MOVE_SRC_INFIFO (0x08 << MOVE_SRC_SHIFT) #define MOVE_SRC_INFIFO_CL (0x09 << MOVE_SRC_SHIFT) +#define MOVE_SRC_AUX_ABLK (0x0a << MOVE_SRC_SHIFT) #define MOVE_DEST_SHIFT 16 #define MOVE_DEST_MASK (0x0f << MOVE_DEST_SHIFT) @@ -1426,6 +1436,10 @@ #define MOVELEN_MRSEL_SHIFT 0 #define MOVELEN_MRSEL_MASK (0x3 << MOVE_LEN_SHIFT) +#define MOVELEN_MRSEL_MATH0 (0 << MOVELEN_MRSEL_SHIFT) +#define MOVELEN_MRSEL_MATH1 (1 << MOVELEN_MRSEL_SHIFT) +#define MOVELEN_MRSEL_MATH2 (2 << MOVELEN_MRSEL_SHIFT) +#define MOVELEN_MRSEL_MATH3 (3 << MOVELEN_MRSEL_SHIFT) /* * MATH Command Constructs @@ -1602,6 +1616,7 @@ #define NFIFOENTRY_DTYPE_IV (0x2 << NFIFOENTRY_DTYPE_SHIFT) #define NFIFOENTRY_DTYPE_SAD (0x3 << NFIFOENTRY_DTYPE_SHIFT) #define NFIFOENTRY_DTYPE_ICV (0xA << NFIFOENTRY_DTYPE_SHIFT) +#define NFIFOENTRY_DTYPE_POLY (0xB << NFIFOENTRY_DTYPE_SHIFT) #define NFIFOENTRY_DTYPE_SKIP (0xE << NFIFOENTRY_DTYPE_SHIFT) #define NFIFOENTRY_DTYPE_MSG (0xF << NFIFOENTRY_DTYPE_SHIFT) diff --git a/drivers/crypto/caam/desc_constr.h b/drivers/crypto/caam/desc_constr.h index d4256fa4a1d6..2980b8ef1fb1 100644 --- a/drivers/crypto/caam/desc_constr.h +++ b/drivers/crypto/caam/desc_constr.h @@ -189,6 +189,7 @@ static inline u32 *append_##cmd(u32 * const desc, u32 options) \ } APPEND_CMD_RET(jump, JUMP) APPEND_CMD_RET(move, MOVE) +APPEND_CMD_RET(move_len, MOVE_LEN) static inline void set_jump_tgt_here(u32 * const desc, u32 *jump_cmd) { @@ -327,7 +328,11 @@ static inline void append_##cmd##_imm_##type(u32 * const desc, type immediate, \ u32 options) \ { \ PRINT_POS; \ - append_cmd(desc, CMD_##op | IMMEDIATE | options | sizeof(type)); \ + if (options & LDST_LEN_MASK) \ + append_cmd(desc, CMD_##op | IMMEDIATE | options); \ + else \ + append_cmd(desc, CMD_##op | IMMEDIATE | options | \ + sizeof(type)); \ append_cmd(desc, immediate); \ } APPEND_CMD_RAW_IMM(load, LOAD, u32); -- cgit v1.2.3-59-g8ed1b From c10a53367901b36eec6208f1dbaf53da9fd358bb Mon Sep 17 00:00:00 2001 From: Horia Geantă Date: Thu, 8 Nov 2018 15:36:31 +0200 Subject: crypto: caam/qi2 - add support for Chacha20 + Poly1305 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add support for Chacha20 + Poly1305 combined AEAD: -generic (rfc7539) -IPsec (rfc7634 - known as rfc7539esp in the kernel) Signed-off-by: Horia Geantă Signed-off-by: Herbert Xu --- drivers/crypto/caam/caamalg.c | 4 +- drivers/crypto/caam/caamalg_desc.c | 24 ++++++- drivers/crypto/caam/caamalg_desc.h | 3 +- drivers/crypto/caam/caamalg_qi2.c | 129 ++++++++++++++++++++++++++++++++++++- 4 files changed, 154 insertions(+), 6 deletions(-) diff --git a/drivers/crypto/caam/caamalg.c b/drivers/crypto/caam/caamalg.c index cbaeb264a261..523565ce0060 100644 --- a/drivers/crypto/caam/caamalg.c +++ b/drivers/crypto/caam/caamalg.c @@ -527,13 +527,13 @@ static int chachapoly_set_sh_desc(struct crypto_aead *aead) desc = ctx->sh_desc_enc; cnstr_shdsc_chachapoly(desc, &ctx->cdata, &ctx->adata, ivsize, - ctx->authsize, true); + ctx->authsize, true, false); dma_sync_single_for_device(jrdev, ctx->sh_desc_enc_dma, desc_bytes(desc), ctx->dir); desc = ctx->sh_desc_dec; cnstr_shdsc_chachapoly(desc, &ctx->cdata, &ctx->adata, ivsize, - ctx->authsize, false); + ctx->authsize, false, false); dma_sync_single_for_device(jrdev, ctx->sh_desc_dec_dma, desc_bytes(desc), ctx->dir); diff --git a/drivers/crypto/caam/caamalg_desc.c b/drivers/crypto/caam/caamalg_desc.c index 0eb2add7e4e2..7db1640d3577 100644 --- a/drivers/crypto/caam/caamalg_desc.c +++ b/drivers/crypto/caam/caamalg_desc.c @@ -1227,10 +1227,12 @@ EXPORT_SYMBOL(cnstr_shdsc_rfc4543_decap); * @ivsize: initialization vector size * @icvsize: integrity check value (ICV) size (truncated or full) * @encap: true if encapsulation, false if decapsulation + * @is_qi: true when called from caam/qi */ void cnstr_shdsc_chachapoly(u32 * const desc, struct alginfo *cdata, struct alginfo *adata, unsigned int ivsize, - unsigned int icvsize, const bool encap) + unsigned int icvsize, const bool encap, + const bool is_qi) { u32 *key_jump_cmd, *wait_cmd; u32 nfifo; @@ -1267,6 +1269,26 @@ void cnstr_shdsc_chachapoly(u32 * const desc, struct alginfo *cdata, OP_ALG_DECRYPT); } + if (is_qi) { + u32 *wait_load_cmd; + u32 ctx1_iv_off = is_ipsec ? 8 : 4; + + /* REG3 = assoclen */ + append_seq_load(desc, 4, LDST_CLASS_DECO | + LDST_SRCDST_WORD_DECO_MATH3 | + 4 << LDST_OFFSET_SHIFT); + + wait_load_cmd = append_jump(desc, JUMP_JSL | JUMP_TEST_ALL | + JUMP_COND_CALM | JUMP_COND_NCP | + JUMP_COND_NOP | JUMP_COND_NIP | + JUMP_COND_NIFP); + set_jump_tgt_here(desc, wait_load_cmd); + + append_seq_load(desc, ivsize, LDST_CLASS_1_CCB | + LDST_SRCDST_BYTE_CONTEXT | + ctx1_iv_off << LDST_OFFSET_SHIFT); + } + /* * MAGIC with NFIFO * Read associated data from the input and send them to class1 and diff --git a/drivers/crypto/caam/caamalg_desc.h b/drivers/crypto/caam/caamalg_desc.h index a1a7b0e6889d..d5ca42ff961a 100644 --- a/drivers/crypto/caam/caamalg_desc.h +++ b/drivers/crypto/caam/caamalg_desc.h @@ -98,7 +98,8 @@ void cnstr_shdsc_rfc4543_decap(u32 * const desc, struct alginfo *cdata, void cnstr_shdsc_chachapoly(u32 * const desc, struct alginfo *cdata, struct alginfo *adata, unsigned int ivsize, - unsigned int icvsize, const bool encap); + unsigned int icvsize, const bool encap, + const bool is_qi); void cnstr_shdsc_skcipher_encap(u32 * const desc, struct alginfo *cdata, unsigned int ivsize, const bool is_rfc3686, diff --git a/drivers/crypto/caam/caamalg_qi2.c b/drivers/crypto/caam/caamalg_qi2.c index a9e264bb9629..2598640aa98b 100644 --- a/drivers/crypto/caam/caamalg_qi2.c +++ b/drivers/crypto/caam/caamalg_qi2.c @@ -462,7 +462,15 @@ static struct aead_edesc *aead_edesc_alloc(struct aead_request *req, edesc->dst_nents = dst_nents; edesc->iv_dma = iv_dma; - edesc->assoclen = cpu_to_caam32(req->assoclen); + if ((alg->caam.class1_alg_type & OP_ALG_ALGSEL_MASK) == + OP_ALG_ALGSEL_CHACHA20 && ivsize != CHACHAPOLY_IV_SIZE) + /* + * The associated data comes already with the IV but we need + * to skip it when we authenticate or encrypt... + */ + edesc->assoclen = cpu_to_caam32(req->assoclen - ivsize); + else + edesc->assoclen = cpu_to_caam32(req->assoclen); edesc->assoclen_dma = dma_map_single(dev, &edesc->assoclen, 4, DMA_TO_DEVICE); if (dma_mapping_error(dev, edesc->assoclen_dma)) { @@ -532,6 +540,68 @@ static struct aead_edesc *aead_edesc_alloc(struct aead_request *req, return edesc; } +static int chachapoly_set_sh_desc(struct crypto_aead *aead) +{ + struct caam_ctx *ctx = crypto_aead_ctx(aead); + unsigned int ivsize = crypto_aead_ivsize(aead); + struct device *dev = ctx->dev; + struct caam_flc *flc; + u32 *desc; + + if (!ctx->cdata.keylen || !ctx->authsize) + return 0; + + flc = &ctx->flc[ENCRYPT]; + desc = flc->sh_desc; + cnstr_shdsc_chachapoly(desc, &ctx->cdata, &ctx->adata, ivsize, + ctx->authsize, true, true); + flc->flc[1] = cpu_to_caam32(desc_len(desc)); /* SDL */ + dma_sync_single_for_device(dev, ctx->flc_dma[ENCRYPT], + sizeof(flc->flc) + desc_bytes(desc), + ctx->dir); + + flc = &ctx->flc[DECRYPT]; + desc = flc->sh_desc; + cnstr_shdsc_chachapoly(desc, &ctx->cdata, &ctx->adata, ivsize, + ctx->authsize, false, true); + flc->flc[1] = cpu_to_caam32(desc_len(desc)); /* SDL */ + dma_sync_single_for_device(dev, ctx->flc_dma[DECRYPT], + sizeof(flc->flc) + desc_bytes(desc), + ctx->dir); + + return 0; +} + +static int chachapoly_setauthsize(struct crypto_aead *aead, + unsigned int authsize) +{ + struct caam_ctx *ctx = crypto_aead_ctx(aead); + + if (authsize != POLY1305_DIGEST_SIZE) + return -EINVAL; + + ctx->authsize = authsize; + return chachapoly_set_sh_desc(aead); +} + +static int chachapoly_setkey(struct crypto_aead *aead, const u8 *key, + unsigned int keylen) +{ + struct caam_ctx *ctx = crypto_aead_ctx(aead); + unsigned int ivsize = crypto_aead_ivsize(aead); + unsigned int saltlen = CHACHAPOLY_IV_SIZE - ivsize; + + if (keylen != CHACHA20_KEY_SIZE + saltlen) { + crypto_aead_set_flags(aead, CRYPTO_TFM_RES_BAD_KEY_LEN); + return -EINVAL; + } + + ctx->cdata.key_virt = key; + ctx->cdata.keylen = keylen - saltlen; + + return chachapoly_set_sh_desc(aead); +} + static int gcm_set_sh_desc(struct crypto_aead *aead) { struct caam_ctx *ctx = crypto_aead_ctx(aead); @@ -2626,6 +2696,50 @@ static struct caam_aead_alg driver_aeads[] = { .geniv = true, }, }, + { + .aead = { + .base = { + .cra_name = "rfc7539(chacha20,poly1305)", + .cra_driver_name = "rfc7539-chacha20-poly1305-" + "caam-qi2", + .cra_blocksize = 1, + }, + .setkey = chachapoly_setkey, + .setauthsize = chachapoly_setauthsize, + .encrypt = aead_encrypt, + .decrypt = aead_decrypt, + .ivsize = CHACHAPOLY_IV_SIZE, + .maxauthsize = POLY1305_DIGEST_SIZE, + }, + .caam = { + .class1_alg_type = OP_ALG_ALGSEL_CHACHA20 | + OP_ALG_AAI_AEAD, + .class2_alg_type = OP_ALG_ALGSEL_POLY1305 | + OP_ALG_AAI_AEAD, + }, + }, + { + .aead = { + .base = { + .cra_name = "rfc7539esp(chacha20,poly1305)", + .cra_driver_name = "rfc7539esp-chacha20-" + "poly1305-caam-qi2", + .cra_blocksize = 1, + }, + .setkey = chachapoly_setkey, + .setauthsize = chachapoly_setauthsize, + .encrypt = aead_encrypt, + .decrypt = aead_decrypt, + .ivsize = 8, + .maxauthsize = POLY1305_DIGEST_SIZE, + }, + .caam = { + .class1_alg_type = OP_ALG_ALGSEL_CHACHA20 | + OP_ALG_AAI_AEAD, + .class2_alg_type = OP_ALG_ALGSEL_POLY1305 | + OP_ALG_AAI_AEAD, + }, + }, { .aead = { .base = { @@ -4963,11 +5077,22 @@ static int dpaa2_caam_probe(struct fsl_mc_device *dpseci_dev) c1_alg_sel == OP_ALG_ALGSEL_AES) continue; + /* Skip CHACHA20 algorithms if not supported by device */ + if (c1_alg_sel == OP_ALG_ALGSEL_CHACHA20 && + !priv->sec_attr.ccha_acc_num) + continue; + + /* Skip POLY1305 algorithms if not supported by device */ + if (c2_alg_sel == OP_ALG_ALGSEL_POLY1305 && + !priv->sec_attr.ptha_acc_num) + continue; + /* * Skip algorithms requiring message digests * if MD not supported by device. */ - if (!priv->sec_attr.md_acc_num && c2_alg_sel) + if ((c2_alg_sel & ~OP_ALG_ALGSEL_SUBMASK) == 0x40 && + !priv->sec_attr.md_acc_num) continue; t_alg->caam.dev = dev; -- cgit v1.2.3-59-g8ed1b From 8ddab428730d66e86ebe90aef15d5f7c5c45fe0d Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Fri, 9 Nov 2018 13:16:39 +0000 Subject: padata: clean an indentation issue, remove extraneous space Trivial fix to clean up an indentation issue Signed-off-by: Colin Ian King Signed-off-by: Herbert Xu --- kernel/padata.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/padata.c b/kernel/padata.c index d568cc56405f..3e2633ae3bca 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -720,7 +720,7 @@ int padata_start(struct padata_instance *pinst) if (pinst->flags & PADATA_INVALID) err = -EINVAL; - __padata_start(pinst); + __padata_start(pinst); mutex_unlock(&pinst->lock); -- cgit v1.2.3-59-g8ed1b From 05ba88468b7d85c50d0945446248e0fcda7536bb Mon Sep 17 00:00:00 2001 From: Stefan Wahren Date: Sat, 10 Nov 2018 15:51:16 +0100 Subject: hwrng: bcm2835 - Switch to SPDX identifier Adopt the SPDX license identifier headers to ease license compliance management. While we are at this fix the comment style, too. Cc: Lubomir Rintel Signed-off-by: Stefan Wahren Acked-by: Greg Kroah-Hartman Reviewed-by: Eric Anholt Acked-by: Lubomir Rintel Signed-off-by: Herbert Xu --- drivers/char/hw_random/bcm2835-rng.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/drivers/char/hw_random/bcm2835-rng.c b/drivers/char/hw_random/bcm2835-rng.c index 6767d965c36c..256b0b1d0f26 100644 --- a/drivers/char/hw_random/bcm2835-rng.c +++ b/drivers/char/hw_random/bcm2835-rng.c @@ -1,10 +1,7 @@ -/** +// SPDX-License-Identifier: GPL-2.0 +/* * Copyright (c) 2010-2012 Broadcom. All rights reserved. * Copyright (c) 2013 Lubomir Rintel - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License ("GPL") - * version 2, as published by the Free Software Foundation. */ #include -- cgit v1.2.3-59-g8ed1b From e4e72063d3c0ee9ba10faeb5645dcdaae2d733e9 Mon Sep 17 00:00:00 2001 From: Martin Willi Date: Sun, 11 Nov 2018 10:36:25 +0100 Subject: crypto: x86/chacha20 - Support partial lengths in 1-block SSSE3 variant Add a length argument to the single block function for SSSE3, so the block function may XOR only a partial length of the full block. Given that the setup code is rather cheap, the function does not process more than one block; this allows us to keep the block function selection in the C glue code. The required branching does not negatively affect performance for full block sizes. The partial XORing uses simple "rep movsb" to copy the data before and after doing XOR in SSE. This is rather efficient on modern processors; movsw can be slightly faster, but the additional complexity is probably not worth it. Signed-off-by: Martin Willi Signed-off-by: Herbert Xu --- arch/x86/crypto/chacha20-ssse3-x86_64.S | 74 ++++++++++++++++++++++++++------- arch/x86/crypto/chacha20_glue.c | 11 ++--- 2 files changed, 63 insertions(+), 22 deletions(-) diff --git a/arch/x86/crypto/chacha20-ssse3-x86_64.S b/arch/x86/crypto/chacha20-ssse3-x86_64.S index 512a2b500fd1..98d130b5e4ab 100644 --- a/arch/x86/crypto/chacha20-ssse3-x86_64.S +++ b/arch/x86/crypto/chacha20-ssse3-x86_64.S @@ -25,12 +25,13 @@ CTRINC: .octa 0x00000003000000020000000100000000 ENTRY(chacha20_block_xor_ssse3) # %rdi: Input state matrix, s - # %rsi: 1 data block output, o - # %rdx: 1 data block input, i + # %rsi: up to 1 data block output, o + # %rdx: up to 1 data block input, i + # %rcx: input/output length in bytes # This function encrypts one ChaCha20 block by loading the state matrix # in four SSE registers. It performs matrix operation on four words in - # parallel, but requireds shuffling to rearrange the words after each + # parallel, but requires shuffling to rearrange the words after each # round. 8/16-bit word rotation is done with the slightly better # performing SSSE3 byte shuffling, 7/12-bit word rotation uses # traditional shift+OR. @@ -48,7 +49,8 @@ ENTRY(chacha20_block_xor_ssse3) movdqa ROT8(%rip),%xmm4 movdqa ROT16(%rip),%xmm5 - mov $10,%ecx + mov %rcx,%rax + mov $10,%ecx .Ldoubleround: @@ -122,27 +124,69 @@ ENTRY(chacha20_block_xor_ssse3) jnz .Ldoubleround # o0 = i0 ^ (x0 + s0) - movdqu 0x00(%rdx),%xmm4 paddd %xmm8,%xmm0 + cmp $0x10,%rax + jl .Lxorpart + movdqu 0x00(%rdx),%xmm4 pxor %xmm4,%xmm0 movdqu %xmm0,0x00(%rsi) # o1 = i1 ^ (x1 + s1) - movdqu 0x10(%rdx),%xmm5 paddd %xmm9,%xmm1 - pxor %xmm5,%xmm1 - movdqu %xmm1,0x10(%rsi) + movdqa %xmm1,%xmm0 + cmp $0x20,%rax + jl .Lxorpart + movdqu 0x10(%rdx),%xmm0 + pxor %xmm1,%xmm0 + movdqu %xmm0,0x10(%rsi) # o2 = i2 ^ (x2 + s2) - movdqu 0x20(%rdx),%xmm6 paddd %xmm10,%xmm2 - pxor %xmm6,%xmm2 - movdqu %xmm2,0x20(%rsi) + movdqa %xmm2,%xmm0 + cmp $0x30,%rax + jl .Lxorpart + movdqu 0x20(%rdx),%xmm0 + pxor %xmm2,%xmm0 + movdqu %xmm0,0x20(%rsi) # o3 = i3 ^ (x3 + s3) - movdqu 0x30(%rdx),%xmm7 paddd %xmm11,%xmm3 - pxor %xmm7,%xmm3 - movdqu %xmm3,0x30(%rsi) - + movdqa %xmm3,%xmm0 + cmp $0x40,%rax + jl .Lxorpart + movdqu 0x30(%rdx),%xmm0 + pxor %xmm3,%xmm0 + movdqu %xmm0,0x30(%rsi) + +.Ldone: ret + +.Lxorpart: + # xor remaining bytes from partial register into output + mov %rax,%r9 + and $0x0f,%r9 + jz .Ldone + and $~0x0f,%rax + + mov %rsi,%r11 + + lea 8(%rsp),%r10 + sub $0x10,%rsp + and $~31,%rsp + + lea (%rdx,%rax),%rsi + mov %rsp,%rdi + mov %r9,%rcx + rep movsb + + pxor 0x00(%rsp),%xmm0 + movdqa %xmm0,0x00(%rsp) + + mov %rsp,%rsi + lea (%r11,%rax),%rdi + mov %r9,%rcx + rep movsb + + lea -8(%r10),%rsp + jmp .Ldone + ENDPROC(chacha20_block_xor_ssse3) ENTRY(chacha20_4block_xor_ssse3) diff --git a/arch/x86/crypto/chacha20_glue.c b/arch/x86/crypto/chacha20_glue.c index dce7c5d39c2f..cc4571736ce8 100644 --- a/arch/x86/crypto/chacha20_glue.c +++ b/arch/x86/crypto/chacha20_glue.c @@ -19,7 +19,8 @@ #define CHACHA20_STATE_ALIGN 16 -asmlinkage void chacha20_block_xor_ssse3(u32 *state, u8 *dst, const u8 *src); +asmlinkage void chacha20_block_xor_ssse3(u32 *state, u8 *dst, const u8 *src, + unsigned int len); asmlinkage void chacha20_4block_xor_ssse3(u32 *state, u8 *dst, const u8 *src); #ifdef CONFIG_AS_AVX2 asmlinkage void chacha20_8block_xor_avx2(u32 *state, u8 *dst, const u8 *src); @@ -29,8 +30,6 @@ static bool chacha20_use_avx2; static void chacha20_dosimd(u32 *state, u8 *dst, const u8 *src, unsigned int bytes) { - u8 buf[CHACHA20_BLOCK_SIZE]; - #ifdef CONFIG_AS_AVX2 if (chacha20_use_avx2) { while (bytes >= CHACHA20_BLOCK_SIZE * 8) { @@ -50,16 +49,14 @@ static void chacha20_dosimd(u32 *state, u8 *dst, const u8 *src, state[12] += 4; } while (bytes >= CHACHA20_BLOCK_SIZE) { - chacha20_block_xor_ssse3(state, dst, src); + chacha20_block_xor_ssse3(state, dst, src, bytes); bytes -= CHACHA20_BLOCK_SIZE; src += CHACHA20_BLOCK_SIZE; dst += CHACHA20_BLOCK_SIZE; state[12]++; } if (bytes) { - memcpy(buf, src, bytes); - chacha20_block_xor_ssse3(state, buf, buf); - memcpy(dst, buf, bytes); + chacha20_block_xor_ssse3(state, dst, src, bytes); } } -- cgit v1.2.3-59-g8ed1b From db8e15a24957904d10f784a9adc4ea4824ee996c Mon Sep 17 00:00:00 2001 From: Martin Willi Date: Sun, 11 Nov 2018 10:36:26 +0100 Subject: crypto: x86/chacha20 - Support partial lengths in 4-block SSSE3 variant Add a length argument to the quad block function for SSSE3, so the block function may XOR only a partial length of four blocks. As we already have the stack set up, the partial XORing does not need to. This gives a slightly different function trailer, so we keep that separate from the 1-block function. Signed-off-by: Martin Willi Signed-off-by: Herbert Xu --- arch/x86/crypto/chacha20-ssse3-x86_64.S | 163 ++++++++++++++++++++++++-------- arch/x86/crypto/chacha20_glue.c | 5 +- 2 files changed, 128 insertions(+), 40 deletions(-) diff --git a/arch/x86/crypto/chacha20-ssse3-x86_64.S b/arch/x86/crypto/chacha20-ssse3-x86_64.S index 98d130b5e4ab..d8ac75bb448f 100644 --- a/arch/x86/crypto/chacha20-ssse3-x86_64.S +++ b/arch/x86/crypto/chacha20-ssse3-x86_64.S @@ -191,8 +191,9 @@ ENDPROC(chacha20_block_xor_ssse3) ENTRY(chacha20_4block_xor_ssse3) # %rdi: Input state matrix, s - # %rsi: 4 data blocks output, o - # %rdx: 4 data blocks input, i + # %rsi: up to 4 data blocks output, o + # %rdx: up to 4 data blocks input, i + # %rcx: input/output length in bytes # This function encrypts four consecutive ChaCha20 blocks by loading the # the state matrix in SSE registers four times. As we need some scratch @@ -207,6 +208,7 @@ ENTRY(chacha20_4block_xor_ssse3) lea 8(%rsp),%r10 sub $0x80,%rsp and $~63,%rsp + mov %rcx,%rax # x0..15[0-3] = s0..3[0..3] movq 0x00(%rdi),%xmm1 @@ -617,58 +619,143 @@ ENTRY(chacha20_4block_xor_ssse3) # xor with corresponding input, write to output movdqa 0x00(%rsp),%xmm0 + cmp $0x10,%rax + jl .Lxorpart4 movdqu 0x00(%rdx),%xmm1 pxor %xmm1,%xmm0 movdqu %xmm0,0x00(%rsi) - movdqa 0x10(%rsp),%xmm0 - movdqu 0x80(%rdx),%xmm1 + + movdqu %xmm4,%xmm0 + cmp $0x20,%rax + jl .Lxorpart4 + movdqu 0x10(%rdx),%xmm1 pxor %xmm1,%xmm0 - movdqu %xmm0,0x80(%rsi) + movdqu %xmm0,0x10(%rsi) + + movdqu %xmm8,%xmm0 + cmp $0x30,%rax + jl .Lxorpart4 + movdqu 0x20(%rdx),%xmm1 + pxor %xmm1,%xmm0 + movdqu %xmm0,0x20(%rsi) + + movdqu %xmm12,%xmm0 + cmp $0x40,%rax + jl .Lxorpart4 + movdqu 0x30(%rdx),%xmm1 + pxor %xmm1,%xmm0 + movdqu %xmm0,0x30(%rsi) + movdqa 0x20(%rsp),%xmm0 + cmp $0x50,%rax + jl .Lxorpart4 movdqu 0x40(%rdx),%xmm1 pxor %xmm1,%xmm0 movdqu %xmm0,0x40(%rsi) + + movdqu %xmm6,%xmm0 + cmp $0x60,%rax + jl .Lxorpart4 + movdqu 0x50(%rdx),%xmm1 + pxor %xmm1,%xmm0 + movdqu %xmm0,0x50(%rsi) + + movdqu %xmm10,%xmm0 + cmp $0x70,%rax + jl .Lxorpart4 + movdqu 0x60(%rdx),%xmm1 + pxor %xmm1,%xmm0 + movdqu %xmm0,0x60(%rsi) + + movdqu %xmm14,%xmm0 + cmp $0x80,%rax + jl .Lxorpart4 + movdqu 0x70(%rdx),%xmm1 + pxor %xmm1,%xmm0 + movdqu %xmm0,0x70(%rsi) + + movdqa 0x10(%rsp),%xmm0 + cmp $0x90,%rax + jl .Lxorpart4 + movdqu 0x80(%rdx),%xmm1 + pxor %xmm1,%xmm0 + movdqu %xmm0,0x80(%rsi) + + movdqu %xmm5,%xmm0 + cmp $0xa0,%rax + jl .Lxorpart4 + movdqu 0x90(%rdx),%xmm1 + pxor %xmm1,%xmm0 + movdqu %xmm0,0x90(%rsi) + + movdqu %xmm9,%xmm0 + cmp $0xb0,%rax + jl .Lxorpart4 + movdqu 0xa0(%rdx),%xmm1 + pxor %xmm1,%xmm0 + movdqu %xmm0,0xa0(%rsi) + + movdqu %xmm13,%xmm0 + cmp $0xc0,%rax + jl .Lxorpart4 + movdqu 0xb0(%rdx),%xmm1 + pxor %xmm1,%xmm0 + movdqu %xmm0,0xb0(%rsi) + movdqa 0x30(%rsp),%xmm0 + cmp $0xd0,%rax + jl .Lxorpart4 movdqu 0xc0(%rdx),%xmm1 pxor %xmm1,%xmm0 movdqu %xmm0,0xc0(%rsi) - movdqu 0x10(%rdx),%xmm1 - pxor %xmm1,%xmm4 - movdqu %xmm4,0x10(%rsi) - movdqu 0x90(%rdx),%xmm1 - pxor %xmm1,%xmm5 - movdqu %xmm5,0x90(%rsi) - movdqu 0x50(%rdx),%xmm1 - pxor %xmm1,%xmm6 - movdqu %xmm6,0x50(%rsi) + + movdqu %xmm7,%xmm0 + cmp $0xe0,%rax + jl .Lxorpart4 movdqu 0xd0(%rdx),%xmm1 - pxor %xmm1,%xmm7 - movdqu %xmm7,0xd0(%rsi) - movdqu 0x20(%rdx),%xmm1 - pxor %xmm1,%xmm8 - movdqu %xmm8,0x20(%rsi) - movdqu 0xa0(%rdx),%xmm1 - pxor %xmm1,%xmm9 - movdqu %xmm9,0xa0(%rsi) - movdqu 0x60(%rdx),%xmm1 - pxor %xmm1,%xmm10 - movdqu %xmm10,0x60(%rsi) + pxor %xmm1,%xmm0 + movdqu %xmm0,0xd0(%rsi) + + movdqu %xmm11,%xmm0 + cmp $0xf0,%rax + jl .Lxorpart4 movdqu 0xe0(%rdx),%xmm1 - pxor %xmm1,%xmm11 - movdqu %xmm11,0xe0(%rsi) - movdqu 0x30(%rdx),%xmm1 - pxor %xmm1,%xmm12 - movdqu %xmm12,0x30(%rsi) - movdqu 0xb0(%rdx),%xmm1 - pxor %xmm1,%xmm13 - movdqu %xmm13,0xb0(%rsi) - movdqu 0x70(%rdx),%xmm1 - pxor %xmm1,%xmm14 - movdqu %xmm14,0x70(%rsi) + pxor %xmm1,%xmm0 + movdqu %xmm0,0xe0(%rsi) + + movdqu %xmm15,%xmm0 + cmp $0x100,%rax + jl .Lxorpart4 movdqu 0xf0(%rdx),%xmm1 - pxor %xmm1,%xmm15 - movdqu %xmm15,0xf0(%rsi) + pxor %xmm1,%xmm0 + movdqu %xmm0,0xf0(%rsi) +.Ldone4: lea -8(%r10),%rsp ret + +.Lxorpart4: + # xor remaining bytes from partial register into output + mov %rax,%r9 + and $0x0f,%r9 + jz .Ldone4 + and $~0x0f,%rax + + mov %rsi,%r11 + + lea (%rdx,%rax),%rsi + mov %rsp,%rdi + mov %r9,%rcx + rep movsb + + pxor 0x00(%rsp),%xmm0 + movdqa %xmm0,0x00(%rsp) + + mov %rsp,%rsi + lea (%r11,%rax),%rdi + mov %r9,%rcx + rep movsb + + jmp .Ldone4 + ENDPROC(chacha20_4block_xor_ssse3) diff --git a/arch/x86/crypto/chacha20_glue.c b/arch/x86/crypto/chacha20_glue.c index cc4571736ce8..8f1ef1a9ce5c 100644 --- a/arch/x86/crypto/chacha20_glue.c +++ b/arch/x86/crypto/chacha20_glue.c @@ -21,7 +21,8 @@ asmlinkage void chacha20_block_xor_ssse3(u32 *state, u8 *dst, const u8 *src, unsigned int len); -asmlinkage void chacha20_4block_xor_ssse3(u32 *state, u8 *dst, const u8 *src); +asmlinkage void chacha20_4block_xor_ssse3(u32 *state, u8 *dst, const u8 *src, + unsigned int len); #ifdef CONFIG_AS_AVX2 asmlinkage void chacha20_8block_xor_avx2(u32 *state, u8 *dst, const u8 *src); static bool chacha20_use_avx2; @@ -42,7 +43,7 @@ static void chacha20_dosimd(u32 *state, u8 *dst, const u8 *src, } #endif while (bytes >= CHACHA20_BLOCK_SIZE * 4) { - chacha20_4block_xor_ssse3(state, dst, src); + chacha20_4block_xor_ssse3(state, dst, src, bytes); bytes -= CHACHA20_BLOCK_SIZE * 4; src += CHACHA20_BLOCK_SIZE * 4; dst += CHACHA20_BLOCK_SIZE * 4; -- cgit v1.2.3-59-g8ed1b From c3b734dd325dadc73c2f5b4d187208730bf21df5 Mon Sep 17 00:00:00 2001 From: Martin Willi Date: Sun, 11 Nov 2018 10:36:27 +0100 Subject: crypto: x86/chacha20 - Support partial lengths in 8-block AVX2 variant Add a length argument to the eight block function for AVX2, so the block function may XOR only a partial length of eight blocks. To avoid unnecessary operations, we integrate XORing of the first four blocks in the final lane interleaving; this also avoids some work in the partial lengths path. Signed-off-by: Martin Willi Signed-off-by: Herbert Xu --- arch/x86/crypto/chacha20-avx2-x86_64.S | 189 +++++++++++++++++++++++---------- arch/x86/crypto/chacha20_glue.c | 5 +- 2 files changed, 133 insertions(+), 61 deletions(-) diff --git a/arch/x86/crypto/chacha20-avx2-x86_64.S b/arch/x86/crypto/chacha20-avx2-x86_64.S index f3cd26f48332..7b62d55bee3d 100644 --- a/arch/x86/crypto/chacha20-avx2-x86_64.S +++ b/arch/x86/crypto/chacha20-avx2-x86_64.S @@ -30,8 +30,9 @@ CTRINC: .octa 0x00000003000000020000000100000000 ENTRY(chacha20_8block_xor_avx2) # %rdi: Input state matrix, s - # %rsi: 8 data blocks output, o - # %rdx: 8 data blocks input, i + # %rsi: up to 8 data blocks output, o + # %rdx: up to 8 data blocks input, i + # %rcx: input/output length in bytes # This function encrypts eight consecutive ChaCha20 blocks by loading # the state matrix in AVX registers eight times. As we need some @@ -48,6 +49,7 @@ ENTRY(chacha20_8block_xor_avx2) lea 8(%rsp),%r10 and $~31, %rsp sub $0x80, %rsp + mov %rcx,%rax # x0..15[0-7] = s[0..15] vpbroadcastd 0x00(%rdi),%ymm0 @@ -375,74 +377,143 @@ ENTRY(chacha20_8block_xor_avx2) vpunpckhqdq %ymm15,%ymm0,%ymm15 # interleave 128-bit words in state n, n+4 - vmovdqa 0x00(%rsp),%ymm0 - vperm2i128 $0x20,%ymm4,%ymm0,%ymm1 - vperm2i128 $0x31,%ymm4,%ymm0,%ymm4 - vmovdqa %ymm1,0x00(%rsp) - vmovdqa 0x20(%rsp),%ymm0 - vperm2i128 $0x20,%ymm5,%ymm0,%ymm1 - vperm2i128 $0x31,%ymm5,%ymm0,%ymm5 - vmovdqa %ymm1,0x20(%rsp) - vmovdqa 0x40(%rsp),%ymm0 - vperm2i128 $0x20,%ymm6,%ymm0,%ymm1 - vperm2i128 $0x31,%ymm6,%ymm0,%ymm6 - vmovdqa %ymm1,0x40(%rsp) - vmovdqa 0x60(%rsp),%ymm0 - vperm2i128 $0x20,%ymm7,%ymm0,%ymm1 - vperm2i128 $0x31,%ymm7,%ymm0,%ymm7 - vmovdqa %ymm1,0x60(%rsp) + # xor/write first four blocks + vmovdqa 0x00(%rsp),%ymm1 + vperm2i128 $0x20,%ymm4,%ymm1,%ymm0 + cmp $0x0020,%rax + jl .Lxorpart8 + vpxor 0x0000(%rdx),%ymm0,%ymm0 + vmovdqu %ymm0,0x0000(%rsi) + vperm2i128 $0x31,%ymm4,%ymm1,%ymm4 + vperm2i128 $0x20,%ymm12,%ymm8,%ymm0 + cmp $0x0040,%rax + jl .Lxorpart8 + vpxor 0x0020(%rdx),%ymm0,%ymm0 + vmovdqu %ymm0,0x0020(%rsi) vperm2i128 $0x31,%ymm12,%ymm8,%ymm12 - vmovdqa %ymm0,%ymm8 - vperm2i128 $0x20,%ymm13,%ymm9,%ymm0 - vperm2i128 $0x31,%ymm13,%ymm9,%ymm13 - vmovdqa %ymm0,%ymm9 + + vmovdqa 0x40(%rsp),%ymm1 + vperm2i128 $0x20,%ymm6,%ymm1,%ymm0 + cmp $0x0060,%rax + jl .Lxorpart8 + vpxor 0x0040(%rdx),%ymm0,%ymm0 + vmovdqu %ymm0,0x0040(%rsi) + vperm2i128 $0x31,%ymm6,%ymm1,%ymm6 + vperm2i128 $0x20,%ymm14,%ymm10,%ymm0 + cmp $0x0080,%rax + jl .Lxorpart8 + vpxor 0x0060(%rdx),%ymm0,%ymm0 + vmovdqu %ymm0,0x0060(%rsi) vperm2i128 $0x31,%ymm14,%ymm10,%ymm14 - vmovdqa %ymm0,%ymm10 - vperm2i128 $0x20,%ymm15,%ymm11,%ymm0 - vperm2i128 $0x31,%ymm15,%ymm11,%ymm15 - vmovdqa %ymm0,%ymm11 - # xor with corresponding input, write to output - vmovdqa 0x00(%rsp),%ymm0 - vpxor 0x0000(%rdx),%ymm0,%ymm0 - vmovdqu %ymm0,0x0000(%rsi) - vmovdqa 0x20(%rsp),%ymm0 + vmovdqa 0x20(%rsp),%ymm1 + vperm2i128 $0x20,%ymm5,%ymm1,%ymm0 + cmp $0x00a0,%rax + jl .Lxorpart8 vpxor 0x0080(%rdx),%ymm0,%ymm0 vmovdqu %ymm0,0x0080(%rsi) - vmovdqa 0x40(%rsp),%ymm0 - vpxor 0x0040(%rdx),%ymm0,%ymm0 - vmovdqu %ymm0,0x0040(%rsi) - vmovdqa 0x60(%rsp),%ymm0 + vperm2i128 $0x31,%ymm5,%ymm1,%ymm5 + + vperm2i128 $0x20,%ymm13,%ymm9,%ymm0 + cmp $0x00c0,%rax + jl .Lxorpart8 + vpxor 0x00a0(%rdx),%ymm0,%ymm0 + vmovdqu %ymm0,0x00a0(%rsi) + vperm2i128 $0x31,%ymm13,%ymm9,%ymm13 + + vmovdqa 0x60(%rsp),%ymm1 + vperm2i128 $0x20,%ymm7,%ymm1,%ymm0 + cmp $0x00e0,%rax + jl .Lxorpart8 vpxor 0x00c0(%rdx),%ymm0,%ymm0 vmovdqu %ymm0,0x00c0(%rsi) - vpxor 0x0100(%rdx),%ymm4,%ymm4 - vmovdqu %ymm4,0x0100(%rsi) - vpxor 0x0180(%rdx),%ymm5,%ymm5 - vmovdqu %ymm5,0x00180(%rsi) - vpxor 0x0140(%rdx),%ymm6,%ymm6 - vmovdqu %ymm6,0x0140(%rsi) - vpxor 0x01c0(%rdx),%ymm7,%ymm7 - vmovdqu %ymm7,0x01c0(%rsi) - vpxor 0x0020(%rdx),%ymm8,%ymm8 - vmovdqu %ymm8,0x0020(%rsi) - vpxor 0x00a0(%rdx),%ymm9,%ymm9 - vmovdqu %ymm9,0x00a0(%rsi) - vpxor 0x0060(%rdx),%ymm10,%ymm10 - vmovdqu %ymm10,0x0060(%rsi) - vpxor 0x00e0(%rdx),%ymm11,%ymm11 - vmovdqu %ymm11,0x00e0(%rsi) - vpxor 0x0120(%rdx),%ymm12,%ymm12 - vmovdqu %ymm12,0x0120(%rsi) - vpxor 0x01a0(%rdx),%ymm13,%ymm13 - vmovdqu %ymm13,0x01a0(%rsi) - vpxor 0x0160(%rdx),%ymm14,%ymm14 - vmovdqu %ymm14,0x0160(%rsi) - vpxor 0x01e0(%rdx),%ymm15,%ymm15 - vmovdqu %ymm15,0x01e0(%rsi) + vperm2i128 $0x31,%ymm7,%ymm1,%ymm7 + + vperm2i128 $0x20,%ymm15,%ymm11,%ymm0 + cmp $0x0100,%rax + jl .Lxorpart8 + vpxor 0x00e0(%rdx),%ymm0,%ymm0 + vmovdqu %ymm0,0x00e0(%rsi) + vperm2i128 $0x31,%ymm15,%ymm11,%ymm15 + + # xor remaining blocks, write to output + vmovdqa %ymm4,%ymm0 + cmp $0x0120,%rax + jl .Lxorpart8 + vpxor 0x0100(%rdx),%ymm0,%ymm0 + vmovdqu %ymm0,0x0100(%rsi) + vmovdqa %ymm12,%ymm0 + cmp $0x0140,%rax + jl .Lxorpart8 + vpxor 0x0120(%rdx),%ymm0,%ymm0 + vmovdqu %ymm0,0x0120(%rsi) + + vmovdqa %ymm6,%ymm0 + cmp $0x0160,%rax + jl .Lxorpart8 + vpxor 0x0140(%rdx),%ymm0,%ymm0 + vmovdqu %ymm0,0x0140(%rsi) + + vmovdqa %ymm14,%ymm0 + cmp $0x0180,%rax + jl .Lxorpart8 + vpxor 0x0160(%rdx),%ymm0,%ymm0 + vmovdqu %ymm0,0x0160(%rsi) + + vmovdqa %ymm5,%ymm0 + cmp $0x01a0,%rax + jl .Lxorpart8 + vpxor 0x0180(%rdx),%ymm0,%ymm0 + vmovdqu %ymm0,0x0180(%rsi) + + vmovdqa %ymm13,%ymm0 + cmp $0x01c0,%rax + jl .Lxorpart8 + vpxor 0x01a0(%rdx),%ymm0,%ymm0 + vmovdqu %ymm0,0x01a0(%rsi) + + vmovdqa %ymm7,%ymm0 + cmp $0x01e0,%rax + jl .Lxorpart8 + vpxor 0x01c0(%rdx),%ymm0,%ymm0 + vmovdqu %ymm0,0x01c0(%rsi) + + vmovdqa %ymm15,%ymm0 + cmp $0x0200,%rax + jl .Lxorpart8 + vpxor 0x01e0(%rdx),%ymm0,%ymm0 + vmovdqu %ymm0,0x01e0(%rsi) + +.Ldone8: vzeroupper lea -8(%r10),%rsp ret + +.Lxorpart8: + # xor remaining bytes from partial register into output + mov %rax,%r9 + and $0x1f,%r9 + jz .Ldone8 + and $~0x1f,%rax + + mov %rsi,%r11 + + lea (%rdx,%rax),%rsi + mov %rsp,%rdi + mov %r9,%rcx + rep movsb + + vpxor 0x00(%rsp),%ymm0,%ymm0 + vmovdqa %ymm0,0x00(%rsp) + + mov %rsp,%rsi + lea (%r11,%rax),%rdi + mov %r9,%rcx + rep movsb + + jmp .Ldone8 + ENDPROC(chacha20_8block_xor_avx2) diff --git a/arch/x86/crypto/chacha20_glue.c b/arch/x86/crypto/chacha20_glue.c index 8f1ef1a9ce5c..882e8bf5965a 100644 --- a/arch/x86/crypto/chacha20_glue.c +++ b/arch/x86/crypto/chacha20_glue.c @@ -24,7 +24,8 @@ asmlinkage void chacha20_block_xor_ssse3(u32 *state, u8 *dst, const u8 *src, asmlinkage void chacha20_4block_xor_ssse3(u32 *state, u8 *dst, const u8 *src, unsigned int len); #ifdef CONFIG_AS_AVX2 -asmlinkage void chacha20_8block_xor_avx2(u32 *state, u8 *dst, const u8 *src); +asmlinkage void chacha20_8block_xor_avx2(u32 *state, u8 *dst, const u8 *src, + unsigned int len); static bool chacha20_use_avx2; #endif @@ -34,7 +35,7 @@ static void chacha20_dosimd(u32 *state, u8 *dst, const u8 *src, #ifdef CONFIG_AS_AVX2 if (chacha20_use_avx2) { while (bytes >= CHACHA20_BLOCK_SIZE * 8) { - chacha20_8block_xor_avx2(state, dst, src); + chacha20_8block_xor_avx2(state, dst, src, bytes); bytes -= CHACHA20_BLOCK_SIZE * 8; src += CHACHA20_BLOCK_SIZE * 8; dst += CHACHA20_BLOCK_SIZE * 8; -- cgit v1.2.3-59-g8ed1b From 9b17608f15b940babe2e32522ea29787abd10af2 Mon Sep 17 00:00:00 2001 From: Martin Willi Date: Sun, 11 Nov 2018 10:36:28 +0100 Subject: crypto: x86/chacha20 - Use larger block functions more aggressively Now that all block functions support partial lengths, engage the wider block sizes more aggressively. This prevents using smaller block functions multiple times, where the next larger block function would have been faster. Signed-off-by: Martin Willi Signed-off-by: Herbert Xu --- arch/x86/crypto/chacha20_glue.c | 39 ++++++++++++++++++++++++--------------- 1 file changed, 24 insertions(+), 15 deletions(-) diff --git a/arch/x86/crypto/chacha20_glue.c b/arch/x86/crypto/chacha20_glue.c index 882e8bf5965a..b541da71f11e 100644 --- a/arch/x86/crypto/chacha20_glue.c +++ b/arch/x86/crypto/chacha20_glue.c @@ -29,6 +29,12 @@ asmlinkage void chacha20_8block_xor_avx2(u32 *state, u8 *dst, const u8 *src, static bool chacha20_use_avx2; #endif +static unsigned int chacha20_advance(unsigned int len, unsigned int maxblocks) +{ + len = min(len, maxblocks * CHACHA20_BLOCK_SIZE); + return round_up(len, CHACHA20_BLOCK_SIZE) / CHACHA20_BLOCK_SIZE; +} + static void chacha20_dosimd(u32 *state, u8 *dst, const u8 *src, unsigned int bytes) { @@ -41,6 +47,11 @@ static void chacha20_dosimd(u32 *state, u8 *dst, const u8 *src, dst += CHACHA20_BLOCK_SIZE * 8; state[12] += 8; } + if (bytes > CHACHA20_BLOCK_SIZE * 4) { + chacha20_8block_xor_avx2(state, dst, src, bytes); + state[12] += chacha20_advance(bytes, 8); + return; + } } #endif while (bytes >= CHACHA20_BLOCK_SIZE * 4) { @@ -50,15 +61,14 @@ static void chacha20_dosimd(u32 *state, u8 *dst, const u8 *src, dst += CHACHA20_BLOCK_SIZE * 4; state[12] += 4; } - while (bytes >= CHACHA20_BLOCK_SIZE) { - chacha20_block_xor_ssse3(state, dst, src, bytes); - bytes -= CHACHA20_BLOCK_SIZE; - src += CHACHA20_BLOCK_SIZE; - dst += CHACHA20_BLOCK_SIZE; - state[12]++; + if (bytes > CHACHA20_BLOCK_SIZE) { + chacha20_4block_xor_ssse3(state, dst, src, bytes); + state[12] += chacha20_advance(bytes, 4); + return; } if (bytes) { chacha20_block_xor_ssse3(state, dst, src, bytes); + state[12]++; } } @@ -82,17 +92,16 @@ static int chacha20_simd(struct skcipher_request *req) kernel_fpu_begin(); - while (walk.nbytes >= CHACHA20_BLOCK_SIZE) { - chacha20_dosimd(state, walk.dst.virt.addr, walk.src.virt.addr, - rounddown(walk.nbytes, CHACHA20_BLOCK_SIZE)); - err = skcipher_walk_done(&walk, - walk.nbytes % CHACHA20_BLOCK_SIZE); - } + while (walk.nbytes > 0) { + unsigned int nbytes = walk.nbytes; + + if (nbytes < walk.total) + nbytes = round_down(nbytes, walk.stride); - if (walk.nbytes) { chacha20_dosimd(state, walk.dst.virt.addr, walk.src.virt.addr, - walk.nbytes); - err = skcipher_walk_done(&walk, 0); + nbytes); + + err = skcipher_walk_done(&walk, walk.nbytes - nbytes); } kernel_fpu_end(); -- cgit v1.2.3-59-g8ed1b From a5dd97f86211e91219807db607d740f9896b8e0b Mon Sep 17 00:00:00 2001 From: Martin Willi Date: Sun, 11 Nov 2018 10:36:29 +0100 Subject: crypto: x86/chacha20 - Add a 2-block AVX2 variant This variant uses the same principle as the single block SSSE3 variant by shuffling the state matrix after each round. With the wider AVX registers, we can do two blocks in parallel, though. This function can increase performance and efficiency significantly for lengths that would otherwise require a 4-block function. Signed-off-by: Martin Willi Signed-off-by: Herbert Xu --- arch/x86/crypto/chacha20-avx2-x86_64.S | 197 +++++++++++++++++++++++++++++++++ arch/x86/crypto/chacha20_glue.c | 7 ++ 2 files changed, 204 insertions(+) diff --git a/arch/x86/crypto/chacha20-avx2-x86_64.S b/arch/x86/crypto/chacha20-avx2-x86_64.S index 7b62d55bee3d..8247076b0ba7 100644 --- a/arch/x86/crypto/chacha20-avx2-x86_64.S +++ b/arch/x86/crypto/chacha20-avx2-x86_64.S @@ -26,8 +26,205 @@ ROT16: .octa 0x0d0c0f0e09080b0a0504070601000302 CTRINC: .octa 0x00000003000000020000000100000000 .octa 0x00000007000000060000000500000004 +.section .rodata.cst32.CTR2BL, "aM", @progbits, 32 +.align 32 +CTR2BL: .octa 0x00000000000000000000000000000000 + .octa 0x00000000000000000000000000000001 + .text +ENTRY(chacha20_2block_xor_avx2) + # %rdi: Input state matrix, s + # %rsi: up to 2 data blocks output, o + # %rdx: up to 2 data blocks input, i + # %rcx: input/output length in bytes + + # This function encrypts two ChaCha20 blocks by loading the state + # matrix twice across four AVX registers. It performs matrix operations + # on four words in each matrix in parallel, but requires shuffling to + # rearrange the words after each round. + + vzeroupper + + # x0..3[0-2] = s0..3 + vbroadcasti128 0x00(%rdi),%ymm0 + vbroadcasti128 0x10(%rdi),%ymm1 + vbroadcasti128 0x20(%rdi),%ymm2 + vbroadcasti128 0x30(%rdi),%ymm3 + + vpaddd CTR2BL(%rip),%ymm3,%ymm3 + + vmovdqa %ymm0,%ymm8 + vmovdqa %ymm1,%ymm9 + vmovdqa %ymm2,%ymm10 + vmovdqa %ymm3,%ymm11 + + vmovdqa ROT8(%rip),%ymm4 + vmovdqa ROT16(%rip),%ymm5 + + mov %rcx,%rax + mov $10,%ecx + +.Ldoubleround: + + # x0 += x1, x3 = rotl32(x3 ^ x0, 16) + vpaddd %ymm1,%ymm0,%ymm0 + vpxor %ymm0,%ymm3,%ymm3 + vpshufb %ymm5,%ymm3,%ymm3 + + # x2 += x3, x1 = rotl32(x1 ^ x2, 12) + vpaddd %ymm3,%ymm2,%ymm2 + vpxor %ymm2,%ymm1,%ymm1 + vmovdqa %ymm1,%ymm6 + vpslld $12,%ymm6,%ymm6 + vpsrld $20,%ymm1,%ymm1 + vpor %ymm6,%ymm1,%ymm1 + + # x0 += x1, x3 = rotl32(x3 ^ x0, 8) + vpaddd %ymm1,%ymm0,%ymm0 + vpxor %ymm0,%ymm3,%ymm3 + vpshufb %ymm4,%ymm3,%ymm3 + + # x2 += x3, x1 = rotl32(x1 ^ x2, 7) + vpaddd %ymm3,%ymm2,%ymm2 + vpxor %ymm2,%ymm1,%ymm1 + vmovdqa %ymm1,%ymm7 + vpslld $7,%ymm7,%ymm7 + vpsrld $25,%ymm1,%ymm1 + vpor %ymm7,%ymm1,%ymm1 + + # x1 = shuffle32(x1, MASK(0, 3, 2, 1)) + vpshufd $0x39,%ymm1,%ymm1 + # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) + vpshufd $0x4e,%ymm2,%ymm2 + # x3 = shuffle32(x3, MASK(2, 1, 0, 3)) + vpshufd $0x93,%ymm3,%ymm3 + + # x0 += x1, x3 = rotl32(x3 ^ x0, 16) + vpaddd %ymm1,%ymm0,%ymm0 + vpxor %ymm0,%ymm3,%ymm3 + vpshufb %ymm5,%ymm3,%ymm3 + + # x2 += x3, x1 = rotl32(x1 ^ x2, 12) + vpaddd %ymm3,%ymm2,%ymm2 + vpxor %ymm2,%ymm1,%ymm1 + vmovdqa %ymm1,%ymm6 + vpslld $12,%ymm6,%ymm6 + vpsrld $20,%ymm1,%ymm1 + vpor %ymm6,%ymm1,%ymm1 + + # x0 += x1, x3 = rotl32(x3 ^ x0, 8) + vpaddd %ymm1,%ymm0,%ymm0 + vpxor %ymm0,%ymm3,%ymm3 + vpshufb %ymm4,%ymm3,%ymm3 + + # x2 += x3, x1 = rotl32(x1 ^ x2, 7) + vpaddd %ymm3,%ymm2,%ymm2 + vpxor %ymm2,%ymm1,%ymm1 + vmovdqa %ymm1,%ymm7 + vpslld $7,%ymm7,%ymm7 + vpsrld $25,%ymm1,%ymm1 + vpor %ymm7,%ymm1,%ymm1 + + # x1 = shuffle32(x1, MASK(2, 1, 0, 3)) + vpshufd $0x93,%ymm1,%ymm1 + # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) + vpshufd $0x4e,%ymm2,%ymm2 + # x3 = shuffle32(x3, MASK(0, 3, 2, 1)) + vpshufd $0x39,%ymm3,%ymm3 + + dec %ecx + jnz .Ldoubleround + + # o0 = i0 ^ (x0 + s0) + vpaddd %ymm8,%ymm0,%ymm7 + cmp $0x10,%rax + jl .Lxorpart2 + vpxor 0x00(%rdx),%xmm7,%xmm6 + vmovdqu %xmm6,0x00(%rsi) + vextracti128 $1,%ymm7,%xmm0 + # o1 = i1 ^ (x1 + s1) + vpaddd %ymm9,%ymm1,%ymm7 + cmp $0x20,%rax + jl .Lxorpart2 + vpxor 0x10(%rdx),%xmm7,%xmm6 + vmovdqu %xmm6,0x10(%rsi) + vextracti128 $1,%ymm7,%xmm1 + # o2 = i2 ^ (x2 + s2) + vpaddd %ymm10,%ymm2,%ymm7 + cmp $0x30,%rax + jl .Lxorpart2 + vpxor 0x20(%rdx),%xmm7,%xmm6 + vmovdqu %xmm6,0x20(%rsi) + vextracti128 $1,%ymm7,%xmm2 + # o3 = i3 ^ (x3 + s3) + vpaddd %ymm11,%ymm3,%ymm7 + cmp $0x40,%rax + jl .Lxorpart2 + vpxor 0x30(%rdx),%xmm7,%xmm6 + vmovdqu %xmm6,0x30(%rsi) + vextracti128 $1,%ymm7,%xmm3 + + # xor and write second block + vmovdqa %xmm0,%xmm7 + cmp $0x50,%rax + jl .Lxorpart2 + vpxor 0x40(%rdx),%xmm7,%xmm6 + vmovdqu %xmm6,0x40(%rsi) + + vmovdqa %xmm1,%xmm7 + cmp $0x60,%rax + jl .Lxorpart2 + vpxor 0x50(%rdx),%xmm7,%xmm6 + vmovdqu %xmm6,0x50(%rsi) + + vmovdqa %xmm2,%xmm7 + cmp $0x70,%rax + jl .Lxorpart2 + vpxor 0x60(%rdx),%xmm7,%xmm6 + vmovdqu %xmm6,0x60(%rsi) + + vmovdqa %xmm3,%xmm7 + cmp $0x80,%rax + jl .Lxorpart2 + vpxor 0x70(%rdx),%xmm7,%xmm6 + vmovdqu %xmm6,0x70(%rsi) + +.Ldone2: + vzeroupper + ret + +.Lxorpart2: + # xor remaining bytes from partial register into output + mov %rax,%r9 + and $0x0f,%r9 + jz .Ldone2 + and $~0x0f,%rax + + mov %rsi,%r11 + + lea 8(%rsp),%r10 + sub $0x10,%rsp + and $~31,%rsp + + lea (%rdx,%rax),%rsi + mov %rsp,%rdi + mov %r9,%rcx + rep movsb + + vpxor 0x00(%rsp),%xmm7,%xmm7 + vmovdqa %xmm7,0x00(%rsp) + + mov %rsp,%rsi + lea (%r11,%rax),%rdi + mov %r9,%rcx + rep movsb + + lea -8(%r10),%rsp + jmp .Ldone2 + +ENDPROC(chacha20_2block_xor_avx2) + ENTRY(chacha20_8block_xor_avx2) # %rdi: Input state matrix, s # %rsi: up to 8 data blocks output, o diff --git a/arch/x86/crypto/chacha20_glue.c b/arch/x86/crypto/chacha20_glue.c index b541da71f11e..82e46589a189 100644 --- a/arch/x86/crypto/chacha20_glue.c +++ b/arch/x86/crypto/chacha20_glue.c @@ -24,6 +24,8 @@ asmlinkage void chacha20_block_xor_ssse3(u32 *state, u8 *dst, const u8 *src, asmlinkage void chacha20_4block_xor_ssse3(u32 *state, u8 *dst, const u8 *src, unsigned int len); #ifdef CONFIG_AS_AVX2 +asmlinkage void chacha20_2block_xor_avx2(u32 *state, u8 *dst, const u8 *src, + unsigned int len); asmlinkage void chacha20_8block_xor_avx2(u32 *state, u8 *dst, const u8 *src, unsigned int len); static bool chacha20_use_avx2; @@ -52,6 +54,11 @@ static void chacha20_dosimd(u32 *state, u8 *dst, const u8 *src, state[12] += chacha20_advance(bytes, 8); return; } + if (bytes > CHACHA20_BLOCK_SIZE) { + chacha20_2block_xor_avx2(state, dst, src, bytes); + state[12] += chacha20_advance(bytes, 2); + return; + } } #endif while (bytes >= CHACHA20_BLOCK_SIZE * 4) { -- cgit v1.2.3-59-g8ed1b From 8a5a79d5556b822143b4403fc46068d4eef2e4e2 Mon Sep 17 00:00:00 2001 From: Martin Willi Date: Sun, 11 Nov 2018 10:36:30 +0100 Subject: crypto: x86/chacha20 - Add a 4-block AVX2 variant This variant builds upon the idea of the 2-block AVX2 variant that shuffles words after each round. The shuffling has a rather high latency, so the arithmetic units are not optimally used. Given that we have plenty of registers in AVX, this version parallelizes the 2-block variant to do four blocks. While the first two blocks are shuffling, the CPU can do the XORing on the second two blocks and vice-versa, which makes this version much faster than the SSSE3 variant for four blocks. The latter is now mostly for systems that do not have AVX2, but there it is the work-horse, so we keep it in place. The partial XORing function trailer is very similar to the AVX2 2-block variant. While it could be shared, that code segment is rather short; profiling is also easier with the trailer integrated, so we keep it per function. Signed-off-by: Martin Willi Signed-off-by: Herbert Xu --- arch/x86/crypto/chacha20-avx2-x86_64.S | 310 +++++++++++++++++++++++++++++++++ arch/x86/crypto/chacha20_glue.c | 7 + 2 files changed, 317 insertions(+) diff --git a/arch/x86/crypto/chacha20-avx2-x86_64.S b/arch/x86/crypto/chacha20-avx2-x86_64.S index 8247076b0ba7..b6ab082be657 100644 --- a/arch/x86/crypto/chacha20-avx2-x86_64.S +++ b/arch/x86/crypto/chacha20-avx2-x86_64.S @@ -31,6 +31,11 @@ CTRINC: .octa 0x00000003000000020000000100000000 CTR2BL: .octa 0x00000000000000000000000000000000 .octa 0x00000000000000000000000000000001 +.section .rodata.cst32.CTR4BL, "aM", @progbits, 32 +.align 32 +CTR4BL: .octa 0x00000000000000000000000000000002 + .octa 0x00000000000000000000000000000003 + .text ENTRY(chacha20_2block_xor_avx2) @@ -225,6 +230,311 @@ ENTRY(chacha20_2block_xor_avx2) ENDPROC(chacha20_2block_xor_avx2) +ENTRY(chacha20_4block_xor_avx2) + # %rdi: Input state matrix, s + # %rsi: up to 4 data blocks output, o + # %rdx: up to 4 data blocks input, i + # %rcx: input/output length in bytes + + # This function encrypts four ChaCha20 block by loading the state + # matrix four times across eight AVX registers. It performs matrix + # operations on four words in two matrices in parallel, sequentially + # to the operations on the four words of the other two matrices. The + # required word shuffling has a rather high latency, we can do the + # arithmetic on two matrix-pairs without much slowdown. + + vzeroupper + + # x0..3[0-4] = s0..3 + vbroadcasti128 0x00(%rdi),%ymm0 + vbroadcasti128 0x10(%rdi),%ymm1 + vbroadcasti128 0x20(%rdi),%ymm2 + vbroadcasti128 0x30(%rdi),%ymm3 + + vmovdqa %ymm0,%ymm4 + vmovdqa %ymm1,%ymm5 + vmovdqa %ymm2,%ymm6 + vmovdqa %ymm3,%ymm7 + + vpaddd CTR2BL(%rip),%ymm3,%ymm3 + vpaddd CTR4BL(%rip),%ymm7,%ymm7 + + vmovdqa %ymm0,%ymm11 + vmovdqa %ymm1,%ymm12 + vmovdqa %ymm2,%ymm13 + vmovdqa %ymm3,%ymm14 + vmovdqa %ymm7,%ymm15 + + vmovdqa ROT8(%rip),%ymm8 + vmovdqa ROT16(%rip),%ymm9 + + mov %rcx,%rax + mov $10,%ecx + +.Ldoubleround4: + + # x0 += x1, x3 = rotl32(x3 ^ x0, 16) + vpaddd %ymm1,%ymm0,%ymm0 + vpxor %ymm0,%ymm3,%ymm3 + vpshufb %ymm9,%ymm3,%ymm3 + + vpaddd %ymm5,%ymm4,%ymm4 + vpxor %ymm4,%ymm7,%ymm7 + vpshufb %ymm9,%ymm7,%ymm7 + + # x2 += x3, x1 = rotl32(x1 ^ x2, 12) + vpaddd %ymm3,%ymm2,%ymm2 + vpxor %ymm2,%ymm1,%ymm1 + vmovdqa %ymm1,%ymm10 + vpslld $12,%ymm10,%ymm10 + vpsrld $20,%ymm1,%ymm1 + vpor %ymm10,%ymm1,%ymm1 + + vpaddd %ymm7,%ymm6,%ymm6 + vpxor %ymm6,%ymm5,%ymm5 + vmovdqa %ymm5,%ymm10 + vpslld $12,%ymm10,%ymm10 + vpsrld $20,%ymm5,%ymm5 + vpor %ymm10,%ymm5,%ymm5 + + # x0 += x1, x3 = rotl32(x3 ^ x0, 8) + vpaddd %ymm1,%ymm0,%ymm0 + vpxor %ymm0,%ymm3,%ymm3 + vpshufb %ymm8,%ymm3,%ymm3 + + vpaddd %ymm5,%ymm4,%ymm4 + vpxor %ymm4,%ymm7,%ymm7 + vpshufb %ymm8,%ymm7,%ymm7 + + # x2 += x3, x1 = rotl32(x1 ^ x2, 7) + vpaddd %ymm3,%ymm2,%ymm2 + vpxor %ymm2,%ymm1,%ymm1 + vmovdqa %ymm1,%ymm10 + vpslld $7,%ymm10,%ymm10 + vpsrld $25,%ymm1,%ymm1 + vpor %ymm10,%ymm1,%ymm1 + + vpaddd %ymm7,%ymm6,%ymm6 + vpxor %ymm6,%ymm5,%ymm5 + vmovdqa %ymm5,%ymm10 + vpslld $7,%ymm10,%ymm10 + vpsrld $25,%ymm5,%ymm5 + vpor %ymm10,%ymm5,%ymm5 + + # x1 = shuffle32(x1, MASK(0, 3, 2, 1)) + vpshufd $0x39,%ymm1,%ymm1 + vpshufd $0x39,%ymm5,%ymm5 + # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) + vpshufd $0x4e,%ymm2,%ymm2 + vpshufd $0x4e,%ymm6,%ymm6 + # x3 = shuffle32(x3, MASK(2, 1, 0, 3)) + vpshufd $0x93,%ymm3,%ymm3 + vpshufd $0x93,%ymm7,%ymm7 + + # x0 += x1, x3 = rotl32(x3 ^ x0, 16) + vpaddd %ymm1,%ymm0,%ymm0 + vpxor %ymm0,%ymm3,%ymm3 + vpshufb %ymm9,%ymm3,%ymm3 + + vpaddd %ymm5,%ymm4,%ymm4 + vpxor %ymm4,%ymm7,%ymm7 + vpshufb %ymm9,%ymm7,%ymm7 + + # x2 += x3, x1 = rotl32(x1 ^ x2, 12) + vpaddd %ymm3,%ymm2,%ymm2 + vpxor %ymm2,%ymm1,%ymm1 + vmovdqa %ymm1,%ymm10 + vpslld $12,%ymm10,%ymm10 + vpsrld $20,%ymm1,%ymm1 + vpor %ymm10,%ymm1,%ymm1 + + vpaddd %ymm7,%ymm6,%ymm6 + vpxor %ymm6,%ymm5,%ymm5 + vmovdqa %ymm5,%ymm10 + vpslld $12,%ymm10,%ymm10 + vpsrld $20,%ymm5,%ymm5 + vpor %ymm10,%ymm5,%ymm5 + + # x0 += x1, x3 = rotl32(x3 ^ x0, 8) + vpaddd %ymm1,%ymm0,%ymm0 + vpxor %ymm0,%ymm3,%ymm3 + vpshufb %ymm8,%ymm3,%ymm3 + + vpaddd %ymm5,%ymm4,%ymm4 + vpxor %ymm4,%ymm7,%ymm7 + vpshufb %ymm8,%ymm7,%ymm7 + + # x2 += x3, x1 = rotl32(x1 ^ x2, 7) + vpaddd %ymm3,%ymm2,%ymm2 + vpxor %ymm2,%ymm1,%ymm1 + vmovdqa %ymm1,%ymm10 + vpslld $7,%ymm10,%ymm10 + vpsrld $25,%ymm1,%ymm1 + vpor %ymm10,%ymm1,%ymm1 + + vpaddd %ymm7,%ymm6,%ymm6 + vpxor %ymm6,%ymm5,%ymm5 + vmovdqa %ymm5,%ymm10 + vpslld $7,%ymm10,%ymm10 + vpsrld $25,%ymm5,%ymm5 + vpor %ymm10,%ymm5,%ymm5 + + # x1 = shuffle32(x1, MASK(2, 1, 0, 3)) + vpshufd $0x93,%ymm1,%ymm1 + vpshufd $0x93,%ymm5,%ymm5 + # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) + vpshufd $0x4e,%ymm2,%ymm2 + vpshufd $0x4e,%ymm6,%ymm6 + # x3 = shuffle32(x3, MASK(0, 3, 2, 1)) + vpshufd $0x39,%ymm3,%ymm3 + vpshufd $0x39,%ymm7,%ymm7 + + dec %ecx + jnz .Ldoubleround4 + + # o0 = i0 ^ (x0 + s0), first block + vpaddd %ymm11,%ymm0,%ymm10 + cmp $0x10,%rax + jl .Lxorpart4 + vpxor 0x00(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0x00(%rsi) + vextracti128 $1,%ymm10,%xmm0 + # o1 = i1 ^ (x1 + s1), first block + vpaddd %ymm12,%ymm1,%ymm10 + cmp $0x20,%rax + jl .Lxorpart4 + vpxor 0x10(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0x10(%rsi) + vextracti128 $1,%ymm10,%xmm1 + # o2 = i2 ^ (x2 + s2), first block + vpaddd %ymm13,%ymm2,%ymm10 + cmp $0x30,%rax + jl .Lxorpart4 + vpxor 0x20(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0x20(%rsi) + vextracti128 $1,%ymm10,%xmm2 + # o3 = i3 ^ (x3 + s3), first block + vpaddd %ymm14,%ymm3,%ymm10 + cmp $0x40,%rax + jl .Lxorpart4 + vpxor 0x30(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0x30(%rsi) + vextracti128 $1,%ymm10,%xmm3 + + # xor and write second block + vmovdqa %xmm0,%xmm10 + cmp $0x50,%rax + jl .Lxorpart4 + vpxor 0x40(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0x40(%rsi) + + vmovdqa %xmm1,%xmm10 + cmp $0x60,%rax + jl .Lxorpart4 + vpxor 0x50(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0x50(%rsi) + + vmovdqa %xmm2,%xmm10 + cmp $0x70,%rax + jl .Lxorpart4 + vpxor 0x60(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0x60(%rsi) + + vmovdqa %xmm3,%xmm10 + cmp $0x80,%rax + jl .Lxorpart4 + vpxor 0x70(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0x70(%rsi) + + # o0 = i0 ^ (x0 + s0), third block + vpaddd %ymm11,%ymm4,%ymm10 + cmp $0x90,%rax + jl .Lxorpart4 + vpxor 0x80(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0x80(%rsi) + vextracti128 $1,%ymm10,%xmm4 + # o1 = i1 ^ (x1 + s1), third block + vpaddd %ymm12,%ymm5,%ymm10 + cmp $0xa0,%rax + jl .Lxorpart4 + vpxor 0x90(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0x90(%rsi) + vextracti128 $1,%ymm10,%xmm5 + # o2 = i2 ^ (x2 + s2), third block + vpaddd %ymm13,%ymm6,%ymm10 + cmp $0xb0,%rax + jl .Lxorpart4 + vpxor 0xa0(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0xa0(%rsi) + vextracti128 $1,%ymm10,%xmm6 + # o3 = i3 ^ (x3 + s3), third block + vpaddd %ymm15,%ymm7,%ymm10 + cmp $0xc0,%rax + jl .Lxorpart4 + vpxor 0xb0(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0xb0(%rsi) + vextracti128 $1,%ymm10,%xmm7 + + # xor and write fourth block + vmovdqa %xmm4,%xmm10 + cmp $0xd0,%rax + jl .Lxorpart4 + vpxor 0xc0(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0xc0(%rsi) + + vmovdqa %xmm5,%xmm10 + cmp $0xe0,%rax + jl .Lxorpart4 + vpxor 0xd0(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0xd0(%rsi) + + vmovdqa %xmm6,%xmm10 + cmp $0xf0,%rax + jl .Lxorpart4 + vpxor 0xe0(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0xe0(%rsi) + + vmovdqa %xmm7,%xmm10 + cmp $0x100,%rax + jl .Lxorpart4 + vpxor 0xf0(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0xf0(%rsi) + +.Ldone4: + vzeroupper + ret + +.Lxorpart4: + # xor remaining bytes from partial register into output + mov %rax,%r9 + and $0x0f,%r9 + jz .Ldone4 + and $~0x0f,%rax + + mov %rsi,%r11 + + lea 8(%rsp),%r10 + sub $0x10,%rsp + and $~31,%rsp + + lea (%rdx,%rax),%rsi + mov %rsp,%rdi + mov %r9,%rcx + rep movsb + + vpxor 0x00(%rsp),%xmm10,%xmm10 + vmovdqa %xmm10,0x00(%rsp) + + mov %rsp,%rsi + lea (%r11,%rax),%rdi + mov %r9,%rcx + rep movsb + + lea -8(%r10),%rsp + jmp .Ldone4 + +ENDPROC(chacha20_4block_xor_avx2) + ENTRY(chacha20_8block_xor_avx2) # %rdi: Input state matrix, s # %rsi: up to 8 data blocks output, o diff --git a/arch/x86/crypto/chacha20_glue.c b/arch/x86/crypto/chacha20_glue.c index 82e46589a189..9fd84fe6ec09 100644 --- a/arch/x86/crypto/chacha20_glue.c +++ b/arch/x86/crypto/chacha20_glue.c @@ -26,6 +26,8 @@ asmlinkage void chacha20_4block_xor_ssse3(u32 *state, u8 *dst, const u8 *src, #ifdef CONFIG_AS_AVX2 asmlinkage void chacha20_2block_xor_avx2(u32 *state, u8 *dst, const u8 *src, unsigned int len); +asmlinkage void chacha20_4block_xor_avx2(u32 *state, u8 *dst, const u8 *src, + unsigned int len); asmlinkage void chacha20_8block_xor_avx2(u32 *state, u8 *dst, const u8 *src, unsigned int len); static bool chacha20_use_avx2; @@ -54,6 +56,11 @@ static void chacha20_dosimd(u32 *state, u8 *dst, const u8 *src, state[12] += chacha20_advance(bytes, 8); return; } + if (bytes > CHACHA20_BLOCK_SIZE * 2) { + chacha20_4block_xor_avx2(state, dst, src, bytes); + state[12] += chacha20_advance(bytes, 4); + return; + } if (bytes > CHACHA20_BLOCK_SIZE) { chacha20_2block_xor_avx2(state, dst, src, bytes); state[12] += chacha20_advance(bytes, 2); -- cgit v1.2.3-59-g8ed1b From 3da2c1dfdb802b184eea0653d1e589515b52d74b Mon Sep 17 00:00:00 2001 From: Vitaly Chikunov Date: Sun, 11 Nov 2018 20:40:02 +0300 Subject: crypto: ecc - regularize scalar for scalar multiplication ecc_point_mult is supposed to be used with a regularized scalar, otherwise, it's possible to deduce the position of the top bit of the scalar with timing attack. This is important when the scalar is a private key. ecc_point_mult is already using a regular algorithm (i.e. having an operation flow independent of the input scalar) but regularization step is not implemented. Arrange scalar to always have fixed top bit by adding a multiple of the curve order (n). References: The constant time regularization step is based on micro-ecc by Kenneth MacKay and also referenced in the literature (Bernstein, D. J., & Lange, T. (2017). Montgomery curves and the Montgomery ladder. (Cryptology ePrint Archive; Vol. 2017/293). s.l.: IACR. Chapter 4.6.2.) Signed-off-by: Vitaly Chikunov Cc: kernel-hardening@lists.openwall.com Signed-off-by: Herbert Xu --- crypto/ecc.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/crypto/ecc.c b/crypto/ecc.c index 9d24e6522730..ed1237115066 100644 --- a/crypto/ecc.c +++ b/crypto/ecc.c @@ -842,15 +842,23 @@ static void xycz_add_c(u64 *x1, u64 *y1, u64 *x2, u64 *y2, u64 *curve_prime, static void ecc_point_mult(struct ecc_point *result, const struct ecc_point *point, const u64 *scalar, - u64 *initial_z, u64 *curve_prime, + u64 *initial_z, const struct ecc_curve *curve, unsigned int ndigits) { /* R0 and R1 */ u64 rx[2][ECC_MAX_DIGITS]; u64 ry[2][ECC_MAX_DIGITS]; u64 z[ECC_MAX_DIGITS]; + u64 sk[2][ECC_MAX_DIGITS]; + u64 *curve_prime = curve->p; int i, nb; - int num_bits = vli_num_bits(scalar, ndigits); + int num_bits; + int carry; + + carry = vli_add(sk[0], scalar, curve->n, ndigits); + vli_add(sk[1], sk[0], curve->n, ndigits); + scalar = sk[!carry]; + num_bits = sizeof(u64) * ndigits * 8 + 1; vli_set(rx[1], point->x, ndigits); vli_set(ry[1], point->y, ndigits); @@ -1014,7 +1022,7 @@ int ecc_make_pub_key(unsigned int curve_id, unsigned int ndigits, goto out; } - ecc_point_mult(pk, &curve->g, priv, NULL, curve->p, ndigits); + ecc_point_mult(pk, &curve->g, priv, NULL, curve, ndigits); if (ecc_point_is_zero(pk)) { ret = -EAGAIN; goto err_free_point; @@ -1100,7 +1108,7 @@ int crypto_ecdh_shared_secret(unsigned int curve_id, unsigned int ndigits, goto err_alloc_product; } - ecc_point_mult(product, pk, priv, rand_z, curve->p, ndigits); + ecc_point_mult(product, pk, priv, rand_z, curve, ndigits); ecc_swap_digits(product->x, secret, ndigits); -- cgit v1.2.3-59-g8ed1b From 2b78aeb366365bd9cbf56c710e8b2ac494620306 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Wed, 14 Nov 2018 11:10:53 -0800 Subject: crypto: inside-secure - remove useless setting of type flags Remove the unnecessary setting of CRYPTO_ALG_TYPE_SKCIPHER. Commit 2c95e6d97892 ("crypto: skcipher - remove useless setting of type flags") took care of this everywhere else, but a few more instances made it into the tree at about the same time. Squash them before they get copy+pasted around again. This patch shouldn't change any actual behavior. Signed-off-by: Eric Biggers Acked-by: Antoine Tenart Signed-off-by: Herbert Xu --- drivers/crypto/inside-secure/safexcel_cipher.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/crypto/inside-secure/safexcel_cipher.c b/drivers/crypto/inside-secure/safexcel_cipher.c index 3aef1d43e435..d531c14020dc 100644 --- a/drivers/crypto/inside-secure/safexcel_cipher.c +++ b/drivers/crypto/inside-secure/safexcel_cipher.c @@ -970,7 +970,7 @@ struct safexcel_alg_template safexcel_alg_cbc_des = { .cra_name = "cbc(des)", .cra_driver_name = "safexcel-cbc-des", .cra_priority = 300, - .cra_flags = CRYPTO_ALG_TYPE_SKCIPHER | CRYPTO_ALG_ASYNC | + .cra_flags = CRYPTO_ALG_ASYNC | CRYPTO_ALG_KERN_DRIVER_ONLY, .cra_blocksize = DES_BLOCK_SIZE, .cra_ctxsize = sizeof(struct safexcel_cipher_ctx), @@ -1010,7 +1010,7 @@ struct safexcel_alg_template safexcel_alg_ecb_des = { .cra_name = "ecb(des)", .cra_driver_name = "safexcel-ecb-des", .cra_priority = 300, - .cra_flags = CRYPTO_ALG_TYPE_SKCIPHER | CRYPTO_ALG_ASYNC | + .cra_flags = CRYPTO_ALG_ASYNC | CRYPTO_ALG_KERN_DRIVER_ONLY, .cra_blocksize = DES_BLOCK_SIZE, .cra_ctxsize = sizeof(struct safexcel_cipher_ctx), @@ -1074,7 +1074,7 @@ struct safexcel_alg_template safexcel_alg_cbc_des3_ede = { .cra_name = "cbc(des3_ede)", .cra_driver_name = "safexcel-cbc-des3_ede", .cra_priority = 300, - .cra_flags = CRYPTO_ALG_TYPE_SKCIPHER | CRYPTO_ALG_ASYNC | + .cra_flags = CRYPTO_ALG_ASYNC | CRYPTO_ALG_KERN_DRIVER_ONLY, .cra_blocksize = DES3_EDE_BLOCK_SIZE, .cra_ctxsize = sizeof(struct safexcel_cipher_ctx), @@ -1114,7 +1114,7 @@ struct safexcel_alg_template safexcel_alg_ecb_des3_ede = { .cra_name = "ecb(des3_ede)", .cra_driver_name = "safexcel-ecb-des3_ede", .cra_priority = 300, - .cra_flags = CRYPTO_ALG_TYPE_SKCIPHER | CRYPTO_ALG_ASYNC | + .cra_flags = CRYPTO_ALG_ASYNC | CRYPTO_ALG_KERN_DRIVER_ONLY, .cra_blocksize = DES3_EDE_BLOCK_SIZE, .cra_ctxsize = sizeof(struct safexcel_cipher_ctx), -- cgit v1.2.3-59-g8ed1b From d41655909e3236bfb00aa69f435a9634cd74b60b Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Wed, 14 Nov 2018 11:35:48 -0800 Subject: crypto: remove useless initializations of cra_list Some algorithms initialize their .cra_list prior to registration. But this is unnecessary since crypto_register_alg() will overwrite .cra_list when adding the algorithm to the 'crypto_alg_list'. Apparently the useless assignment has just been copy+pasted around. So, remove the useless assignments. Exception: paes_s390.c uses cra_list to check whether the algorithm is registered or not, so I left that as-is for now. This patch shouldn't change any actual behavior. Signed-off-by: Eric Biggers Signed-off-by: Herbert Xu --- arch/sparc/crypto/aes_glue.c | 5 ----- arch/sparc/crypto/camellia_glue.c | 5 ----- arch/sparc/crypto/des_glue.c | 5 ----- crypto/lz4.c | 1 - crypto/lz4hc.c | 1 - drivers/crypto/bcm/cipher.c | 2 -- drivers/crypto/omap-aes.c | 2 -- drivers/crypto/omap-des.c | 1 - drivers/crypto/qce/ablkcipher.c | 1 - drivers/crypto/qce/sha.c | 1 - drivers/crypto/sahara.c | 1 - 11 files changed, 25 deletions(-) diff --git a/arch/sparc/crypto/aes_glue.c b/arch/sparc/crypto/aes_glue.c index 3cd4f6b198b6..a9b8b0b94a8d 100644 --- a/arch/sparc/crypto/aes_glue.c +++ b/arch/sparc/crypto/aes_glue.c @@ -476,11 +476,6 @@ static bool __init sparc64_has_aes_opcode(void) static int __init aes_sparc64_mod_init(void) { - int i; - - for (i = 0; i < ARRAY_SIZE(algs); i++) - INIT_LIST_HEAD(&algs[i].cra_list); - if (sparc64_has_aes_opcode()) { pr_info("Using sparc64 aes opcodes optimized AES implementation\n"); return crypto_register_algs(algs, ARRAY_SIZE(algs)); diff --git a/arch/sparc/crypto/camellia_glue.c b/arch/sparc/crypto/camellia_glue.c index 561a84d93cf6..900d5c617e83 100644 --- a/arch/sparc/crypto/camellia_glue.c +++ b/arch/sparc/crypto/camellia_glue.c @@ -299,11 +299,6 @@ static bool __init sparc64_has_camellia_opcode(void) static int __init camellia_sparc64_mod_init(void) { - int i; - - for (i = 0; i < ARRAY_SIZE(algs); i++) - INIT_LIST_HEAD(&algs[i].cra_list); - if (sparc64_has_camellia_opcode()) { pr_info("Using sparc64 camellia opcodes optimized CAMELLIA implementation\n"); return crypto_register_algs(algs, ARRAY_SIZE(algs)); diff --git a/arch/sparc/crypto/des_glue.c b/arch/sparc/crypto/des_glue.c index 61af794aa2d3..56499ea39fd3 100644 --- a/arch/sparc/crypto/des_glue.c +++ b/arch/sparc/crypto/des_glue.c @@ -510,11 +510,6 @@ static bool __init sparc64_has_des_opcode(void) static int __init des_sparc64_mod_init(void) { - int i; - - for (i = 0; i < ARRAY_SIZE(algs); i++) - INIT_LIST_HEAD(&algs[i].cra_list); - if (sparc64_has_des_opcode()) { pr_info("Using sparc64 des opcodes optimized DES implementation\n"); return crypto_register_algs(algs, ARRAY_SIZE(algs)); diff --git a/crypto/lz4.c b/crypto/lz4.c index 2ce2660d3519..c160dfdbf2e0 100644 --- a/crypto/lz4.c +++ b/crypto/lz4.c @@ -122,7 +122,6 @@ static struct crypto_alg alg_lz4 = { .cra_flags = CRYPTO_ALG_TYPE_COMPRESS, .cra_ctxsize = sizeof(struct lz4_ctx), .cra_module = THIS_MODULE, - .cra_list = LIST_HEAD_INIT(alg_lz4.cra_list), .cra_init = lz4_init, .cra_exit = lz4_exit, .cra_u = { .compress = { diff --git a/crypto/lz4hc.c b/crypto/lz4hc.c index 2be14f054daf..583b5e013d7a 100644 --- a/crypto/lz4hc.c +++ b/crypto/lz4hc.c @@ -123,7 +123,6 @@ static struct crypto_alg alg_lz4hc = { .cra_flags = CRYPTO_ALG_TYPE_COMPRESS, .cra_ctxsize = sizeof(struct lz4hc_ctx), .cra_module = THIS_MODULE, - .cra_list = LIST_HEAD_INIT(alg_lz4hc.cra_list), .cra_init = lz4hc_init, .cra_exit = lz4hc_exit, .cra_u = { .compress = { diff --git a/drivers/crypto/bcm/cipher.c b/drivers/crypto/bcm/cipher.c index 4d67e22a4664..2ce3a16d3d10 100644 --- a/drivers/crypto/bcm/cipher.c +++ b/drivers/crypto/bcm/cipher.c @@ -4605,7 +4605,6 @@ static int spu_register_ablkcipher(struct iproc_alg_s *driver_alg) crypto->cra_priority = cipher_pri; crypto->cra_alignmask = 0; crypto->cra_ctxsize = sizeof(struct iproc_ctx_s); - INIT_LIST_HEAD(&crypto->cra_list); crypto->cra_init = ablkcipher_cra_init; crypto->cra_exit = generic_cra_exit; @@ -4691,7 +4690,6 @@ static int spu_register_aead(struct iproc_alg_s *driver_alg) aead->base.cra_priority = aead_pri; aead->base.cra_alignmask = 0; aead->base.cra_ctxsize = sizeof(struct iproc_ctx_s); - INIT_LIST_HEAD(&aead->base.cra_list); aead->base.cra_flags |= CRYPTO_ALG_ASYNC; /* setkey set in alg initialization */ diff --git a/drivers/crypto/omap-aes.c b/drivers/crypto/omap-aes.c index a553ffddb11b..4c0ea8142923 100644 --- a/drivers/crypto/omap-aes.c +++ b/drivers/crypto/omap-aes.c @@ -1222,7 +1222,6 @@ static int omap_aes_probe(struct platform_device *pdev) algp = &dd->pdata->algs_info[i].algs_list[j]; pr_debug("reg alg: %s\n", algp->cra_name); - INIT_LIST_HEAD(&algp->cra_list); err = crypto_register_alg(algp); if (err) @@ -1240,7 +1239,6 @@ static int omap_aes_probe(struct platform_device *pdev) algp = &aalg->base; pr_debug("reg alg: %s\n", algp->cra_name); - INIT_LIST_HEAD(&algp->cra_list); err = crypto_register_aead(aalg); if (err) diff --git a/drivers/crypto/omap-des.c b/drivers/crypto/omap-des.c index eb95b0d7f184..6369019219d4 100644 --- a/drivers/crypto/omap-des.c +++ b/drivers/crypto/omap-des.c @@ -1069,7 +1069,6 @@ static int omap_des_probe(struct platform_device *pdev) algp = &dd->pdata->algs_info[i].algs_list[j]; pr_debug("reg alg: %s\n", algp->cra_name); - INIT_LIST_HEAD(&algp->cra_list); err = crypto_register_alg(algp); if (err) diff --git a/drivers/crypto/qce/ablkcipher.c b/drivers/crypto/qce/ablkcipher.c index 585e1cab9ae3..25c13e26d012 100644 --- a/drivers/crypto/qce/ablkcipher.c +++ b/drivers/crypto/qce/ablkcipher.c @@ -376,7 +376,6 @@ static int qce_ablkcipher_register_one(const struct qce_ablkcipher_def *def, alg->cra_module = THIS_MODULE; alg->cra_init = qce_ablkcipher_init; alg->cra_exit = qce_ablkcipher_exit; - INIT_LIST_HEAD(&alg->cra_list); INIT_LIST_HEAD(&tmpl->entry); tmpl->crypto_alg_type = CRYPTO_ALG_TYPE_ABLKCIPHER; diff --git a/drivers/crypto/qce/sha.c b/drivers/crypto/qce/sha.c index d8a5db11b7ea..fc45f5ea6fdd 100644 --- a/drivers/crypto/qce/sha.c +++ b/drivers/crypto/qce/sha.c @@ -508,7 +508,6 @@ static int qce_ahash_register_one(const struct qce_ahash_def *def, base->cra_alignmask = 0; base->cra_module = THIS_MODULE; base->cra_init = qce_ahash_cra_init; - INIT_LIST_HEAD(&base->cra_list); snprintf(base->cra_name, CRYPTO_MAX_ALG_NAME, "%s", def->name); snprintf(base->cra_driver_name, CRYPTO_MAX_ALG_NAME, "%s", diff --git a/drivers/crypto/sahara.c b/drivers/crypto/sahara.c index bbf166a97ad3..8c32a3059b4a 100644 --- a/drivers/crypto/sahara.c +++ b/drivers/crypto/sahara.c @@ -1321,7 +1321,6 @@ static int sahara_register_algs(struct sahara_dev *dev) unsigned int i, j, k, l; for (i = 0; i < ARRAY_SIZE(aes_algs); i++) { - INIT_LIST_HEAD(&aes_algs[i].cra_list); err = crypto_register_alg(&aes_algs[i]); if (err) goto err_aes_algs; -- cgit v1.2.3-59-g8ed1b From 1ad0f1603a6b2afb62a1c065409aaa4e43ca7627 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Wed, 14 Nov 2018 12:19:39 -0800 Subject: crypto: drop mask=CRYPTO_ALG_ASYNC from 'cipher' tfm allocations 'cipher' algorithms (single block ciphers) are always synchronous, so passing CRYPTO_ALG_ASYNC in the mask to crypto_alloc_cipher() has no effect. Many users therefore already don't pass it, but some still do. This inconsistency can cause confusion, especially since the way the 'mask' argument works is somewhat counterintuitive. Thus, just remove the unneeded CRYPTO_ALG_ASYNC flags. This patch shouldn't change any actual behavior. Signed-off-by: Eric Biggers Signed-off-by: Herbert Xu --- arch/s390/crypto/aes_s390.c | 2 +- drivers/crypto/amcc/crypto4xx_alg.c | 3 +-- drivers/crypto/ccp/ccp-crypto-aes-cmac.c | 4 +--- drivers/crypto/geode-aes.c | 2 +- drivers/md/dm-crypt.c | 2 +- drivers/net/wireless/cisco/airo.c | 2 +- drivers/staging/rtl8192e/rtllib_crypt_ccmp.c | 2 +- drivers/staging/rtl8192u/ieee80211/ieee80211_crypt_ccmp.c | 2 +- drivers/usb/wusbcore/crypto.c | 2 +- net/bluetooth/smp.c | 6 +++--- net/mac80211/wep.c | 4 ++-- net/wireless/lib80211_crypt_ccmp.c | 2 +- net/wireless/lib80211_crypt_tkip.c | 4 ++-- net/wireless/lib80211_crypt_wep.c | 4 ++-- 14 files changed, 19 insertions(+), 22 deletions(-) diff --git a/arch/s390/crypto/aes_s390.c b/arch/s390/crypto/aes_s390.c index 812d9498d97b..dd456725189f 100644 --- a/arch/s390/crypto/aes_s390.c +++ b/arch/s390/crypto/aes_s390.c @@ -137,7 +137,7 @@ static int fallback_init_cip(struct crypto_tfm *tfm) struct s390_aes_ctx *sctx = crypto_tfm_ctx(tfm); sctx->fallback.cip = crypto_alloc_cipher(name, 0, - CRYPTO_ALG_ASYNC | CRYPTO_ALG_NEED_FALLBACK); + CRYPTO_ALG_NEED_FALLBACK); if (IS_ERR(sctx->fallback.cip)) { pr_err("Allocating AES fallback algorithm %s failed\n", diff --git a/drivers/crypto/amcc/crypto4xx_alg.c b/drivers/crypto/amcc/crypto4xx_alg.c index f5c07498ea4f..4092c2aad8e2 100644 --- a/drivers/crypto/amcc/crypto4xx_alg.c +++ b/drivers/crypto/amcc/crypto4xx_alg.c @@ -520,8 +520,7 @@ static int crypto4xx_compute_gcm_hash_key_sw(__le32 *hash_start, const u8 *key, uint8_t src[16] = { 0 }; int rc = 0; - aes_tfm = crypto_alloc_cipher("aes", 0, CRYPTO_ALG_ASYNC | - CRYPTO_ALG_NEED_FALLBACK); + aes_tfm = crypto_alloc_cipher("aes", 0, CRYPTO_ALG_NEED_FALLBACK); if (IS_ERR(aes_tfm)) { rc = PTR_ERR(aes_tfm); pr_warn("could not load aes cipher driver: %d\n", rc); diff --git a/drivers/crypto/ccp/ccp-crypto-aes-cmac.c b/drivers/crypto/ccp/ccp-crypto-aes-cmac.c index 3c6fe57f91f8..9108015e56cc 100644 --- a/drivers/crypto/ccp/ccp-crypto-aes-cmac.c +++ b/drivers/crypto/ccp/ccp-crypto-aes-cmac.c @@ -346,9 +346,7 @@ static int ccp_aes_cmac_cra_init(struct crypto_tfm *tfm) crypto_ahash_set_reqsize(ahash, sizeof(struct ccp_aes_cmac_req_ctx)); - cipher_tfm = crypto_alloc_cipher("aes", 0, - CRYPTO_ALG_ASYNC | - CRYPTO_ALG_NEED_FALLBACK); + cipher_tfm = crypto_alloc_cipher("aes", 0, CRYPTO_ALG_NEED_FALLBACK); if (IS_ERR(cipher_tfm)) { pr_warn("could not load aes cipher driver\n"); return PTR_ERR(cipher_tfm); diff --git a/drivers/crypto/geode-aes.c b/drivers/crypto/geode-aes.c index eb2a0a73cbed..b4c24a35b3d0 100644 --- a/drivers/crypto/geode-aes.c +++ b/drivers/crypto/geode-aes.c @@ -261,7 +261,7 @@ static int fallback_init_cip(struct crypto_tfm *tfm) struct geode_aes_op *op = crypto_tfm_ctx(tfm); op->fallback.cip = crypto_alloc_cipher(name, 0, - CRYPTO_ALG_ASYNC | CRYPTO_ALG_NEED_FALLBACK); + CRYPTO_ALG_NEED_FALLBACK); if (IS_ERR(op->fallback.cip)) { printk(KERN_ERR "Error allocating fallback algo %s\n", name); diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index b8eec515a003..a7195eb5b8d8 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -377,7 +377,7 @@ static struct crypto_cipher *alloc_essiv_cipher(struct crypt_config *cc, int err; /* Setup the essiv_tfm with the given salt */ - essiv_tfm = crypto_alloc_cipher(cc->cipher, 0, CRYPTO_ALG_ASYNC); + essiv_tfm = crypto_alloc_cipher(cc->cipher, 0, 0); if (IS_ERR(essiv_tfm)) { ti->error = "Error allocating crypto tfm for ESSIV"; return essiv_tfm; diff --git a/drivers/net/wireless/cisco/airo.c b/drivers/net/wireless/cisco/airo.c index 04dd7a936593..6fab69fe6c92 100644 --- a/drivers/net/wireless/cisco/airo.c +++ b/drivers/net/wireless/cisco/airo.c @@ -1359,7 +1359,7 @@ static int micsetup(struct airo_info *ai) { int i; if (ai->tfm == NULL) - ai->tfm = crypto_alloc_cipher("aes", 0, CRYPTO_ALG_ASYNC); + ai->tfm = crypto_alloc_cipher("aes", 0, 0); if (IS_ERR(ai->tfm)) { airo_print_err(ai->dev->name, "failed to load transform for AES"); diff --git a/drivers/staging/rtl8192e/rtllib_crypt_ccmp.c b/drivers/staging/rtl8192e/rtllib_crypt_ccmp.c index bc45cf098b04..91871503364d 100644 --- a/drivers/staging/rtl8192e/rtllib_crypt_ccmp.c +++ b/drivers/staging/rtl8192e/rtllib_crypt_ccmp.c @@ -67,7 +67,7 @@ static void *rtllib_ccmp_init(int key_idx) goto fail; priv->key_idx = key_idx; - priv->tfm = (void *)crypto_alloc_cipher("aes", 0, CRYPTO_ALG_ASYNC); + priv->tfm = (void *)crypto_alloc_cipher("aes", 0, 0); if (IS_ERR(priv->tfm)) { pr_debug("Could not allocate crypto API aes\n"); priv->tfm = NULL; diff --git a/drivers/staging/rtl8192u/ieee80211/ieee80211_crypt_ccmp.c b/drivers/staging/rtl8192u/ieee80211/ieee80211_crypt_ccmp.c index 041f1b123888..3534ddb900d1 100644 --- a/drivers/staging/rtl8192u/ieee80211/ieee80211_crypt_ccmp.c +++ b/drivers/staging/rtl8192u/ieee80211/ieee80211_crypt_ccmp.c @@ -71,7 +71,7 @@ static void *ieee80211_ccmp_init(int key_idx) goto fail; priv->key_idx = key_idx; - priv->tfm = (void *)crypto_alloc_cipher("aes", 0, CRYPTO_ALG_ASYNC); + priv->tfm = (void *)crypto_alloc_cipher("aes", 0, 0); if (IS_ERR(priv->tfm)) { pr_debug("ieee80211_crypt_ccmp: could not allocate crypto API aes\n"); priv->tfm = NULL; diff --git a/drivers/usb/wusbcore/crypto.c b/drivers/usb/wusbcore/crypto.c index 68ddee86a886..edb7263bff40 100644 --- a/drivers/usb/wusbcore/crypto.c +++ b/drivers/usb/wusbcore/crypto.c @@ -316,7 +316,7 @@ ssize_t wusb_prf(void *out, size_t out_size, goto error_setkey_cbc; } - tfm_aes = crypto_alloc_cipher("aes", 0, CRYPTO_ALG_ASYNC); + tfm_aes = crypto_alloc_cipher("aes", 0, 0); if (IS_ERR(tfm_aes)) { result = PTR_ERR(tfm_aes); printk(KERN_ERR "E: can't load AES: %d\n", (int)result); diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c index c822e626761b..1f94a25beef6 100644 --- a/net/bluetooth/smp.c +++ b/net/bluetooth/smp.c @@ -1390,7 +1390,7 @@ static struct smp_chan *smp_chan_create(struct l2cap_conn *conn) if (!smp) return NULL; - smp->tfm_aes = crypto_alloc_cipher("aes", 0, CRYPTO_ALG_ASYNC); + smp->tfm_aes = crypto_alloc_cipher("aes", 0, 0); if (IS_ERR(smp->tfm_aes)) { BT_ERR("Unable to create AES crypto context"); goto zfree_smp; @@ -3233,7 +3233,7 @@ static struct l2cap_chan *smp_add_cid(struct hci_dev *hdev, u16 cid) if (!smp) return ERR_PTR(-ENOMEM); - tfm_aes = crypto_alloc_cipher("aes", 0, CRYPTO_ALG_ASYNC); + tfm_aes = crypto_alloc_cipher("aes", 0, 0); if (IS_ERR(tfm_aes)) { BT_ERR("Unable to create AES crypto context"); kzfree(smp); @@ -3906,7 +3906,7 @@ int __init bt_selftest_smp(void) struct crypto_kpp *tfm_ecdh; int err; - tfm_aes = crypto_alloc_cipher("aes", 0, CRYPTO_ALG_ASYNC); + tfm_aes = crypto_alloc_cipher("aes", 0, 0); if (IS_ERR(tfm_aes)) { BT_ERR("Unable to create AES crypto context"); return PTR_ERR(tfm_aes); diff --git a/net/mac80211/wep.c b/net/mac80211/wep.c index 73e8f347802e..bfe9ed9f4c48 100644 --- a/net/mac80211/wep.c +++ b/net/mac80211/wep.c @@ -30,13 +30,13 @@ int ieee80211_wep_init(struct ieee80211_local *local) /* start WEP IV from a random value */ get_random_bytes(&local->wep_iv, IEEE80211_WEP_IV_LEN); - local->wep_tx_tfm = crypto_alloc_cipher("arc4", 0, CRYPTO_ALG_ASYNC); + local->wep_tx_tfm = crypto_alloc_cipher("arc4", 0, 0); if (IS_ERR(local->wep_tx_tfm)) { local->wep_rx_tfm = ERR_PTR(-EINVAL); return PTR_ERR(local->wep_tx_tfm); } - local->wep_rx_tfm = crypto_alloc_cipher("arc4", 0, CRYPTO_ALG_ASYNC); + local->wep_rx_tfm = crypto_alloc_cipher("arc4", 0, 0); if (IS_ERR(local->wep_rx_tfm)) { crypto_free_cipher(local->wep_tx_tfm); local->wep_tx_tfm = ERR_PTR(-EINVAL); diff --git a/net/wireless/lib80211_crypt_ccmp.c b/net/wireless/lib80211_crypt_ccmp.c index 6beab0cfcb99..55214fe925b2 100644 --- a/net/wireless/lib80211_crypt_ccmp.c +++ b/net/wireless/lib80211_crypt_ccmp.c @@ -75,7 +75,7 @@ static void *lib80211_ccmp_init(int key_idx) goto fail; priv->key_idx = key_idx; - priv->tfm = crypto_alloc_cipher("aes", 0, CRYPTO_ALG_ASYNC); + priv->tfm = crypto_alloc_cipher("aes", 0, 0); if (IS_ERR(priv->tfm)) { priv->tfm = NULL; goto fail; diff --git a/net/wireless/lib80211_crypt_tkip.c b/net/wireless/lib80211_crypt_tkip.c index b5e235573c8a..35f06563207d 100644 --- a/net/wireless/lib80211_crypt_tkip.c +++ b/net/wireless/lib80211_crypt_tkip.c @@ -99,7 +99,7 @@ static void *lib80211_tkip_init(int key_idx) priv->key_idx = key_idx; - priv->tx_tfm_arc4 = crypto_alloc_cipher("arc4", 0, CRYPTO_ALG_ASYNC); + priv->tx_tfm_arc4 = crypto_alloc_cipher("arc4", 0, 0); if (IS_ERR(priv->tx_tfm_arc4)) { priv->tx_tfm_arc4 = NULL; goto fail; @@ -111,7 +111,7 @@ static void *lib80211_tkip_init(int key_idx) goto fail; } - priv->rx_tfm_arc4 = crypto_alloc_cipher("arc4", 0, CRYPTO_ALG_ASYNC); + priv->rx_tfm_arc4 = crypto_alloc_cipher("arc4", 0, 0); if (IS_ERR(priv->rx_tfm_arc4)) { priv->rx_tfm_arc4 = NULL; goto fail; diff --git a/net/wireless/lib80211_crypt_wep.c b/net/wireless/lib80211_crypt_wep.c index 6015f6b542a6..20c1ad63ad44 100644 --- a/net/wireless/lib80211_crypt_wep.c +++ b/net/wireless/lib80211_crypt_wep.c @@ -48,13 +48,13 @@ static void *lib80211_wep_init(int keyidx) goto fail; priv->key_idx = keyidx; - priv->tx_tfm = crypto_alloc_cipher("arc4", 0, CRYPTO_ALG_ASYNC); + priv->tx_tfm = crypto_alloc_cipher("arc4", 0, 0); if (IS_ERR(priv->tx_tfm)) { priv->tx_tfm = NULL; goto fail; } - priv->rx_tfm = crypto_alloc_cipher("arc4", 0, CRYPTO_ALG_ASYNC); + priv->rx_tfm = crypto_alloc_cipher("arc4", 0, 0); if (IS_ERR(priv->rx_tfm)) { priv->rx_tfm = NULL; goto fail; -- cgit v1.2.3-59-g8ed1b From 3d234b3313cd12157946522fe35f5a4574f31169 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Wed, 14 Nov 2018 12:21:11 -0800 Subject: crypto: drop mask=CRYPTO_ALG_ASYNC from 'shash' tfm allocations 'shash' algorithms are always synchronous, so passing CRYPTO_ALG_ASYNC in the mask to crypto_alloc_shash() has no effect. Many users therefore already don't pass it, but some still do. This inconsistency can cause confusion, especially since the way the 'mask' argument works is somewhat counterintuitive. Thus, just remove the unneeded CRYPTO_ALG_ASYNC flags. This patch shouldn't change any actual behavior. Signed-off-by: Eric Biggers Signed-off-by: Herbert Xu --- drivers/block/drbd/drbd_receiver.c | 2 +- drivers/md/dm-integrity.c | 2 +- drivers/net/wireless/intersil/orinoco/mic.c | 6 ++---- fs/ubifs/auth.c | 5 ++--- net/bluetooth/smp.c | 2 +- security/apparmor/crypto.c | 2 +- security/integrity/evm/evm_crypto.c | 3 +-- security/keys/encrypted-keys/encrypted.c | 4 ++-- security/keys/trusted.c | 4 ++-- 9 files changed, 13 insertions(+), 17 deletions(-) diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index 61c392752fe4..ccfcf00f2798 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -3623,7 +3623,7 @@ static int receive_protocol(struct drbd_connection *connection, struct packet_in * change. */ - peer_integrity_tfm = crypto_alloc_shash(integrity_alg, 0, CRYPTO_ALG_ASYNC); + peer_integrity_tfm = crypto_alloc_shash(integrity_alg, 0, 0); if (IS_ERR(peer_integrity_tfm)) { peer_integrity_tfm = NULL; drbd_err(connection, "peer data-integrity-alg %s not supported\n", diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index bb3096bf2cc6..d4ad0bfee251 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -2804,7 +2804,7 @@ static int get_mac(struct crypto_shash **hash, struct alg_spec *a, char **error, int r; if (a->alg_string) { - *hash = crypto_alloc_shash(a->alg_string, 0, CRYPTO_ALG_ASYNC); + *hash = crypto_alloc_shash(a->alg_string, 0, 0); if (IS_ERR(*hash)) { *error = error_alg; r = PTR_ERR(*hash); diff --git a/drivers/net/wireless/intersil/orinoco/mic.c b/drivers/net/wireless/intersil/orinoco/mic.c index 08bc7822f820..709d9ab3e7bc 100644 --- a/drivers/net/wireless/intersil/orinoco/mic.c +++ b/drivers/net/wireless/intersil/orinoco/mic.c @@ -16,8 +16,7 @@ /********************************************************************/ int orinoco_mic_init(struct orinoco_private *priv) { - priv->tx_tfm_mic = crypto_alloc_shash("michael_mic", 0, - CRYPTO_ALG_ASYNC); + priv->tx_tfm_mic = crypto_alloc_shash("michael_mic", 0, 0); if (IS_ERR(priv->tx_tfm_mic)) { printk(KERN_DEBUG "orinoco_mic_init: could not allocate " "crypto API michael_mic\n"); @@ -25,8 +24,7 @@ int orinoco_mic_init(struct orinoco_private *priv) return -ENOMEM; } - priv->rx_tfm_mic = crypto_alloc_shash("michael_mic", 0, - CRYPTO_ALG_ASYNC); + priv->rx_tfm_mic = crypto_alloc_shash("michael_mic", 0, 0); if (IS_ERR(priv->rx_tfm_mic)) { printk(KERN_DEBUG "orinoco_mic_init: could not allocate " "crypto API michael_mic\n"); diff --git a/fs/ubifs/auth.c b/fs/ubifs/auth.c index 124e965a28b3..5bf5fd08879e 100644 --- a/fs/ubifs/auth.c +++ b/fs/ubifs/auth.c @@ -269,8 +269,7 @@ int ubifs_init_authentication(struct ubifs_info *c) goto out; } - c->hash_tfm = crypto_alloc_shash(c->auth_hash_name, 0, - CRYPTO_ALG_ASYNC); + c->hash_tfm = crypto_alloc_shash(c->auth_hash_name, 0, 0); if (IS_ERR(c->hash_tfm)) { err = PTR_ERR(c->hash_tfm); ubifs_err(c, "Can not allocate %s: %d", @@ -286,7 +285,7 @@ int ubifs_init_authentication(struct ubifs_info *c) goto out_free_hash; } - c->hmac_tfm = crypto_alloc_shash(hmac_name, 0, CRYPTO_ALG_ASYNC); + c->hmac_tfm = crypto_alloc_shash(hmac_name, 0, 0); if (IS_ERR(c->hmac_tfm)) { err = PTR_ERR(c->hmac_tfm); ubifs_err(c, "Can not allocate %s: %d", hmac_name, err); diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c index 1f94a25beef6..621146d04c03 100644 --- a/net/bluetooth/smp.c +++ b/net/bluetooth/smp.c @@ -3912,7 +3912,7 @@ int __init bt_selftest_smp(void) return PTR_ERR(tfm_aes); } - tfm_cmac = crypto_alloc_shash("cmac(aes)", 0, CRYPTO_ALG_ASYNC); + tfm_cmac = crypto_alloc_shash("cmac(aes)", 0, 0); if (IS_ERR(tfm_cmac)) { BT_ERR("Unable to create CMAC crypto context"); crypto_free_cipher(tfm_aes); diff --git a/security/apparmor/crypto.c b/security/apparmor/crypto.c index 136f2a047836..af03d98c7552 100644 --- a/security/apparmor/crypto.c +++ b/security/apparmor/crypto.c @@ -112,7 +112,7 @@ static int __init init_profile_hash(void) if (!apparmor_initialized) return 0; - tfm = crypto_alloc_shash("sha1", 0, CRYPTO_ALG_ASYNC); + tfm = crypto_alloc_shash("sha1", 0, 0); if (IS_ERR(tfm)) { int error = PTR_ERR(tfm); AA_ERROR("failed to setup profile sha1 hashing: %d\n", error); diff --git a/security/integrity/evm/evm_crypto.c b/security/integrity/evm/evm_crypto.c index 8c25f949ebdb..1820099dc74b 100644 --- a/security/integrity/evm/evm_crypto.c +++ b/security/integrity/evm/evm_crypto.c @@ -97,8 +97,7 @@ static struct shash_desc *init_desc(char type, uint8_t hash_algo) mutex_lock(&mutex); if (*tfm) goto out; - *tfm = crypto_alloc_shash(algo, 0, - CRYPTO_ALG_ASYNC | CRYPTO_NOLOAD); + *tfm = crypto_alloc_shash(algo, 0, CRYPTO_NOLOAD); if (IS_ERR(*tfm)) { rc = PTR_ERR(*tfm); pr_err("Can not allocate %s (reason: %ld)\n", algo, rc); diff --git a/security/keys/encrypted-keys/encrypted.c b/security/keys/encrypted-keys/encrypted.c index d92cbf9687c3..a3891ae9fa0f 100644 --- a/security/keys/encrypted-keys/encrypted.c +++ b/security/keys/encrypted-keys/encrypted.c @@ -342,7 +342,7 @@ static int calc_hmac(u8 *digest, const u8 *key, unsigned int keylen, struct crypto_shash *tfm; int err; - tfm = crypto_alloc_shash(hmac_alg, 0, CRYPTO_ALG_ASYNC); + tfm = crypto_alloc_shash(hmac_alg, 0, 0); if (IS_ERR(tfm)) { pr_err("encrypted_key: can't alloc %s transform: %ld\n", hmac_alg, PTR_ERR(tfm)); @@ -984,7 +984,7 @@ static int __init init_encrypted(void) { int ret; - hash_tfm = crypto_alloc_shash(hash_alg, 0, CRYPTO_ALG_ASYNC); + hash_tfm = crypto_alloc_shash(hash_alg, 0, 0); if (IS_ERR(hash_tfm)) { pr_err("encrypted_key: can't allocate %s transform: %ld\n", hash_alg, PTR_ERR(hash_tfm)); diff --git a/security/keys/trusted.c b/security/keys/trusted.c index ff6789365a12..2d4f16b2a7c5 100644 --- a/security/keys/trusted.c +++ b/security/keys/trusted.c @@ -1199,14 +1199,14 @@ static int __init trusted_shash_alloc(void) { int ret; - hmacalg = crypto_alloc_shash(hmac_alg, 0, CRYPTO_ALG_ASYNC); + hmacalg = crypto_alloc_shash(hmac_alg, 0, 0); if (IS_ERR(hmacalg)) { pr_info("trusted_key: could not allocate crypto %s\n", hmac_alg); return PTR_ERR(hmacalg); } - hashalg = crypto_alloc_shash(hash_alg, 0, CRYPTO_ALG_ASYNC); + hashalg = crypto_alloc_shash(hash_alg, 0, 0); if (IS_ERR(hashalg)) { pr_info("trusted_key: could not allocate crypto %s\n", hash_alg); -- cgit v1.2.3-59-g8ed1b From dd333449d0fb667c5250c42488a7e90470e16c77 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 16 Nov 2018 17:26:18 -0800 Subject: crypto: chacha20-generic - add HChaCha20 library function Refactor the unkeyed permutation part of chacha20_block() into its own function, then add hchacha20_block() which is the ChaCha equivalent of HSalsa20 and is an intermediate step towards XChaCha20 (see https://cr.yp.to/snuffle/xsalsa-20081128.pdf). HChaCha20 skips the final addition of the initial state, and outputs only certain words of the state. It should not be used for streaming directly. Reviewed-by: Ard Biesheuvel Acked-by: Martin Willi Signed-off-by: Eric Biggers Signed-off-by: Herbert Xu --- include/crypto/chacha20.h | 2 ++ lib/chacha20.c | 50 +++++++++++++++++++++++++++++++++++++++++------ 2 files changed, 46 insertions(+), 6 deletions(-) diff --git a/include/crypto/chacha20.h b/include/crypto/chacha20.h index 2d3129442a52..56073814eef0 100644 --- a/include/crypto/chacha20.h +++ b/include/crypto/chacha20.h @@ -20,6 +20,8 @@ struct chacha20_ctx { }; void chacha20_block(u32 *state, u8 *stream); +void hchacha20_block(const u32 *in, u32 *out); + void crypto_chacha20_init(u32 *state, struct chacha20_ctx *ctx, u8 *iv); int crypto_chacha20_setkey(struct crypto_skcipher *tfm, const u8 *key, unsigned int keysize); diff --git a/lib/chacha20.c b/lib/chacha20.c index d907fec6a9ed..6a484e16171d 100644 --- a/lib/chacha20.c +++ b/lib/chacha20.c @@ -1,5 +1,5 @@ /* - * ChaCha20 256-bit cipher algorithm, RFC7539 + * The "hash function" used as the core of the ChaCha20 stream cipher (RFC7539) * * Copyright (C) 2015 Martin Willi * @@ -16,14 +16,10 @@ #include #include -void chacha20_block(u32 *state, u8 *stream) +static void chacha20_permute(u32 *x) { - u32 x[16]; int i; - for (i = 0; i < ARRAY_SIZE(x); i++) - x[i] = state[i]; - for (i = 0; i < 20; i += 2) { x[0] += x[4]; x[12] = rol32(x[12] ^ x[0], 16); x[1] += x[5]; x[13] = rol32(x[13] ^ x[1], 16); @@ -65,6 +61,25 @@ void chacha20_block(u32 *state, u8 *stream) x[8] += x[13]; x[7] = rol32(x[7] ^ x[8], 7); x[9] += x[14]; x[4] = rol32(x[4] ^ x[9], 7); } +} + +/** + * chacha20_block - generate one keystream block and increment block counter + * @state: input state matrix (16 32-bit words) + * @stream: output keystream block (64 bytes) + * + * This is the ChaCha20 core, a function from 64-byte strings to 64-byte + * strings. The caller has already converted the endianness of the input. This + * function also handles incrementing the block counter in the input matrix. + */ +void chacha20_block(u32 *state, u8 *stream) +{ + u32 x[16]; + int i; + + memcpy(x, state, 64); + + chacha20_permute(x); for (i = 0; i < ARRAY_SIZE(x); i++) put_unaligned_le32(x[i] + state[i], &stream[i * sizeof(u32)]); @@ -72,3 +87,26 @@ void chacha20_block(u32 *state, u8 *stream) state[12]++; } EXPORT_SYMBOL(chacha20_block); + +/** + * hchacha20_block - abbreviated ChaCha20 core, for XChaCha20 + * @in: input state matrix (16 32-bit words) + * @out: output (8 32-bit words) + * + * HChaCha20 is the ChaCha equivalent of HSalsa20 and is an intermediate step + * towards XChaCha20 (see https://cr.yp.to/snuffle/xsalsa-20081128.pdf). + * HChaCha20 skips the final addition of the initial state, and outputs only + * certain words of the state. It should not be used for streaming directly. + */ +void hchacha20_block(const u32 *in, u32 *out) +{ + u32 x[16]; + + memcpy(x, in, 64); + + chacha20_permute(x); + + memcpy(&out[0], &x[0], 16); + memcpy(&out[4], &x[12], 16); +} +EXPORT_SYMBOL(hchacha20_block); -- cgit v1.2.3-59-g8ed1b From 5e04542a0e0763294e9fced73a149c38c4e0cee5 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 16 Nov 2018 17:26:19 -0800 Subject: crypto: chacha20-generic - don't unnecessarily use atomic walk chacha20-generic doesn't use SIMD instructions or otherwise disable preemption, so passing atomic=true to skcipher_walk_virt() is unnecessary. Suggested-by: Ard Biesheuvel Acked-by: Martin Willi Signed-off-by: Eric Biggers Signed-off-by: Herbert Xu --- crypto/chacha20_generic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crypto/chacha20_generic.c b/crypto/chacha20_generic.c index 3ae96587caf9..3529521d72a4 100644 --- a/crypto/chacha20_generic.c +++ b/crypto/chacha20_generic.c @@ -81,7 +81,7 @@ int crypto_chacha20_crypt(struct skcipher_request *req) u32 state[16]; int err; - err = skcipher_walk_virt(&walk, req, true); + err = skcipher_walk_virt(&walk, req, false); crypto_chacha20_init(state, ctx, walk.iv); -- cgit v1.2.3-59-g8ed1b From de61d7ae5d3789dcba3749a418f76613fbee8414 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 16 Nov 2018 17:26:20 -0800 Subject: crypto: chacha20-generic - add XChaCha20 support Add support for the XChaCha20 stream cipher. XChaCha20 is the application of the XSalsa20 construction (https://cr.yp.to/snuffle/xsalsa-20081128.pdf) to ChaCha20 rather than to Salsa20. XChaCha20 extends ChaCha20's nonce length from 64 bits (or 96 bits, depending on convention) to 192 bits, while provably retaining ChaCha20's security. XChaCha20 uses the ChaCha20 permutation to map the key and first 128 nonce bits to a 256-bit subkey. Then, it does the ChaCha20 stream cipher with the subkey and remaining 64 bits of nonce. We need XChaCha support in order to add support for the Adiantum encryption mode. Note that to meet our performance requirements, we actually plan to primarily use the variant XChaCha12. But we believe it's wise to first add XChaCha20 as a baseline with a higher security margin, in case there are any situations where it can be used. Supporting both variants is straightforward. Since XChaCha20's subkey differs for each request, XChaCha20 can't be a template that wraps ChaCha20; that would require re-keying the underlying ChaCha20 for every request, which wouldn't be thread-safe. Instead, we make XChaCha20 its own top-level algorithm which calls the ChaCha20 streaming implementation internally. Similar to the existing ChaCha20 implementation, we define the IV to be the nonce and stream position concatenated together. This allows users to seek to any position in the stream. I considered splitting the code into separate chacha20-common, chacha20, and xchacha20 modules, so that chacha20 and xchacha20 could be enabled/disabled independently. However, since nearly all the code is shared anyway, I ultimately decided there would have been little benefit to the added complexity of separate modules. Reviewed-by: Ard Biesheuvel Acked-by: Martin Willi Signed-off-by: Eric Biggers Signed-off-by: Herbert Xu --- crypto/Kconfig | 14 +- crypto/chacha20_generic.c | 120 +++++++--- crypto/testmgr.c | 6 + crypto/testmgr.h | 577 ++++++++++++++++++++++++++++++++++++++++++++++ include/crypto/chacha20.h | 14 +- 5 files changed, 689 insertions(+), 42 deletions(-) diff --git a/crypto/Kconfig b/crypto/Kconfig index 62dbd1a99fa3..75ebd1a2746c 100644 --- a/crypto/Kconfig +++ b/crypto/Kconfig @@ -1403,18 +1403,22 @@ config CRYPTO_SALSA20 Bernstein . See config CRYPTO_CHACHA20 - tristate "ChaCha20 cipher algorithm" + tristate "ChaCha20 stream cipher algorithms" select CRYPTO_BLKCIPHER help - ChaCha20 cipher algorithm, RFC7539. + The ChaCha20 and XChaCha20 stream cipher algorithms. ChaCha20 is a 256-bit high-speed stream cipher designed by Daniel J. Bernstein and further specified in RFC7539 for use in IETF protocols. - This is the portable C implementation of ChaCha20. - - See also: + This is the portable C implementation of ChaCha20. See also: + XChaCha20 is the application of the XSalsa20 construction to ChaCha20 + rather than to Salsa20. XChaCha20 extends ChaCha20's nonce length + from 64 bits (or 96 bits using the RFC7539 convention) to 192 bits, + while provably retaining ChaCha20's security. See also: + + config CRYPTO_CHACHA20_X86_64 tristate "ChaCha20 cipher algorithm (x86_64/SSSE3/AVX2)" depends on X86 && 64BIT diff --git a/crypto/chacha20_generic.c b/crypto/chacha20_generic.c index 3529521d72a4..4305b1b62b16 100644 --- a/crypto/chacha20_generic.c +++ b/crypto/chacha20_generic.c @@ -1,7 +1,8 @@ /* - * ChaCha20 256-bit cipher algorithm, RFC7539 + * ChaCha20 (RFC7539) and XChaCha20 stream cipher algorithms * * Copyright (C) 2015 Martin Willi + * Copyright (C) 2018 Google LLC * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -36,6 +37,31 @@ static void chacha20_docrypt(u32 *state, u8 *dst, const u8 *src, } } +static int chacha20_stream_xor(struct skcipher_request *req, + struct chacha20_ctx *ctx, u8 *iv) +{ + struct skcipher_walk walk; + u32 state[16]; + int err; + + err = skcipher_walk_virt(&walk, req, false); + + crypto_chacha20_init(state, ctx, iv); + + while (walk.nbytes > 0) { + unsigned int nbytes = walk.nbytes; + + if (nbytes < walk.total) + nbytes = round_down(nbytes, walk.stride); + + chacha20_docrypt(state, walk.dst.virt.addr, walk.src.virt.addr, + nbytes); + err = skcipher_walk_done(&walk, walk.nbytes - nbytes); + } + + return err; +} + void crypto_chacha20_init(u32 *state, struct chacha20_ctx *ctx, u8 *iv) { state[0] = 0x61707865; /* "expa" */ @@ -77,54 +103,74 @@ int crypto_chacha20_crypt(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct chacha20_ctx *ctx = crypto_skcipher_ctx(tfm); - struct skcipher_walk walk; - u32 state[16]; - int err; - - err = skcipher_walk_virt(&walk, req, false); - crypto_chacha20_init(state, ctx, walk.iv); + return chacha20_stream_xor(req, ctx, req->iv); +} +EXPORT_SYMBOL_GPL(crypto_chacha20_crypt); - while (walk.nbytes > 0) { - unsigned int nbytes = walk.nbytes; +int crypto_xchacha20_crypt(struct skcipher_request *req) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct chacha20_ctx *ctx = crypto_skcipher_ctx(tfm); + struct chacha20_ctx subctx; + u32 state[16]; + u8 real_iv[16]; - if (nbytes < walk.total) - nbytes = round_down(nbytes, walk.stride); + /* Compute the subkey given the original key and first 128 nonce bits */ + crypto_chacha20_init(state, ctx, req->iv); + hchacha20_block(state, subctx.key); - chacha20_docrypt(state, walk.dst.virt.addr, walk.src.virt.addr, - nbytes); - err = skcipher_walk_done(&walk, walk.nbytes - nbytes); - } + /* Build the real IV */ + memcpy(&real_iv[0], req->iv + 24, 8); /* stream position */ + memcpy(&real_iv[8], req->iv + 16, 8); /* remaining 64 nonce bits */ - return err; + /* Generate the stream and XOR it with the data */ + return chacha20_stream_xor(req, &subctx, real_iv); } -EXPORT_SYMBOL_GPL(crypto_chacha20_crypt); - -static struct skcipher_alg alg = { - .base.cra_name = "chacha20", - .base.cra_driver_name = "chacha20-generic", - .base.cra_priority = 100, - .base.cra_blocksize = 1, - .base.cra_ctxsize = sizeof(struct chacha20_ctx), - .base.cra_module = THIS_MODULE, - - .min_keysize = CHACHA20_KEY_SIZE, - .max_keysize = CHACHA20_KEY_SIZE, - .ivsize = CHACHA20_IV_SIZE, - .chunksize = CHACHA20_BLOCK_SIZE, - .setkey = crypto_chacha20_setkey, - .encrypt = crypto_chacha20_crypt, - .decrypt = crypto_chacha20_crypt, +EXPORT_SYMBOL_GPL(crypto_xchacha20_crypt); + +static struct skcipher_alg algs[] = { + { + .base.cra_name = "chacha20", + .base.cra_driver_name = "chacha20-generic", + .base.cra_priority = 100, + .base.cra_blocksize = 1, + .base.cra_ctxsize = sizeof(struct chacha20_ctx), + .base.cra_module = THIS_MODULE, + + .min_keysize = CHACHA20_KEY_SIZE, + .max_keysize = CHACHA20_KEY_SIZE, + .ivsize = CHACHA20_IV_SIZE, + .chunksize = CHACHA20_BLOCK_SIZE, + .setkey = crypto_chacha20_setkey, + .encrypt = crypto_chacha20_crypt, + .decrypt = crypto_chacha20_crypt, + }, { + .base.cra_name = "xchacha20", + .base.cra_driver_name = "xchacha20-generic", + .base.cra_priority = 100, + .base.cra_blocksize = 1, + .base.cra_ctxsize = sizeof(struct chacha20_ctx), + .base.cra_module = THIS_MODULE, + + .min_keysize = CHACHA20_KEY_SIZE, + .max_keysize = CHACHA20_KEY_SIZE, + .ivsize = XCHACHA20_IV_SIZE, + .chunksize = CHACHA20_BLOCK_SIZE, + .setkey = crypto_chacha20_setkey, + .encrypt = crypto_xchacha20_crypt, + .decrypt = crypto_xchacha20_crypt, + } }; static int __init chacha20_generic_mod_init(void) { - return crypto_register_skcipher(&alg); + return crypto_register_skciphers(algs, ARRAY_SIZE(algs)); } static void __exit chacha20_generic_mod_fini(void) { - crypto_unregister_skcipher(&alg); + crypto_unregister_skciphers(algs, ARRAY_SIZE(algs)); } module_init(chacha20_generic_mod_init); @@ -132,6 +178,8 @@ module_exit(chacha20_generic_mod_fini); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Martin Willi "); -MODULE_DESCRIPTION("chacha20 cipher algorithm"); +MODULE_DESCRIPTION("ChaCha20 and XChaCha20 stream ciphers (generic)"); MODULE_ALIAS_CRYPTO("chacha20"); MODULE_ALIAS_CRYPTO("chacha20-generic"); +MODULE_ALIAS_CRYPTO("xchacha20"); +MODULE_ALIAS_CRYPTO("xchacha20-generic"); diff --git a/crypto/testmgr.c b/crypto/testmgr.c index 379794a259a7..11f5c8b0f4dc 100644 --- a/crypto/testmgr.c +++ b/crypto/testmgr.c @@ -3576,6 +3576,12 @@ static const struct alg_test_desc alg_test_descs[] = { .suite = { .hash = __VECS(aes_xcbc128_tv_template) } + }, { + .alg = "xchacha20", + .test = alg_test_skcipher, + .suite = { + .cipher = __VECS(xchacha20_tv_template) + }, }, { .alg = "xts(aes)", .test = alg_test_skcipher, diff --git a/crypto/testmgr.h b/crypto/testmgr.h index fc1b6c0e9ed2..df0dc44a9b7b 100644 --- a/crypto/testmgr.h +++ b/crypto/testmgr.h @@ -30994,6 +30994,583 @@ static const struct cipher_testvec chacha20_tv_template[] = { }, }; +static const struct cipher_testvec xchacha20_tv_template[] = { + { /* from libsodium test/default/xchacha20.c */ + .key = "\x79\xc9\x97\x98\xac\x67\x30\x0b" + "\xbb\x27\x04\xc9\x5c\x34\x1e\x32" + "\x45\xf3\xdc\xb2\x17\x61\xb9\x8e" + "\x52\xff\x45\xb2\x4f\x30\x4f\xc4", + .klen = 32, + .iv = "\xb3\x3f\xfd\x30\x96\x47\x9b\xcf" + "\xbc\x9a\xee\x49\x41\x76\x88\xa0" + "\xa2\x55\x4f\x8d\x95\x38\x94\x19" + "\x00\x00\x00\x00\x00\x00\x00\x00", + .ptext = "\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00", + .ctext = "\xc6\xe9\x75\x81\x60\x08\x3a\xc6" + "\x04\xef\x90\xe7\x12\xce\x6e\x75" + "\xd7\x79\x75\x90\x74\x4e\x0c\xf0" + "\x60\xf0\x13\x73\x9c", + .len = 29, + }, { /* from libsodium test/default/xchacha20.c */ + .key = "\x9d\x23\xbd\x41\x49\xcb\x97\x9c" + "\xcf\x3c\x5c\x94\xdd\x21\x7e\x98" + "\x08\xcb\x0e\x50\xcd\x0f\x67\x81" + "\x22\x35\xea\xaf\x60\x1d\x62\x32", + .klen = 32, + .iv = "\xc0\x47\x54\x82\x66\xb7\xc3\x70" + "\xd3\x35\x66\xa2\x42\x5c\xbf\x30" + "\xd8\x2d\x1e\xaf\x52\x94\x10\x9e" + "\x00\x00\x00\x00\x00\x00\x00\x00", + .ptext = "\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00", + .ctext = "\xa2\x12\x09\x09\x65\x94\xde\x8c" + "\x56\x67\xb1\xd1\x3a\xd9\x3f\x74" + "\x41\x06\xd0\x54\xdf\x21\x0e\x47" + "\x82\xcd\x39\x6f\xec\x69\x2d\x35" + "\x15\xa2\x0b\xf3\x51\xee\xc0\x11" + "\xa9\x2c\x36\x78\x88\xbc\x46\x4c" + "\x32\xf0\x80\x7a\xcd\x6c\x20\x3a" + "\x24\x7e\x0d\xb8\x54\x14\x84\x68" + "\xe9\xf9\x6b\xee\x4c\xf7\x18\xd6" + "\x8d\x5f\x63\x7c\xbd\x5a\x37\x64" + "\x57\x78\x8e\x6f\xae\x90\xfc\x31" + "\x09\x7c\xfc", + .len = 91, + }, { /* Taken from the ChaCha20 test vectors, appended 16 random bytes + to nonce, and recomputed the ciphertext with libsodium */ + .key = "\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00", + .klen = 32, + .iv = "\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x67\xc6\x69\x73" + "\x51\xff\x4a\xec\x29\xcd\xba\xab" + "\x00\x00\x00\x00\x00\x00\x00\x00", + .ptext = "\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00", + .ctext = "\x9c\x49\x2a\xe7\x8a\x2f\x93\xc7" + "\xb3\x33\x6f\x82\x17\xd8\xc4\x1e" + "\xad\x80\x11\x11\x1d\x4c\x16\x18" + "\x07\x73\x9b\x4f\xdb\x7c\xcb\x47" + "\xfd\xef\x59\x74\xfa\x3f\xe5\x4c" + "\x9b\xd0\xea\xbc\xba\x56\xad\x32" + "\x03\xdc\xf8\x2b\xc1\xe1\x75\x67" + "\x23\x7b\xe6\xfc\xd4\x03\x86\x54", + .len = 64, + }, { /* Taken from the ChaCha20 test vectors, appended 16 random bytes + to nonce, and recomputed the ciphertext with libsodium */ + .key = "\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x01", + .klen = 32, + .iv = "\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x02\xf2\xfb\xe3\x46" + "\x7c\xc2\x54\xf8\x1b\xe8\xe7\x8d" + "\x01\x00\x00\x00\x00\x00\x00\x00", + .ptext = "\x41\x6e\x79\x20\x73\x75\x62\x6d" + "\x69\x73\x73\x69\x6f\x6e\x20\x74" + "\x6f\x20\x74\x68\x65\x20\x49\x45" + "\x54\x46\x20\x69\x6e\x74\x65\x6e" + "\x64\x65\x64\x20\x62\x79\x20\x74" + "\x68\x65\x20\x43\x6f\x6e\x74\x72" + "\x69\x62\x75\x74\x6f\x72\x20\x66" + "\x6f\x72\x20\x70\x75\x62\x6c\x69" + "\x63\x61\x74\x69\x6f\x6e\x20\x61" + "\x73\x20\x61\x6c\x6c\x20\x6f\x72" + "\x20\x70\x61\x72\x74\x20\x6f\x66" + "\x20\x61\x6e\x20\x49\x45\x54\x46" + "\x20\x49\x6e\x74\x65\x72\x6e\x65" + "\x74\x2d\x44\x72\x61\x66\x74\x20" + "\x6f\x72\x20\x52\x46\x43\x20\x61" + "\x6e\x64\x20\x61\x6e\x79\x20\x73" + "\x74\x61\x74\x65\x6d\x65\x6e\x74" + "\x20\x6d\x61\x64\x65\x20\x77\x69" + "\x74\x68\x69\x6e\x20\x74\x68\x65" + "\x20\x63\x6f\x6e\x74\x65\x78\x74" + "\x20\x6f\x66\x20\x61\x6e\x20\x49" + "\x45\x54\x46\x20\x61\x63\x74\x69" + "\x76\x69\x74\x79\x20\x69\x73\x20" + "\x63\x6f\x6e\x73\x69\x64\x65\x72" + "\x65\x64\x20\x61\x6e\x20\x22\x49" + "\x45\x54\x46\x20\x43\x6f\x6e\x74" + "\x72\x69\x62\x75\x74\x69\x6f\x6e" + "\x22\x2e\x20\x53\x75\x63\x68\x20" + "\x73\x74\x61\x74\x65\x6d\x65\x6e" + "\x74\x73\x20\x69\x6e\x63\x6c\x75" + "\x64\x65\x20\x6f\x72\x61\x6c\x20" + "\x73\x74\x61\x74\x65\x6d\x65\x6e" + "\x74\x73\x20\x69\x6e\x20\x49\x45" + "\x54\x46\x20\x73\x65\x73\x73\x69" + "\x6f\x6e\x73\x2c\x20\x61\x73\x20" + "\x77\x65\x6c\x6c\x20\x61\x73\x20" + "\x77\x72\x69\x74\x74\x65\x6e\x20" + "\x61\x6e\x64\x20\x65\x6c\x65\x63" + "\x74\x72\x6f\x6e\x69\x63\x20\x63" + "\x6f\x6d\x6d\x75\x6e\x69\x63\x61" + "\x74\x69\x6f\x6e\x73\x20\x6d\x61" + "\x64\x65\x20\x61\x74\x20\x61\x6e" + "\x79\x20\x74\x69\x6d\x65\x20\x6f" + "\x72\x20\x70\x6c\x61\x63\x65\x2c" + "\x20\x77\x68\x69\x63\x68\x20\x61" + "\x72\x65\x20\x61\x64\x64\x72\x65" + "\x73\x73\x65\x64\x20\x74\x6f", + .ctext = "\xf9\xab\x7a\x4a\x60\xb8\x5f\xa0" + "\x50\xbb\x57\xce\xef\x8c\xc1\xd9" + "\x24\x15\xb3\x67\x5e\x7f\x01\xf6" + "\x1c\x22\xf6\xe5\x71\xb1\x43\x64" + "\x63\x05\xd5\xfc\x5c\x3d\xc0\x0e" + "\x23\xef\xd3\x3b\xd9\xdc\x7f\xa8" + "\x58\x26\xb3\xd0\xc2\xd5\x04\x3f" + "\x0a\x0e\x8f\x17\xe4\xcd\xf7\x2a" + "\xb4\x2c\x09\xe4\x47\xec\x8b\xfb" + "\x59\x37\x7a\xa1\xd0\x04\x7e\xaa" + "\xf1\x98\x5f\x24\x3d\x72\x9a\x43" + "\xa4\x36\x51\x92\x22\x87\xff\x26" + "\xce\x9d\xeb\x59\x78\x84\x5e\x74" + "\x97\x2e\x63\xc0\xef\x29\xf7\x8a" + "\xb9\xee\x35\x08\x77\x6a\x35\x9a" + "\x3e\xe6\x4f\x06\x03\x74\x1b\xc1" + "\x5b\xb3\x0b\x89\x11\x07\xd3\xb7" + "\x53\xd6\x25\x04\xd9\x35\xb4\x5d" + "\x4c\x33\x5a\xc2\x42\x4c\xe6\xa4" + "\x97\x6e\x0e\xd2\xb2\x8b\x2f\x7f" + "\x28\xe5\x9f\xac\x4b\x2e\x02\xab" + "\x85\xfa\xa9\x0d\x7c\x2d\x10\xe6" + "\x91\xab\x55\x63\xf0\xde\x3a\x94" + "\x25\x08\x10\x03\xc2\x68\xd1\xf4" + "\xaf\x7d\x9c\x99\xf7\x86\x96\x30" + "\x60\xfc\x0b\xe6\xa8\x80\x15\xb0" + "\x81\xb1\x0c\xbe\xb9\x12\x18\x25" + "\xe9\x0e\xb1\xe7\x23\xb2\xef\x4a" + "\x22\x8f\xc5\x61\x89\xd4\xe7\x0c" + "\x64\x36\x35\x61\xb6\x34\x60\xf7" + "\x7b\x61\x37\x37\x12\x10\xa2\xf6" + "\x7e\xdb\x7f\x39\x3f\xb6\x8e\x89" + "\x9e\xf3\xfe\x13\x98\xbb\x66\x5a" + "\xec\xea\xab\x3f\x9c\x87\xc4\x8c" + "\x8a\x04\x18\x49\xfc\x77\x11\x50" + "\x16\xe6\x71\x2b\xee\xc0\x9c\xb6" + "\x87\xfd\x80\xff\x0b\x1d\x73\x38" + "\xa4\x1d\x6f\xae\xe4\x12\xd7\x93" + "\x9d\xcd\x38\x26\x09\x40\x52\xcd" + "\x67\x01\x67\x26\xe0\x3e\x98\xa8" + "\xe8\x1a\x13\x41\xbb\x90\x4d\x87" + "\xbb\x42\x82\x39\xce\x3a\xd0\x18" + "\x6d\x7b\x71\x8f\xbb\x2c\x6a\xd1" + "\xbd\xf5\xc7\x8a\x7e\xe1\x1e\x0f" + "\x0d\x0d\x13\x7c\xd9\xd8\x3c\x91" + "\xab\xff\x1f\x12\xc3\xee\xe5\x65" + "\x12\x8d\x7b\x61\xe5\x1f\x98", + .len = 375, + .also_non_np = 1, + .np = 3, + .tap = { 375 - 20, 4, 16 }, + + }, { /* Taken from the ChaCha20 test vectors, appended 16 random bytes + to nonce, and recomputed the ciphertext with libsodium */ + .key = "\x1c\x92\x40\xa5\xeb\x55\xd3\x8a" + "\xf3\x33\x88\x86\x04\xf6\xb5\xf0" + "\x47\x39\x17\xc1\x40\x2b\x80\x09" + "\x9d\xca\x5c\xbc\x20\x70\x75\xc0", + .klen = 32, + .iv = "\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x02\x76\x5a\x2e\x63" + "\x33\x9f\xc9\x9a\x66\x32\x0d\xb7" + "\x2a\x00\x00\x00\x00\x00\x00\x00", + .ptext = "\x27\x54\x77\x61\x73\x20\x62\x72" + "\x69\x6c\x6c\x69\x67\x2c\x20\x61" + "\x6e\x64\x20\x74\x68\x65\x20\x73" + "\x6c\x69\x74\x68\x79\x20\x74\x6f" + "\x76\x65\x73\x0a\x44\x69\x64\x20" + "\x67\x79\x72\x65\x20\x61\x6e\x64" + "\x20\x67\x69\x6d\x62\x6c\x65\x20" + "\x69\x6e\x20\x74\x68\x65\x20\x77" + "\x61\x62\x65\x3a\x0a\x41\x6c\x6c" + "\x20\x6d\x69\x6d\x73\x79\x20\x77" + "\x65\x72\x65\x20\x74\x68\x65\x20" + "\x62\x6f\x72\x6f\x67\x6f\x76\x65" + "\x73\x2c\x0a\x41\x6e\x64\x20\x74" + "\x68\x65\x20\x6d\x6f\x6d\x65\x20" + "\x72\x61\x74\x68\x73\x20\x6f\x75" + "\x74\x67\x72\x61\x62\x65\x2e", + .ctext = "\x95\xb9\x51\xe7\x8f\xb4\xa4\x03" + "\xca\x37\xcc\xde\x60\x1d\x8c\xe2" + "\xf1\xbb\x8a\x13\x7f\x61\x85\xcc" + "\xad\xf4\xf0\xdc\x86\xa6\x1e\x10" + "\xbc\x8e\xcb\x38\x2b\xa5\xc8\x8f" + "\xaa\x03\x3d\x53\x4a\x42\xb1\x33" + "\xfc\xd3\xef\xf0\x8e\x7e\x10\x9c" + "\x6f\x12\x5e\xd4\x96\xfe\x5b\x08" + "\xb6\x48\xf0\x14\x74\x51\x18\x7c" + "\x07\x92\xfc\xac\x9d\xf1\x94\xc0" + "\xc1\x9d\xc5\x19\x43\x1f\x1d\xbb" + "\x07\xf0\x1b\x14\x25\x45\xbb\xcb" + "\x5c\xe2\x8b\x28\xf3\xcf\x47\x29" + "\x27\x79\x67\x24\xa6\x87\xc2\x11" + "\x65\x03\xfa\x45\xf7\x9e\x53\x7a" + "\x99\xf1\x82\x25\x4f\x8d\x07", + .len = 127, + }, { /* Taken from the ChaCha20 test vectors, appended 16 random bytes + to nonce, and recomputed the ciphertext with libsodium */ + .key = "\x1c\x92\x40\xa5\xeb\x55\xd3\x8a" + "\xf3\x33\x88\x86\x04\xf6\xb5\xf0" + "\x47\x39\x17\xc1\x40\x2b\x80\x09" + "\x9d\xca\x5c\xbc\x20\x70\x75\xc0", + .klen = 32, + .iv = "\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x01\x31\x58\xa3\x5a" + "\x25\x5d\x05\x17\x58\xe9\x5e\xd4" + "\x1c\x00\x00\x00\x00\x00\x00\x00", + .ptext = "\x49\xee\xe0\xdc\x24\x90\x40\xcd" + "\xc5\x40\x8f\x47\x05\xbc\xdd\x81" + "\x47\xc6\x8d\xe6\xb1\x8f\xd7\xcb" + "\x09\x0e\x6e\x22\x48\x1f\xbf\xb8" + "\x5c\xf7\x1e\x8a\xc1\x23\xf2\xd4" + "\x19\x4b\x01\x0f\x4e\xa4\x43\xce" + "\x01\xc6\x67\xda\x03\x91\x18\x90" + "\xa5\xa4\x8e\x45\x03\xb3\x2d\xac" + "\x74\x92\xd3\x53\x47\xc8\xdd\x25" + "\x53\x6c\x02\x03\x87\x0d\x11\x0c" + "\x58\xe3\x12\x18\xfd\x2a\x5b\x40" + "\x0c\x30\xf0\xb8\x3f\x43\xce\xae" + "\x65\x3a\x7d\x7c\xf4\x54\xaa\xcc" + "\x33\x97\xc3\x77\xba\xc5\x70\xde" + "\xd7\xd5\x13\xa5\x65\xc4\x5f\x0f" + "\x46\x1a\x0d\x97\xb5\xf3\xbb\x3c" + "\x84\x0f\x2b\xc5\xaa\xea\xf2\x6c" + "\xc9\xb5\x0c\xee\x15\xf3\x7d\xbe" + "\x9f\x7b\x5a\xa6\xae\x4f\x83\xb6" + "\x79\x49\x41\xf4\x58\x18\xcb\x86" + "\x7f\x30\x0e\xf8\x7d\x44\x36\xea" + "\x75\xeb\x88\x84\x40\x3c\xad\x4f" + "\x6f\x31\x6b\xaa\x5d\xe5\xa5\xc5" + "\x21\x66\xe9\xa7\xe3\xb2\x15\x88" + "\x78\xf6\x79\xa1\x59\x47\x12\x4e" + "\x9f\x9f\x64\x1a\xa0\x22\x5b\x08" + "\xbe\x7c\x36\xc2\x2b\x66\x33\x1b" + "\xdd\x60\x71\xf7\x47\x8c\x61\xc3" + "\xda\x8a\x78\x1e\x16\xfa\x1e\x86" + "\x81\xa6\x17\x2a\xa7\xb5\xc2\xe7" + "\xa4\xc7\x42\xf1\xcf\x6a\xca\xb4" + "\x45\xcf\xf3\x93\xf0\xe7\xea\xf6" + "\xf4\xe6\x33\x43\x84\x93\xa5\x67" + "\x9b\x16\x58\x58\x80\x0f\x2b\x5c" + "\x24\x74\x75\x7f\x95\x81\xb7\x30" + "\x7a\x33\xa7\xf7\x94\x87\x32\x27" + "\x10\x5d\x14\x4c\x43\x29\xdd\x26" + "\xbd\x3e\x3c\x0e\xfe\x0e\xa5\x10" + "\xea\x6b\x64\xfd\x73\xc6\xed\xec" + "\xa8\xc9\xbf\xb3\xba\x0b\x4d\x07" + "\x70\xfc\x16\xfd\x79\x1e\xd7\xc5" + "\x49\x4e\x1c\x8b\x8d\x79\x1b\xb1" + "\xec\xca\x60\x09\x4c\x6a\xd5\x09" + "\x49\x46\x00\x88\x22\x8d\xce\xea" + "\xb1\x17\x11\xde\x42\xd2\x23\xc1" + "\x72\x11\xf5\x50\x73\x04\x40\x47" + "\xf9\x5d\xe7\xa7\x26\xb1\x7e\xb0" + "\x3f\x58\xc1\x52\xab\x12\x67\x9d" + "\x3f\x43\x4b\x68\xd4\x9c\x68\x38" + "\x07\x8a\x2d\x3e\xf3\xaf\x6a\x4b" + "\xf9\xe5\x31\x69\x22\xf9\xa6\x69" + "\xc6\x9c\x96\x9a\x12\x35\x95\x1d" + "\x95\xd5\xdd\xbe\xbf\x93\x53\x24" + "\xfd\xeb\xc2\x0a\x64\xb0\x77\x00" + "\x6f\x88\xc4\x37\x18\x69\x7c\xd7" + "\x41\x92\x55\x4c\x03\xa1\x9a\x4b" + "\x15\xe5\xdf\x7f\x37\x33\x72\xc1" + "\x8b\x10\x67\xa3\x01\x57\x94\x25" + "\x7b\x38\x71\x7e\xdd\x1e\xcc\x73" + "\x55\xd2\x8e\xeb\x07\xdd\xf1\xda" + "\x58\xb1\x47\x90\xfe\x42\x21\x72" + "\xa3\x54\x7a\xa0\x40\xec\x9f\xdd" + "\xc6\x84\x6e\xca\xae\xe3\x68\xb4" + "\x9d\xe4\x78\xff\x57\xf2\xf8\x1b" + "\x03\xa1\x31\xd9\xde\x8d\xf5\x22" + "\x9c\xdd\x20\xa4\x1e\x27\xb1\x76" + "\x4f\x44\x55\xe2\x9b\xa1\x9c\xfe" + "\x54\xf7\x27\x1b\xf4\xde\x02\xf5" + "\x1b\x55\x48\x5c\xdc\x21\x4b\x9e" + "\x4b\x6e\xed\x46\x23\xdc\x65\xb2" + "\xcf\x79\x5f\x28\xe0\x9e\x8b\xe7" + "\x4c\x9d\x8a\xff\xc1\xa6\x28\xb8" + "\x65\x69\x8a\x45\x29\xef\x74\x85" + "\xde\x79\xc7\x08\xae\x30\xb0\xf4" + "\xa3\x1d\x51\x41\xab\xce\xcb\xf6" + "\xb5\xd8\x6d\xe0\x85\xe1\x98\xb3" + "\x43\xbb\x86\x83\x0a\xa0\xf5\xb7" + "\x04\x0b\xfa\x71\x1f\xb0\xf6\xd9" + "\x13\x00\x15\xf0\xc7\xeb\x0d\x5a" + "\x9f\xd7\xb9\x6c\x65\x14\x22\x45" + "\x6e\x45\x32\x3e\x7e\x60\x1a\x12" + "\x97\x82\x14\xfb\xaa\x04\x22\xfa" + "\xa0\xe5\x7e\x8c\x78\x02\x48\x5d" + "\x78\x33\x5a\x7c\xad\xdb\x29\xce" + "\xbb\x8b\x61\xa4\xb7\x42\xe2\xac" + "\x8b\x1a\xd9\x2f\x0b\x8b\x62\x21" + "\x83\x35\x7e\xad\x73\xc2\xb5\x6c" + "\x10\x26\x38\x07\xe5\xc7\x36\x80" + "\xe2\x23\x12\x61\xf5\x48\x4b\x2b" + "\xc5\xdf\x15\xd9\x87\x01\xaa\xac" + "\x1e\x7c\xad\x73\x78\x18\x63\xe0" + "\x8b\x9f\x81\xd8\x12\x6a\x28\x10" + "\xbe\x04\x68\x8a\x09\x7c\x1b\x1c" + "\x83\x66\x80\x47\x80\xe8\xfd\x35" + "\x1c\x97\x6f\xae\x49\x10\x66\xcc" + "\xc6\xd8\xcc\x3a\x84\x91\x20\x77" + "\x72\xe4\x24\xd2\x37\x9f\xc5\xc9" + "\x25\x94\x10\x5f\x40\x00\x64\x99" + "\xdc\xae\xd7\x21\x09\x78\x50\x15" + "\xac\x5f\xc6\x2c\xa2\x0b\xa9\x39" + "\x87\x6e\x6d\xab\xde\x08\x51\x16" + "\xc7\x13\xe9\xea\xed\x06\x8e\x2c" + "\xf8\x37\x8c\xf0\xa6\x96\x8d\x43" + "\xb6\x98\x37\xb2\x43\xed\xde\xdf" + "\x89\x1a\xe7\xeb\x9d\xa1\x7b\x0b" + "\x77\xb0\xe2\x75\xc0\xf1\x98\xd9" + "\x80\x55\xc9\x34\x91\xd1\x59\xe8" + "\x4b\x0f\xc1\xa9\x4b\x7a\x84\x06" + "\x20\xa8\x5d\xfa\xd1\xde\x70\x56" + "\x2f\x9e\x91\x9c\x20\xb3\x24\xd8" + "\x84\x3d\xe1\x8c\x7e\x62\x52\xe5" + "\x44\x4b\x9f\xc2\x93\x03\xea\x2b" + "\x59\xc5\xfa\x3f\x91\x2b\xbb\x23" + "\xf5\xb2\x7b\xf5\x38\xaf\xb3\xee" + "\x63\xdc\x7b\xd1\xff\xaa\x8b\xab" + "\x82\x6b\x37\x04\xeb\x74\xbe\x79" + "\xb9\x83\x90\xef\x20\x59\x46\xff" + "\xe9\x97\x3e\x2f\xee\xb6\x64\x18" + "\x38\x4c\x7a\x4a\xf9\x61\xe8\x9a" + "\xa1\xb5\x01\xa6\x47\xd3\x11\xd4" + "\xce\xd3\x91\x49\x88\xc7\xb8\x4d" + "\xb1\xb9\x07\x6d\x16\x72\xae\x46" + "\x5e\x03\xa1\x4b\xb6\x02\x30\xa8" + "\x3d\xa9\x07\x2a\x7c\x19\xe7\x62" + "\x87\xe3\x82\x2f\x6f\xe1\x09\xd9" + "\x94\x97\xea\xdd\x58\x9e\xae\x76" + "\x7e\x35\xe5\xb4\xda\x7e\xf4\xde" + "\xf7\x32\x87\xcd\x93\xbf\x11\x56" + "\x11\xbe\x08\x74\xe1\x69\xad\xe2" + "\xd7\xf8\x86\x75\x8a\x3c\xa4\xbe" + "\x70\xa7\x1b\xfc\x0b\x44\x2a\x76" + "\x35\xea\x5d\x85\x81\xaf\x85\xeb" + "\xa0\x1c\x61\xc2\xf7\x4f\xa5\xdc" + "\x02\x7f\xf6\x95\x40\x6e\x8a\x9a" + "\xf3\x5d\x25\x6e\x14\x3a\x22\xc9" + "\x37\x1c\xeb\x46\x54\x3f\xa5\x91" + "\xc2\xb5\x8c\xfe\x53\x08\x97\x32" + "\x1b\xb2\x30\x27\xfe\x25\x5d\xdc" + "\x08\x87\xd0\xe5\x94\x1a\xd4\xf1" + "\xfe\xd6\xb4\xa3\xe6\x74\x81\x3c" + "\x1b\xb7\x31\xa7\x22\xfd\xd4\xdd" + "\x20\x4e\x7c\x51\xb0\x60\x73\xb8" + "\x9c\xac\x91\x90\x7e\x01\xb0\xe1" + "\x8a\x2f\x75\x1c\x53\x2a\x98\x2a" + "\x06\x52\x95\x52\xb2\xe9\x25\x2e" + "\x4c\xe2\x5a\x00\xb2\x13\x81\x03" + "\x77\x66\x0d\xa5\x99\xda\x4e\x8c" + "\xac\xf3\x13\x53\x27\x45\xaf\x64" + "\x46\xdc\xea\x23\xda\x97\xd1\xab" + "\x7d\x6c\x30\x96\x1f\xbc\x06\x34" + "\x18\x0b\x5e\x21\x35\x11\x8d\x4c" + "\xe0\x2d\xe9\x50\x16\x74\x81\xa8" + "\xb4\x34\xb9\x72\x42\xa6\xcc\xbc" + "\xca\x34\x83\x27\x10\x5b\x68\x45" + "\x8f\x52\x22\x0c\x55\x3d\x29\x7c" + "\xe3\xc0\x66\x05\x42\x91\x5f\x58" + "\xfe\x4a\x62\xd9\x8c\xa9\x04\x19" + "\x04\xa9\x08\x4b\x57\xfc\x67\x53" + "\x08\x7c\xbc\x66\x8a\xb0\xb6\x9f" + "\x92\xd6\x41\x7c\x5b\x2a\x00\x79" + "\x72", + .ctext = "\x3a\x92\xee\x53\x31\xaf\x2b\x60" + "\x5f\x55\x8d\x00\x5d\xfc\x74\x97" + "\x28\x54\xf4\xa5\x75\xf1\x9b\x25" + "\x62\x1c\xc0\xe0\x13\xc8\x87\x53" + "\xd0\xf3\xa7\x97\x1f\x3b\x1e\xea" + "\xe0\xe5\x2a\xd1\xdd\xa4\x3b\x50" + "\x45\xa3\x0d\x7e\x1b\xc9\xa0\xad" + "\xb9\x2c\x54\xa6\xc7\x55\x16\xd0" + "\xc5\x2e\x02\x44\x35\xd0\x7e\x67" + "\xf2\xc4\x9b\xcd\x95\x10\xcc\x29" + "\x4b\xfa\x86\x87\xbe\x40\x36\xbe" + "\xe1\xa3\x52\x89\x55\x20\x9b\xc2" + "\xab\xf2\x31\x34\x16\xad\xc8\x17" + "\x65\x24\xc0\xff\x12\x37\xfe\x5a" + "\x62\x3b\x59\x47\x6c\x5f\x3a\x8e" + "\x3b\xd9\x30\xc8\x7f\x2f\x88\xda" + "\x80\xfd\x02\xda\x7f\x9a\x7a\x73" + "\x59\xc5\x34\x09\x9a\x11\xcb\xa7" + "\xfc\xf6\xa1\xa0\x60\xfb\x43\xbb" + "\xf1\xe9\xd7\xc6\x79\x27\x4e\xff" + "\x22\xb4\x24\xbf\x76\xee\x47\xb9" + "\x6d\x3f\x8b\xb0\x9c\x3c\x43\xdd" + "\xff\x25\x2e\x6d\xa4\x2b\xfb\x5d" + "\x1b\x97\x6c\x55\x0a\x82\x7a\x7b" + "\x94\x34\xc2\xdb\x2f\x1f\xc1\xea" + "\xd4\x4d\x17\x46\x3b\x51\x69\x09" + "\xe4\x99\x32\x25\xfd\x94\xaf\xfb" + "\x10\xf7\x4f\xdd\x0b\x3c\x8b\x41" + "\xb3\x6a\xb7\xd1\x33\xa8\x0c\x2f" + "\x62\x4c\x72\x11\xd7\x74\xe1\x3b" + "\x38\x43\x66\x7b\x6c\x36\x48\xe7" + "\xe3\xe7\x9d\xb9\x42\x73\x7a\x2a" + "\x89\x20\x1a\x41\x80\x03\xf7\x8f" + "\x61\x78\x13\xbf\xfe\x50\xf5\x04" + "\x52\xf9\xac\x47\xf8\x62\x4b\xb2" + "\x24\xa9\xbf\x64\xb0\x18\x69\xd2" + "\xf5\xe4\xce\xc8\xb1\x87\x75\xd6" + "\x2c\x24\x79\x00\x7d\x26\xfb\x44" + "\xe7\x45\x7a\xee\x58\xa5\x83\xc1" + "\xb4\x24\xab\x23\x2f\x4d\xd7\x4f" + "\x1c\xc7\xaa\xa9\x50\xf4\xa3\x07" + "\x12\x13\x89\x74\xdc\x31\x6a\xb2" + "\xf5\x0f\x13\x8b\xb9\xdb\x85\x1f" + "\xf5\xbc\x88\xd9\x95\xea\x31\x6c" + "\x36\x60\xb6\x49\xdc\xc4\xf7\x55" + "\x3f\x21\xc1\xb5\x92\x18\x5e\xbc" + "\x9f\x87\x7f\xe7\x79\x25\x40\x33" + "\xd6\xb9\x33\xd5\x50\xb3\xc7\x89" + "\x1b\x12\xa0\x46\xdd\xa7\xd8\x3e" + "\x71\xeb\x6f\x66\xa1\x26\x0c\x67" + "\xab\xb2\x38\x58\x17\xd8\x44\x3b" + "\x16\xf0\x8e\x62\x8d\x16\x10\x00" + "\x32\x8b\xef\xb9\x28\xd3\xc5\xad" + "\x0a\x19\xa2\xe4\x03\x27\x7d\x94" + "\x06\x18\xcd\xd6\x27\x00\xf9\x1f" + "\xb6\xb3\xfe\x96\x35\x5f\xc4\x1c" + "\x07\x62\x10\x79\x68\x50\xf1\x7e" + "\x29\xe7\xc4\xc4\xe7\xee\x54\xd6" + "\x58\x76\x84\x6d\x8d\xe4\x59\x31" + "\xe9\xf4\xdc\xa1\x1f\xe5\x1a\xd6" + "\xe6\x64\x46\xf5\x77\x9c\x60\x7a" + "\x5e\x62\xe3\x0a\xd4\x9f\x7a\x2d" + "\x7a\xa5\x0a\x7b\x29\x86\x7a\x74" + "\x74\x71\x6b\xca\x7d\x1d\xaa\xba" + "\x39\x84\x43\x76\x35\xfe\x4f\x9b" + "\xbb\xbb\xb5\x6a\x32\xb5\x5d\x41" + "\x51\xf0\x5b\x68\x03\x47\x4b\x8a" + "\xca\x88\xf6\x37\xbd\x73\x51\x70" + "\x66\xfe\x9e\x5f\x21\x9c\xf3\xdd" + "\xc3\xea\x27\xf9\x64\x94\xe1\x19" + "\xa0\xa9\xab\x60\xe0\x0e\xf7\x78" + "\x70\x86\xeb\xe0\xd1\x5c\x05\xd3" + "\xd7\xca\xe0\xc0\x47\x47\x34\xee" + "\x11\xa3\xa3\x54\x98\xb7\x49\x8e" + "\x84\x28\x70\x2c\x9e\xfb\x55\x54" + "\x4d\xf8\x86\xf7\x85\x7c\xbd\xf3" + "\x17\xd8\x47\xcb\xac\xf4\x20\x85" + "\x34\x66\xad\x37\x2d\x5e\x52\xda" + "\x8a\xfe\x98\x55\x30\xe7\x2d\x2b" + "\x19\x10\x8e\x7b\x66\x5e\xdc\xe0" + "\x45\x1f\x7b\xb4\x08\xfb\x8f\xf6" + "\x8c\x89\x21\x34\x55\x27\xb2\x76" + "\xb2\x07\xd9\xd6\x68\x9b\xea\x6b" + "\x2d\xb4\xc4\x35\xdd\xd2\x79\xae" + "\xc7\xd6\x26\x7f\x12\x01\x8c\xa7" + "\xe3\xdb\xa8\xf4\xf7\x2b\xec\x99" + "\x11\x00\xf1\x35\x8c\xcf\xd5\xc9" + "\xbd\x91\x36\x39\x70\xcf\x7d\x70" + "\x47\x1a\xfc\x6b\x56\xe0\x3f\x9c" + "\x60\x49\x01\x72\xa9\xaf\x2c\x9c" + "\xe8\xab\xda\x8c\x14\x19\xf3\x75" + "\x07\x17\x9d\x44\x67\x7a\x2e\xef" + "\xb7\x83\x35\x4a\xd1\x3d\x1c\x84" + "\x32\xdd\xaa\xea\xca\x1d\xdc\x72" + "\x2c\xcc\x43\xcd\x5d\xe3\x21\xa4" + "\xd0\x8a\x4b\x20\x12\xa3\xd5\x86" + "\x76\x96\xff\x5f\x04\x57\x0f\xe6" + "\xba\xe8\x76\x50\x0c\x64\x1d\x83" + "\x9c\x9b\x9a\x9a\x58\x97\x9c\x5c" + "\xb4\xa4\xa6\x3e\x19\xeb\x8f\x5a" + "\x61\xb2\x03\x7b\x35\x19\xbe\xa7" + "\x63\x0c\xfd\xdd\xf9\x90\x6c\x08" + "\x19\x11\xd3\x65\x4a\xf5\x96\x92" + "\x59\xaa\x9c\x61\x0c\x29\xa7\xf8" + "\x14\x39\x37\xbf\x3c\xf2\x16\x72" + "\x02\xfa\xa2\xf3\x18\x67\x5d\xcb" + "\xdc\x4d\xbb\x96\xff\x70\x08\x2d" + "\xc2\xa8\x52\xe1\x34\x5f\x72\xfe" + "\x64\xbf\xca\xa7\x74\x38\xfb\x74" + "\x55\x9c\xfa\x8a\xed\xfb\x98\xeb" + "\x58\x2e\x6c\xe1\x52\x76\x86\xd7" + "\xcf\xa1\xa4\xfc\xb2\x47\x41\x28" + "\xa3\xc1\xe5\xfd\x53\x19\x28\x2b" + "\x37\x04\x65\x96\x99\x7a\x28\x0f" + "\x07\x68\x4b\xc7\x52\x0a\x55\x35" + "\x40\x19\x95\x61\xe8\x59\x40\x1f" + "\x9d\xbf\x78\x7d\x8f\x84\xff\x6f" + "\xd0\xd5\x63\xd2\x22\xbd\xc8\x4e" + "\xfb\xe7\x9f\x06\xe6\xe7\x39\x6d" + "\x6a\x96\x9f\xf0\x74\x7e\xc9\x35" + "\xb7\x26\xb8\x1c\x0a\xa6\x27\x2c" + "\xa2\x2b\xfe\xbe\x0f\x07\x73\xae" + "\x7f\x7f\x54\xf5\x7c\x6a\x0a\x56" + "\x49\xd4\x81\xe5\x85\x53\x99\x1f" + "\x95\x05\x13\x58\x8d\x0e\x1b\x90" + "\xc3\x75\x48\x64\x58\x98\x67\x84" + "\xae\xe2\x21\xa2\x8a\x04\x0a\x0b" + "\x61\xaa\xb0\xd4\x28\x60\x7a\xf8" + "\xbc\x52\xfb\x24\x7f\xed\x0d\x2a" + "\x0a\xb2\xf9\xc6\x95\xb5\x11\xc9" + "\xf4\x0f\x26\x11\xcf\x2a\x57\x87" + "\x7a\xf3\xe7\x94\x65\xc2\xb5\xb3" + "\xab\x98\xe3\xc1\x2b\x59\x19\x7c" + "\xd6\xf3\xf9\xbf\xff\x6d\xc6\x82" + "\x13\x2f\x4a\x2e\xcd\x26\xfe\x2d" + "\x01\x70\xf4\xc2\x7f\x1f\x4c\xcb" + "\x47\x77\x0c\xa0\xa3\x03\xec\xda" + "\xa9\xbf\x0d\x2d\xae\xe4\xb8\x7b" + "\xa9\xbc\x08\xb4\x68\x2e\xc5\x60" + "\x8d\x87\x41\x2b\x0f\x69\xf0\xaf" + "\x5f\xba\x72\x20\x0f\x33\xcd\x6d" + "\x36\x7d\x7b\xd5\x05\xf1\x4b\x05" + "\xc4\xfc\x7f\x80\xb9\x4d\xbd\xf7" + "\x7c\x84\x07\x01\xc2\x40\x66\x5b" + "\x98\xc7\x2c\xe3\x97\xfa\xdf\x87" + "\xa0\x1f\xe9\x21\x42\x0f\x3b\xeb" + "\x89\x1c\x3b\xca\x83\x61\x77\x68" + "\x84\xbb\x60\x87\x38\x2e\x25\xd5" + "\x9e\x04\x41\x70\xac\xda\xc0\x9c" + "\x9c\x69\xea\x8d\x4e\x55\x2a\x29" + "\xed\x05\x4b\x7b\x73\x71\x90\x59" + "\x4d\xc8\xd8\x44\xf0\x4c\xe1\x5e" + "\x84\x47\x55\xcc\x32\x3f\xe7\x97" + "\x42\xc6\x32\xac\x40\xe5\xa5\xc7" + "\x8b\xed\xdb\xf7\x83\xd6\xb1\xc2" + "\x52\x5e\x34\xb7\xeb\x6e\xd9\xfc" + "\xe5\x93\x9a\x97\x3e\xb0\xdc\xd9" + "\xd7\x06\x10\xb6\x1d\x80\x59\xdd" + "\x0d\xfe\x64\x35\xcd\x5d\xec\xf0" + "\xba\xd0\x34\xc9\x2d\x91\xc5\x17" + "\x11", + .len = 1281, + .also_non_np = 1, + .np = 3, + .tap = { 1200, 1, 80 }, + }, +}; + /* * CTS (Cipher Text Stealing) mode tests */ diff --git a/include/crypto/chacha20.h b/include/crypto/chacha20.h index 56073814eef0..c24b4ac03b85 100644 --- a/include/crypto/chacha20.h +++ b/include/crypto/chacha20.h @@ -1,6 +1,10 @@ /* SPDX-License-Identifier: GPL-2.0 */ /* - * Common values for the ChaCha20 algorithm + * Common values and helper functions for the ChaCha20 and XChaCha20 algorithms. + * + * XChaCha20 extends ChaCha20's nonce to 192 bits, while provably retaining + * ChaCha20's security. Here they share the same key size, tfm context, and + * setkey function; only their IV size and encrypt/decrypt function differ. */ #ifndef _CRYPTO_CHACHA20_H @@ -10,11 +14,16 @@ #include #include +/* 32-bit stream position, then 96-bit nonce (RFC7539 convention) */ #define CHACHA20_IV_SIZE 16 + #define CHACHA20_KEY_SIZE 32 #define CHACHA20_BLOCK_SIZE 64 #define CHACHAPOLY_IV_SIZE 12 +/* 192-bit nonce, then 64-bit stream position */ +#define XCHACHA20_IV_SIZE 32 + struct chacha20_ctx { u32 key[8]; }; @@ -23,8 +32,11 @@ void chacha20_block(u32 *state, u8 *stream); void hchacha20_block(const u32 *in, u32 *out); void crypto_chacha20_init(u32 *state, struct chacha20_ctx *ctx, u8 *iv); + int crypto_chacha20_setkey(struct crypto_skcipher *tfm, const u8 *key, unsigned int keysize); + int crypto_chacha20_crypt(struct skcipher_request *req); +int crypto_xchacha20_crypt(struct skcipher_request *req); #endif -- cgit v1.2.3-59-g8ed1b From 1ca1b917940c24ca3d1f490118c5474168622953 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 16 Nov 2018 17:26:21 -0800 Subject: crypto: chacha20-generic - refactor to allow varying number of rounds In preparation for adding XChaCha12 support, rename/refactor chacha20-generic to support different numbers of rounds. The justification for needing XChaCha12 support is explained in more detail in the patch "crypto: chacha - add XChaCha12 support". The only difference between ChaCha{8,12,20} are the number of rounds itself; all other parts of the algorithm are the same. Therefore, remove the "20" from all definitions, structures, functions, files, etc. that will be shared by all ChaCha versions. Also make ->setkey() store the round count in the chacha_ctx (previously chacha20_ctx). The generic code then passes the round count through to chacha_block(). There will be a ->setkey() function for each explicitly allowed round count; the encrypt/decrypt functions will be the same. I decided not to do it the opposite way (same ->setkey() function for all round counts, with different encrypt/decrypt functions) because that would have required more boilerplate code in architecture-specific implementations of ChaCha and XChaCha. Reviewed-by: Ard Biesheuvel Acked-by: Martin Willi Signed-off-by: Eric Biggers Signed-off-by: Herbert Xu --- arch/arm/crypto/chacha20-neon-glue.c | 40 +++---- arch/arm64/crypto/chacha20-neon-glue.c | 40 +++---- arch/x86/crypto/chacha20_glue.c | 48 ++++---- crypto/Makefile | 2 +- crypto/chacha20_generic.c | 185 ------------------------------- crypto/chacha20poly1305.c | 10 +- crypto/chacha_generic.c | 193 +++++++++++++++++++++++++++++++++ drivers/char/random.c | 51 +++++---- drivers/crypto/caam/caamalg.c | 2 +- drivers/crypto/caam/caamalg_qi2.c | 8 +- drivers/crypto/caam/compat.h | 2 +- include/crypto/chacha.h | 47 ++++++++ include/crypto/chacha20.h | 42 ------- lib/Makefile | 2 +- lib/chacha.c | 117 ++++++++++++++++++++ lib/chacha20.c | 112 ------------------- 16 files changed, 459 insertions(+), 442 deletions(-) delete mode 100644 crypto/chacha20_generic.c create mode 100644 crypto/chacha_generic.c create mode 100644 include/crypto/chacha.h delete mode 100644 include/crypto/chacha20.h create mode 100644 lib/chacha.c delete mode 100644 lib/chacha20.c diff --git a/arch/arm/crypto/chacha20-neon-glue.c b/arch/arm/crypto/chacha20-neon-glue.c index 59a7be08e80c..7386eb1c1889 100644 --- a/arch/arm/crypto/chacha20-neon-glue.c +++ b/arch/arm/crypto/chacha20-neon-glue.c @@ -19,7 +19,7 @@ */ #include -#include +#include #include #include #include @@ -34,20 +34,20 @@ asmlinkage void chacha20_4block_xor_neon(u32 *state, u8 *dst, const u8 *src); static void chacha20_doneon(u32 *state, u8 *dst, const u8 *src, unsigned int bytes) { - u8 buf[CHACHA20_BLOCK_SIZE]; + u8 buf[CHACHA_BLOCK_SIZE]; - while (bytes >= CHACHA20_BLOCK_SIZE * 4) { + while (bytes >= CHACHA_BLOCK_SIZE * 4) { chacha20_4block_xor_neon(state, dst, src); - bytes -= CHACHA20_BLOCK_SIZE * 4; - src += CHACHA20_BLOCK_SIZE * 4; - dst += CHACHA20_BLOCK_SIZE * 4; + bytes -= CHACHA_BLOCK_SIZE * 4; + src += CHACHA_BLOCK_SIZE * 4; + dst += CHACHA_BLOCK_SIZE * 4; state[12] += 4; } - while (bytes >= CHACHA20_BLOCK_SIZE) { + while (bytes >= CHACHA_BLOCK_SIZE) { chacha20_block_xor_neon(state, dst, src); - bytes -= CHACHA20_BLOCK_SIZE; - src += CHACHA20_BLOCK_SIZE; - dst += CHACHA20_BLOCK_SIZE; + bytes -= CHACHA_BLOCK_SIZE; + src += CHACHA_BLOCK_SIZE; + dst += CHACHA_BLOCK_SIZE; state[12]++; } if (bytes) { @@ -60,17 +60,17 @@ static void chacha20_doneon(u32 *state, u8 *dst, const u8 *src, static int chacha20_neon(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - struct chacha20_ctx *ctx = crypto_skcipher_ctx(tfm); + struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm); struct skcipher_walk walk; u32 state[16]; int err; - if (req->cryptlen <= CHACHA20_BLOCK_SIZE || !may_use_simd()) - return crypto_chacha20_crypt(req); + if (req->cryptlen <= CHACHA_BLOCK_SIZE || !may_use_simd()) + return crypto_chacha_crypt(req); err = skcipher_walk_virt(&walk, req, true); - crypto_chacha20_init(state, ctx, walk.iv); + crypto_chacha_init(state, ctx, walk.iv); kernel_neon_begin(); while (walk.nbytes > 0) { @@ -93,14 +93,14 @@ static struct skcipher_alg alg = { .base.cra_driver_name = "chacha20-neon", .base.cra_priority = 300, .base.cra_blocksize = 1, - .base.cra_ctxsize = sizeof(struct chacha20_ctx), + .base.cra_ctxsize = sizeof(struct chacha_ctx), .base.cra_module = THIS_MODULE, - .min_keysize = CHACHA20_KEY_SIZE, - .max_keysize = CHACHA20_KEY_SIZE, - .ivsize = CHACHA20_IV_SIZE, - .chunksize = CHACHA20_BLOCK_SIZE, - .walksize = 4 * CHACHA20_BLOCK_SIZE, + .min_keysize = CHACHA_KEY_SIZE, + .max_keysize = CHACHA_KEY_SIZE, + .ivsize = CHACHA_IV_SIZE, + .chunksize = CHACHA_BLOCK_SIZE, + .walksize = 4 * CHACHA_BLOCK_SIZE, .setkey = crypto_chacha20_setkey, .encrypt = chacha20_neon, .decrypt = chacha20_neon, diff --git a/arch/arm64/crypto/chacha20-neon-glue.c b/arch/arm64/crypto/chacha20-neon-glue.c index 727579c93ded..96e0cfb8c3f5 100644 --- a/arch/arm64/crypto/chacha20-neon-glue.c +++ b/arch/arm64/crypto/chacha20-neon-glue.c @@ -19,7 +19,7 @@ */ #include -#include +#include #include #include #include @@ -34,15 +34,15 @@ asmlinkage void chacha20_4block_xor_neon(u32 *state, u8 *dst, const u8 *src); static void chacha20_doneon(u32 *state, u8 *dst, const u8 *src, unsigned int bytes) { - u8 buf[CHACHA20_BLOCK_SIZE]; + u8 buf[CHACHA_BLOCK_SIZE]; - while (bytes >= CHACHA20_BLOCK_SIZE * 4) { + while (bytes >= CHACHA_BLOCK_SIZE * 4) { kernel_neon_begin(); chacha20_4block_xor_neon(state, dst, src); kernel_neon_end(); - bytes -= CHACHA20_BLOCK_SIZE * 4; - src += CHACHA20_BLOCK_SIZE * 4; - dst += CHACHA20_BLOCK_SIZE * 4; + bytes -= CHACHA_BLOCK_SIZE * 4; + src += CHACHA_BLOCK_SIZE * 4; + dst += CHACHA_BLOCK_SIZE * 4; state[12] += 4; } @@ -50,11 +50,11 @@ static void chacha20_doneon(u32 *state, u8 *dst, const u8 *src, return; kernel_neon_begin(); - while (bytes >= CHACHA20_BLOCK_SIZE) { + while (bytes >= CHACHA_BLOCK_SIZE) { chacha20_block_xor_neon(state, dst, src); - bytes -= CHACHA20_BLOCK_SIZE; - src += CHACHA20_BLOCK_SIZE; - dst += CHACHA20_BLOCK_SIZE; + bytes -= CHACHA_BLOCK_SIZE; + src += CHACHA_BLOCK_SIZE; + dst += CHACHA_BLOCK_SIZE; state[12]++; } if (bytes) { @@ -68,17 +68,17 @@ static void chacha20_doneon(u32 *state, u8 *dst, const u8 *src, static int chacha20_neon(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - struct chacha20_ctx *ctx = crypto_skcipher_ctx(tfm); + struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm); struct skcipher_walk walk; u32 state[16]; int err; - if (!may_use_simd() || req->cryptlen <= CHACHA20_BLOCK_SIZE) - return crypto_chacha20_crypt(req); + if (!may_use_simd() || req->cryptlen <= CHACHA_BLOCK_SIZE) + return crypto_chacha_crypt(req); err = skcipher_walk_virt(&walk, req, false); - crypto_chacha20_init(state, ctx, walk.iv); + crypto_chacha_init(state, ctx, walk.iv); while (walk.nbytes > 0) { unsigned int nbytes = walk.nbytes; @@ -99,14 +99,14 @@ static struct skcipher_alg alg = { .base.cra_driver_name = "chacha20-neon", .base.cra_priority = 300, .base.cra_blocksize = 1, - .base.cra_ctxsize = sizeof(struct chacha20_ctx), + .base.cra_ctxsize = sizeof(struct chacha_ctx), .base.cra_module = THIS_MODULE, - .min_keysize = CHACHA20_KEY_SIZE, - .max_keysize = CHACHA20_KEY_SIZE, - .ivsize = CHACHA20_IV_SIZE, - .chunksize = CHACHA20_BLOCK_SIZE, - .walksize = 4 * CHACHA20_BLOCK_SIZE, + .min_keysize = CHACHA_KEY_SIZE, + .max_keysize = CHACHA_KEY_SIZE, + .ivsize = CHACHA_IV_SIZE, + .chunksize = CHACHA_BLOCK_SIZE, + .walksize = 4 * CHACHA_BLOCK_SIZE, .setkey = crypto_chacha20_setkey, .encrypt = chacha20_neon, .decrypt = chacha20_neon, diff --git a/arch/x86/crypto/chacha20_glue.c b/arch/x86/crypto/chacha20_glue.c index 9fd84fe6ec09..1e9e66509226 100644 --- a/arch/x86/crypto/chacha20_glue.c +++ b/arch/x86/crypto/chacha20_glue.c @@ -10,7 +10,7 @@ */ #include -#include +#include #include #include #include @@ -35,8 +35,8 @@ static bool chacha20_use_avx2; static unsigned int chacha20_advance(unsigned int len, unsigned int maxblocks) { - len = min(len, maxblocks * CHACHA20_BLOCK_SIZE); - return round_up(len, CHACHA20_BLOCK_SIZE) / CHACHA20_BLOCK_SIZE; + len = min(len, maxblocks * CHACHA_BLOCK_SIZE); + return round_up(len, CHACHA_BLOCK_SIZE) / CHACHA_BLOCK_SIZE; } static void chacha20_dosimd(u32 *state, u8 *dst, const u8 *src, @@ -44,38 +44,38 @@ static void chacha20_dosimd(u32 *state, u8 *dst, const u8 *src, { #ifdef CONFIG_AS_AVX2 if (chacha20_use_avx2) { - while (bytes >= CHACHA20_BLOCK_SIZE * 8) { + while (bytes >= CHACHA_BLOCK_SIZE * 8) { chacha20_8block_xor_avx2(state, dst, src, bytes); - bytes -= CHACHA20_BLOCK_SIZE * 8; - src += CHACHA20_BLOCK_SIZE * 8; - dst += CHACHA20_BLOCK_SIZE * 8; + bytes -= CHACHA_BLOCK_SIZE * 8; + src += CHACHA_BLOCK_SIZE * 8; + dst += CHACHA_BLOCK_SIZE * 8; state[12] += 8; } - if (bytes > CHACHA20_BLOCK_SIZE * 4) { + if (bytes > CHACHA_BLOCK_SIZE * 4) { chacha20_8block_xor_avx2(state, dst, src, bytes); state[12] += chacha20_advance(bytes, 8); return; } - if (bytes > CHACHA20_BLOCK_SIZE * 2) { + if (bytes > CHACHA_BLOCK_SIZE * 2) { chacha20_4block_xor_avx2(state, dst, src, bytes); state[12] += chacha20_advance(bytes, 4); return; } - if (bytes > CHACHA20_BLOCK_SIZE) { + if (bytes > CHACHA_BLOCK_SIZE) { chacha20_2block_xor_avx2(state, dst, src, bytes); state[12] += chacha20_advance(bytes, 2); return; } } #endif - while (bytes >= CHACHA20_BLOCK_SIZE * 4) { + while (bytes >= CHACHA_BLOCK_SIZE * 4) { chacha20_4block_xor_ssse3(state, dst, src, bytes); - bytes -= CHACHA20_BLOCK_SIZE * 4; - src += CHACHA20_BLOCK_SIZE * 4; - dst += CHACHA20_BLOCK_SIZE * 4; + bytes -= CHACHA_BLOCK_SIZE * 4; + src += CHACHA_BLOCK_SIZE * 4; + dst += CHACHA_BLOCK_SIZE * 4; state[12] += 4; } - if (bytes > CHACHA20_BLOCK_SIZE) { + if (bytes > CHACHA_BLOCK_SIZE) { chacha20_4block_xor_ssse3(state, dst, src, bytes); state[12] += chacha20_advance(bytes, 4); return; @@ -89,7 +89,7 @@ static void chacha20_dosimd(u32 *state, u8 *dst, const u8 *src, static int chacha20_simd(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - struct chacha20_ctx *ctx = crypto_skcipher_ctx(tfm); + struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm); u32 *state, state_buf[16 + 2] __aligned(8); struct skcipher_walk walk; int err; @@ -97,12 +97,12 @@ static int chacha20_simd(struct skcipher_request *req) BUILD_BUG_ON(CHACHA20_STATE_ALIGN != 16); state = PTR_ALIGN(state_buf + 0, CHACHA20_STATE_ALIGN); - if (req->cryptlen <= CHACHA20_BLOCK_SIZE || !may_use_simd()) - return crypto_chacha20_crypt(req); + if (req->cryptlen <= CHACHA_BLOCK_SIZE || !may_use_simd()) + return crypto_chacha_crypt(req); err = skcipher_walk_virt(&walk, req, true); - crypto_chacha20_init(state, ctx, walk.iv); + crypto_chacha_init(state, ctx, walk.iv); kernel_fpu_begin(); @@ -128,13 +128,13 @@ static struct skcipher_alg alg = { .base.cra_driver_name = "chacha20-simd", .base.cra_priority = 300, .base.cra_blocksize = 1, - .base.cra_ctxsize = sizeof(struct chacha20_ctx), + .base.cra_ctxsize = sizeof(struct chacha_ctx), .base.cra_module = THIS_MODULE, - .min_keysize = CHACHA20_KEY_SIZE, - .max_keysize = CHACHA20_KEY_SIZE, - .ivsize = CHACHA20_IV_SIZE, - .chunksize = CHACHA20_BLOCK_SIZE, + .min_keysize = CHACHA_KEY_SIZE, + .max_keysize = CHACHA_KEY_SIZE, + .ivsize = CHACHA_IV_SIZE, + .chunksize = CHACHA_BLOCK_SIZE, .setkey = crypto_chacha20_setkey, .encrypt = chacha20_simd, .decrypt = chacha20_simd, diff --git a/crypto/Makefile b/crypto/Makefile index abbd86fdbad2..102e8525814f 100644 --- a/crypto/Makefile +++ b/crypto/Makefile @@ -117,7 +117,7 @@ obj-$(CONFIG_CRYPTO_KHAZAD) += khazad.o obj-$(CONFIG_CRYPTO_ANUBIS) += anubis.o obj-$(CONFIG_CRYPTO_SEED) += seed.o obj-$(CONFIG_CRYPTO_SALSA20) += salsa20_generic.o -obj-$(CONFIG_CRYPTO_CHACHA20) += chacha20_generic.o +obj-$(CONFIG_CRYPTO_CHACHA20) += chacha_generic.o obj-$(CONFIG_CRYPTO_POLY1305) += poly1305_generic.o obj-$(CONFIG_CRYPTO_DEFLATE) += deflate.o obj-$(CONFIG_CRYPTO_MICHAEL_MIC) += michael_mic.o diff --git a/crypto/chacha20_generic.c b/crypto/chacha20_generic.c deleted file mode 100644 index 4305b1b62b16..000000000000 --- a/crypto/chacha20_generic.c +++ /dev/null @@ -1,185 +0,0 @@ -/* - * ChaCha20 (RFC7539) and XChaCha20 stream cipher algorithms - * - * Copyright (C) 2015 Martin Willi - * Copyright (C) 2018 Google LLC - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - */ - -#include -#include -#include -#include -#include - -static void chacha20_docrypt(u32 *state, u8 *dst, const u8 *src, - unsigned int bytes) -{ - /* aligned to potentially speed up crypto_xor() */ - u8 stream[CHACHA20_BLOCK_SIZE] __aligned(sizeof(long)); - - if (dst != src) - memcpy(dst, src, bytes); - - while (bytes >= CHACHA20_BLOCK_SIZE) { - chacha20_block(state, stream); - crypto_xor(dst, stream, CHACHA20_BLOCK_SIZE); - bytes -= CHACHA20_BLOCK_SIZE; - dst += CHACHA20_BLOCK_SIZE; - } - if (bytes) { - chacha20_block(state, stream); - crypto_xor(dst, stream, bytes); - } -} - -static int chacha20_stream_xor(struct skcipher_request *req, - struct chacha20_ctx *ctx, u8 *iv) -{ - struct skcipher_walk walk; - u32 state[16]; - int err; - - err = skcipher_walk_virt(&walk, req, false); - - crypto_chacha20_init(state, ctx, iv); - - while (walk.nbytes > 0) { - unsigned int nbytes = walk.nbytes; - - if (nbytes < walk.total) - nbytes = round_down(nbytes, walk.stride); - - chacha20_docrypt(state, walk.dst.virt.addr, walk.src.virt.addr, - nbytes); - err = skcipher_walk_done(&walk, walk.nbytes - nbytes); - } - - return err; -} - -void crypto_chacha20_init(u32 *state, struct chacha20_ctx *ctx, u8 *iv) -{ - state[0] = 0x61707865; /* "expa" */ - state[1] = 0x3320646e; /* "nd 3" */ - state[2] = 0x79622d32; /* "2-by" */ - state[3] = 0x6b206574; /* "te k" */ - state[4] = ctx->key[0]; - state[5] = ctx->key[1]; - state[6] = ctx->key[2]; - state[7] = ctx->key[3]; - state[8] = ctx->key[4]; - state[9] = ctx->key[5]; - state[10] = ctx->key[6]; - state[11] = ctx->key[7]; - state[12] = get_unaligned_le32(iv + 0); - state[13] = get_unaligned_le32(iv + 4); - state[14] = get_unaligned_le32(iv + 8); - state[15] = get_unaligned_le32(iv + 12); -} -EXPORT_SYMBOL_GPL(crypto_chacha20_init); - -int crypto_chacha20_setkey(struct crypto_skcipher *tfm, const u8 *key, - unsigned int keysize) -{ - struct chacha20_ctx *ctx = crypto_skcipher_ctx(tfm); - int i; - - if (keysize != CHACHA20_KEY_SIZE) - return -EINVAL; - - for (i = 0; i < ARRAY_SIZE(ctx->key); i++) - ctx->key[i] = get_unaligned_le32(key + i * sizeof(u32)); - - return 0; -} -EXPORT_SYMBOL_GPL(crypto_chacha20_setkey); - -int crypto_chacha20_crypt(struct skcipher_request *req) -{ - struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - struct chacha20_ctx *ctx = crypto_skcipher_ctx(tfm); - - return chacha20_stream_xor(req, ctx, req->iv); -} -EXPORT_SYMBOL_GPL(crypto_chacha20_crypt); - -int crypto_xchacha20_crypt(struct skcipher_request *req) -{ - struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - struct chacha20_ctx *ctx = crypto_skcipher_ctx(tfm); - struct chacha20_ctx subctx; - u32 state[16]; - u8 real_iv[16]; - - /* Compute the subkey given the original key and first 128 nonce bits */ - crypto_chacha20_init(state, ctx, req->iv); - hchacha20_block(state, subctx.key); - - /* Build the real IV */ - memcpy(&real_iv[0], req->iv + 24, 8); /* stream position */ - memcpy(&real_iv[8], req->iv + 16, 8); /* remaining 64 nonce bits */ - - /* Generate the stream and XOR it with the data */ - return chacha20_stream_xor(req, &subctx, real_iv); -} -EXPORT_SYMBOL_GPL(crypto_xchacha20_crypt); - -static struct skcipher_alg algs[] = { - { - .base.cra_name = "chacha20", - .base.cra_driver_name = "chacha20-generic", - .base.cra_priority = 100, - .base.cra_blocksize = 1, - .base.cra_ctxsize = sizeof(struct chacha20_ctx), - .base.cra_module = THIS_MODULE, - - .min_keysize = CHACHA20_KEY_SIZE, - .max_keysize = CHACHA20_KEY_SIZE, - .ivsize = CHACHA20_IV_SIZE, - .chunksize = CHACHA20_BLOCK_SIZE, - .setkey = crypto_chacha20_setkey, - .encrypt = crypto_chacha20_crypt, - .decrypt = crypto_chacha20_crypt, - }, { - .base.cra_name = "xchacha20", - .base.cra_driver_name = "xchacha20-generic", - .base.cra_priority = 100, - .base.cra_blocksize = 1, - .base.cra_ctxsize = sizeof(struct chacha20_ctx), - .base.cra_module = THIS_MODULE, - - .min_keysize = CHACHA20_KEY_SIZE, - .max_keysize = CHACHA20_KEY_SIZE, - .ivsize = XCHACHA20_IV_SIZE, - .chunksize = CHACHA20_BLOCK_SIZE, - .setkey = crypto_chacha20_setkey, - .encrypt = crypto_xchacha20_crypt, - .decrypt = crypto_xchacha20_crypt, - } -}; - -static int __init chacha20_generic_mod_init(void) -{ - return crypto_register_skciphers(algs, ARRAY_SIZE(algs)); -} - -static void __exit chacha20_generic_mod_fini(void) -{ - crypto_unregister_skciphers(algs, ARRAY_SIZE(algs)); -} - -module_init(chacha20_generic_mod_init); -module_exit(chacha20_generic_mod_fini); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Martin Willi "); -MODULE_DESCRIPTION("ChaCha20 and XChaCha20 stream ciphers (generic)"); -MODULE_ALIAS_CRYPTO("chacha20"); -MODULE_ALIAS_CRYPTO("chacha20-generic"); -MODULE_ALIAS_CRYPTO("xchacha20"); -MODULE_ALIAS_CRYPTO("xchacha20-generic"); diff --git a/crypto/chacha20poly1305.c b/crypto/chacha20poly1305.c index f9dd5453046a..fef11446ab1b 100644 --- a/crypto/chacha20poly1305.c +++ b/crypto/chacha20poly1305.c @@ -13,7 +13,7 @@ #include #include #include -#include +#include #include #include #include @@ -49,7 +49,7 @@ struct poly_req { }; struct chacha_req { - u8 iv[CHACHA20_IV_SIZE]; + u8 iv[CHACHA_IV_SIZE]; struct scatterlist src[1]; struct skcipher_request req; /* must be last member */ }; @@ -89,7 +89,7 @@ static void chacha_iv(u8 *iv, struct aead_request *req, u32 icb) memcpy(iv, &leicb, sizeof(leicb)); memcpy(iv + sizeof(leicb), ctx->salt, ctx->saltlen); memcpy(iv + sizeof(leicb) + ctx->saltlen, req->iv, - CHACHA20_IV_SIZE - sizeof(leicb) - ctx->saltlen); + CHACHA_IV_SIZE - sizeof(leicb) - ctx->saltlen); } static int poly_verify_tag(struct aead_request *req) @@ -492,7 +492,7 @@ static int chachapoly_setkey(struct crypto_aead *aead, const u8 *key, struct chachapoly_ctx *ctx = crypto_aead_ctx(aead); int err; - if (keylen != ctx->saltlen + CHACHA20_KEY_SIZE) + if (keylen != ctx->saltlen + CHACHA_KEY_SIZE) return -EINVAL; keylen -= ctx->saltlen; @@ -637,7 +637,7 @@ static int chachapoly_create(struct crypto_template *tmpl, struct rtattr **tb, err = -EINVAL; /* Need 16-byte IV size, including Initial Block Counter value */ - if (crypto_skcipher_alg_ivsize(chacha) != CHACHA20_IV_SIZE) + if (crypto_skcipher_alg_ivsize(chacha) != CHACHA_IV_SIZE) goto out_drop_chacha; /* Not a stream cipher? */ if (chacha->base.cra_blocksize != 1) diff --git a/crypto/chacha_generic.c b/crypto/chacha_generic.c new file mode 100644 index 000000000000..438f15a14054 --- /dev/null +++ b/crypto/chacha_generic.c @@ -0,0 +1,193 @@ +/* + * ChaCha20 (RFC7539) and XChaCha20 stream cipher algorithms + * + * Copyright (C) 2015 Martin Willi + * Copyright (C) 2018 Google LLC + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include +#include +#include +#include +#include + +static void chacha_docrypt(u32 *state, u8 *dst, const u8 *src, + unsigned int bytes, int nrounds) +{ + /* aligned to potentially speed up crypto_xor() */ + u8 stream[CHACHA_BLOCK_SIZE] __aligned(sizeof(long)); + + if (dst != src) + memcpy(dst, src, bytes); + + while (bytes >= CHACHA_BLOCK_SIZE) { + chacha_block(state, stream, nrounds); + crypto_xor(dst, stream, CHACHA_BLOCK_SIZE); + bytes -= CHACHA_BLOCK_SIZE; + dst += CHACHA_BLOCK_SIZE; + } + if (bytes) { + chacha_block(state, stream, nrounds); + crypto_xor(dst, stream, bytes); + } +} + +static int chacha_stream_xor(struct skcipher_request *req, + struct chacha_ctx *ctx, u8 *iv) +{ + struct skcipher_walk walk; + u32 state[16]; + int err; + + err = skcipher_walk_virt(&walk, req, false); + + crypto_chacha_init(state, ctx, iv); + + while (walk.nbytes > 0) { + unsigned int nbytes = walk.nbytes; + + if (nbytes < walk.total) + nbytes = round_down(nbytes, walk.stride); + + chacha_docrypt(state, walk.dst.virt.addr, walk.src.virt.addr, + nbytes, ctx->nrounds); + err = skcipher_walk_done(&walk, walk.nbytes - nbytes); + } + + return err; +} + +void crypto_chacha_init(u32 *state, struct chacha_ctx *ctx, u8 *iv) +{ + state[0] = 0x61707865; /* "expa" */ + state[1] = 0x3320646e; /* "nd 3" */ + state[2] = 0x79622d32; /* "2-by" */ + state[3] = 0x6b206574; /* "te k" */ + state[4] = ctx->key[0]; + state[5] = ctx->key[1]; + state[6] = ctx->key[2]; + state[7] = ctx->key[3]; + state[8] = ctx->key[4]; + state[9] = ctx->key[5]; + state[10] = ctx->key[6]; + state[11] = ctx->key[7]; + state[12] = get_unaligned_le32(iv + 0); + state[13] = get_unaligned_le32(iv + 4); + state[14] = get_unaligned_le32(iv + 8); + state[15] = get_unaligned_le32(iv + 12); +} +EXPORT_SYMBOL_GPL(crypto_chacha_init); + +static int chacha_setkey(struct crypto_skcipher *tfm, const u8 *key, + unsigned int keysize, int nrounds) +{ + struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm); + int i; + + if (keysize != CHACHA_KEY_SIZE) + return -EINVAL; + + for (i = 0; i < ARRAY_SIZE(ctx->key); i++) + ctx->key[i] = get_unaligned_le32(key + i * sizeof(u32)); + + ctx->nrounds = nrounds; + return 0; +} + +int crypto_chacha20_setkey(struct crypto_skcipher *tfm, const u8 *key, + unsigned int keysize) +{ + return chacha_setkey(tfm, key, keysize, 20); +} +EXPORT_SYMBOL_GPL(crypto_chacha20_setkey); + +int crypto_chacha_crypt(struct skcipher_request *req) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm); + + return chacha_stream_xor(req, ctx, req->iv); +} +EXPORT_SYMBOL_GPL(crypto_chacha_crypt); + +int crypto_xchacha_crypt(struct skcipher_request *req) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm); + struct chacha_ctx subctx; + u32 state[16]; + u8 real_iv[16]; + + /* Compute the subkey given the original key and first 128 nonce bits */ + crypto_chacha_init(state, ctx, req->iv); + hchacha_block(state, subctx.key, ctx->nrounds); + subctx.nrounds = ctx->nrounds; + + /* Build the real IV */ + memcpy(&real_iv[0], req->iv + 24, 8); /* stream position */ + memcpy(&real_iv[8], req->iv + 16, 8); /* remaining 64 nonce bits */ + + /* Generate the stream and XOR it with the data */ + return chacha_stream_xor(req, &subctx, real_iv); +} +EXPORT_SYMBOL_GPL(crypto_xchacha_crypt); + +static struct skcipher_alg algs[] = { + { + .base.cra_name = "chacha20", + .base.cra_driver_name = "chacha20-generic", + .base.cra_priority = 100, + .base.cra_blocksize = 1, + .base.cra_ctxsize = sizeof(struct chacha_ctx), + .base.cra_module = THIS_MODULE, + + .min_keysize = CHACHA_KEY_SIZE, + .max_keysize = CHACHA_KEY_SIZE, + .ivsize = CHACHA_IV_SIZE, + .chunksize = CHACHA_BLOCK_SIZE, + .setkey = crypto_chacha20_setkey, + .encrypt = crypto_chacha_crypt, + .decrypt = crypto_chacha_crypt, + }, { + .base.cra_name = "xchacha20", + .base.cra_driver_name = "xchacha20-generic", + .base.cra_priority = 100, + .base.cra_blocksize = 1, + .base.cra_ctxsize = sizeof(struct chacha_ctx), + .base.cra_module = THIS_MODULE, + + .min_keysize = CHACHA_KEY_SIZE, + .max_keysize = CHACHA_KEY_SIZE, + .ivsize = XCHACHA_IV_SIZE, + .chunksize = CHACHA_BLOCK_SIZE, + .setkey = crypto_chacha20_setkey, + .encrypt = crypto_xchacha_crypt, + .decrypt = crypto_xchacha_crypt, + } +}; + +static int __init chacha_generic_mod_init(void) +{ + return crypto_register_skciphers(algs, ARRAY_SIZE(algs)); +} + +static void __exit chacha_generic_mod_fini(void) +{ + crypto_unregister_skciphers(algs, ARRAY_SIZE(algs)); +} + +module_init(chacha_generic_mod_init); +module_exit(chacha_generic_mod_fini); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Martin Willi "); +MODULE_DESCRIPTION("ChaCha and XChaCha stream ciphers (generic)"); +MODULE_ALIAS_CRYPTO("chacha20"); +MODULE_ALIAS_CRYPTO("chacha20-generic"); +MODULE_ALIAS_CRYPTO("xchacha20"); +MODULE_ALIAS_CRYPTO("xchacha20-generic"); diff --git a/drivers/char/random.c b/drivers/char/random.c index 2eb70e76ed35..38c6d1af6d1c 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -265,7 +265,7 @@ #include #include #include -#include +#include #include #include @@ -431,11 +431,10 @@ static int crng_init = 0; #define crng_ready() (likely(crng_init > 1)) static int crng_init_cnt = 0; static unsigned long crng_global_init_time = 0; -#define CRNG_INIT_CNT_THRESH (2*CHACHA20_KEY_SIZE) -static void _extract_crng(struct crng_state *crng, - __u8 out[CHACHA20_BLOCK_SIZE]); +#define CRNG_INIT_CNT_THRESH (2*CHACHA_KEY_SIZE) +static void _extract_crng(struct crng_state *crng, __u8 out[CHACHA_BLOCK_SIZE]); static void _crng_backtrack_protect(struct crng_state *crng, - __u8 tmp[CHACHA20_BLOCK_SIZE], int used); + __u8 tmp[CHACHA_BLOCK_SIZE], int used); static void process_random_ready_list(void); static void _get_random_bytes(void *buf, int nbytes); @@ -863,7 +862,7 @@ static int crng_fast_load(const char *cp, size_t len) } p = (unsigned char *) &primary_crng.state[4]; while (len > 0 && crng_init_cnt < CRNG_INIT_CNT_THRESH) { - p[crng_init_cnt % CHACHA20_KEY_SIZE] ^= *cp; + p[crng_init_cnt % CHACHA_KEY_SIZE] ^= *cp; cp++; crng_init_cnt++; len--; } spin_unlock_irqrestore(&primary_crng.lock, flags); @@ -895,7 +894,7 @@ static int crng_slow_load(const char *cp, size_t len) unsigned long flags; static unsigned char lfsr = 1; unsigned char tmp; - unsigned i, max = CHACHA20_KEY_SIZE; + unsigned i, max = CHACHA_KEY_SIZE; const char * src_buf = cp; char * dest_buf = (char *) &primary_crng.state[4]; @@ -913,8 +912,8 @@ static int crng_slow_load(const char *cp, size_t len) lfsr >>= 1; if (tmp & 1) lfsr ^= 0xE1; - tmp = dest_buf[i % CHACHA20_KEY_SIZE]; - dest_buf[i % CHACHA20_KEY_SIZE] ^= src_buf[i % len] ^ lfsr; + tmp = dest_buf[i % CHACHA_KEY_SIZE]; + dest_buf[i % CHACHA_KEY_SIZE] ^= src_buf[i % len] ^ lfsr; lfsr += (tmp << 3) | (tmp >> 5); } spin_unlock_irqrestore(&primary_crng.lock, flags); @@ -926,7 +925,7 @@ static void crng_reseed(struct crng_state *crng, struct entropy_store *r) unsigned long flags; int i, num; union { - __u8 block[CHACHA20_BLOCK_SIZE]; + __u8 block[CHACHA_BLOCK_SIZE]; __u32 key[8]; } buf; @@ -937,7 +936,7 @@ static void crng_reseed(struct crng_state *crng, struct entropy_store *r) } else { _extract_crng(&primary_crng, buf.block); _crng_backtrack_protect(&primary_crng, buf.block, - CHACHA20_KEY_SIZE); + CHACHA_KEY_SIZE); } spin_lock_irqsave(&crng->lock, flags); for (i = 0; i < 8; i++) { @@ -973,7 +972,7 @@ static void crng_reseed(struct crng_state *crng, struct entropy_store *r) } static void _extract_crng(struct crng_state *crng, - __u8 out[CHACHA20_BLOCK_SIZE]) + __u8 out[CHACHA_BLOCK_SIZE]) { unsigned long v, flags; @@ -990,7 +989,7 @@ static void _extract_crng(struct crng_state *crng, spin_unlock_irqrestore(&crng->lock, flags); } -static void extract_crng(__u8 out[CHACHA20_BLOCK_SIZE]) +static void extract_crng(__u8 out[CHACHA_BLOCK_SIZE]) { struct crng_state *crng = NULL; @@ -1008,14 +1007,14 @@ static void extract_crng(__u8 out[CHACHA20_BLOCK_SIZE]) * enough) to mutate the CRNG key to provide backtracking protection. */ static void _crng_backtrack_protect(struct crng_state *crng, - __u8 tmp[CHACHA20_BLOCK_SIZE], int used) + __u8 tmp[CHACHA_BLOCK_SIZE], int used) { unsigned long flags; __u32 *s, *d; int i; used = round_up(used, sizeof(__u32)); - if (used + CHACHA20_KEY_SIZE > CHACHA20_BLOCK_SIZE) { + if (used + CHACHA_KEY_SIZE > CHACHA_BLOCK_SIZE) { extract_crng(tmp); used = 0; } @@ -1027,7 +1026,7 @@ static void _crng_backtrack_protect(struct crng_state *crng, spin_unlock_irqrestore(&crng->lock, flags); } -static void crng_backtrack_protect(__u8 tmp[CHACHA20_BLOCK_SIZE], int used) +static void crng_backtrack_protect(__u8 tmp[CHACHA_BLOCK_SIZE], int used) { struct crng_state *crng = NULL; @@ -1042,8 +1041,8 @@ static void crng_backtrack_protect(__u8 tmp[CHACHA20_BLOCK_SIZE], int used) static ssize_t extract_crng_user(void __user *buf, size_t nbytes) { - ssize_t ret = 0, i = CHACHA20_BLOCK_SIZE; - __u8 tmp[CHACHA20_BLOCK_SIZE] __aligned(4); + ssize_t ret = 0, i = CHACHA_BLOCK_SIZE; + __u8 tmp[CHACHA_BLOCK_SIZE] __aligned(4); int large_request = (nbytes > 256); while (nbytes) { @@ -1057,7 +1056,7 @@ static ssize_t extract_crng_user(void __user *buf, size_t nbytes) } extract_crng(tmp); - i = min_t(int, nbytes, CHACHA20_BLOCK_SIZE); + i = min_t(int, nbytes, CHACHA_BLOCK_SIZE); if (copy_to_user(buf, tmp, i)) { ret = -EFAULT; break; @@ -1622,14 +1621,14 @@ static void _warn_unseeded_randomness(const char *func_name, void *caller, */ static void _get_random_bytes(void *buf, int nbytes) { - __u8 tmp[CHACHA20_BLOCK_SIZE] __aligned(4); + __u8 tmp[CHACHA_BLOCK_SIZE] __aligned(4); trace_get_random_bytes(nbytes, _RET_IP_); - while (nbytes >= CHACHA20_BLOCK_SIZE) { + while (nbytes >= CHACHA_BLOCK_SIZE) { extract_crng(buf); - buf += CHACHA20_BLOCK_SIZE; - nbytes -= CHACHA20_BLOCK_SIZE; + buf += CHACHA_BLOCK_SIZE; + nbytes -= CHACHA_BLOCK_SIZE; } if (nbytes > 0) { @@ -1637,7 +1636,7 @@ static void _get_random_bytes(void *buf, int nbytes) memcpy(buf, tmp, nbytes); crng_backtrack_protect(tmp, nbytes); } else - crng_backtrack_protect(tmp, CHACHA20_BLOCK_SIZE); + crng_backtrack_protect(tmp, CHACHA_BLOCK_SIZE); memzero_explicit(tmp, sizeof(tmp)); } @@ -2208,8 +2207,8 @@ struct ctl_table random_table[] = { struct batched_entropy { union { - u64 entropy_u64[CHACHA20_BLOCK_SIZE / sizeof(u64)]; - u32 entropy_u32[CHACHA20_BLOCK_SIZE / sizeof(u32)]; + u64 entropy_u64[CHACHA_BLOCK_SIZE / sizeof(u64)]; + u32 entropy_u32[CHACHA_BLOCK_SIZE / sizeof(u32)]; }; unsigned int position; }; diff --git a/drivers/crypto/caam/caamalg.c b/drivers/crypto/caam/caamalg.c index 523565ce0060..92e593e2069a 100644 --- a/drivers/crypto/caam/caamalg.c +++ b/drivers/crypto/caam/caamalg.c @@ -559,7 +559,7 @@ static int chachapoly_setkey(struct crypto_aead *aead, const u8 *key, unsigned int ivsize = crypto_aead_ivsize(aead); unsigned int saltlen = CHACHAPOLY_IV_SIZE - ivsize; - if (keylen != CHACHA20_KEY_SIZE + saltlen) { + if (keylen != CHACHA_KEY_SIZE + saltlen) { crypto_aead_set_flags(aead, CRYPTO_TFM_RES_BAD_KEY_LEN); return -EINVAL; } diff --git a/drivers/crypto/caam/caamalg_qi2.c b/drivers/crypto/caam/caamalg_qi2.c index 2598640aa98b..425d5d974613 100644 --- a/drivers/crypto/caam/caamalg_qi2.c +++ b/drivers/crypto/caam/caamalg_qi2.c @@ -591,7 +591,7 @@ static int chachapoly_setkey(struct crypto_aead *aead, const u8 *key, unsigned int ivsize = crypto_aead_ivsize(aead); unsigned int saltlen = CHACHAPOLY_IV_SIZE - ivsize; - if (keylen != CHACHA20_KEY_SIZE + saltlen) { + if (keylen != CHACHA_KEY_SIZE + saltlen) { crypto_aead_set_flags(aead, CRYPTO_TFM_RES_BAD_KEY_LEN); return -EINVAL; } @@ -1577,9 +1577,9 @@ static struct caam_skcipher_alg driver_algs[] = { .setkey = skcipher_setkey, .encrypt = skcipher_encrypt, .decrypt = skcipher_decrypt, - .min_keysize = CHACHA20_KEY_SIZE, - .max_keysize = CHACHA20_KEY_SIZE, - .ivsize = CHACHA20_IV_SIZE, + .min_keysize = CHACHA_KEY_SIZE, + .max_keysize = CHACHA_KEY_SIZE, + .ivsize = CHACHA_IV_SIZE, }, .caam.class1_alg_type = OP_ALG_ALGSEL_CHACHA20, }, diff --git a/drivers/crypto/caam/compat.h b/drivers/crypto/caam/compat.h index 8bde903f9f4a..87d9efe4c7aa 100644 --- a/drivers/crypto/caam/compat.h +++ b/drivers/crypto/caam/compat.h @@ -36,7 +36,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/include/crypto/chacha.h b/include/crypto/chacha.h new file mode 100644 index 000000000000..b722a23e54bb --- /dev/null +++ b/include/crypto/chacha.h @@ -0,0 +1,47 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Common values and helper functions for the ChaCha and XChaCha stream ciphers. + * + * XChaCha extends ChaCha's nonce to 192 bits, while provably retaining ChaCha's + * security. Here they share the same key size, tfm context, and setkey + * function; only their IV size and encrypt/decrypt function differ. + */ + +#ifndef _CRYPTO_CHACHA_H +#define _CRYPTO_CHACHA_H + +#include +#include +#include + +/* 32-bit stream position, then 96-bit nonce (RFC7539 convention) */ +#define CHACHA_IV_SIZE 16 + +#define CHACHA_KEY_SIZE 32 +#define CHACHA_BLOCK_SIZE 64 +#define CHACHAPOLY_IV_SIZE 12 + +/* 192-bit nonce, then 64-bit stream position */ +#define XCHACHA_IV_SIZE 32 + +struct chacha_ctx { + u32 key[8]; + int nrounds; +}; + +void chacha_block(u32 *state, u8 *stream, int nrounds); +static inline void chacha20_block(u32 *state, u8 *stream) +{ + chacha_block(state, stream, 20); +} +void hchacha_block(const u32 *in, u32 *out, int nrounds); + +void crypto_chacha_init(u32 *state, struct chacha_ctx *ctx, u8 *iv); + +int crypto_chacha20_setkey(struct crypto_skcipher *tfm, const u8 *key, + unsigned int keysize); + +int crypto_chacha_crypt(struct skcipher_request *req); +int crypto_xchacha_crypt(struct skcipher_request *req); + +#endif /* _CRYPTO_CHACHA_H */ diff --git a/include/crypto/chacha20.h b/include/crypto/chacha20.h deleted file mode 100644 index c24b4ac03b85..000000000000 --- a/include/crypto/chacha20.h +++ /dev/null @@ -1,42 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Common values and helper functions for the ChaCha20 and XChaCha20 algorithms. - * - * XChaCha20 extends ChaCha20's nonce to 192 bits, while provably retaining - * ChaCha20's security. Here they share the same key size, tfm context, and - * setkey function; only their IV size and encrypt/decrypt function differ. - */ - -#ifndef _CRYPTO_CHACHA20_H -#define _CRYPTO_CHACHA20_H - -#include -#include -#include - -/* 32-bit stream position, then 96-bit nonce (RFC7539 convention) */ -#define CHACHA20_IV_SIZE 16 - -#define CHACHA20_KEY_SIZE 32 -#define CHACHA20_BLOCK_SIZE 64 -#define CHACHAPOLY_IV_SIZE 12 - -/* 192-bit nonce, then 64-bit stream position */ -#define XCHACHA20_IV_SIZE 32 - -struct chacha20_ctx { - u32 key[8]; -}; - -void chacha20_block(u32 *state, u8 *stream); -void hchacha20_block(const u32 *in, u32 *out); - -void crypto_chacha20_init(u32 *state, struct chacha20_ctx *ctx, u8 *iv); - -int crypto_chacha20_setkey(struct crypto_skcipher *tfm, const u8 *key, - unsigned int keysize); - -int crypto_chacha20_crypt(struct skcipher_request *req); -int crypto_xchacha20_crypt(struct skcipher_request *req); - -#endif diff --git a/lib/Makefile b/lib/Makefile index db06d1237898..4c2b6fc5cde9 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -20,7 +20,7 @@ KCOV_INSTRUMENT_dynamic_debug.o := n lib-y := ctype.o string.o vsprintf.o cmdline.o \ rbtree.o radix-tree.o timerqueue.o xarray.o \ idr.o int_sqrt.o extable.o \ - sha1.o chacha20.o irq_regs.o argv_split.o \ + sha1.o chacha.o irq_regs.o argv_split.o \ flex_proportions.o ratelimit.o show_mem.o \ is_single_threaded.o plist.o decompress.o kobject_uevent.o \ earlycpio.o seq_buf.o siphash.o dec_and_lock.o \ diff --git a/lib/chacha.c b/lib/chacha.c new file mode 100644 index 000000000000..1bdc688c18df --- /dev/null +++ b/lib/chacha.c @@ -0,0 +1,117 @@ +/* + * The "hash function" used as the core of the ChaCha stream cipher (RFC7539) + * + * Copyright (C) 2015 Martin Willi + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include + +static void chacha_permute(u32 *x, int nrounds) +{ + int i; + + /* whitelist the allowed round counts */ + WARN_ON_ONCE(nrounds != 20); + + for (i = 0; i < nrounds; i += 2) { + x[0] += x[4]; x[12] = rol32(x[12] ^ x[0], 16); + x[1] += x[5]; x[13] = rol32(x[13] ^ x[1], 16); + x[2] += x[6]; x[14] = rol32(x[14] ^ x[2], 16); + x[3] += x[7]; x[15] = rol32(x[15] ^ x[3], 16); + + x[8] += x[12]; x[4] = rol32(x[4] ^ x[8], 12); + x[9] += x[13]; x[5] = rol32(x[5] ^ x[9], 12); + x[10] += x[14]; x[6] = rol32(x[6] ^ x[10], 12); + x[11] += x[15]; x[7] = rol32(x[7] ^ x[11], 12); + + x[0] += x[4]; x[12] = rol32(x[12] ^ x[0], 8); + x[1] += x[5]; x[13] = rol32(x[13] ^ x[1], 8); + x[2] += x[6]; x[14] = rol32(x[14] ^ x[2], 8); + x[3] += x[7]; x[15] = rol32(x[15] ^ x[3], 8); + + x[8] += x[12]; x[4] = rol32(x[4] ^ x[8], 7); + x[9] += x[13]; x[5] = rol32(x[5] ^ x[9], 7); + x[10] += x[14]; x[6] = rol32(x[6] ^ x[10], 7); + x[11] += x[15]; x[7] = rol32(x[7] ^ x[11], 7); + + x[0] += x[5]; x[15] = rol32(x[15] ^ x[0], 16); + x[1] += x[6]; x[12] = rol32(x[12] ^ x[1], 16); + x[2] += x[7]; x[13] = rol32(x[13] ^ x[2], 16); + x[3] += x[4]; x[14] = rol32(x[14] ^ x[3], 16); + + x[10] += x[15]; x[5] = rol32(x[5] ^ x[10], 12); + x[11] += x[12]; x[6] = rol32(x[6] ^ x[11], 12); + x[8] += x[13]; x[7] = rol32(x[7] ^ x[8], 12); + x[9] += x[14]; x[4] = rol32(x[4] ^ x[9], 12); + + x[0] += x[5]; x[15] = rol32(x[15] ^ x[0], 8); + x[1] += x[6]; x[12] = rol32(x[12] ^ x[1], 8); + x[2] += x[7]; x[13] = rol32(x[13] ^ x[2], 8); + x[3] += x[4]; x[14] = rol32(x[14] ^ x[3], 8); + + x[10] += x[15]; x[5] = rol32(x[5] ^ x[10], 7); + x[11] += x[12]; x[6] = rol32(x[6] ^ x[11], 7); + x[8] += x[13]; x[7] = rol32(x[7] ^ x[8], 7); + x[9] += x[14]; x[4] = rol32(x[4] ^ x[9], 7); + } +} + +/** + * chacha_block - generate one keystream block and increment block counter + * @state: input state matrix (16 32-bit words) + * @stream: output keystream block (64 bytes) + * @nrounds: number of rounds (currently must be 20) + * + * This is the ChaCha core, a function from 64-byte strings to 64-byte strings. + * The caller has already converted the endianness of the input. This function + * also handles incrementing the block counter in the input matrix. + */ +void chacha_block(u32 *state, u8 *stream, int nrounds) +{ + u32 x[16]; + int i; + + memcpy(x, state, 64); + + chacha_permute(x, nrounds); + + for (i = 0; i < ARRAY_SIZE(x); i++) + put_unaligned_le32(x[i] + state[i], &stream[i * sizeof(u32)]); + + state[12]++; +} +EXPORT_SYMBOL(chacha_block); + +/** + * hchacha_block - abbreviated ChaCha core, for XChaCha + * @in: input state matrix (16 32-bit words) + * @out: output (8 32-bit words) + * @nrounds: number of rounds (currently must be 20) + * + * HChaCha is the ChaCha equivalent of HSalsa and is an intermediate step + * towards XChaCha (see https://cr.yp.to/snuffle/xsalsa-20081128.pdf). HChaCha + * skips the final addition of the initial state, and outputs only certain words + * of the state. It should not be used for streaming directly. + */ +void hchacha_block(const u32 *in, u32 *out, int nrounds) +{ + u32 x[16]; + + memcpy(x, in, 64); + + chacha_permute(x, nrounds); + + memcpy(&out[0], &x[0], 16); + memcpy(&out[4], &x[12], 16); +} +EXPORT_SYMBOL(hchacha_block); diff --git a/lib/chacha20.c b/lib/chacha20.c deleted file mode 100644 index 6a484e16171d..000000000000 --- a/lib/chacha20.c +++ /dev/null @@ -1,112 +0,0 @@ -/* - * The "hash function" used as the core of the ChaCha20 stream cipher (RFC7539) - * - * Copyright (C) 2015 Martin Willi - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - */ - -#include -#include -#include -#include -#include -#include - -static void chacha20_permute(u32 *x) -{ - int i; - - for (i = 0; i < 20; i += 2) { - x[0] += x[4]; x[12] = rol32(x[12] ^ x[0], 16); - x[1] += x[5]; x[13] = rol32(x[13] ^ x[1], 16); - x[2] += x[6]; x[14] = rol32(x[14] ^ x[2], 16); - x[3] += x[7]; x[15] = rol32(x[15] ^ x[3], 16); - - x[8] += x[12]; x[4] = rol32(x[4] ^ x[8], 12); - x[9] += x[13]; x[5] = rol32(x[5] ^ x[9], 12); - x[10] += x[14]; x[6] = rol32(x[6] ^ x[10], 12); - x[11] += x[15]; x[7] = rol32(x[7] ^ x[11], 12); - - x[0] += x[4]; x[12] = rol32(x[12] ^ x[0], 8); - x[1] += x[5]; x[13] = rol32(x[13] ^ x[1], 8); - x[2] += x[6]; x[14] = rol32(x[14] ^ x[2], 8); - x[3] += x[7]; x[15] = rol32(x[15] ^ x[3], 8); - - x[8] += x[12]; x[4] = rol32(x[4] ^ x[8], 7); - x[9] += x[13]; x[5] = rol32(x[5] ^ x[9], 7); - x[10] += x[14]; x[6] = rol32(x[6] ^ x[10], 7); - x[11] += x[15]; x[7] = rol32(x[7] ^ x[11], 7); - - x[0] += x[5]; x[15] = rol32(x[15] ^ x[0], 16); - x[1] += x[6]; x[12] = rol32(x[12] ^ x[1], 16); - x[2] += x[7]; x[13] = rol32(x[13] ^ x[2], 16); - x[3] += x[4]; x[14] = rol32(x[14] ^ x[3], 16); - - x[10] += x[15]; x[5] = rol32(x[5] ^ x[10], 12); - x[11] += x[12]; x[6] = rol32(x[6] ^ x[11], 12); - x[8] += x[13]; x[7] = rol32(x[7] ^ x[8], 12); - x[9] += x[14]; x[4] = rol32(x[4] ^ x[9], 12); - - x[0] += x[5]; x[15] = rol32(x[15] ^ x[0], 8); - x[1] += x[6]; x[12] = rol32(x[12] ^ x[1], 8); - x[2] += x[7]; x[13] = rol32(x[13] ^ x[2], 8); - x[3] += x[4]; x[14] = rol32(x[14] ^ x[3], 8); - - x[10] += x[15]; x[5] = rol32(x[5] ^ x[10], 7); - x[11] += x[12]; x[6] = rol32(x[6] ^ x[11], 7); - x[8] += x[13]; x[7] = rol32(x[7] ^ x[8], 7); - x[9] += x[14]; x[4] = rol32(x[4] ^ x[9], 7); - } -} - -/** - * chacha20_block - generate one keystream block and increment block counter - * @state: input state matrix (16 32-bit words) - * @stream: output keystream block (64 bytes) - * - * This is the ChaCha20 core, a function from 64-byte strings to 64-byte - * strings. The caller has already converted the endianness of the input. This - * function also handles incrementing the block counter in the input matrix. - */ -void chacha20_block(u32 *state, u8 *stream) -{ - u32 x[16]; - int i; - - memcpy(x, state, 64); - - chacha20_permute(x); - - for (i = 0; i < ARRAY_SIZE(x); i++) - put_unaligned_le32(x[i] + state[i], &stream[i * sizeof(u32)]); - - state[12]++; -} -EXPORT_SYMBOL(chacha20_block); - -/** - * hchacha20_block - abbreviated ChaCha20 core, for XChaCha20 - * @in: input state matrix (16 32-bit words) - * @out: output (8 32-bit words) - * - * HChaCha20 is the ChaCha equivalent of HSalsa20 and is an intermediate step - * towards XChaCha20 (see https://cr.yp.to/snuffle/xsalsa-20081128.pdf). - * HChaCha20 skips the final addition of the initial state, and outputs only - * certain words of the state. It should not be used for streaming directly. - */ -void hchacha20_block(const u32 *in, u32 *out) -{ - u32 x[16]; - - memcpy(x, in, 64); - - chacha20_permute(x); - - memcpy(&out[0], &x[0], 16); - memcpy(&out[4], &x[12], 16); -} -EXPORT_SYMBOL(hchacha20_block); -- cgit v1.2.3-59-g8ed1b From aa7624093cb7fbf4fea95e612580d8d29a819f67 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 16 Nov 2018 17:26:22 -0800 Subject: crypto: chacha - add XChaCha12 support Now that the generic implementation of ChaCha20 has been refactored to allow varying the number of rounds, add support for XChaCha12, which is the XSalsa construction applied to ChaCha12. ChaCha12 is one of the three ciphers specified by the original ChaCha paper (https://cr.yp.to/chacha/chacha-20080128.pdf: "ChaCha, a variant of Salsa20"), alongside ChaCha8 and ChaCha20. ChaCha12 is faster than ChaCha20 but has a lower, but still large, security margin. We need XChaCha12 support so that it can be used in the Adiantum encryption mode, which enables disk/file encryption on low-end mobile devices where AES-XTS is too slow as the CPUs lack AES instructions. We'd prefer XChaCha20 (the more popular variant), but it's too slow on some of our target devices, so at least in some cases we do need the XChaCha12-based version. In more detail, the problem is that Adiantum is still much slower than we're happy with, and encryption still has a quite noticeable effect on the feel of low-end devices. Users and vendors push back hard against encryption that degrades the user experience, which always risks encryption being disabled entirely. So we need to choose the fastest option that gives us a solid margin of security, and here that's XChaCha12. The best known attack on ChaCha breaks only 7 rounds and has 2^235 time complexity, so ChaCha12's security margin is still better than AES-256's. Much has been learned about cryptanalysis of ARX ciphers since Salsa20 was originally designed in 2005, and it now seems we can be comfortable with a smaller number of rounds. The eSTREAM project also suggests the 12-round version of Salsa20 as providing the best balance among the different variants: combining very good performance with a "comfortable margin of security". Note that it would be trivial to add vanilla ChaCha12 in addition to XChaCha12. However, it's unneeded for now and therefore is omitted. As discussed in the patch that introduced XChaCha20 support, I considered splitting the code into separate chacha-common, chacha20, xchacha20, and xchacha12 modules, so that these algorithms could be enabled/disabled independently. However, since nearly all the code is shared anyway, I ultimately decided there would have been little benefit to the added complexity. Reviewed-by: Ard Biesheuvel Acked-by: Martin Willi Signed-off-by: Eric Biggers Signed-off-by: Herbert Xu --- crypto/Kconfig | 8 +- crypto/chacha_generic.c | 26 ++- crypto/testmgr.c | 6 + crypto/testmgr.h | 578 ++++++++++++++++++++++++++++++++++++++++++++++++ include/crypto/chacha.h | 7 + lib/chacha.c | 6 +- 6 files changed, 625 insertions(+), 6 deletions(-) diff --git a/crypto/Kconfig b/crypto/Kconfig index 75ebd1a2746c..4431c0db56b7 100644 --- a/crypto/Kconfig +++ b/crypto/Kconfig @@ -1403,10 +1403,10 @@ config CRYPTO_SALSA20 Bernstein . See config CRYPTO_CHACHA20 - tristate "ChaCha20 stream cipher algorithms" + tristate "ChaCha stream cipher algorithms" select CRYPTO_BLKCIPHER help - The ChaCha20 and XChaCha20 stream cipher algorithms. + The ChaCha20, XChaCha20, and XChaCha12 stream cipher algorithms. ChaCha20 is a 256-bit high-speed stream cipher designed by Daniel J. Bernstein and further specified in RFC7539 for use in IETF protocols. @@ -1419,6 +1419,10 @@ config CRYPTO_CHACHA20 while provably retaining ChaCha20's security. See also: + XChaCha12 is XChaCha20 reduced to 12 rounds, with correspondingly + reduced security margin but increased performance. It can be needed + in some performance-sensitive scenarios. + config CRYPTO_CHACHA20_X86_64 tristate "ChaCha20 cipher algorithm (x86_64/SSSE3/AVX2)" depends on X86 && 64BIT diff --git a/crypto/chacha_generic.c b/crypto/chacha_generic.c index 438f15a14054..35b583101f4f 100644 --- a/crypto/chacha_generic.c +++ b/crypto/chacha_generic.c @@ -1,5 +1,5 @@ /* - * ChaCha20 (RFC7539) and XChaCha20 stream cipher algorithms + * ChaCha and XChaCha stream ciphers, including ChaCha20 (RFC7539) * * Copyright (C) 2015 Martin Willi * Copyright (C) 2018 Google LLC @@ -106,6 +106,13 @@ int crypto_chacha20_setkey(struct crypto_skcipher *tfm, const u8 *key, } EXPORT_SYMBOL_GPL(crypto_chacha20_setkey); +int crypto_chacha12_setkey(struct crypto_skcipher *tfm, const u8 *key, + unsigned int keysize) +{ + return chacha_setkey(tfm, key, keysize, 12); +} +EXPORT_SYMBOL_GPL(crypto_chacha12_setkey); + int crypto_chacha_crypt(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); @@ -168,6 +175,21 @@ static struct skcipher_alg algs[] = { .setkey = crypto_chacha20_setkey, .encrypt = crypto_xchacha_crypt, .decrypt = crypto_xchacha_crypt, + }, { + .base.cra_name = "xchacha12", + .base.cra_driver_name = "xchacha12-generic", + .base.cra_priority = 100, + .base.cra_blocksize = 1, + .base.cra_ctxsize = sizeof(struct chacha_ctx), + .base.cra_module = THIS_MODULE, + + .min_keysize = CHACHA_KEY_SIZE, + .max_keysize = CHACHA_KEY_SIZE, + .ivsize = XCHACHA_IV_SIZE, + .chunksize = CHACHA_BLOCK_SIZE, + .setkey = crypto_chacha12_setkey, + .encrypt = crypto_xchacha_crypt, + .decrypt = crypto_xchacha_crypt, } }; @@ -191,3 +213,5 @@ MODULE_ALIAS_CRYPTO("chacha20"); MODULE_ALIAS_CRYPTO("chacha20-generic"); MODULE_ALIAS_CRYPTO("xchacha20"); MODULE_ALIAS_CRYPTO("xchacha20-generic"); +MODULE_ALIAS_CRYPTO("xchacha12"); +MODULE_ALIAS_CRYPTO("xchacha12-generic"); diff --git a/crypto/testmgr.c b/crypto/testmgr.c index 11f5c8b0f4dc..6ff60c3745f1 100644 --- a/crypto/testmgr.c +++ b/crypto/testmgr.c @@ -3576,6 +3576,12 @@ static const struct alg_test_desc alg_test_descs[] = { .suite = { .hash = __VECS(aes_xcbc128_tv_template) } + }, { + .alg = "xchacha12", + .test = alg_test_skcipher, + .suite = { + .cipher = __VECS(xchacha12_tv_template) + }, }, { .alg = "xchacha20", .test = alg_test_skcipher, diff --git a/crypto/testmgr.h b/crypto/testmgr.h index df0dc44a9b7b..a23dca2b11d0 100644 --- a/crypto/testmgr.h +++ b/crypto/testmgr.h @@ -31571,6 +31571,584 @@ static const struct cipher_testvec xchacha20_tv_template[] = { }, }; +/* + * Same as XChaCha20 test vectors above, but recomputed the ciphertext with + * XChaCha12, using a modified libsodium. + */ +static const struct cipher_testvec xchacha12_tv_template[] = { + { + .key = "\x79\xc9\x97\x98\xac\x67\x30\x0b" + "\xbb\x27\x04\xc9\x5c\x34\x1e\x32" + "\x45\xf3\xdc\xb2\x17\x61\xb9\x8e" + "\x52\xff\x45\xb2\x4f\x30\x4f\xc4", + .klen = 32, + .iv = "\xb3\x3f\xfd\x30\x96\x47\x9b\xcf" + "\xbc\x9a\xee\x49\x41\x76\x88\xa0" + "\xa2\x55\x4f\x8d\x95\x38\x94\x19" + "\x00\x00\x00\x00\x00\x00\x00\x00", + .ptext = "\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00", + .ctext = "\x1b\x78\x7f\xd7\xa1\x41\x68\xab" + "\x3d\x3f\xd1\x7b\x69\x56\xb2\xd5" + "\x43\xce\xeb\xaf\x36\xf0\x29\x9d" + "\x3a\xfb\x18\xae\x1b", + .len = 29, + }, { + .key = "\x9d\x23\xbd\x41\x49\xcb\x97\x9c" + "\xcf\x3c\x5c\x94\xdd\x21\x7e\x98" + "\x08\xcb\x0e\x50\xcd\x0f\x67\x81" + "\x22\x35\xea\xaf\x60\x1d\x62\x32", + .klen = 32, + .iv = "\xc0\x47\x54\x82\x66\xb7\xc3\x70" + "\xd3\x35\x66\xa2\x42\x5c\xbf\x30" + "\xd8\x2d\x1e\xaf\x52\x94\x10\x9e" + "\x00\x00\x00\x00\x00\x00\x00\x00", + .ptext = "\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00", + .ctext = "\xfb\x32\x09\x1d\x83\x05\xae\x4c" + "\x13\x1f\x12\x71\xf2\xca\xb2\xeb" + "\x5b\x83\x14\x7d\x83\xf6\x57\x77" + "\x2e\x40\x1f\x92\x2c\xf9\xec\x35" + "\x34\x1f\x93\xdf\xfb\x30\xd7\x35" + "\x03\x05\x78\xc1\x20\x3b\x7a\xe3" + "\x62\xa3\x89\xdc\x11\x11\x45\xa8" + "\x82\x89\xa0\xf1\x4e\xc7\x0f\x11" + "\x69\xdd\x0c\x84\x2b\x89\x5c\xdc" + "\xf0\xde\x01\xef\xc5\x65\x79\x23" + "\x87\x67\xd6\x50\xd9\x8d\xd9\x92" + "\x54\x5b\x0e", + .len = 91, + }, { + .key = "\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00", + .klen = 32, + .iv = "\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x67\xc6\x69\x73" + "\x51\xff\x4a\xec\x29\xcd\xba\xab" + "\x00\x00\x00\x00\x00\x00\x00\x00", + .ptext = "\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00", + .ctext = "\xdf\x2d\xc6\x21\x2a\x9d\xa1\xbb" + "\xc2\x77\x66\x0c\x5c\x46\xef\xa7" + "\x79\x1b\xb9\xdf\x55\xe2\xf9\x61" + "\x4c\x7b\xa4\x52\x24\xaf\xa2\xda" + "\xd1\x8f\x8f\xa2\x9e\x53\x4d\xc4" + "\xb8\x55\x98\x08\x7c\x08\xd4\x18" + "\x67\x8f\xef\x50\xb1\x5f\xa5\x77" + "\x4c\x25\xe7\x86\x26\x42\xca\x44", + .len = 64, + }, { + .key = "\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x01", + .klen = 32, + .iv = "\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x02\xf2\xfb\xe3\x46" + "\x7c\xc2\x54\xf8\x1b\xe8\xe7\x8d" + "\x01\x00\x00\x00\x00\x00\x00\x00", + .ptext = "\x41\x6e\x79\x20\x73\x75\x62\x6d" + "\x69\x73\x73\x69\x6f\x6e\x20\x74" + "\x6f\x20\x74\x68\x65\x20\x49\x45" + "\x54\x46\x20\x69\x6e\x74\x65\x6e" + "\x64\x65\x64\x20\x62\x79\x20\x74" + "\x68\x65\x20\x43\x6f\x6e\x74\x72" + "\x69\x62\x75\x74\x6f\x72\x20\x66" + "\x6f\x72\x20\x70\x75\x62\x6c\x69" + "\x63\x61\x74\x69\x6f\x6e\x20\x61" + "\x73\x20\x61\x6c\x6c\x20\x6f\x72" + "\x20\x70\x61\x72\x74\x20\x6f\x66" + "\x20\x61\x6e\x20\x49\x45\x54\x46" + "\x20\x49\x6e\x74\x65\x72\x6e\x65" + "\x74\x2d\x44\x72\x61\x66\x74\x20" + "\x6f\x72\x20\x52\x46\x43\x20\x61" + "\x6e\x64\x20\x61\x6e\x79\x20\x73" + "\x74\x61\x74\x65\x6d\x65\x6e\x74" + "\x20\x6d\x61\x64\x65\x20\x77\x69" + "\x74\x68\x69\x6e\x20\x74\x68\x65" + "\x20\x63\x6f\x6e\x74\x65\x78\x74" + "\x20\x6f\x66\x20\x61\x6e\x20\x49" + "\x45\x54\x46\x20\x61\x63\x74\x69" + "\x76\x69\x74\x79\x20\x69\x73\x20" + "\x63\x6f\x6e\x73\x69\x64\x65\x72" + "\x65\x64\x20\x61\x6e\x20\x22\x49" + "\x45\x54\x46\x20\x43\x6f\x6e\x74" + "\x72\x69\x62\x75\x74\x69\x6f\x6e" + "\x22\x2e\x20\x53\x75\x63\x68\x20" + "\x73\x74\x61\x74\x65\x6d\x65\x6e" + "\x74\x73\x20\x69\x6e\x63\x6c\x75" + "\x64\x65\x20\x6f\x72\x61\x6c\x20" + "\x73\x74\x61\x74\x65\x6d\x65\x6e" + "\x74\x73\x20\x69\x6e\x20\x49\x45" + "\x54\x46\x20\x73\x65\x73\x73\x69" + "\x6f\x6e\x73\x2c\x20\x61\x73\x20" + "\x77\x65\x6c\x6c\x20\x61\x73\x20" + "\x77\x72\x69\x74\x74\x65\x6e\x20" + "\x61\x6e\x64\x20\x65\x6c\x65\x63" + "\x74\x72\x6f\x6e\x69\x63\x20\x63" + "\x6f\x6d\x6d\x75\x6e\x69\x63\x61" + "\x74\x69\x6f\x6e\x73\x20\x6d\x61" + "\x64\x65\x20\x61\x74\x20\x61\x6e" + "\x79\x20\x74\x69\x6d\x65\x20\x6f" + "\x72\x20\x70\x6c\x61\x63\x65\x2c" + "\x20\x77\x68\x69\x63\x68\x20\x61" + "\x72\x65\x20\x61\x64\x64\x72\x65" + "\x73\x73\x65\x64\x20\x74\x6f", + .ctext = "\xe4\xa6\xc8\x30\xc4\x23\x13\xd6" + "\x08\x4d\xc9\xb7\xa5\x64\x7c\xb9" + "\x71\xe2\xab\x3e\xa8\x30\x8a\x1c" + "\x4a\x94\x6d\x9b\xe0\xb3\x6f\xf1" + "\xdc\xe3\x1b\xb3\xa9\x6d\x0d\xd6" + "\xd0\xca\x12\xef\xe7\x5f\xd8\x61" + "\x3c\x82\xd3\x99\x86\x3c\x6f\x66" + "\x02\x06\xdc\x55\xf9\xed\xdf\x38" + "\xb4\xa6\x17\x00\x7f\xef\xbf\x4f" + "\xf8\x36\xf1\x60\x7e\x47\xaf\xdb" + "\x55\x9b\x12\xcb\x56\x44\xa7\x1f" + "\xd3\x1a\x07\x3b\x00\xec\xe6\x4c" + "\xa2\x43\x27\xdf\x86\x19\x4f\x16" + "\xed\xf9\x4a\xf3\x63\x6f\xfa\x7f" + "\x78\x11\xf6\x7d\x97\x6f\xec\x6f" + "\x85\x0f\x5c\x36\x13\x8d\x87\xe0" + "\x80\xb1\x69\x0b\x98\x89\x9c\x4e" + "\xf8\xdd\xee\x5c\x0a\x85\xce\xd4" + "\xea\x1b\x48\xbe\x08\xf8\xe2\xa8" + "\xa5\xb0\x3c\x79\xb1\x15\xb4\xb9" + "\x75\x10\x95\x35\x81\x7e\x26\xe6" + "\x78\xa4\x88\xcf\xdb\x91\x34\x18" + "\xad\xd7\x8e\x07\x7d\xab\x39\xf9" + "\xa3\x9e\xa5\x1d\xbb\xed\x61\xfd" + "\xdc\xb7\x5a\x27\xfc\xb5\xc9\x10" + "\xa8\xcc\x52\x7f\x14\x76\x90\xe7" + "\x1b\x29\x60\x74\xc0\x98\x77\xbb" + "\xe0\x54\xbb\x27\x49\x59\x1e\x62" + "\x3d\xaf\x74\x06\xa4\x42\x6f\xc6" + "\x52\x97\xc4\x1d\xc4\x9f\xe2\xe5" + "\x38\x57\x91\xd1\xa2\x28\xcc\x40" + "\xcc\x70\x59\x37\xfc\x9f\x4b\xda" + "\xa0\xeb\x97\x9a\x7d\xed\x14\x5c" + "\x9c\xb7\x93\x26\x41\xa8\x66\xdd" + "\x87\x6a\xc0\xd3\xc2\xa9\x3e\xae" + "\xe9\x72\xfe\xd1\xb3\xac\x38\xea" + "\x4d\x15\xa9\xd5\x36\x61\xe9\x96" + "\x6c\x23\xf8\x43\xe4\x92\x29\xd9" + "\x8b\x78\xf7\x0a\x52\xe0\x19\x5b" + "\x59\x69\x5b\x5d\xa1\x53\xc4\x68" + "\xe1\xbb\xac\x89\x14\xe2\xe2\x85" + "\x41\x18\xf5\xb3\xd1\xfa\x68\x19" + "\x44\x78\xdc\xcf\xe7\x88\x2d\x52" + "\x5f\x40\xb5\x7e\xf8\x88\xa2\xae" + "\x4a\xb2\x07\x35\x9d\x9b\x07\x88" + "\xb7\x00\xd0\x0c\xb6\xa0\x47\x59" + "\xda\x4e\xc9\xab\x9b\x8a\x7b", + + .len = 375, + .also_non_np = 1, + .np = 3, + .tap = { 375 - 20, 4, 16 }, + + }, { + .key = "\x1c\x92\x40\xa5\xeb\x55\xd3\x8a" + "\xf3\x33\x88\x86\x04\xf6\xb5\xf0" + "\x47\x39\x17\xc1\x40\x2b\x80\x09" + "\x9d\xca\x5c\xbc\x20\x70\x75\xc0", + .klen = 32, + .iv = "\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x02\x76\x5a\x2e\x63" + "\x33\x9f\xc9\x9a\x66\x32\x0d\xb7" + "\x2a\x00\x00\x00\x00\x00\x00\x00", + .ptext = "\x27\x54\x77\x61\x73\x20\x62\x72" + "\x69\x6c\x6c\x69\x67\x2c\x20\x61" + "\x6e\x64\x20\x74\x68\x65\x20\x73" + "\x6c\x69\x74\x68\x79\x20\x74\x6f" + "\x76\x65\x73\x0a\x44\x69\x64\x20" + "\x67\x79\x72\x65\x20\x61\x6e\x64" + "\x20\x67\x69\x6d\x62\x6c\x65\x20" + "\x69\x6e\x20\x74\x68\x65\x20\x77" + "\x61\x62\x65\x3a\x0a\x41\x6c\x6c" + "\x20\x6d\x69\x6d\x73\x79\x20\x77" + "\x65\x72\x65\x20\x74\x68\x65\x20" + "\x62\x6f\x72\x6f\x67\x6f\x76\x65" + "\x73\x2c\x0a\x41\x6e\x64\x20\x74" + "\x68\x65\x20\x6d\x6f\x6d\x65\x20" + "\x72\x61\x74\x68\x73\x20\x6f\x75" + "\x74\x67\x72\x61\x62\x65\x2e", + .ctext = "\xb9\x68\xbc\x6a\x24\xbc\xcc\xd8" + "\x9b\x2a\x8d\x5b\x96\xaf\x56\xe3" + "\x11\x61\xe7\xa7\x9b\xce\x4e\x7d" + "\x60\x02\x48\xac\xeb\xd5\x3a\x26" + "\x9d\x77\x3b\xb5\x32\x13\x86\x8e" + "\x20\x82\x26\x72\xae\x64\x1b\x7e" + "\x2e\x01\x68\xb4\x87\x45\xa1\x24" + "\xe4\x48\x40\xf0\xaa\xac\xee\xa9" + "\xfc\x31\xad\x9d\x89\xa3\xbb\xd2" + "\xe4\x25\x13\xad\x0f\x5e\xdf\x3c" + "\x27\xab\xb8\x62\x46\x22\x30\x48" + "\x55\x2c\x4e\x84\x78\x1d\x0d\x34" + "\x8d\x3c\x91\x0a\x7f\x5b\x19\x9f" + "\x97\x05\x4c\xa7\x62\x47\x8b\xc5" + "\x44\x2e\x20\x33\xdd\xa0\x82\xa9" + "\x25\x76\x37\xe6\x3c\x67\x5b", + .len = 127, + }, { + .key = "\x1c\x92\x40\xa5\xeb\x55\xd3\x8a" + "\xf3\x33\x88\x86\x04\xf6\xb5\xf0" + "\x47\x39\x17\xc1\x40\x2b\x80\x09" + "\x9d\xca\x5c\xbc\x20\x70\x75\xc0", + .klen = 32, + .iv = "\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x01\x31\x58\xa3\x5a" + "\x25\x5d\x05\x17\x58\xe9\x5e\xd4" + "\x1c\x00\x00\x00\x00\x00\x00\x00", + .ptext = "\x49\xee\xe0\xdc\x24\x90\x40\xcd" + "\xc5\x40\x8f\x47\x05\xbc\xdd\x81" + "\x47\xc6\x8d\xe6\xb1\x8f\xd7\xcb" + "\x09\x0e\x6e\x22\x48\x1f\xbf\xb8" + "\x5c\xf7\x1e\x8a\xc1\x23\xf2\xd4" + "\x19\x4b\x01\x0f\x4e\xa4\x43\xce" + "\x01\xc6\x67\xda\x03\x91\x18\x90" + "\xa5\xa4\x8e\x45\x03\xb3\x2d\xac" + "\x74\x92\xd3\x53\x47\xc8\xdd\x25" + "\x53\x6c\x02\x03\x87\x0d\x11\x0c" + "\x58\xe3\x12\x18\xfd\x2a\x5b\x40" + "\x0c\x30\xf0\xb8\x3f\x43\xce\xae" + "\x65\x3a\x7d\x7c\xf4\x54\xaa\xcc" + "\x33\x97\xc3\x77\xba\xc5\x70\xde" + "\xd7\xd5\x13\xa5\x65\xc4\x5f\x0f" + "\x46\x1a\x0d\x97\xb5\xf3\xbb\x3c" + "\x84\x0f\x2b\xc5\xaa\xea\xf2\x6c" + "\xc9\xb5\x0c\xee\x15\xf3\x7d\xbe" + "\x9f\x7b\x5a\xa6\xae\x4f\x83\xb6" + "\x79\x49\x41\xf4\x58\x18\xcb\x86" + "\x7f\x30\x0e\xf8\x7d\x44\x36\xea" + "\x75\xeb\x88\x84\x40\x3c\xad\x4f" + "\x6f\x31\x6b\xaa\x5d\xe5\xa5\xc5" + "\x21\x66\xe9\xa7\xe3\xb2\x15\x88" + "\x78\xf6\x79\xa1\x59\x47\x12\x4e" + "\x9f\x9f\x64\x1a\xa0\x22\x5b\x08" + "\xbe\x7c\x36\xc2\x2b\x66\x33\x1b" + "\xdd\x60\x71\xf7\x47\x8c\x61\xc3" + "\xda\x8a\x78\x1e\x16\xfa\x1e\x86" + "\x81\xa6\x17\x2a\xa7\xb5\xc2\xe7" + "\xa4\xc7\x42\xf1\xcf\x6a\xca\xb4" + "\x45\xcf\xf3\x93\xf0\xe7\xea\xf6" + "\xf4\xe6\x33\x43\x84\x93\xa5\x67" + "\x9b\x16\x58\x58\x80\x0f\x2b\x5c" + "\x24\x74\x75\x7f\x95\x81\xb7\x30" + "\x7a\x33\xa7\xf7\x94\x87\x32\x27" + "\x10\x5d\x14\x4c\x43\x29\xdd\x26" + "\xbd\x3e\x3c\x0e\xfe\x0e\xa5\x10" + "\xea\x6b\x64\xfd\x73\xc6\xed\xec" + "\xa8\xc9\xbf\xb3\xba\x0b\x4d\x07" + "\x70\xfc\x16\xfd\x79\x1e\xd7\xc5" + "\x49\x4e\x1c\x8b\x8d\x79\x1b\xb1" + "\xec\xca\x60\x09\x4c\x6a\xd5\x09" + "\x49\x46\x00\x88\x22\x8d\xce\xea" + "\xb1\x17\x11\xde\x42\xd2\x23\xc1" + "\x72\x11\xf5\x50\x73\x04\x40\x47" + "\xf9\x5d\xe7\xa7\x26\xb1\x7e\xb0" + "\x3f\x58\xc1\x52\xab\x12\x67\x9d" + "\x3f\x43\x4b\x68\xd4\x9c\x68\x38" + "\x07\x8a\x2d\x3e\xf3\xaf\x6a\x4b" + "\xf9\xe5\x31\x69\x22\xf9\xa6\x69" + "\xc6\x9c\x96\x9a\x12\x35\x95\x1d" + "\x95\xd5\xdd\xbe\xbf\x93\x53\x24" + "\xfd\xeb\xc2\x0a\x64\xb0\x77\x00" + "\x6f\x88\xc4\x37\x18\x69\x7c\xd7" + "\x41\x92\x55\x4c\x03\xa1\x9a\x4b" + "\x15\xe5\xdf\x7f\x37\x33\x72\xc1" + "\x8b\x10\x67\xa3\x01\x57\x94\x25" + "\x7b\x38\x71\x7e\xdd\x1e\xcc\x73" + "\x55\xd2\x8e\xeb\x07\xdd\xf1\xda" + "\x58\xb1\x47\x90\xfe\x42\x21\x72" + "\xa3\x54\x7a\xa0\x40\xec\x9f\xdd" + "\xc6\x84\x6e\xca\xae\xe3\x68\xb4" + "\x9d\xe4\x78\xff\x57\xf2\xf8\x1b" + "\x03\xa1\x31\xd9\xde\x8d\xf5\x22" + "\x9c\xdd\x20\xa4\x1e\x27\xb1\x76" + "\x4f\x44\x55\xe2\x9b\xa1\x9c\xfe" + "\x54\xf7\x27\x1b\xf4\xde\x02\xf5" + "\x1b\x55\x48\x5c\xdc\x21\x4b\x9e" + "\x4b\x6e\xed\x46\x23\xdc\x65\xb2" + "\xcf\x79\x5f\x28\xe0\x9e\x8b\xe7" + "\x4c\x9d\x8a\xff\xc1\xa6\x28\xb8" + "\x65\x69\x8a\x45\x29\xef\x74\x85" + "\xde\x79\xc7\x08\xae\x30\xb0\xf4" + "\xa3\x1d\x51\x41\xab\xce\xcb\xf6" + "\xb5\xd8\x6d\xe0\x85\xe1\x98\xb3" + "\x43\xbb\x86\x83\x0a\xa0\xf5\xb7" + "\x04\x0b\xfa\x71\x1f\xb0\xf6\xd9" + "\x13\x00\x15\xf0\xc7\xeb\x0d\x5a" + "\x9f\xd7\xb9\x6c\x65\x14\x22\x45" + "\x6e\x45\x32\x3e\x7e\x60\x1a\x12" + "\x97\x82\x14\xfb\xaa\x04\x22\xfa" + "\xa0\xe5\x7e\x8c\x78\x02\x48\x5d" + "\x78\x33\x5a\x7c\xad\xdb\x29\xce" + "\xbb\x8b\x61\xa4\xb7\x42\xe2\xac" + "\x8b\x1a\xd9\x2f\x0b\x8b\x62\x21" + "\x83\x35\x7e\xad\x73\xc2\xb5\x6c" + "\x10\x26\x38\x07\xe5\xc7\x36\x80" + "\xe2\x23\x12\x61\xf5\x48\x4b\x2b" + "\xc5\xdf\x15\xd9\x87\x01\xaa\xac" + "\x1e\x7c\xad\x73\x78\x18\x63\xe0" + "\x8b\x9f\x81\xd8\x12\x6a\x28\x10" + "\xbe\x04\x68\x8a\x09\x7c\x1b\x1c" + "\x83\x66\x80\x47\x80\xe8\xfd\x35" + "\x1c\x97\x6f\xae\x49\x10\x66\xcc" + "\xc6\xd8\xcc\x3a\x84\x91\x20\x77" + "\x72\xe4\x24\xd2\x37\x9f\xc5\xc9" + "\x25\x94\x10\x5f\x40\x00\x64\x99" + "\xdc\xae\xd7\x21\x09\x78\x50\x15" + "\xac\x5f\xc6\x2c\xa2\x0b\xa9\x39" + "\x87\x6e\x6d\xab\xde\x08\x51\x16" + "\xc7\x13\xe9\xea\xed\x06\x8e\x2c" + "\xf8\x37\x8c\xf0\xa6\x96\x8d\x43" + "\xb6\x98\x37\xb2\x43\xed\xde\xdf" + "\x89\x1a\xe7\xeb\x9d\xa1\x7b\x0b" + "\x77\xb0\xe2\x75\xc0\xf1\x98\xd9" + "\x80\x55\xc9\x34\x91\xd1\x59\xe8" + "\x4b\x0f\xc1\xa9\x4b\x7a\x84\x06" + "\x20\xa8\x5d\xfa\xd1\xde\x70\x56" + "\x2f\x9e\x91\x9c\x20\xb3\x24\xd8" + "\x84\x3d\xe1\x8c\x7e\x62\x52\xe5" + "\x44\x4b\x9f\xc2\x93\x03\xea\x2b" + "\x59\xc5\xfa\x3f\x91\x2b\xbb\x23" + "\xf5\xb2\x7b\xf5\x38\xaf\xb3\xee" + "\x63\xdc\x7b\xd1\xff\xaa\x8b\xab" + "\x82\x6b\x37\x04\xeb\x74\xbe\x79" + "\xb9\x83\x90\xef\x20\x59\x46\xff" + "\xe9\x97\x3e\x2f\xee\xb6\x64\x18" + "\x38\x4c\x7a\x4a\xf9\x61\xe8\x9a" + "\xa1\xb5\x01\xa6\x47\xd3\x11\xd4" + "\xce\xd3\x91\x49\x88\xc7\xb8\x4d" + "\xb1\xb9\x07\x6d\x16\x72\xae\x46" + "\x5e\x03\xa1\x4b\xb6\x02\x30\xa8" + "\x3d\xa9\x07\x2a\x7c\x19\xe7\x62" + "\x87\xe3\x82\x2f\x6f\xe1\x09\xd9" + "\x94\x97\xea\xdd\x58\x9e\xae\x76" + "\x7e\x35\xe5\xb4\xda\x7e\xf4\xde" + "\xf7\x32\x87\xcd\x93\xbf\x11\x56" + "\x11\xbe\x08\x74\xe1\x69\xad\xe2" + "\xd7\xf8\x86\x75\x8a\x3c\xa4\xbe" + "\x70\xa7\x1b\xfc\x0b\x44\x2a\x76" + "\x35\xea\x5d\x85\x81\xaf\x85\xeb" + "\xa0\x1c\x61\xc2\xf7\x4f\xa5\xdc" + "\x02\x7f\xf6\x95\x40\x6e\x8a\x9a" + "\xf3\x5d\x25\x6e\x14\x3a\x22\xc9" + "\x37\x1c\xeb\x46\x54\x3f\xa5\x91" + "\xc2\xb5\x8c\xfe\x53\x08\x97\x32" + "\x1b\xb2\x30\x27\xfe\x25\x5d\xdc" + "\x08\x87\xd0\xe5\x94\x1a\xd4\xf1" + "\xfe\xd6\xb4\xa3\xe6\x74\x81\x3c" + "\x1b\xb7\x31\xa7\x22\xfd\xd4\xdd" + "\x20\x4e\x7c\x51\xb0\x60\x73\xb8" + "\x9c\xac\x91\x90\x7e\x01\xb0\xe1" + "\x8a\x2f\x75\x1c\x53\x2a\x98\x2a" + "\x06\x52\x95\x52\xb2\xe9\x25\x2e" + "\x4c\xe2\x5a\x00\xb2\x13\x81\x03" + "\x77\x66\x0d\xa5\x99\xda\x4e\x8c" + "\xac\xf3\x13\x53\x27\x45\xaf\x64" + "\x46\xdc\xea\x23\xda\x97\xd1\xab" + "\x7d\x6c\x30\x96\x1f\xbc\x06\x34" + "\x18\x0b\x5e\x21\x35\x11\x8d\x4c" + "\xe0\x2d\xe9\x50\x16\x74\x81\xa8" + "\xb4\x34\xb9\x72\x42\xa6\xcc\xbc" + "\xca\x34\x83\x27\x10\x5b\x68\x45" + "\x8f\x52\x22\x0c\x55\x3d\x29\x7c" + "\xe3\xc0\x66\x05\x42\x91\x5f\x58" + "\xfe\x4a\x62\xd9\x8c\xa9\x04\x19" + "\x04\xa9\x08\x4b\x57\xfc\x67\x53" + "\x08\x7c\xbc\x66\x8a\xb0\xb6\x9f" + "\x92\xd6\x41\x7c\x5b\x2a\x00\x79" + "\x72", + .ctext = "\xe1\xb6\x8b\x5c\x80\xb8\xcc\x08" + "\x1b\x84\xb2\xd1\xad\xa4\x70\xac" + "\x67\xa9\x39\x27\xac\xb4\x5b\xb7" + "\x4c\x26\x77\x23\x1d\xce\x0a\xbe" + "\x18\x9e\x42\x8b\xbd\x7f\xd6\xf1" + "\xf1\x6b\xe2\x6d\x7f\x92\x0e\xcb" + "\xb8\x79\xba\xb4\xac\x7e\x2d\xc0" + "\x9e\x83\x81\x91\xd5\xea\xc3\x12" + "\x8d\xa4\x26\x70\xa4\xf9\x71\x0b" + "\xbd\x2e\xe1\xb3\x80\x42\x25\xb3" + "\x0b\x31\x99\xe1\x0d\xde\xa6\x90" + "\xf2\xa3\x10\xf7\xe5\xf3\x83\x1e" + "\x2c\xfb\x4d\xf0\x45\x3d\x28\x3c" + "\xb8\xf1\xcb\xbf\x67\xd8\x43\x5a" + "\x9d\x7b\x73\x29\x88\x0f\x13\x06" + "\x37\x50\x0d\x7c\xe6\x9b\x07\xdd" + "\x7e\x01\x1f\x81\x90\x10\x69\xdb" + "\xa4\xad\x8a\x5e\xac\x30\x72\xf2" + "\x36\xcd\xe3\x23\x49\x02\x93\xfa" + "\x3d\xbb\xe2\x98\x83\xeb\xe9\x8d" + "\xb3\x8f\x11\xaa\x53\xdb\xaf\x2e" + "\x95\x13\x99\x3d\x71\xbd\x32\x92" + "\xdd\xfc\x9d\x5e\x6f\x63\x2c\xee" + "\x91\x1f\x4c\x64\x3d\x87\x55\x0f" + "\xcc\x3d\x89\x61\x53\x02\x57\x8f" + "\xe4\x77\x29\x32\xaf\xa6\x2f\x0a" + "\xae\x3c\x3f\x3f\xf4\xfb\x65\x52" + "\xc5\xc1\x78\x78\x53\x28\xad\xed" + "\xd1\x67\x37\xc7\x59\x70\xcd\x0a" + "\xb8\x0f\x80\x51\x9f\xc0\x12\x5e" + "\x06\x0a\x7e\xec\x24\x5f\x73\x00" + "\xb1\x0b\x31\x47\x4f\x73\x8d\xb4" + "\xce\xf3\x55\x45\x6c\x84\x27\xba" + "\xb9\x6f\x03\x4a\xeb\x98\x88\x6e" + "\x53\xed\x25\x19\x0d\x8f\xfe\xca" + "\x60\xe5\x00\x93\x6e\x3c\xff\x19" + "\xae\x08\x3b\x8a\xa6\x84\x05\xfe" + "\x9b\x59\xa0\x8c\xc8\x05\x45\xf5" + "\x05\x37\xdc\x45\x6f\x8b\x95\x8c" + "\x4e\x11\x45\x7a\xce\x21\xa5\xf7" + "\x71\x67\xb9\xce\xd7\xf9\xe9\x5e" + "\x60\xf5\x53\x7a\xa8\x85\x14\x03" + "\xa0\x92\xec\xf3\x51\x80\x84\xc4" + "\xdc\x11\x9e\x57\xce\x4b\x45\xcf" + "\x90\x95\x85\x0b\x96\xe9\xee\x35" + "\x10\xb8\x9b\xf2\x59\x4a\xc6\x7e" + "\x85\xe5\x6f\x38\x51\x93\x40\x0c" + "\x99\xd7\x7f\x32\xa8\x06\x27\xd1" + "\x2b\xd5\xb5\x3a\x1a\xe1\x5e\xda" + "\xcd\x5a\x50\x30\x3c\xc7\xe7\x65" + "\xa6\x07\x0b\x98\x91\xc6\x20\x27" + "\x2a\x03\x63\x1b\x1e\x3d\xaf\xc8" + "\x71\x48\x46\x6a\x64\x28\xf9\x3d" + "\xd1\x1d\xab\xc8\x40\x76\xc2\x39" + "\x4e\x00\x75\xd2\x0e\x82\x58\x8c" + "\xd3\x73\x5a\xea\x46\x89\xbe\xfd" + "\x4e\x2c\x0d\x94\xaa\x9b\x68\xac" + "\x86\x87\x30\x7e\xa9\x16\xcd\x59" + "\xd2\xa6\xbe\x0a\xd8\xf5\xfd\x2d" + "\x49\x69\xd2\x1a\x90\xd2\x1b\xed" + "\xff\x71\x04\x87\x87\x21\xc4\xb8" + "\x1f\x5b\x51\x33\xd0\xd6\x59\x9a" + "\x03\x0e\xd3\x8b\xfb\x57\x73\xfd" + "\x5a\x52\x63\x82\xc8\x85\x2f\xcb" + "\x74\x6d\x4e\xd9\x68\x37\x85\x6a" + "\xd4\xfb\x94\xed\x8d\xd1\x1a\xaf" + "\x76\xa7\xb7\x88\xd0\x2b\x4e\xda" + "\xec\x99\x94\x27\x6f\x87\x8c\xdf" + "\x4b\x5e\xa6\x66\xdd\xcb\x33\x7b" + "\x64\x94\x31\xa8\x37\xa6\x1d\xdb" + "\x0d\x5c\x93\xa4\x40\xf9\x30\x53" + "\x4b\x74\x8d\xdd\xf6\xde\x3c\xac" + "\x5c\x80\x01\x3a\xef\xb1\x9a\x02" + "\x0c\x22\x8e\xe7\x44\x09\x74\x4c" + "\xf2\x9a\x27\x69\x7f\x12\x32\x36" + "\xde\x92\xdf\xde\x8f\x5b\x31\xab" + "\x4a\x01\x26\xe0\xb1\xda\xe8\x37" + "\x21\x64\xe8\xff\x69\xfc\x9e\x41" + "\xd2\x96\x2d\x18\x64\x98\x33\x78" + "\x24\x61\x73\x9b\x47\x29\xf1\xa7" + "\xcb\x27\x0f\xf0\x85\x6d\x8c\x9d" + "\x2c\x95\x9e\xe5\xb2\x8e\x30\x29" + "\x78\x8a\x9d\x65\xb4\x8e\xde\x7b" + "\xd9\x00\x50\xf5\x7f\x81\xc3\x1b" + "\x25\x85\xeb\xc2\x8c\x33\x22\x1e" + "\x68\x38\x22\x30\xd8\x2e\x00\x98" + "\x85\x16\x06\x56\xb4\x81\x74\x20" + "\x95\xdb\x1c\x05\x19\xe8\x23\x4d" + "\x65\x5d\xcc\xd8\x7f\xc4\x2d\x0f" + "\x57\x26\x71\x07\xad\xaa\x71\x9f" + "\x19\x76\x2f\x25\x51\x88\xe4\xc0" + "\x82\x6e\x08\x05\x37\x04\xee\x25" + "\x23\x90\xe9\x4e\xce\x9b\x16\xc1" + "\x31\xe7\x6e\x2c\x1b\xe1\x85\x9a" + "\x0c\x8c\xbb\x12\x1e\x68\x7b\x93" + "\xa9\x3c\x39\x56\x23\x3e\x6e\xc7" + "\x77\x84\xd3\xe0\x86\x59\xaa\xb9" + "\xd5\x53\x58\xc9\x0a\x83\x5f\x85" + "\xd8\x47\x14\x67\x8a\x3c\x17\xe0" + "\xab\x02\x51\xea\xf1\xf0\x4f\x30" + "\x7d\xe0\x92\xc2\x5f\xfb\x19\x5a" + "\x3f\xbd\xf4\x39\xa4\x31\x0c\x39" + "\xd1\xae\x4e\xf7\x65\x7f\x1f\xce" + "\xc2\x39\xd1\x84\xd4\xe5\x02\xe0" + "\x58\xaa\xf1\x5e\x81\xaf\x7f\x72" + "\x0f\x08\x99\x43\xb9\xd8\xac\x41" + "\x35\x55\xf2\xb2\xd4\x98\xb8\x3b" + "\x2b\x3c\x3e\x16\x06\x31\xfc\x79" + "\x47\x38\x63\x51\xc5\xd0\x26\xd7" + "\x43\xb4\x2b\xd9\xc5\x05\xf2\x9d" + "\x18\xc9\x26\x82\x56\xd2\x11\x05" + "\xb6\x89\xb4\x43\x9c\xb5\x9d\x11" + "\x6c\x83\x37\x71\x27\x1c\xae\xbf" + "\xcd\x57\xd2\xee\x0d\x5a\x15\x26" + "\x67\x88\x80\x80\x1b\xdc\xc1\x62" + "\xdd\x4c\xff\x92\x5c\x6c\xe1\xa0" + "\xe3\x79\xa9\x65\x8c\x8c\x14\x42" + "\xe5\x11\xd2\x1a\xad\xa9\x56\x6f" + "\x98\xfc\x8a\x7b\x56\x1f\xc6\xc1" + "\x52\x12\x92\x9b\x41\x0f\x4b\xae" + "\x1b\x4a\xbc\xfe\x23\xb6\x94\x70" + "\x04\x30\x9e\x69\x47\xbe\xb8\x8f" + "\xca\x45\xd7\x8a\xf4\x78\x3e\xaa" + "\x71\x17\xd8\x1e\xb8\x11\x8f\xbc" + "\xc8\x1a\x65\x7b\x41\x89\x72\xc7" + "\x5f\xbe\xc5\x2a\xdb\x5c\x54\xf9" + "\x25\xa3\x7a\x80\x56\x9c\x8c\xab" + "\x26\x19\x10\x36\xa6\xf3\x14\x79" + "\x40\x98\x70\x68\xb7\x35\xd9\xb9" + "\x27\xd4\xe7\x74\x5b\x3d\x97\xb4" + "\xd9\xaa\xd9\xf2\xb5\x14\x84\x1f" + "\xa9\xde\x12\x44\x5b\x00\xc0\xbc" + "\xc8\x11\x25\x1b\x67\x7a\x15\x72" + "\xa6\x31\x6f\xf4\x68\x7a\x86\x9d" + "\x43\x1c\x5f\x16\xd3\xad\x2e\x52" + "\xf3\xb4\xc3\xfa\x27\x2e\x68\x6c" + "\x06\xe7\x4c\x4f\xa2\xe0\xe4\x21" + "\x5d\x9e\x33\x58\x8d\xbf\xd5\x70" + "\xf8\x80\xa5\xdd\xe7\x18\x79\xfa" + "\x7b\xfd\x09\x69\x2c\x37\x32\xa8" + "\x65\xfa\x8d\x8b\x5c\xcc\xe8\xf3" + "\x37\xf6\xa6\xc6\x5c\xa2\x66\x79" + "\xfa\x8a\xa7\xd1\x0b\x2e\x1b\x5e" + "\x95\x35\x00\x76\xae\x42\xf7\x50" + "\x51\x78\xfb\xb4\x28\x24\xde\x1a" + "\x70\x8b\xed\xca\x3c\x5e\xe4\xbd" + "\x28\xb5\xf3\x76\x4f\x67\x5d\x81" + "\xb2\x60\x87\xd9\x7b\x19\x1a\xa7" + "\x79\xa2\xfa\x3f\x9e\xa9\xd7\x25" + "\x61\xe1\x74\x31\xa2\x77\xa0\x1b" + "\xf6\xf7\xcb\xc5\xaa\x9e\xce\xf9" + "\x9b\x96\xef\x51\xc3\x1a\x44\x96" + "\xae\x17\x50\xab\x29\x08\xda\xcc" + "\x1a\xb3\x12\xd0\x24\xe4\xe2\xe0" + "\xc6\xe3\xcc\x82\xd0\xba\x47\x4c" + "\x3f\x49\xd7\xe8\xb6\x61\xaa\x65" + "\x25\x18\x40\x2d\x62\x25\x02\x71" + "\x61\xa2\xc1\xb2\x13\xd2\x71\x3f" + "\x43\x1a\xc9\x09\x92\xff\xd5\x57" + "\xf0\xfc\x5e\x1c\xf1\xf5\xf9\xf3" + "\x5b", + .len = 1281, + .also_non_np = 1, + .np = 3, + .tap = { 1200, 1, 80 }, + }, +}; + /* * CTS (Cipher Text Stealing) mode tests */ diff --git a/include/crypto/chacha.h b/include/crypto/chacha.h index b722a23e54bb..1fc70a69d550 100644 --- a/include/crypto/chacha.h +++ b/include/crypto/chacha.h @@ -5,6 +5,11 @@ * XChaCha extends ChaCha's nonce to 192 bits, while provably retaining ChaCha's * security. Here they share the same key size, tfm context, and setkey * function; only their IV size and encrypt/decrypt function differ. + * + * The ChaCha paper specifies 20, 12, and 8-round variants. In general, it is + * recommended to use the 20-round variant ChaCha20. However, the other + * variants can be needed in some performance-sensitive scenarios. The generic + * ChaCha code currently allows only the 20 and 12-round variants. */ #ifndef _CRYPTO_CHACHA_H @@ -40,6 +45,8 @@ void crypto_chacha_init(u32 *state, struct chacha_ctx *ctx, u8 *iv); int crypto_chacha20_setkey(struct crypto_skcipher *tfm, const u8 *key, unsigned int keysize); +int crypto_chacha12_setkey(struct crypto_skcipher *tfm, const u8 *key, + unsigned int keysize); int crypto_chacha_crypt(struct skcipher_request *req); int crypto_xchacha_crypt(struct skcipher_request *req); diff --git a/lib/chacha.c b/lib/chacha.c index 1bdc688c18df..a46d2832dbab 100644 --- a/lib/chacha.c +++ b/lib/chacha.c @@ -21,7 +21,7 @@ static void chacha_permute(u32 *x, int nrounds) int i; /* whitelist the allowed round counts */ - WARN_ON_ONCE(nrounds != 20); + WARN_ON_ONCE(nrounds != 20 && nrounds != 12); for (i = 0; i < nrounds; i += 2) { x[0] += x[4]; x[12] = rol32(x[12] ^ x[0], 16); @@ -70,7 +70,7 @@ static void chacha_permute(u32 *x, int nrounds) * chacha_block - generate one keystream block and increment block counter * @state: input state matrix (16 32-bit words) * @stream: output keystream block (64 bytes) - * @nrounds: number of rounds (currently must be 20) + * @nrounds: number of rounds (20 or 12; 20 is recommended) * * This is the ChaCha core, a function from 64-byte strings to 64-byte strings. * The caller has already converted the endianness of the input. This function @@ -96,7 +96,7 @@ EXPORT_SYMBOL(chacha_block); * hchacha_block - abbreviated ChaCha core, for XChaCha * @in: input state matrix (16 32-bit words) * @out: output (8 32-bit words) - * @nrounds: number of rounds (currently must be 20) + * @nrounds: number of rounds (20 or 12; 20 is recommended) * * HChaCha is the ChaCha equivalent of HSalsa and is an intermediate step * towards XChaCha (see https://cr.yp.to/snuffle/xsalsa-20081128.pdf). HChaCha -- cgit v1.2.3-59-g8ed1b From be2830b15b60011845ad701076511e8b93b2fd76 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 16 Nov 2018 17:26:23 -0800 Subject: crypto: arm/chacha20 - limit the preemption-disabled section To improve responsivesess, disable preemption for each step of the walk (which is at most PAGE_SIZE) rather than for the entire encryption/decryption operation. Suggested-by: Ard Biesheuvel Signed-off-by: Eric Biggers Signed-off-by: Herbert Xu --- arch/arm/crypto/chacha20-neon-glue.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/arm/crypto/chacha20-neon-glue.c b/arch/arm/crypto/chacha20-neon-glue.c index 7386eb1c1889..2bc035cb8f23 100644 --- a/arch/arm/crypto/chacha20-neon-glue.c +++ b/arch/arm/crypto/chacha20-neon-glue.c @@ -68,22 +68,22 @@ static int chacha20_neon(struct skcipher_request *req) if (req->cryptlen <= CHACHA_BLOCK_SIZE || !may_use_simd()) return crypto_chacha_crypt(req); - err = skcipher_walk_virt(&walk, req, true); + err = skcipher_walk_virt(&walk, req, false); crypto_chacha_init(state, ctx, walk.iv); - kernel_neon_begin(); while (walk.nbytes > 0) { unsigned int nbytes = walk.nbytes; if (nbytes < walk.total) nbytes = round_down(nbytes, walk.stride); + kernel_neon_begin(); chacha20_doneon(state, walk.dst.virt.addr, walk.src.virt.addr, nbytes); + kernel_neon_end(); err = skcipher_walk_done(&walk, walk.nbytes - nbytes); } - kernel_neon_end(); return err; } -- cgit v1.2.3-59-g8ed1b From d97a94309d764ed907d4281da6246f5d935166f8 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 16 Nov 2018 17:26:24 -0800 Subject: crypto: arm/chacha20 - add XChaCha20 support Add an XChaCha20 implementation that is hooked up to the ARM NEON implementation of ChaCha20. This is needed for use in the Adiantum encryption mode; see the generic code patch, "crypto: chacha20-generic - add XChaCha20 support", for more details. We also update the NEON code to support HChaCha20 on one block, so we can use that in XChaCha20 rather than calling the generic HChaCha20. This required factoring the permutation out into its own macro. Reviewed-by: Ard Biesheuvel Signed-off-by: Eric Biggers Signed-off-by: Herbert Xu --- arch/arm/crypto/Kconfig | 2 +- arch/arm/crypto/chacha20-neon-core.S | 70 ++++++++++++++++-------- arch/arm/crypto/chacha20-neon-glue.c | 103 ++++++++++++++++++++++++++--------- 3 files changed, 126 insertions(+), 49 deletions(-) diff --git a/arch/arm/crypto/Kconfig b/arch/arm/crypto/Kconfig index 0473a8f68389..a08759c32cb9 100644 --- a/arch/arm/crypto/Kconfig +++ b/arch/arm/crypto/Kconfig @@ -126,7 +126,7 @@ config CRYPTO_CRC32_ARM_CE select CRYPTO_HASH config CRYPTO_CHACHA20_NEON - tristate "NEON accelerated ChaCha20 symmetric cipher" + tristate "NEON accelerated ChaCha20 stream cipher algorithms" depends on KERNEL_MODE_NEON select CRYPTO_BLKCIPHER select CRYPTO_CHACHA20 diff --git a/arch/arm/crypto/chacha20-neon-core.S b/arch/arm/crypto/chacha20-neon-core.S index 50e7b9896818..2335e5055d2b 100644 --- a/arch/arm/crypto/chacha20-neon-core.S +++ b/arch/arm/crypto/chacha20-neon-core.S @@ -52,27 +52,16 @@ .fpu neon .align 5 -ENTRY(chacha20_block_xor_neon) - // r0: Input state matrix, s - // r1: 1 data block output, o - // r2: 1 data block input, i - - // - // This function encrypts one ChaCha20 block by loading the state matrix - // in four NEON registers. It performs matrix operation on four words in - // parallel, but requireds shuffling to rearrange the words after each - // round. - // - - // x0..3 = s0..3 - add ip, r0, #0x20 - vld1.32 {q0-q1}, [r0] - vld1.32 {q2-q3}, [ip] - - vmov q8, q0 - vmov q9, q1 - vmov q10, q2 - vmov q11, q3 +/* + * chacha20_permute - permute one block + * + * Permute one 64-byte block where the state matrix is stored in the four NEON + * registers q0-q3. It performs matrix operations on four words in parallel, + * but requires shuffling to rearrange the words after each round. + * + * Clobbers: r3, ip, q4-q5 + */ +chacha20_permute: adr ip, .Lrol8_table mov r3, #10 @@ -142,6 +131,27 @@ ENTRY(chacha20_block_xor_neon) subs r3, r3, #1 bne .Ldoubleround + bx lr +ENDPROC(chacha20_permute) + +ENTRY(chacha20_block_xor_neon) + // r0: Input state matrix, s + // r1: 1 data block output, o + // r2: 1 data block input, i + push {lr} + + // x0..3 = s0..3 + add ip, r0, #0x20 + vld1.32 {q0-q1}, [r0] + vld1.32 {q2-q3}, [ip] + + vmov q8, q0 + vmov q9, q1 + vmov q10, q2 + vmov q11, q3 + + bl chacha20_permute + add ip, r2, #0x20 vld1.8 {q4-q5}, [r2] vld1.8 {q6-q7}, [ip] @@ -166,9 +176,25 @@ ENTRY(chacha20_block_xor_neon) vst1.8 {q0-q1}, [r1] vst1.8 {q2-q3}, [ip] - bx lr + pop {pc} ENDPROC(chacha20_block_xor_neon) +ENTRY(hchacha20_block_neon) + // r0: Input state matrix, s + // r1: output (8 32-bit words) + push {lr} + + vld1.32 {q0-q1}, [r0]! + vld1.32 {q2-q3}, [r0] + + bl chacha20_permute + + vst1.32 {q0}, [r1]! + vst1.32 {q3}, [r1] + + pop {pc} +ENDPROC(hchacha20_block_neon) + .align 4 .Lctrinc: .word 0, 1, 2, 3 .Lrol8_table: .byte 3, 0, 1, 2, 7, 4, 5, 6 diff --git a/arch/arm/crypto/chacha20-neon-glue.c b/arch/arm/crypto/chacha20-neon-glue.c index 2bc035cb8f23..f2d3b0f70a8d 100644 --- a/arch/arm/crypto/chacha20-neon-glue.c +++ b/arch/arm/crypto/chacha20-neon-glue.c @@ -1,5 +1,5 @@ /* - * ChaCha20 256-bit cipher algorithm, RFC7539, ARM NEON functions + * ChaCha20 (RFC7539) and XChaCha20 stream ciphers, NEON accelerated * * Copyright (C) 2016 Linaro, Ltd. * @@ -30,6 +30,7 @@ asmlinkage void chacha20_block_xor_neon(u32 *state, u8 *dst, const u8 *src); asmlinkage void chacha20_4block_xor_neon(u32 *state, u8 *dst, const u8 *src); +asmlinkage void hchacha20_block_neon(const u32 *state, u32 *out); static void chacha20_doneon(u32 *state, u8 *dst, const u8 *src, unsigned int bytes) @@ -57,20 +58,16 @@ static void chacha20_doneon(u32 *state, u8 *dst, const u8 *src, } } -static int chacha20_neon(struct skcipher_request *req) +static int chacha20_neon_stream_xor(struct skcipher_request *req, + struct chacha_ctx *ctx, u8 *iv) { - struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm); struct skcipher_walk walk; u32 state[16]; int err; - if (req->cryptlen <= CHACHA_BLOCK_SIZE || !may_use_simd()) - return crypto_chacha_crypt(req); - err = skcipher_walk_virt(&walk, req, false); - crypto_chacha_init(state, ctx, walk.iv); + crypto_chacha_init(state, ctx, iv); while (walk.nbytes > 0) { unsigned int nbytes = walk.nbytes; @@ -88,22 +85,73 @@ static int chacha20_neon(struct skcipher_request *req) return err; } -static struct skcipher_alg alg = { - .base.cra_name = "chacha20", - .base.cra_driver_name = "chacha20-neon", - .base.cra_priority = 300, - .base.cra_blocksize = 1, - .base.cra_ctxsize = sizeof(struct chacha_ctx), - .base.cra_module = THIS_MODULE, - - .min_keysize = CHACHA_KEY_SIZE, - .max_keysize = CHACHA_KEY_SIZE, - .ivsize = CHACHA_IV_SIZE, - .chunksize = CHACHA_BLOCK_SIZE, - .walksize = 4 * CHACHA_BLOCK_SIZE, - .setkey = crypto_chacha20_setkey, - .encrypt = chacha20_neon, - .decrypt = chacha20_neon, +static int chacha20_neon(struct skcipher_request *req) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm); + + if (req->cryptlen <= CHACHA_BLOCK_SIZE || !may_use_simd()) + return crypto_chacha_crypt(req); + + return chacha20_neon_stream_xor(req, ctx, req->iv); +} + +static int xchacha20_neon(struct skcipher_request *req) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm); + struct chacha_ctx subctx; + u32 state[16]; + u8 real_iv[16]; + + if (req->cryptlen <= CHACHA_BLOCK_SIZE || !may_use_simd()) + return crypto_xchacha_crypt(req); + + crypto_chacha_init(state, ctx, req->iv); + + kernel_neon_begin(); + hchacha20_block_neon(state, subctx.key); + kernel_neon_end(); + + memcpy(&real_iv[0], req->iv + 24, 8); + memcpy(&real_iv[8], req->iv + 16, 8); + return chacha20_neon_stream_xor(req, &subctx, real_iv); +} + +static struct skcipher_alg algs[] = { + { + .base.cra_name = "chacha20", + .base.cra_driver_name = "chacha20-neon", + .base.cra_priority = 300, + .base.cra_blocksize = 1, + .base.cra_ctxsize = sizeof(struct chacha_ctx), + .base.cra_module = THIS_MODULE, + + .min_keysize = CHACHA_KEY_SIZE, + .max_keysize = CHACHA_KEY_SIZE, + .ivsize = CHACHA_IV_SIZE, + .chunksize = CHACHA_BLOCK_SIZE, + .walksize = 4 * CHACHA_BLOCK_SIZE, + .setkey = crypto_chacha20_setkey, + .encrypt = chacha20_neon, + .decrypt = chacha20_neon, + }, { + .base.cra_name = "xchacha20", + .base.cra_driver_name = "xchacha20-neon", + .base.cra_priority = 300, + .base.cra_blocksize = 1, + .base.cra_ctxsize = sizeof(struct chacha_ctx), + .base.cra_module = THIS_MODULE, + + .min_keysize = CHACHA_KEY_SIZE, + .max_keysize = CHACHA_KEY_SIZE, + .ivsize = XCHACHA_IV_SIZE, + .chunksize = CHACHA_BLOCK_SIZE, + .walksize = 4 * CHACHA_BLOCK_SIZE, + .setkey = crypto_chacha20_setkey, + .encrypt = xchacha20_neon, + .decrypt = xchacha20_neon, + } }; static int __init chacha20_simd_mod_init(void) @@ -111,12 +159,12 @@ static int __init chacha20_simd_mod_init(void) if (!(elf_hwcap & HWCAP_NEON)) return -ENODEV; - return crypto_register_skcipher(&alg); + return crypto_register_skciphers(algs, ARRAY_SIZE(algs)); } static void __exit chacha20_simd_mod_fini(void) { - crypto_unregister_skcipher(&alg); + crypto_unregister_skciphers(algs, ARRAY_SIZE(algs)); } module_init(chacha20_simd_mod_init); @@ -125,3 +173,6 @@ module_exit(chacha20_simd_mod_fini); MODULE_AUTHOR("Ard Biesheuvel "); MODULE_LICENSE("GPL v2"); MODULE_ALIAS_CRYPTO("chacha20"); +MODULE_ALIAS_CRYPTO("chacha20-neon"); +MODULE_ALIAS_CRYPTO("xchacha20"); +MODULE_ALIAS_CRYPTO("xchacha20-neon"); -- cgit v1.2.3-59-g8ed1b From 3cc215198eac75cc4130729ddd94a5cdbdb4d300 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 16 Nov 2018 17:26:25 -0800 Subject: crypto: arm/chacha20 - refactor to allow varying number of rounds In preparation for adding XChaCha12 support, rename/refactor the NEON implementation of ChaCha20 to support different numbers of rounds. Reviewed-by: Ard Biesheuvel Signed-off-by: Eric Biggers Signed-off-by: Herbert Xu --- arch/arm/crypto/Makefile | 4 +- arch/arm/crypto/chacha-neon-core.S | 560 +++++++++++++++++++++++++++++++++++ arch/arm/crypto/chacha-neon-glue.c | 182 ++++++++++++ arch/arm/crypto/chacha20-neon-core.S | 556 ---------------------------------- arch/arm/crypto/chacha20-neon-glue.c | 178 ----------- 5 files changed, 744 insertions(+), 736 deletions(-) create mode 100644 arch/arm/crypto/chacha-neon-core.S create mode 100644 arch/arm/crypto/chacha-neon-glue.c delete mode 100644 arch/arm/crypto/chacha20-neon-core.S delete mode 100644 arch/arm/crypto/chacha20-neon-glue.c diff --git a/arch/arm/crypto/Makefile b/arch/arm/crypto/Makefile index bd5bceef0605..005482ff9504 100644 --- a/arch/arm/crypto/Makefile +++ b/arch/arm/crypto/Makefile @@ -9,7 +9,7 @@ obj-$(CONFIG_CRYPTO_SHA1_ARM) += sha1-arm.o obj-$(CONFIG_CRYPTO_SHA1_ARM_NEON) += sha1-arm-neon.o obj-$(CONFIG_CRYPTO_SHA256_ARM) += sha256-arm.o obj-$(CONFIG_CRYPTO_SHA512_ARM) += sha512-arm.o -obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha20-neon.o +obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha-neon.o ce-obj-$(CONFIG_CRYPTO_AES_ARM_CE) += aes-arm-ce.o ce-obj-$(CONFIG_CRYPTO_SHA1_ARM_CE) += sha1-arm-ce.o @@ -52,7 +52,7 @@ aes-arm-ce-y := aes-ce-core.o aes-ce-glue.o ghash-arm-ce-y := ghash-ce-core.o ghash-ce-glue.o crct10dif-arm-ce-y := crct10dif-ce-core.o crct10dif-ce-glue.o crc32-arm-ce-y:= crc32-ce-core.o crc32-ce-glue.o -chacha20-neon-y := chacha20-neon-core.o chacha20-neon-glue.o +chacha-neon-y := chacha-neon-core.o chacha-neon-glue.o ifdef REGENERATE_ARM_CRYPTO quiet_cmd_perl = PERL $@ diff --git a/arch/arm/crypto/chacha-neon-core.S b/arch/arm/crypto/chacha-neon-core.S new file mode 100644 index 000000000000..eb22926d4912 --- /dev/null +++ b/arch/arm/crypto/chacha-neon-core.S @@ -0,0 +1,560 @@ +/* + * ChaCha/XChaCha NEON helper functions + * + * Copyright (C) 2016 Linaro, Ltd. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Based on: + * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSE3 functions + * + * Copyright (C) 2015 Martin Willi + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + + /* + * NEON doesn't have a rotate instruction. The alternatives are, more or less: + * + * (a) vshl.u32 + vsri.u32 (needs temporary register) + * (b) vshl.u32 + vshr.u32 + vorr (needs temporary register) + * (c) vrev32.16 (16-bit rotations only) + * (d) vtbl.8 + vtbl.8 (multiple of 8 bits rotations only, + * needs index vector) + * + * ChaCha has 16, 12, 8, and 7-bit rotations. For the 12 and 7-bit rotations, + * the only choices are (a) and (b). We use (a) since it takes two-thirds the + * cycles of (b) on both Cortex-A7 and Cortex-A53. + * + * For the 16-bit rotation, we use vrev32.16 since it's consistently fastest + * and doesn't need a temporary register. + * + * For the 8-bit rotation, we use vtbl.8 + vtbl.8. On Cortex-A7, this sequence + * is twice as fast as (a), even when doing (a) on multiple registers + * simultaneously to eliminate the stall between vshl and vsri. Also, it + * parallelizes better when temporary registers are scarce. + * + * A disadvantage is that on Cortex-A53, the vtbl sequence is the same speed as + * (a), so the need to load the rotation table actually makes the vtbl method + * slightly slower overall on that CPU (~1.3% slower ChaCha20). Still, it + * seems to be a good compromise to get a more significant speed boost on some + * CPUs, e.g. ~4.8% faster ChaCha20 on Cortex-A7. + */ + +#include + + .text + .fpu neon + .align 5 + +/* + * chacha_permute - permute one block + * + * Permute one 64-byte block where the state matrix is stored in the four NEON + * registers q0-q3. It performs matrix operations on four words in parallel, + * but requires shuffling to rearrange the words after each round. + * + * The round count is given in r3. + * + * Clobbers: r3, ip, q4-q5 + */ +chacha_permute: + + adr ip, .Lrol8_table + vld1.8 {d10}, [ip, :64] + +.Ldoubleround: + // x0 += x1, x3 = rotl32(x3 ^ x0, 16) + vadd.i32 q0, q0, q1 + veor q3, q3, q0 + vrev32.16 q3, q3 + + // x2 += x3, x1 = rotl32(x1 ^ x2, 12) + vadd.i32 q2, q2, q3 + veor q4, q1, q2 + vshl.u32 q1, q4, #12 + vsri.u32 q1, q4, #20 + + // x0 += x1, x3 = rotl32(x3 ^ x0, 8) + vadd.i32 q0, q0, q1 + veor q3, q3, q0 + vtbl.8 d6, {d6}, d10 + vtbl.8 d7, {d7}, d10 + + // x2 += x3, x1 = rotl32(x1 ^ x2, 7) + vadd.i32 q2, q2, q3 + veor q4, q1, q2 + vshl.u32 q1, q4, #7 + vsri.u32 q1, q4, #25 + + // x1 = shuffle32(x1, MASK(0, 3, 2, 1)) + vext.8 q1, q1, q1, #4 + // x2 = shuffle32(x2, MASK(1, 0, 3, 2)) + vext.8 q2, q2, q2, #8 + // x3 = shuffle32(x3, MASK(2, 1, 0, 3)) + vext.8 q3, q3, q3, #12 + + // x0 += x1, x3 = rotl32(x3 ^ x0, 16) + vadd.i32 q0, q0, q1 + veor q3, q3, q0 + vrev32.16 q3, q3 + + // x2 += x3, x1 = rotl32(x1 ^ x2, 12) + vadd.i32 q2, q2, q3 + veor q4, q1, q2 + vshl.u32 q1, q4, #12 + vsri.u32 q1, q4, #20 + + // x0 += x1, x3 = rotl32(x3 ^ x0, 8) + vadd.i32 q0, q0, q1 + veor q3, q3, q0 + vtbl.8 d6, {d6}, d10 + vtbl.8 d7, {d7}, d10 + + // x2 += x3, x1 = rotl32(x1 ^ x2, 7) + vadd.i32 q2, q2, q3 + veor q4, q1, q2 + vshl.u32 q1, q4, #7 + vsri.u32 q1, q4, #25 + + // x1 = shuffle32(x1, MASK(2, 1, 0, 3)) + vext.8 q1, q1, q1, #12 + // x2 = shuffle32(x2, MASK(1, 0, 3, 2)) + vext.8 q2, q2, q2, #8 + // x3 = shuffle32(x3, MASK(0, 3, 2, 1)) + vext.8 q3, q3, q3, #4 + + subs r3, r3, #2 + bne .Ldoubleround + + bx lr +ENDPROC(chacha_permute) + +ENTRY(chacha_block_xor_neon) + // r0: Input state matrix, s + // r1: 1 data block output, o + // r2: 1 data block input, i + // r3: nrounds + push {lr} + + // x0..3 = s0..3 + add ip, r0, #0x20 + vld1.32 {q0-q1}, [r0] + vld1.32 {q2-q3}, [ip] + + vmov q8, q0 + vmov q9, q1 + vmov q10, q2 + vmov q11, q3 + + bl chacha_permute + + add ip, r2, #0x20 + vld1.8 {q4-q5}, [r2] + vld1.8 {q6-q7}, [ip] + + // o0 = i0 ^ (x0 + s0) + vadd.i32 q0, q0, q8 + veor q0, q0, q4 + + // o1 = i1 ^ (x1 + s1) + vadd.i32 q1, q1, q9 + veor q1, q1, q5 + + // o2 = i2 ^ (x2 + s2) + vadd.i32 q2, q2, q10 + veor q2, q2, q6 + + // o3 = i3 ^ (x3 + s3) + vadd.i32 q3, q3, q11 + veor q3, q3, q7 + + add ip, r1, #0x20 + vst1.8 {q0-q1}, [r1] + vst1.8 {q2-q3}, [ip] + + pop {pc} +ENDPROC(chacha_block_xor_neon) + +ENTRY(hchacha_block_neon) + // r0: Input state matrix, s + // r1: output (8 32-bit words) + // r2: nrounds + push {lr} + + vld1.32 {q0-q1}, [r0]! + vld1.32 {q2-q3}, [r0] + + mov r3, r2 + bl chacha_permute + + vst1.32 {q0}, [r1]! + vst1.32 {q3}, [r1] + + pop {pc} +ENDPROC(hchacha_block_neon) + + .align 4 +.Lctrinc: .word 0, 1, 2, 3 +.Lrol8_table: .byte 3, 0, 1, 2, 7, 4, 5, 6 + + .align 5 +ENTRY(chacha_4block_xor_neon) + push {r4-r5} + mov r4, sp // preserve the stack pointer + sub ip, sp, #0x20 // allocate a 32 byte buffer + bic ip, ip, #0x1f // aligned to 32 bytes + mov sp, ip + + // r0: Input state matrix, s + // r1: 4 data blocks output, o + // r2: 4 data blocks input, i + // r3: nrounds + + // + // This function encrypts four consecutive ChaCha blocks by loading + // the state matrix in NEON registers four times. The algorithm performs + // each operation on the corresponding word of each state matrix, hence + // requires no word shuffling. The words are re-interleaved before the + // final addition of the original state and the XORing step. + // + + // x0..15[0-3] = s0..15[0-3] + add ip, r0, #0x20 + vld1.32 {q0-q1}, [r0] + vld1.32 {q2-q3}, [ip] + + adr r5, .Lctrinc + vdup.32 q15, d7[1] + vdup.32 q14, d7[0] + vld1.32 {q4}, [r5, :128] + vdup.32 q13, d6[1] + vdup.32 q12, d6[0] + vdup.32 q11, d5[1] + vdup.32 q10, d5[0] + vadd.u32 q12, q12, q4 // x12 += counter values 0-3 + vdup.32 q9, d4[1] + vdup.32 q8, d4[0] + vdup.32 q7, d3[1] + vdup.32 q6, d3[0] + vdup.32 q5, d2[1] + vdup.32 q4, d2[0] + vdup.32 q3, d1[1] + vdup.32 q2, d1[0] + vdup.32 q1, d0[1] + vdup.32 q0, d0[0] + + adr ip, .Lrol8_table + b 1f + +.Ldoubleround4: + vld1.32 {q8-q9}, [sp, :256] +1: + // x0 += x4, x12 = rotl32(x12 ^ x0, 16) + // x1 += x5, x13 = rotl32(x13 ^ x1, 16) + // x2 += x6, x14 = rotl32(x14 ^ x2, 16) + // x3 += x7, x15 = rotl32(x15 ^ x3, 16) + vadd.i32 q0, q0, q4 + vadd.i32 q1, q1, q5 + vadd.i32 q2, q2, q6 + vadd.i32 q3, q3, q7 + + veor q12, q12, q0 + veor q13, q13, q1 + veor q14, q14, q2 + veor q15, q15, q3 + + vrev32.16 q12, q12 + vrev32.16 q13, q13 + vrev32.16 q14, q14 + vrev32.16 q15, q15 + + // x8 += x12, x4 = rotl32(x4 ^ x8, 12) + // x9 += x13, x5 = rotl32(x5 ^ x9, 12) + // x10 += x14, x6 = rotl32(x6 ^ x10, 12) + // x11 += x15, x7 = rotl32(x7 ^ x11, 12) + vadd.i32 q8, q8, q12 + vadd.i32 q9, q9, q13 + vadd.i32 q10, q10, q14 + vadd.i32 q11, q11, q15 + + vst1.32 {q8-q9}, [sp, :256] + + veor q8, q4, q8 + veor q9, q5, q9 + vshl.u32 q4, q8, #12 + vshl.u32 q5, q9, #12 + vsri.u32 q4, q8, #20 + vsri.u32 q5, q9, #20 + + veor q8, q6, q10 + veor q9, q7, q11 + vshl.u32 q6, q8, #12 + vshl.u32 q7, q9, #12 + vsri.u32 q6, q8, #20 + vsri.u32 q7, q9, #20 + + // x0 += x4, x12 = rotl32(x12 ^ x0, 8) + // x1 += x5, x13 = rotl32(x13 ^ x1, 8) + // x2 += x6, x14 = rotl32(x14 ^ x2, 8) + // x3 += x7, x15 = rotl32(x15 ^ x3, 8) + vld1.8 {d16}, [ip, :64] + vadd.i32 q0, q0, q4 + vadd.i32 q1, q1, q5 + vadd.i32 q2, q2, q6 + vadd.i32 q3, q3, q7 + + veor q12, q12, q0 + veor q13, q13, q1 + veor q14, q14, q2 + veor q15, q15, q3 + + vtbl.8 d24, {d24}, d16 + vtbl.8 d25, {d25}, d16 + vtbl.8 d26, {d26}, d16 + vtbl.8 d27, {d27}, d16 + vtbl.8 d28, {d28}, d16 + vtbl.8 d29, {d29}, d16 + vtbl.8 d30, {d30}, d16 + vtbl.8 d31, {d31}, d16 + + vld1.32 {q8-q9}, [sp, :256] + + // x8 += x12, x4 = rotl32(x4 ^ x8, 7) + // x9 += x13, x5 = rotl32(x5 ^ x9, 7) + // x10 += x14, x6 = rotl32(x6 ^ x10, 7) + // x11 += x15, x7 = rotl32(x7 ^ x11, 7) + vadd.i32 q8, q8, q12 + vadd.i32 q9, q9, q13 + vadd.i32 q10, q10, q14 + vadd.i32 q11, q11, q15 + + vst1.32 {q8-q9}, [sp, :256] + + veor q8, q4, q8 + veor q9, q5, q9 + vshl.u32 q4, q8, #7 + vshl.u32 q5, q9, #7 + vsri.u32 q4, q8, #25 + vsri.u32 q5, q9, #25 + + veor q8, q6, q10 + veor q9, q7, q11 + vshl.u32 q6, q8, #7 + vshl.u32 q7, q9, #7 + vsri.u32 q6, q8, #25 + vsri.u32 q7, q9, #25 + + vld1.32 {q8-q9}, [sp, :256] + + // x0 += x5, x15 = rotl32(x15 ^ x0, 16) + // x1 += x6, x12 = rotl32(x12 ^ x1, 16) + // x2 += x7, x13 = rotl32(x13 ^ x2, 16) + // x3 += x4, x14 = rotl32(x14 ^ x3, 16) + vadd.i32 q0, q0, q5 + vadd.i32 q1, q1, q6 + vadd.i32 q2, q2, q7 + vadd.i32 q3, q3, q4 + + veor q15, q15, q0 + veor q12, q12, q1 + veor q13, q13, q2 + veor q14, q14, q3 + + vrev32.16 q15, q15 + vrev32.16 q12, q12 + vrev32.16 q13, q13 + vrev32.16 q14, q14 + + // x10 += x15, x5 = rotl32(x5 ^ x10, 12) + // x11 += x12, x6 = rotl32(x6 ^ x11, 12) + // x8 += x13, x7 = rotl32(x7 ^ x8, 12) + // x9 += x14, x4 = rotl32(x4 ^ x9, 12) + vadd.i32 q10, q10, q15 + vadd.i32 q11, q11, q12 + vadd.i32 q8, q8, q13 + vadd.i32 q9, q9, q14 + + vst1.32 {q8-q9}, [sp, :256] + + veor q8, q7, q8 + veor q9, q4, q9 + vshl.u32 q7, q8, #12 + vshl.u32 q4, q9, #12 + vsri.u32 q7, q8, #20 + vsri.u32 q4, q9, #20 + + veor q8, q5, q10 + veor q9, q6, q11 + vshl.u32 q5, q8, #12 + vshl.u32 q6, q9, #12 + vsri.u32 q5, q8, #20 + vsri.u32 q6, q9, #20 + + // x0 += x5, x15 = rotl32(x15 ^ x0, 8) + // x1 += x6, x12 = rotl32(x12 ^ x1, 8) + // x2 += x7, x13 = rotl32(x13 ^ x2, 8) + // x3 += x4, x14 = rotl32(x14 ^ x3, 8) + vld1.8 {d16}, [ip, :64] + vadd.i32 q0, q0, q5 + vadd.i32 q1, q1, q6 + vadd.i32 q2, q2, q7 + vadd.i32 q3, q3, q4 + + veor q15, q15, q0 + veor q12, q12, q1 + veor q13, q13, q2 + veor q14, q14, q3 + + vtbl.8 d30, {d30}, d16 + vtbl.8 d31, {d31}, d16 + vtbl.8 d24, {d24}, d16 + vtbl.8 d25, {d25}, d16 + vtbl.8 d26, {d26}, d16 + vtbl.8 d27, {d27}, d16 + vtbl.8 d28, {d28}, d16 + vtbl.8 d29, {d29}, d16 + + vld1.32 {q8-q9}, [sp, :256] + + // x10 += x15, x5 = rotl32(x5 ^ x10, 7) + // x11 += x12, x6 = rotl32(x6 ^ x11, 7) + // x8 += x13, x7 = rotl32(x7 ^ x8, 7) + // x9 += x14, x4 = rotl32(x4 ^ x9, 7) + vadd.i32 q10, q10, q15 + vadd.i32 q11, q11, q12 + vadd.i32 q8, q8, q13 + vadd.i32 q9, q9, q14 + + vst1.32 {q8-q9}, [sp, :256] + + veor q8, q7, q8 + veor q9, q4, q9 + vshl.u32 q7, q8, #7 + vshl.u32 q4, q9, #7 + vsri.u32 q7, q8, #25 + vsri.u32 q4, q9, #25 + + veor q8, q5, q10 + veor q9, q6, q11 + vshl.u32 q5, q8, #7 + vshl.u32 q6, q9, #7 + vsri.u32 q5, q8, #25 + vsri.u32 q6, q9, #25 + + subs r3, r3, #2 + bne .Ldoubleround4 + + // x0..7[0-3] are in q0-q7, x10..15[0-3] are in q10-q15. + // x8..9[0-3] are on the stack. + + // Re-interleave the words in the first two rows of each block (x0..7). + // Also add the counter values 0-3 to x12[0-3]. + vld1.32 {q8}, [r5, :128] // load counter values 0-3 + vzip.32 q0, q1 // => (0 1 0 1) (0 1 0 1) + vzip.32 q2, q3 // => (2 3 2 3) (2 3 2 3) + vzip.32 q4, q5 // => (4 5 4 5) (4 5 4 5) + vzip.32 q6, q7 // => (6 7 6 7) (6 7 6 7) + vadd.u32 q12, q8 // x12 += counter values 0-3 + vswp d1, d4 + vswp d3, d6 + vld1.32 {q8-q9}, [r0]! // load s0..7 + vswp d9, d12 + vswp d11, d14 + + // Swap q1 and q4 so that we'll free up consecutive registers (q0-q1) + // after XORing the first 32 bytes. + vswp q1, q4 + + // First two rows of each block are (q0 q1) (q2 q6) (q4 q5) (q3 q7) + + // x0..3[0-3] += s0..3[0-3] (add orig state to 1st row of each block) + vadd.u32 q0, q0, q8 + vadd.u32 q2, q2, q8 + vadd.u32 q4, q4, q8 + vadd.u32 q3, q3, q8 + + // x4..7[0-3] += s4..7[0-3] (add orig state to 2nd row of each block) + vadd.u32 q1, q1, q9 + vadd.u32 q6, q6, q9 + vadd.u32 q5, q5, q9 + vadd.u32 q7, q7, q9 + + // XOR first 32 bytes using keystream from first two rows of first block + vld1.8 {q8-q9}, [r2]! + veor q8, q8, q0 + veor q9, q9, q1 + vst1.8 {q8-q9}, [r1]! + + // Re-interleave the words in the last two rows of each block (x8..15). + vld1.32 {q8-q9}, [sp, :256] + vzip.32 q12, q13 // => (12 13 12 13) (12 13 12 13) + vzip.32 q14, q15 // => (14 15 14 15) (14 15 14 15) + vzip.32 q8, q9 // => (8 9 8 9) (8 9 8 9) + vzip.32 q10, q11 // => (10 11 10 11) (10 11 10 11) + vld1.32 {q0-q1}, [r0] // load s8..15 + vswp d25, d28 + vswp d27, d30 + vswp d17, d20 + vswp d19, d22 + + // Last two rows of each block are (q8 q12) (q10 q14) (q9 q13) (q11 q15) + + // x8..11[0-3] += s8..11[0-3] (add orig state to 3rd row of each block) + vadd.u32 q8, q8, q0 + vadd.u32 q10, q10, q0 + vadd.u32 q9, q9, q0 + vadd.u32 q11, q11, q0 + + // x12..15[0-3] += s12..15[0-3] (add orig state to 4th row of each block) + vadd.u32 q12, q12, q1 + vadd.u32 q14, q14, q1 + vadd.u32 q13, q13, q1 + vadd.u32 q15, q15, q1 + + // XOR the rest of the data with the keystream + + vld1.8 {q0-q1}, [r2]! + veor q0, q0, q8 + veor q1, q1, q12 + vst1.8 {q0-q1}, [r1]! + + vld1.8 {q0-q1}, [r2]! + veor q0, q0, q2 + veor q1, q1, q6 + vst1.8 {q0-q1}, [r1]! + + vld1.8 {q0-q1}, [r2]! + veor q0, q0, q10 + veor q1, q1, q14 + vst1.8 {q0-q1}, [r1]! + + vld1.8 {q0-q1}, [r2]! + veor q0, q0, q4 + veor q1, q1, q5 + vst1.8 {q0-q1}, [r1]! + + vld1.8 {q0-q1}, [r2]! + veor q0, q0, q9 + veor q1, q1, q13 + vst1.8 {q0-q1}, [r1]! + + vld1.8 {q0-q1}, [r2]! + veor q0, q0, q3 + veor q1, q1, q7 + vst1.8 {q0-q1}, [r1]! + + vld1.8 {q0-q1}, [r2] + mov sp, r4 // restore original stack pointer + veor q0, q0, q11 + veor q1, q1, q15 + vst1.8 {q0-q1}, [r1] + + pop {r4-r5} + bx lr +ENDPROC(chacha_4block_xor_neon) diff --git a/arch/arm/crypto/chacha-neon-glue.c b/arch/arm/crypto/chacha-neon-glue.c new file mode 100644 index 000000000000..385557d38634 --- /dev/null +++ b/arch/arm/crypto/chacha-neon-glue.c @@ -0,0 +1,182 @@ +/* + * ChaCha20 (RFC7539) and XChaCha20 stream ciphers, NEON accelerated + * + * Copyright (C) 2016 Linaro, Ltd. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Based on: + * ChaCha20 256-bit cipher algorithm, RFC7539, SIMD glue code + * + * Copyright (C) 2015 Martin Willi + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include +#include +#include +#include +#include + +#include +#include +#include + +asmlinkage void chacha_block_xor_neon(const u32 *state, u8 *dst, const u8 *src, + int nrounds); +asmlinkage void chacha_4block_xor_neon(const u32 *state, u8 *dst, const u8 *src, + int nrounds); +asmlinkage void hchacha_block_neon(const u32 *state, u32 *out, int nrounds); + +static void chacha_doneon(u32 *state, u8 *dst, const u8 *src, + unsigned int bytes, int nrounds) +{ + u8 buf[CHACHA_BLOCK_SIZE]; + + while (bytes >= CHACHA_BLOCK_SIZE * 4) { + chacha_4block_xor_neon(state, dst, src, nrounds); + bytes -= CHACHA_BLOCK_SIZE * 4; + src += CHACHA_BLOCK_SIZE * 4; + dst += CHACHA_BLOCK_SIZE * 4; + state[12] += 4; + } + while (bytes >= CHACHA_BLOCK_SIZE) { + chacha_block_xor_neon(state, dst, src, nrounds); + bytes -= CHACHA_BLOCK_SIZE; + src += CHACHA_BLOCK_SIZE; + dst += CHACHA_BLOCK_SIZE; + state[12]++; + } + if (bytes) { + memcpy(buf, src, bytes); + chacha_block_xor_neon(state, buf, buf, nrounds); + memcpy(dst, buf, bytes); + } +} + +static int chacha_neon_stream_xor(struct skcipher_request *req, + struct chacha_ctx *ctx, u8 *iv) +{ + struct skcipher_walk walk; + u32 state[16]; + int err; + + err = skcipher_walk_virt(&walk, req, false); + + crypto_chacha_init(state, ctx, iv); + + while (walk.nbytes > 0) { + unsigned int nbytes = walk.nbytes; + + if (nbytes < walk.total) + nbytes = round_down(nbytes, walk.stride); + + kernel_neon_begin(); + chacha_doneon(state, walk.dst.virt.addr, walk.src.virt.addr, + nbytes, ctx->nrounds); + kernel_neon_end(); + err = skcipher_walk_done(&walk, walk.nbytes - nbytes); + } + + return err; +} + +static int chacha_neon(struct skcipher_request *req) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm); + + if (req->cryptlen <= CHACHA_BLOCK_SIZE || !may_use_simd()) + return crypto_chacha_crypt(req); + + return chacha_neon_stream_xor(req, ctx, req->iv); +} + +static int xchacha_neon(struct skcipher_request *req) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm); + struct chacha_ctx subctx; + u32 state[16]; + u8 real_iv[16]; + + if (req->cryptlen <= CHACHA_BLOCK_SIZE || !may_use_simd()) + return crypto_xchacha_crypt(req); + + crypto_chacha_init(state, ctx, req->iv); + + kernel_neon_begin(); + hchacha_block_neon(state, subctx.key, ctx->nrounds); + kernel_neon_end(); + subctx.nrounds = ctx->nrounds; + + memcpy(&real_iv[0], req->iv + 24, 8); + memcpy(&real_iv[8], req->iv + 16, 8); + return chacha_neon_stream_xor(req, &subctx, real_iv); +} + +static struct skcipher_alg algs[] = { + { + .base.cra_name = "chacha20", + .base.cra_driver_name = "chacha20-neon", + .base.cra_priority = 300, + .base.cra_blocksize = 1, + .base.cra_ctxsize = sizeof(struct chacha_ctx), + .base.cra_module = THIS_MODULE, + + .min_keysize = CHACHA_KEY_SIZE, + .max_keysize = CHACHA_KEY_SIZE, + .ivsize = CHACHA_IV_SIZE, + .chunksize = CHACHA_BLOCK_SIZE, + .walksize = 4 * CHACHA_BLOCK_SIZE, + .setkey = crypto_chacha20_setkey, + .encrypt = chacha_neon, + .decrypt = chacha_neon, + }, { + .base.cra_name = "xchacha20", + .base.cra_driver_name = "xchacha20-neon", + .base.cra_priority = 300, + .base.cra_blocksize = 1, + .base.cra_ctxsize = sizeof(struct chacha_ctx), + .base.cra_module = THIS_MODULE, + + .min_keysize = CHACHA_KEY_SIZE, + .max_keysize = CHACHA_KEY_SIZE, + .ivsize = XCHACHA_IV_SIZE, + .chunksize = CHACHA_BLOCK_SIZE, + .walksize = 4 * CHACHA_BLOCK_SIZE, + .setkey = crypto_chacha20_setkey, + .encrypt = xchacha_neon, + .decrypt = xchacha_neon, + } +}; + +static int __init chacha_simd_mod_init(void) +{ + if (!(elf_hwcap & HWCAP_NEON)) + return -ENODEV; + + return crypto_register_skciphers(algs, ARRAY_SIZE(algs)); +} + +static void __exit chacha_simd_mod_fini(void) +{ + crypto_unregister_skciphers(algs, ARRAY_SIZE(algs)); +} + +module_init(chacha_simd_mod_init); +module_exit(chacha_simd_mod_fini); + +MODULE_DESCRIPTION("ChaCha and XChaCha stream ciphers (NEON accelerated)"); +MODULE_AUTHOR("Ard Biesheuvel "); +MODULE_LICENSE("GPL v2"); +MODULE_ALIAS_CRYPTO("chacha20"); +MODULE_ALIAS_CRYPTO("chacha20-neon"); +MODULE_ALIAS_CRYPTO("xchacha20"); +MODULE_ALIAS_CRYPTO("xchacha20-neon"); diff --git a/arch/arm/crypto/chacha20-neon-core.S b/arch/arm/crypto/chacha20-neon-core.S deleted file mode 100644 index 2335e5055d2b..000000000000 --- a/arch/arm/crypto/chacha20-neon-core.S +++ /dev/null @@ -1,556 +0,0 @@ -/* - * ChaCha20 256-bit cipher algorithm, RFC7539, ARM NEON functions - * - * Copyright (C) 2016 Linaro, Ltd. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * Based on: - * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSE3 functions - * - * Copyright (C) 2015 Martin Willi - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - */ - - /* - * NEON doesn't have a rotate instruction. The alternatives are, more or less: - * - * (a) vshl.u32 + vsri.u32 (needs temporary register) - * (b) vshl.u32 + vshr.u32 + vorr (needs temporary register) - * (c) vrev32.16 (16-bit rotations only) - * (d) vtbl.8 + vtbl.8 (multiple of 8 bits rotations only, - * needs index vector) - * - * ChaCha20 has 16, 12, 8, and 7-bit rotations. For the 12 and 7-bit - * rotations, the only choices are (a) and (b). We use (a) since it takes - * two-thirds the cycles of (b) on both Cortex-A7 and Cortex-A53. - * - * For the 16-bit rotation, we use vrev32.16 since it's consistently fastest - * and doesn't need a temporary register. - * - * For the 8-bit rotation, we use vtbl.8 + vtbl.8. On Cortex-A7, this sequence - * is twice as fast as (a), even when doing (a) on multiple registers - * simultaneously to eliminate the stall between vshl and vsri. Also, it - * parallelizes better when temporary registers are scarce. - * - * A disadvantage is that on Cortex-A53, the vtbl sequence is the same speed as - * (a), so the need to load the rotation table actually makes the vtbl method - * slightly slower overall on that CPU (~1.3% slower ChaCha20). Still, it - * seems to be a good compromise to get a more significant speed boost on some - * CPUs, e.g. ~4.8% faster ChaCha20 on Cortex-A7. - */ - -#include - - .text - .fpu neon - .align 5 - -/* - * chacha20_permute - permute one block - * - * Permute one 64-byte block where the state matrix is stored in the four NEON - * registers q0-q3. It performs matrix operations on four words in parallel, - * but requires shuffling to rearrange the words after each round. - * - * Clobbers: r3, ip, q4-q5 - */ -chacha20_permute: - - adr ip, .Lrol8_table - mov r3, #10 - vld1.8 {d10}, [ip, :64] - -.Ldoubleround: - // x0 += x1, x3 = rotl32(x3 ^ x0, 16) - vadd.i32 q0, q0, q1 - veor q3, q3, q0 - vrev32.16 q3, q3 - - // x2 += x3, x1 = rotl32(x1 ^ x2, 12) - vadd.i32 q2, q2, q3 - veor q4, q1, q2 - vshl.u32 q1, q4, #12 - vsri.u32 q1, q4, #20 - - // x0 += x1, x3 = rotl32(x3 ^ x0, 8) - vadd.i32 q0, q0, q1 - veor q3, q3, q0 - vtbl.8 d6, {d6}, d10 - vtbl.8 d7, {d7}, d10 - - // x2 += x3, x1 = rotl32(x1 ^ x2, 7) - vadd.i32 q2, q2, q3 - veor q4, q1, q2 - vshl.u32 q1, q4, #7 - vsri.u32 q1, q4, #25 - - // x1 = shuffle32(x1, MASK(0, 3, 2, 1)) - vext.8 q1, q1, q1, #4 - // x2 = shuffle32(x2, MASK(1, 0, 3, 2)) - vext.8 q2, q2, q2, #8 - // x3 = shuffle32(x3, MASK(2, 1, 0, 3)) - vext.8 q3, q3, q3, #12 - - // x0 += x1, x3 = rotl32(x3 ^ x0, 16) - vadd.i32 q0, q0, q1 - veor q3, q3, q0 - vrev32.16 q3, q3 - - // x2 += x3, x1 = rotl32(x1 ^ x2, 12) - vadd.i32 q2, q2, q3 - veor q4, q1, q2 - vshl.u32 q1, q4, #12 - vsri.u32 q1, q4, #20 - - // x0 += x1, x3 = rotl32(x3 ^ x0, 8) - vadd.i32 q0, q0, q1 - veor q3, q3, q0 - vtbl.8 d6, {d6}, d10 - vtbl.8 d7, {d7}, d10 - - // x2 += x3, x1 = rotl32(x1 ^ x2, 7) - vadd.i32 q2, q2, q3 - veor q4, q1, q2 - vshl.u32 q1, q4, #7 - vsri.u32 q1, q4, #25 - - // x1 = shuffle32(x1, MASK(2, 1, 0, 3)) - vext.8 q1, q1, q1, #12 - // x2 = shuffle32(x2, MASK(1, 0, 3, 2)) - vext.8 q2, q2, q2, #8 - // x3 = shuffle32(x3, MASK(0, 3, 2, 1)) - vext.8 q3, q3, q3, #4 - - subs r3, r3, #1 - bne .Ldoubleround - - bx lr -ENDPROC(chacha20_permute) - -ENTRY(chacha20_block_xor_neon) - // r0: Input state matrix, s - // r1: 1 data block output, o - // r2: 1 data block input, i - push {lr} - - // x0..3 = s0..3 - add ip, r0, #0x20 - vld1.32 {q0-q1}, [r0] - vld1.32 {q2-q3}, [ip] - - vmov q8, q0 - vmov q9, q1 - vmov q10, q2 - vmov q11, q3 - - bl chacha20_permute - - add ip, r2, #0x20 - vld1.8 {q4-q5}, [r2] - vld1.8 {q6-q7}, [ip] - - // o0 = i0 ^ (x0 + s0) - vadd.i32 q0, q0, q8 - veor q0, q0, q4 - - // o1 = i1 ^ (x1 + s1) - vadd.i32 q1, q1, q9 - veor q1, q1, q5 - - // o2 = i2 ^ (x2 + s2) - vadd.i32 q2, q2, q10 - veor q2, q2, q6 - - // o3 = i3 ^ (x3 + s3) - vadd.i32 q3, q3, q11 - veor q3, q3, q7 - - add ip, r1, #0x20 - vst1.8 {q0-q1}, [r1] - vst1.8 {q2-q3}, [ip] - - pop {pc} -ENDPROC(chacha20_block_xor_neon) - -ENTRY(hchacha20_block_neon) - // r0: Input state matrix, s - // r1: output (8 32-bit words) - push {lr} - - vld1.32 {q0-q1}, [r0]! - vld1.32 {q2-q3}, [r0] - - bl chacha20_permute - - vst1.32 {q0}, [r1]! - vst1.32 {q3}, [r1] - - pop {pc} -ENDPROC(hchacha20_block_neon) - - .align 4 -.Lctrinc: .word 0, 1, 2, 3 -.Lrol8_table: .byte 3, 0, 1, 2, 7, 4, 5, 6 - - .align 5 -ENTRY(chacha20_4block_xor_neon) - push {r4-r5} - mov r4, sp // preserve the stack pointer - sub ip, sp, #0x20 // allocate a 32 byte buffer - bic ip, ip, #0x1f // aligned to 32 bytes - mov sp, ip - - // r0: Input state matrix, s - // r1: 4 data blocks output, o - // r2: 4 data blocks input, i - - // - // This function encrypts four consecutive ChaCha20 blocks by loading - // the state matrix in NEON registers four times. The algorithm performs - // each operation on the corresponding word of each state matrix, hence - // requires no word shuffling. The words are re-interleaved before the - // final addition of the original state and the XORing step. - // - - // x0..15[0-3] = s0..15[0-3] - add ip, r0, #0x20 - vld1.32 {q0-q1}, [r0] - vld1.32 {q2-q3}, [ip] - - adr r5, .Lctrinc - vdup.32 q15, d7[1] - vdup.32 q14, d7[0] - vld1.32 {q4}, [r5, :128] - vdup.32 q13, d6[1] - vdup.32 q12, d6[0] - vdup.32 q11, d5[1] - vdup.32 q10, d5[0] - vadd.u32 q12, q12, q4 // x12 += counter values 0-3 - vdup.32 q9, d4[1] - vdup.32 q8, d4[0] - vdup.32 q7, d3[1] - vdup.32 q6, d3[0] - vdup.32 q5, d2[1] - vdup.32 q4, d2[0] - vdup.32 q3, d1[1] - vdup.32 q2, d1[0] - vdup.32 q1, d0[1] - vdup.32 q0, d0[0] - - adr ip, .Lrol8_table - mov r3, #10 - b 1f - -.Ldoubleround4: - vld1.32 {q8-q9}, [sp, :256] -1: - // x0 += x4, x12 = rotl32(x12 ^ x0, 16) - // x1 += x5, x13 = rotl32(x13 ^ x1, 16) - // x2 += x6, x14 = rotl32(x14 ^ x2, 16) - // x3 += x7, x15 = rotl32(x15 ^ x3, 16) - vadd.i32 q0, q0, q4 - vadd.i32 q1, q1, q5 - vadd.i32 q2, q2, q6 - vadd.i32 q3, q3, q7 - - veor q12, q12, q0 - veor q13, q13, q1 - veor q14, q14, q2 - veor q15, q15, q3 - - vrev32.16 q12, q12 - vrev32.16 q13, q13 - vrev32.16 q14, q14 - vrev32.16 q15, q15 - - // x8 += x12, x4 = rotl32(x4 ^ x8, 12) - // x9 += x13, x5 = rotl32(x5 ^ x9, 12) - // x10 += x14, x6 = rotl32(x6 ^ x10, 12) - // x11 += x15, x7 = rotl32(x7 ^ x11, 12) - vadd.i32 q8, q8, q12 - vadd.i32 q9, q9, q13 - vadd.i32 q10, q10, q14 - vadd.i32 q11, q11, q15 - - vst1.32 {q8-q9}, [sp, :256] - - veor q8, q4, q8 - veor q9, q5, q9 - vshl.u32 q4, q8, #12 - vshl.u32 q5, q9, #12 - vsri.u32 q4, q8, #20 - vsri.u32 q5, q9, #20 - - veor q8, q6, q10 - veor q9, q7, q11 - vshl.u32 q6, q8, #12 - vshl.u32 q7, q9, #12 - vsri.u32 q6, q8, #20 - vsri.u32 q7, q9, #20 - - // x0 += x4, x12 = rotl32(x12 ^ x0, 8) - // x1 += x5, x13 = rotl32(x13 ^ x1, 8) - // x2 += x6, x14 = rotl32(x14 ^ x2, 8) - // x3 += x7, x15 = rotl32(x15 ^ x3, 8) - vld1.8 {d16}, [ip, :64] - vadd.i32 q0, q0, q4 - vadd.i32 q1, q1, q5 - vadd.i32 q2, q2, q6 - vadd.i32 q3, q3, q7 - - veor q12, q12, q0 - veor q13, q13, q1 - veor q14, q14, q2 - veor q15, q15, q3 - - vtbl.8 d24, {d24}, d16 - vtbl.8 d25, {d25}, d16 - vtbl.8 d26, {d26}, d16 - vtbl.8 d27, {d27}, d16 - vtbl.8 d28, {d28}, d16 - vtbl.8 d29, {d29}, d16 - vtbl.8 d30, {d30}, d16 - vtbl.8 d31, {d31}, d16 - - vld1.32 {q8-q9}, [sp, :256] - - // x8 += x12, x4 = rotl32(x4 ^ x8, 7) - // x9 += x13, x5 = rotl32(x5 ^ x9, 7) - // x10 += x14, x6 = rotl32(x6 ^ x10, 7) - // x11 += x15, x7 = rotl32(x7 ^ x11, 7) - vadd.i32 q8, q8, q12 - vadd.i32 q9, q9, q13 - vadd.i32 q10, q10, q14 - vadd.i32 q11, q11, q15 - - vst1.32 {q8-q9}, [sp, :256] - - veor q8, q4, q8 - veor q9, q5, q9 - vshl.u32 q4, q8, #7 - vshl.u32 q5, q9, #7 - vsri.u32 q4, q8, #25 - vsri.u32 q5, q9, #25 - - veor q8, q6, q10 - veor q9, q7, q11 - vshl.u32 q6, q8, #7 - vshl.u32 q7, q9, #7 - vsri.u32 q6, q8, #25 - vsri.u32 q7, q9, #25 - - vld1.32 {q8-q9}, [sp, :256] - - // x0 += x5, x15 = rotl32(x15 ^ x0, 16) - // x1 += x6, x12 = rotl32(x12 ^ x1, 16) - // x2 += x7, x13 = rotl32(x13 ^ x2, 16) - // x3 += x4, x14 = rotl32(x14 ^ x3, 16) - vadd.i32 q0, q0, q5 - vadd.i32 q1, q1, q6 - vadd.i32 q2, q2, q7 - vadd.i32 q3, q3, q4 - - veor q15, q15, q0 - veor q12, q12, q1 - veor q13, q13, q2 - veor q14, q14, q3 - - vrev32.16 q15, q15 - vrev32.16 q12, q12 - vrev32.16 q13, q13 - vrev32.16 q14, q14 - - // x10 += x15, x5 = rotl32(x5 ^ x10, 12) - // x11 += x12, x6 = rotl32(x6 ^ x11, 12) - // x8 += x13, x7 = rotl32(x7 ^ x8, 12) - // x9 += x14, x4 = rotl32(x4 ^ x9, 12) - vadd.i32 q10, q10, q15 - vadd.i32 q11, q11, q12 - vadd.i32 q8, q8, q13 - vadd.i32 q9, q9, q14 - - vst1.32 {q8-q9}, [sp, :256] - - veor q8, q7, q8 - veor q9, q4, q9 - vshl.u32 q7, q8, #12 - vshl.u32 q4, q9, #12 - vsri.u32 q7, q8, #20 - vsri.u32 q4, q9, #20 - - veor q8, q5, q10 - veor q9, q6, q11 - vshl.u32 q5, q8, #12 - vshl.u32 q6, q9, #12 - vsri.u32 q5, q8, #20 - vsri.u32 q6, q9, #20 - - // x0 += x5, x15 = rotl32(x15 ^ x0, 8) - // x1 += x6, x12 = rotl32(x12 ^ x1, 8) - // x2 += x7, x13 = rotl32(x13 ^ x2, 8) - // x3 += x4, x14 = rotl32(x14 ^ x3, 8) - vld1.8 {d16}, [ip, :64] - vadd.i32 q0, q0, q5 - vadd.i32 q1, q1, q6 - vadd.i32 q2, q2, q7 - vadd.i32 q3, q3, q4 - - veor q15, q15, q0 - veor q12, q12, q1 - veor q13, q13, q2 - veor q14, q14, q3 - - vtbl.8 d30, {d30}, d16 - vtbl.8 d31, {d31}, d16 - vtbl.8 d24, {d24}, d16 - vtbl.8 d25, {d25}, d16 - vtbl.8 d26, {d26}, d16 - vtbl.8 d27, {d27}, d16 - vtbl.8 d28, {d28}, d16 - vtbl.8 d29, {d29}, d16 - - vld1.32 {q8-q9}, [sp, :256] - - // x10 += x15, x5 = rotl32(x5 ^ x10, 7) - // x11 += x12, x6 = rotl32(x6 ^ x11, 7) - // x8 += x13, x7 = rotl32(x7 ^ x8, 7) - // x9 += x14, x4 = rotl32(x4 ^ x9, 7) - vadd.i32 q10, q10, q15 - vadd.i32 q11, q11, q12 - vadd.i32 q8, q8, q13 - vadd.i32 q9, q9, q14 - - vst1.32 {q8-q9}, [sp, :256] - - veor q8, q7, q8 - veor q9, q4, q9 - vshl.u32 q7, q8, #7 - vshl.u32 q4, q9, #7 - vsri.u32 q7, q8, #25 - vsri.u32 q4, q9, #25 - - veor q8, q5, q10 - veor q9, q6, q11 - vshl.u32 q5, q8, #7 - vshl.u32 q6, q9, #7 - vsri.u32 q5, q8, #25 - vsri.u32 q6, q9, #25 - - subs r3, r3, #1 - bne .Ldoubleround4 - - // x0..7[0-3] are in q0-q7, x10..15[0-3] are in q10-q15. - // x8..9[0-3] are on the stack. - - // Re-interleave the words in the first two rows of each block (x0..7). - // Also add the counter values 0-3 to x12[0-3]. - vld1.32 {q8}, [r5, :128] // load counter values 0-3 - vzip.32 q0, q1 // => (0 1 0 1) (0 1 0 1) - vzip.32 q2, q3 // => (2 3 2 3) (2 3 2 3) - vzip.32 q4, q5 // => (4 5 4 5) (4 5 4 5) - vzip.32 q6, q7 // => (6 7 6 7) (6 7 6 7) - vadd.u32 q12, q8 // x12 += counter values 0-3 - vswp d1, d4 - vswp d3, d6 - vld1.32 {q8-q9}, [r0]! // load s0..7 - vswp d9, d12 - vswp d11, d14 - - // Swap q1 and q4 so that we'll free up consecutive registers (q0-q1) - // after XORing the first 32 bytes. - vswp q1, q4 - - // First two rows of each block are (q0 q1) (q2 q6) (q4 q5) (q3 q7) - - // x0..3[0-3] += s0..3[0-3] (add orig state to 1st row of each block) - vadd.u32 q0, q0, q8 - vadd.u32 q2, q2, q8 - vadd.u32 q4, q4, q8 - vadd.u32 q3, q3, q8 - - // x4..7[0-3] += s4..7[0-3] (add orig state to 2nd row of each block) - vadd.u32 q1, q1, q9 - vadd.u32 q6, q6, q9 - vadd.u32 q5, q5, q9 - vadd.u32 q7, q7, q9 - - // XOR first 32 bytes using keystream from first two rows of first block - vld1.8 {q8-q9}, [r2]! - veor q8, q8, q0 - veor q9, q9, q1 - vst1.8 {q8-q9}, [r1]! - - // Re-interleave the words in the last two rows of each block (x8..15). - vld1.32 {q8-q9}, [sp, :256] - vzip.32 q12, q13 // => (12 13 12 13) (12 13 12 13) - vzip.32 q14, q15 // => (14 15 14 15) (14 15 14 15) - vzip.32 q8, q9 // => (8 9 8 9) (8 9 8 9) - vzip.32 q10, q11 // => (10 11 10 11) (10 11 10 11) - vld1.32 {q0-q1}, [r0] // load s8..15 - vswp d25, d28 - vswp d27, d30 - vswp d17, d20 - vswp d19, d22 - - // Last two rows of each block are (q8 q12) (q10 q14) (q9 q13) (q11 q15) - - // x8..11[0-3] += s8..11[0-3] (add orig state to 3rd row of each block) - vadd.u32 q8, q8, q0 - vadd.u32 q10, q10, q0 - vadd.u32 q9, q9, q0 - vadd.u32 q11, q11, q0 - - // x12..15[0-3] += s12..15[0-3] (add orig state to 4th row of each block) - vadd.u32 q12, q12, q1 - vadd.u32 q14, q14, q1 - vadd.u32 q13, q13, q1 - vadd.u32 q15, q15, q1 - - // XOR the rest of the data with the keystream - - vld1.8 {q0-q1}, [r2]! - veor q0, q0, q8 - veor q1, q1, q12 - vst1.8 {q0-q1}, [r1]! - - vld1.8 {q0-q1}, [r2]! - veor q0, q0, q2 - veor q1, q1, q6 - vst1.8 {q0-q1}, [r1]! - - vld1.8 {q0-q1}, [r2]! - veor q0, q0, q10 - veor q1, q1, q14 - vst1.8 {q0-q1}, [r1]! - - vld1.8 {q0-q1}, [r2]! - veor q0, q0, q4 - veor q1, q1, q5 - vst1.8 {q0-q1}, [r1]! - - vld1.8 {q0-q1}, [r2]! - veor q0, q0, q9 - veor q1, q1, q13 - vst1.8 {q0-q1}, [r1]! - - vld1.8 {q0-q1}, [r2]! - veor q0, q0, q3 - veor q1, q1, q7 - vst1.8 {q0-q1}, [r1]! - - vld1.8 {q0-q1}, [r2] - mov sp, r4 // restore original stack pointer - veor q0, q0, q11 - veor q1, q1, q15 - vst1.8 {q0-q1}, [r1] - - pop {r4-r5} - bx lr -ENDPROC(chacha20_4block_xor_neon) diff --git a/arch/arm/crypto/chacha20-neon-glue.c b/arch/arm/crypto/chacha20-neon-glue.c deleted file mode 100644 index f2d3b0f70a8d..000000000000 --- a/arch/arm/crypto/chacha20-neon-glue.c +++ /dev/null @@ -1,178 +0,0 @@ -/* - * ChaCha20 (RFC7539) and XChaCha20 stream ciphers, NEON accelerated - * - * Copyright (C) 2016 Linaro, Ltd. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * Based on: - * ChaCha20 256-bit cipher algorithm, RFC7539, SIMD glue code - * - * Copyright (C) 2015 Martin Willi - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - */ - -#include -#include -#include -#include -#include - -#include -#include -#include - -asmlinkage void chacha20_block_xor_neon(u32 *state, u8 *dst, const u8 *src); -asmlinkage void chacha20_4block_xor_neon(u32 *state, u8 *dst, const u8 *src); -asmlinkage void hchacha20_block_neon(const u32 *state, u32 *out); - -static void chacha20_doneon(u32 *state, u8 *dst, const u8 *src, - unsigned int bytes) -{ - u8 buf[CHACHA_BLOCK_SIZE]; - - while (bytes >= CHACHA_BLOCK_SIZE * 4) { - chacha20_4block_xor_neon(state, dst, src); - bytes -= CHACHA_BLOCK_SIZE * 4; - src += CHACHA_BLOCK_SIZE * 4; - dst += CHACHA_BLOCK_SIZE * 4; - state[12] += 4; - } - while (bytes >= CHACHA_BLOCK_SIZE) { - chacha20_block_xor_neon(state, dst, src); - bytes -= CHACHA_BLOCK_SIZE; - src += CHACHA_BLOCK_SIZE; - dst += CHACHA_BLOCK_SIZE; - state[12]++; - } - if (bytes) { - memcpy(buf, src, bytes); - chacha20_block_xor_neon(state, buf, buf); - memcpy(dst, buf, bytes); - } -} - -static int chacha20_neon_stream_xor(struct skcipher_request *req, - struct chacha_ctx *ctx, u8 *iv) -{ - struct skcipher_walk walk; - u32 state[16]; - int err; - - err = skcipher_walk_virt(&walk, req, false); - - crypto_chacha_init(state, ctx, iv); - - while (walk.nbytes > 0) { - unsigned int nbytes = walk.nbytes; - - if (nbytes < walk.total) - nbytes = round_down(nbytes, walk.stride); - - kernel_neon_begin(); - chacha20_doneon(state, walk.dst.virt.addr, walk.src.virt.addr, - nbytes); - kernel_neon_end(); - err = skcipher_walk_done(&walk, walk.nbytes - nbytes); - } - - return err; -} - -static int chacha20_neon(struct skcipher_request *req) -{ - struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm); - - if (req->cryptlen <= CHACHA_BLOCK_SIZE || !may_use_simd()) - return crypto_chacha_crypt(req); - - return chacha20_neon_stream_xor(req, ctx, req->iv); -} - -static int xchacha20_neon(struct skcipher_request *req) -{ - struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm); - struct chacha_ctx subctx; - u32 state[16]; - u8 real_iv[16]; - - if (req->cryptlen <= CHACHA_BLOCK_SIZE || !may_use_simd()) - return crypto_xchacha_crypt(req); - - crypto_chacha_init(state, ctx, req->iv); - - kernel_neon_begin(); - hchacha20_block_neon(state, subctx.key); - kernel_neon_end(); - - memcpy(&real_iv[0], req->iv + 24, 8); - memcpy(&real_iv[8], req->iv + 16, 8); - return chacha20_neon_stream_xor(req, &subctx, real_iv); -} - -static struct skcipher_alg algs[] = { - { - .base.cra_name = "chacha20", - .base.cra_driver_name = "chacha20-neon", - .base.cra_priority = 300, - .base.cra_blocksize = 1, - .base.cra_ctxsize = sizeof(struct chacha_ctx), - .base.cra_module = THIS_MODULE, - - .min_keysize = CHACHA_KEY_SIZE, - .max_keysize = CHACHA_KEY_SIZE, - .ivsize = CHACHA_IV_SIZE, - .chunksize = CHACHA_BLOCK_SIZE, - .walksize = 4 * CHACHA_BLOCK_SIZE, - .setkey = crypto_chacha20_setkey, - .encrypt = chacha20_neon, - .decrypt = chacha20_neon, - }, { - .base.cra_name = "xchacha20", - .base.cra_driver_name = "xchacha20-neon", - .base.cra_priority = 300, - .base.cra_blocksize = 1, - .base.cra_ctxsize = sizeof(struct chacha_ctx), - .base.cra_module = THIS_MODULE, - - .min_keysize = CHACHA_KEY_SIZE, - .max_keysize = CHACHA_KEY_SIZE, - .ivsize = XCHACHA_IV_SIZE, - .chunksize = CHACHA_BLOCK_SIZE, - .walksize = 4 * CHACHA_BLOCK_SIZE, - .setkey = crypto_chacha20_setkey, - .encrypt = xchacha20_neon, - .decrypt = xchacha20_neon, - } -}; - -static int __init chacha20_simd_mod_init(void) -{ - if (!(elf_hwcap & HWCAP_NEON)) - return -ENODEV; - - return crypto_register_skciphers(algs, ARRAY_SIZE(algs)); -} - -static void __exit chacha20_simd_mod_fini(void) -{ - crypto_unregister_skciphers(algs, ARRAY_SIZE(algs)); -} - -module_init(chacha20_simd_mod_init); -module_exit(chacha20_simd_mod_fini); - -MODULE_AUTHOR("Ard Biesheuvel "); -MODULE_LICENSE("GPL v2"); -MODULE_ALIAS_CRYPTO("chacha20"); -MODULE_ALIAS_CRYPTO("chacha20-neon"); -MODULE_ALIAS_CRYPTO("xchacha20"); -MODULE_ALIAS_CRYPTO("xchacha20-neon"); -- cgit v1.2.3-59-g8ed1b From bdb063a79f6da589af1de3f10a7c8f654fba9ae8 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 16 Nov 2018 17:26:26 -0800 Subject: crypto: arm/chacha - add XChaCha12 support Now that the 32-bit ARM NEON implementation of ChaCha20 and XChaCha20 has been refactored to support varying the number of rounds, add support for XChaCha12. This is identical to XChaCha20 except for the number of rounds, which is 12 instead of 20. XChaCha12 is faster than XChaCha20 but has a lower security margin, though still greater than AES-256's since the best known attacks make it through only 7 rounds. See the patch "crypto: chacha - add XChaCha12 support" for more details about why we need XChaCha12 support. Reviewed-by: Ard Biesheuvel Signed-off-by: Eric Biggers Signed-off-by: Herbert Xu --- arch/arm/crypto/Kconfig | 2 +- arch/arm/crypto/chacha-neon-glue.c | 21 ++++++++++++++++++++- 2 files changed, 21 insertions(+), 2 deletions(-) diff --git a/arch/arm/crypto/Kconfig b/arch/arm/crypto/Kconfig index a08759c32cb9..59c674cf08ef 100644 --- a/arch/arm/crypto/Kconfig +++ b/arch/arm/crypto/Kconfig @@ -126,7 +126,7 @@ config CRYPTO_CRC32_ARM_CE select CRYPTO_HASH config CRYPTO_CHACHA20_NEON - tristate "NEON accelerated ChaCha20 stream cipher algorithms" + tristate "NEON accelerated ChaCha stream cipher algorithms" depends on KERNEL_MODE_NEON select CRYPTO_BLKCIPHER select CRYPTO_CHACHA20 diff --git a/arch/arm/crypto/chacha-neon-glue.c b/arch/arm/crypto/chacha-neon-glue.c index 385557d38634..9d6fda81986d 100644 --- a/arch/arm/crypto/chacha-neon-glue.c +++ b/arch/arm/crypto/chacha-neon-glue.c @@ -1,5 +1,6 @@ /* - * ChaCha20 (RFC7539) and XChaCha20 stream ciphers, NEON accelerated + * ARM NEON accelerated ChaCha and XChaCha stream ciphers, + * including ChaCha20 (RFC7539) * * Copyright (C) 2016 Linaro, Ltd. * @@ -154,6 +155,22 @@ static struct skcipher_alg algs[] = { .setkey = crypto_chacha20_setkey, .encrypt = xchacha_neon, .decrypt = xchacha_neon, + }, { + .base.cra_name = "xchacha12", + .base.cra_driver_name = "xchacha12-neon", + .base.cra_priority = 300, + .base.cra_blocksize = 1, + .base.cra_ctxsize = sizeof(struct chacha_ctx), + .base.cra_module = THIS_MODULE, + + .min_keysize = CHACHA_KEY_SIZE, + .max_keysize = CHACHA_KEY_SIZE, + .ivsize = XCHACHA_IV_SIZE, + .chunksize = CHACHA_BLOCK_SIZE, + .walksize = 4 * CHACHA_BLOCK_SIZE, + .setkey = crypto_chacha12_setkey, + .encrypt = xchacha_neon, + .decrypt = xchacha_neon, } }; @@ -180,3 +197,5 @@ MODULE_ALIAS_CRYPTO("chacha20"); MODULE_ALIAS_CRYPTO("chacha20-neon"); MODULE_ALIAS_CRYPTO("xchacha20"); MODULE_ALIAS_CRYPTO("xchacha20-neon"); +MODULE_ALIAS_CRYPTO("xchacha12"); +MODULE_ALIAS_CRYPTO("xchacha12-neon"); -- cgit v1.2.3-59-g8ed1b From 878afc35cd28bcd93cd3c5e1985ef39a104a4d45 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 16 Nov 2018 17:26:27 -0800 Subject: crypto: poly1305 - use structures for key and accumulator MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In preparation for exposing a low-level Poly1305 API which implements the ε-almost-∆-universal (εA∆U) hash function underlying the Poly1305 MAC and supports block-aligned inputs only, create structures poly1305_key and poly1305_state which hold the limbs of the Poly1305 "r" key and accumulator, respectively. These structures could actually have the same type (e.g. poly1305_val), but different types are preferable, to prevent misuse. Acked-by: Martin Willi Signed-off-by: Eric Biggers Acked-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- arch/x86/crypto/poly1305_glue.c | 20 +++++++++------- crypto/poly1305_generic.c | 52 ++++++++++++++++++++--------------------- include/crypto/poly1305.h | 12 ++++++++-- 3 files changed, 47 insertions(+), 37 deletions(-) diff --git a/arch/x86/crypto/poly1305_glue.c b/arch/x86/crypto/poly1305_glue.c index f012b7e28ad1..88cc01506c84 100644 --- a/arch/x86/crypto/poly1305_glue.c +++ b/arch/x86/crypto/poly1305_glue.c @@ -83,35 +83,37 @@ static unsigned int poly1305_simd_blocks(struct poly1305_desc_ctx *dctx, if (poly1305_use_avx2 && srclen >= POLY1305_BLOCK_SIZE * 4) { if (unlikely(!sctx->wset)) { if (!sctx->uset) { - memcpy(sctx->u, dctx->r, sizeof(sctx->u)); - poly1305_simd_mult(sctx->u, dctx->r); + memcpy(sctx->u, dctx->r.r, sizeof(sctx->u)); + poly1305_simd_mult(sctx->u, dctx->r.r); sctx->uset = true; } memcpy(sctx->u + 5, sctx->u, sizeof(sctx->u)); - poly1305_simd_mult(sctx->u + 5, dctx->r); + poly1305_simd_mult(sctx->u + 5, dctx->r.r); memcpy(sctx->u + 10, sctx->u + 5, sizeof(sctx->u)); - poly1305_simd_mult(sctx->u + 10, dctx->r); + poly1305_simd_mult(sctx->u + 10, dctx->r.r); sctx->wset = true; } blocks = srclen / (POLY1305_BLOCK_SIZE * 4); - poly1305_4block_avx2(dctx->h, src, dctx->r, blocks, sctx->u); + poly1305_4block_avx2(dctx->h.h, src, dctx->r.r, blocks, + sctx->u); src += POLY1305_BLOCK_SIZE * 4 * blocks; srclen -= POLY1305_BLOCK_SIZE * 4 * blocks; } #endif if (likely(srclen >= POLY1305_BLOCK_SIZE * 2)) { if (unlikely(!sctx->uset)) { - memcpy(sctx->u, dctx->r, sizeof(sctx->u)); - poly1305_simd_mult(sctx->u, dctx->r); + memcpy(sctx->u, dctx->r.r, sizeof(sctx->u)); + poly1305_simd_mult(sctx->u, dctx->r.r); sctx->uset = true; } blocks = srclen / (POLY1305_BLOCK_SIZE * 2); - poly1305_2block_sse2(dctx->h, src, dctx->r, blocks, sctx->u); + poly1305_2block_sse2(dctx->h.h, src, dctx->r.r, blocks, + sctx->u); src += POLY1305_BLOCK_SIZE * 2 * blocks; srclen -= POLY1305_BLOCK_SIZE * 2 * blocks; } if (srclen >= POLY1305_BLOCK_SIZE) { - poly1305_block_sse2(dctx->h, src, dctx->r, 1); + poly1305_block_sse2(dctx->h.h, src, dctx->r.r, 1); srclen -= POLY1305_BLOCK_SIZE; } return srclen; diff --git a/crypto/poly1305_generic.c b/crypto/poly1305_generic.c index 47d3a6b83931..a23173f351b7 100644 --- a/crypto/poly1305_generic.c +++ b/crypto/poly1305_generic.c @@ -38,7 +38,7 @@ int crypto_poly1305_init(struct shash_desc *desc) { struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc); - memset(dctx->h, 0, sizeof(dctx->h)); + memset(dctx->h.h, 0, sizeof(dctx->h.h)); dctx->buflen = 0; dctx->rset = false; dctx->sset = false; @@ -50,11 +50,11 @@ EXPORT_SYMBOL_GPL(crypto_poly1305_init); static void poly1305_setrkey(struct poly1305_desc_ctx *dctx, const u8 *key) { /* r &= 0xffffffc0ffffffc0ffffffc0fffffff */ - dctx->r[0] = (get_unaligned_le32(key + 0) >> 0) & 0x3ffffff; - dctx->r[1] = (get_unaligned_le32(key + 3) >> 2) & 0x3ffff03; - dctx->r[2] = (get_unaligned_le32(key + 6) >> 4) & 0x3ffc0ff; - dctx->r[3] = (get_unaligned_le32(key + 9) >> 6) & 0x3f03fff; - dctx->r[4] = (get_unaligned_le32(key + 12) >> 8) & 0x00fffff; + dctx->r.r[0] = (get_unaligned_le32(key + 0) >> 0) & 0x3ffffff; + dctx->r.r[1] = (get_unaligned_le32(key + 3) >> 2) & 0x3ffff03; + dctx->r.r[2] = (get_unaligned_le32(key + 6) >> 4) & 0x3ffc0ff; + dctx->r.r[3] = (get_unaligned_le32(key + 9) >> 6) & 0x3f03fff; + dctx->r.r[4] = (get_unaligned_le32(key + 12) >> 8) & 0x00fffff; } static void poly1305_setskey(struct poly1305_desc_ctx *dctx, const u8 *key) @@ -107,22 +107,22 @@ static unsigned int poly1305_blocks(struct poly1305_desc_ctx *dctx, srclen = datalen; } - r0 = dctx->r[0]; - r1 = dctx->r[1]; - r2 = dctx->r[2]; - r3 = dctx->r[3]; - r4 = dctx->r[4]; + r0 = dctx->r.r[0]; + r1 = dctx->r.r[1]; + r2 = dctx->r.r[2]; + r3 = dctx->r.r[3]; + r4 = dctx->r.r[4]; s1 = r1 * 5; s2 = r2 * 5; s3 = r3 * 5; s4 = r4 * 5; - h0 = dctx->h[0]; - h1 = dctx->h[1]; - h2 = dctx->h[2]; - h3 = dctx->h[3]; - h4 = dctx->h[4]; + h0 = dctx->h.h[0]; + h1 = dctx->h.h[1]; + h2 = dctx->h.h[2]; + h3 = dctx->h.h[3]; + h4 = dctx->h.h[4]; while (likely(srclen >= POLY1305_BLOCK_SIZE)) { @@ -157,11 +157,11 @@ static unsigned int poly1305_blocks(struct poly1305_desc_ctx *dctx, srclen -= POLY1305_BLOCK_SIZE; } - dctx->h[0] = h0; - dctx->h[1] = h1; - dctx->h[2] = h2; - dctx->h[3] = h3; - dctx->h[4] = h4; + dctx->h.h[0] = h0; + dctx->h.h[1] = h1; + dctx->h.h[2] = h2; + dctx->h.h[3] = h3; + dctx->h.h[4] = h4; return srclen; } @@ -220,11 +220,11 @@ int crypto_poly1305_final(struct shash_desc *desc, u8 *dst) } /* fully carry h */ - h0 = dctx->h[0]; - h1 = dctx->h[1]; - h2 = dctx->h[2]; - h3 = dctx->h[3]; - h4 = dctx->h[4]; + h0 = dctx->h.h[0]; + h1 = dctx->h.h[1]; + h2 = dctx->h.h[2]; + h3 = dctx->h.h[3]; + h4 = dctx->h.h[4]; h2 += (h1 >> 26); h1 = h1 & 0x3ffffff; h3 += (h2 >> 26); h2 = h2 & 0x3ffffff; diff --git a/include/crypto/poly1305.h b/include/crypto/poly1305.h index f718a19da82f..493244c46664 100644 --- a/include/crypto/poly1305.h +++ b/include/crypto/poly1305.h @@ -13,13 +13,21 @@ #define POLY1305_KEY_SIZE 32 #define POLY1305_DIGEST_SIZE 16 +struct poly1305_key { + u32 r[5]; /* key, base 2^26 */ +}; + +struct poly1305_state { + u32 h[5]; /* accumulator, base 2^26 */ +}; + struct poly1305_desc_ctx { /* key */ - u32 r[5]; + struct poly1305_key r; /* finalize key */ u32 s[4]; /* accumulator */ - u32 h[5]; + struct poly1305_state h; /* partial buffer */ u8 buf[POLY1305_BLOCK_SIZE]; /* bytes used in partial buffer */ -- cgit v1.2.3-59-g8ed1b From 1b6fd3d5d18bbc1b1abf3b0cbc4b95a9a63d407b Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 16 Nov 2018 17:26:28 -0800 Subject: crypto: poly1305 - add Poly1305 core API MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Expose a low-level Poly1305 API which implements the ε-almost-∆-universal (εA∆U) hash function underlying the Poly1305 MAC and supports block-aligned inputs only. This is needed for Adiantum hashing, which builds an εA∆U hash function from NH and a polynomial evaluation in GF(2^{130}-5); this polynomial evaluation is identical to the one the Poly1305 MAC does. However, the crypto_shash Poly1305 API isn't very appropriate for this because its calling convention assumes it is used as a MAC, with a 32-byte "one-time key" provided for every digest. But by design, in Adiantum hashing the performance of the polynomial evaluation isn't nearly as critical as NH. So it suffices to just have some C helper functions. Thus, this patch adds such functions. Acked-by: Martin Willi Signed-off-by: Eric Biggers Acked-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- crypto/poly1305_generic.c | 174 ++++++++++++++++++++++++++-------------------- include/crypto/poly1305.h | 16 +++++ 2 files changed, 115 insertions(+), 75 deletions(-) diff --git a/crypto/poly1305_generic.c b/crypto/poly1305_generic.c index a23173f351b7..2a06874204e8 100644 --- a/crypto/poly1305_generic.c +++ b/crypto/poly1305_generic.c @@ -38,7 +38,7 @@ int crypto_poly1305_init(struct shash_desc *desc) { struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc); - memset(dctx->h.h, 0, sizeof(dctx->h.h)); + poly1305_core_init(&dctx->h); dctx->buflen = 0; dctx->rset = false; dctx->sset = false; @@ -47,23 +47,16 @@ int crypto_poly1305_init(struct shash_desc *desc) } EXPORT_SYMBOL_GPL(crypto_poly1305_init); -static void poly1305_setrkey(struct poly1305_desc_ctx *dctx, const u8 *key) +void poly1305_core_setkey(struct poly1305_key *key, const u8 *raw_key) { /* r &= 0xffffffc0ffffffc0ffffffc0fffffff */ - dctx->r.r[0] = (get_unaligned_le32(key + 0) >> 0) & 0x3ffffff; - dctx->r.r[1] = (get_unaligned_le32(key + 3) >> 2) & 0x3ffff03; - dctx->r.r[2] = (get_unaligned_le32(key + 6) >> 4) & 0x3ffc0ff; - dctx->r.r[3] = (get_unaligned_le32(key + 9) >> 6) & 0x3f03fff; - dctx->r.r[4] = (get_unaligned_le32(key + 12) >> 8) & 0x00fffff; -} - -static void poly1305_setskey(struct poly1305_desc_ctx *dctx, const u8 *key) -{ - dctx->s[0] = get_unaligned_le32(key + 0); - dctx->s[1] = get_unaligned_le32(key + 4); - dctx->s[2] = get_unaligned_le32(key + 8); - dctx->s[3] = get_unaligned_le32(key + 12); + key->r[0] = (get_unaligned_le32(raw_key + 0) >> 0) & 0x3ffffff; + key->r[1] = (get_unaligned_le32(raw_key + 3) >> 2) & 0x3ffff03; + key->r[2] = (get_unaligned_le32(raw_key + 6) >> 4) & 0x3ffc0ff; + key->r[3] = (get_unaligned_le32(raw_key + 9) >> 6) & 0x3f03fff; + key->r[4] = (get_unaligned_le32(raw_key + 12) >> 8) & 0x00fffff; } +EXPORT_SYMBOL_GPL(poly1305_core_setkey); /* * Poly1305 requires a unique key for each tag, which implies that we can't set @@ -75,13 +68,16 @@ unsigned int crypto_poly1305_setdesckey(struct poly1305_desc_ctx *dctx, { if (!dctx->sset) { if (!dctx->rset && srclen >= POLY1305_BLOCK_SIZE) { - poly1305_setrkey(dctx, src); + poly1305_core_setkey(&dctx->r, src); src += POLY1305_BLOCK_SIZE; srclen -= POLY1305_BLOCK_SIZE; dctx->rset = true; } if (srclen >= POLY1305_BLOCK_SIZE) { - poly1305_setskey(dctx, src); + dctx->s[0] = get_unaligned_le32(src + 0); + dctx->s[1] = get_unaligned_le32(src + 4); + dctx->s[2] = get_unaligned_le32(src + 8); + dctx->s[3] = get_unaligned_le32(src + 12); src += POLY1305_BLOCK_SIZE; srclen -= POLY1305_BLOCK_SIZE; dctx->sset = true; @@ -91,41 +87,37 @@ unsigned int crypto_poly1305_setdesckey(struct poly1305_desc_ctx *dctx, } EXPORT_SYMBOL_GPL(crypto_poly1305_setdesckey); -static unsigned int poly1305_blocks(struct poly1305_desc_ctx *dctx, - const u8 *src, unsigned int srclen, - u32 hibit) +static void poly1305_blocks_internal(struct poly1305_state *state, + const struct poly1305_key *key, + const void *src, unsigned int nblocks, + u32 hibit) { u32 r0, r1, r2, r3, r4; u32 s1, s2, s3, s4; u32 h0, h1, h2, h3, h4; u64 d0, d1, d2, d3, d4; - unsigned int datalen; - if (unlikely(!dctx->sset)) { - datalen = crypto_poly1305_setdesckey(dctx, src, srclen); - src += srclen - datalen; - srclen = datalen; - } + if (!nblocks) + return; - r0 = dctx->r.r[0]; - r1 = dctx->r.r[1]; - r2 = dctx->r.r[2]; - r3 = dctx->r.r[3]; - r4 = dctx->r.r[4]; + r0 = key->r[0]; + r1 = key->r[1]; + r2 = key->r[2]; + r3 = key->r[3]; + r4 = key->r[4]; s1 = r1 * 5; s2 = r2 * 5; s3 = r3 * 5; s4 = r4 * 5; - h0 = dctx->h.h[0]; - h1 = dctx->h.h[1]; - h2 = dctx->h.h[2]; - h3 = dctx->h.h[3]; - h4 = dctx->h.h[4]; - - while (likely(srclen >= POLY1305_BLOCK_SIZE)) { + h0 = state->h[0]; + h1 = state->h[1]; + h2 = state->h[2]; + h3 = state->h[3]; + h4 = state->h[4]; + do { /* h += m[i] */ h0 += (get_unaligned_le32(src + 0) >> 0) & 0x3ffffff; h1 += (get_unaligned_le32(src + 3) >> 2) & 0x3ffffff; @@ -154,16 +146,36 @@ static unsigned int poly1305_blocks(struct poly1305_desc_ctx *dctx, h1 += h0 >> 26; h0 = h0 & 0x3ffffff; src += POLY1305_BLOCK_SIZE; - srclen -= POLY1305_BLOCK_SIZE; - } + } while (--nblocks); - dctx->h.h[0] = h0; - dctx->h.h[1] = h1; - dctx->h.h[2] = h2; - dctx->h.h[3] = h3; - dctx->h.h[4] = h4; + state->h[0] = h0; + state->h[1] = h1; + state->h[2] = h2; + state->h[3] = h3; + state->h[4] = h4; +} - return srclen; +void poly1305_core_blocks(struct poly1305_state *state, + const struct poly1305_key *key, + const void *src, unsigned int nblocks) +{ + poly1305_blocks_internal(state, key, src, nblocks, 1 << 24); +} +EXPORT_SYMBOL_GPL(poly1305_core_blocks); + +static void poly1305_blocks(struct poly1305_desc_ctx *dctx, + const u8 *src, unsigned int srclen, u32 hibit) +{ + unsigned int datalen; + + if (unlikely(!dctx->sset)) { + datalen = crypto_poly1305_setdesckey(dctx, src, srclen); + src += srclen - datalen; + srclen = datalen; + } + + poly1305_blocks_internal(&dctx->h, &dctx->r, + src, srclen / POLY1305_BLOCK_SIZE, hibit); } int crypto_poly1305_update(struct shash_desc *desc, @@ -187,9 +199,9 @@ int crypto_poly1305_update(struct shash_desc *desc, } if (likely(srclen >= POLY1305_BLOCK_SIZE)) { - bytes = poly1305_blocks(dctx, src, srclen, 1 << 24); - src += srclen - bytes; - srclen = bytes; + poly1305_blocks(dctx, src, srclen, 1 << 24); + src += srclen - (srclen % POLY1305_BLOCK_SIZE); + srclen %= POLY1305_BLOCK_SIZE; } if (unlikely(srclen)) { @@ -201,30 +213,18 @@ int crypto_poly1305_update(struct shash_desc *desc, } EXPORT_SYMBOL_GPL(crypto_poly1305_update); -int crypto_poly1305_final(struct shash_desc *desc, u8 *dst) +void poly1305_core_emit(const struct poly1305_state *state, void *dst) { - struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc); u32 h0, h1, h2, h3, h4; u32 g0, g1, g2, g3, g4; u32 mask; - u64 f = 0; - - if (unlikely(!dctx->sset)) - return -ENOKEY; - - if (unlikely(dctx->buflen)) { - dctx->buf[dctx->buflen++] = 1; - memset(dctx->buf + dctx->buflen, 0, - POLY1305_BLOCK_SIZE - dctx->buflen); - poly1305_blocks(dctx, dctx->buf, POLY1305_BLOCK_SIZE, 0); - } /* fully carry h */ - h0 = dctx->h.h[0]; - h1 = dctx->h.h[1]; - h2 = dctx->h.h[2]; - h3 = dctx->h.h[3]; - h4 = dctx->h.h[4]; + h0 = state->h[0]; + h1 = state->h[1]; + h2 = state->h[2]; + h3 = state->h[3]; + h4 = state->h[4]; h2 += (h1 >> 26); h1 = h1 & 0x3ffffff; h3 += (h2 >> 26); h2 = h2 & 0x3ffffff; @@ -254,16 +254,40 @@ int crypto_poly1305_final(struct shash_desc *desc, u8 *dst) h4 = (h4 & mask) | g4; /* h = h % (2^128) */ - h0 = (h0 >> 0) | (h1 << 26); - h1 = (h1 >> 6) | (h2 << 20); - h2 = (h2 >> 12) | (h3 << 14); - h3 = (h3 >> 18) | (h4 << 8); + put_unaligned_le32((h0 >> 0) | (h1 << 26), dst + 0); + put_unaligned_le32((h1 >> 6) | (h2 << 20), dst + 4); + put_unaligned_le32((h2 >> 12) | (h3 << 14), dst + 8); + put_unaligned_le32((h3 >> 18) | (h4 << 8), dst + 12); +} +EXPORT_SYMBOL_GPL(poly1305_core_emit); + +int crypto_poly1305_final(struct shash_desc *desc, u8 *dst) +{ + struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc); + __le32 digest[4]; + u64 f = 0; + + if (unlikely(!dctx->sset)) + return -ENOKEY; + + if (unlikely(dctx->buflen)) { + dctx->buf[dctx->buflen++] = 1; + memset(dctx->buf + dctx->buflen, 0, + POLY1305_BLOCK_SIZE - dctx->buflen); + poly1305_blocks(dctx, dctx->buf, POLY1305_BLOCK_SIZE, 0); + } + + poly1305_core_emit(&dctx->h, digest); /* mac = (h + s) % (2^128) */ - f = (f >> 32) + h0 + dctx->s[0]; put_unaligned_le32(f, dst + 0); - f = (f >> 32) + h1 + dctx->s[1]; put_unaligned_le32(f, dst + 4); - f = (f >> 32) + h2 + dctx->s[2]; put_unaligned_le32(f, dst + 8); - f = (f >> 32) + h3 + dctx->s[3]; put_unaligned_le32(f, dst + 12); + f = (f >> 32) + le32_to_cpu(digest[0]) + dctx->s[0]; + put_unaligned_le32(f, dst + 0); + f = (f >> 32) + le32_to_cpu(digest[1]) + dctx->s[1]; + put_unaligned_le32(f, dst + 4); + f = (f >> 32) + le32_to_cpu(digest[2]) + dctx->s[2]; + put_unaligned_le32(f, dst + 8); + f = (f >> 32) + le32_to_cpu(digest[3]) + dctx->s[3]; + put_unaligned_le32(f, dst + 12); return 0; } diff --git a/include/crypto/poly1305.h b/include/crypto/poly1305.h index 493244c46664..34317ed2071e 100644 --- a/include/crypto/poly1305.h +++ b/include/crypto/poly1305.h @@ -38,6 +38,22 @@ struct poly1305_desc_ctx { bool sset; }; +/* + * Poly1305 core functions. These implement the ε-almost-∆-universal hash + * function underlying the Poly1305 MAC, i.e. they don't add an encrypted nonce + * ("s key") at the end. They also only support block-aligned inputs. + */ +void poly1305_core_setkey(struct poly1305_key *key, const u8 *raw_key); +static inline void poly1305_core_init(struct poly1305_state *state) +{ + memset(state->h, 0, sizeof(state->h)); +} +void poly1305_core_blocks(struct poly1305_state *state, + const struct poly1305_key *key, + const void *src, unsigned int nblocks); +void poly1305_core_emit(const struct poly1305_state *state, void *dst); + +/* Crypto API helper functions for the Poly1305 MAC */ int crypto_poly1305_init(struct shash_desc *desc); unsigned int crypto_poly1305_setdesckey(struct poly1305_desc_ctx *dctx, const u8 *src, unsigned int srclen); -- cgit v1.2.3-59-g8ed1b From 26609a21a9460145e37d90947ad957b358a05288 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 16 Nov 2018 17:26:29 -0800 Subject: crypto: nhpoly1305 - add NHPoly1305 support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a generic implementation of NHPoly1305, an ε-almost-∆-universal hash function used in the Adiantum encryption mode. CONFIG_NHPOLY1305 is not selectable by itself since there won't be any real reason to enable it without also enabling Adiantum support. Signed-off-by: Eric Biggers Acked-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- crypto/Kconfig | 5 + crypto/Makefile | 1 + crypto/nhpoly1305.c | 254 +++++++++ crypto/testmgr.c | 6 + crypto/testmgr.h | 1240 ++++++++++++++++++++++++++++++++++++++++++- include/crypto/nhpoly1305.h | 74 +++ 6 files changed, 1576 insertions(+), 4 deletions(-) create mode 100644 crypto/nhpoly1305.c create mode 100644 include/crypto/nhpoly1305.h diff --git a/crypto/Kconfig b/crypto/Kconfig index 4431c0db56b7..eaeb8a986b7d 100644 --- a/crypto/Kconfig +++ b/crypto/Kconfig @@ -496,6 +496,11 @@ config CRYPTO_KEYWRAP Support for key wrapping (NIST SP800-38F / RFC3394) without padding. +config CRYPTO_NHPOLY1305 + tristate + select CRYPTO_HASH + select CRYPTO_POLY1305 + comment "Hash modes" config CRYPTO_CMAC diff --git a/crypto/Makefile b/crypto/Makefile index 102e8525814f..c3310c85f09f 100644 --- a/crypto/Makefile +++ b/crypto/Makefile @@ -85,6 +85,7 @@ obj-$(CONFIG_CRYPTO_LRW) += lrw.o obj-$(CONFIG_CRYPTO_XTS) += xts.o obj-$(CONFIG_CRYPTO_CTR) += ctr.o obj-$(CONFIG_CRYPTO_KEYWRAP) += keywrap.o +obj-$(CONFIG_CRYPTO_NHPOLY1305) += nhpoly1305.o obj-$(CONFIG_CRYPTO_GCM) += gcm.o obj-$(CONFIG_CRYPTO_CCM) += ccm.o obj-$(CONFIG_CRYPTO_CHACHA20POLY1305) += chacha20poly1305.o diff --git a/crypto/nhpoly1305.c b/crypto/nhpoly1305.c new file mode 100644 index 000000000000..c8385853f699 --- /dev/null +++ b/crypto/nhpoly1305.c @@ -0,0 +1,254 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * NHPoly1305 - ε-almost-∆-universal hash function for Adiantum + * + * Copyright 2018 Google LLC + */ + +/* + * "NHPoly1305" is the main component of Adiantum hashing. + * Specifically, it is the calculation + * + * H_M ← Poly1305_{K_M}(NH_{K_N}(pad_{128}(M))) + * + * from the procedure in section A.5 of the Adiantum paper [1]. It is an + * ε-almost-∆-universal (εA∆U) hash function for equal-length inputs over + * Z/(2^{128}Z), where the "∆" operation is addition. It hashes 1024-byte + * chunks of the input with the NH hash function [2], reducing the input length + * by 32x. The resulting NH digests are evaluated as a polynomial in + * GF(2^{130}-5), like in the Poly1305 MAC [3]. Note that the polynomial + * evaluation by itself would suffice to achieve the εA∆U property; NH is used + * for performance since it's over twice as fast as Poly1305. + * + * This is *not* a cryptographic hash function; do not use it as such! + * + * [1] Adiantum: length-preserving encryption for entry-level processors + * (https://eprint.iacr.org/2018/720.pdf) + * [2] UMAC: Fast and Secure Message Authentication + * (https://fastcrypto.org/umac/umac_proc.pdf) + * [3] The Poly1305-AES message-authentication code + * (https://cr.yp.to/mac/poly1305-20050329.pdf) + */ + +#include +#include +#include +#include +#include +#include +#include + +static void nh_generic(const u32 *key, const u8 *message, size_t message_len, + __le64 hash[NH_NUM_PASSES]) +{ + u64 sums[4] = { 0, 0, 0, 0 }; + + BUILD_BUG_ON(NH_PAIR_STRIDE != 2); + BUILD_BUG_ON(NH_NUM_PASSES != 4); + + while (message_len) { + u32 m0 = get_unaligned_le32(message + 0); + u32 m1 = get_unaligned_le32(message + 4); + u32 m2 = get_unaligned_le32(message + 8); + u32 m3 = get_unaligned_le32(message + 12); + + sums[0] += (u64)(u32)(m0 + key[ 0]) * (u32)(m2 + key[ 2]); + sums[1] += (u64)(u32)(m0 + key[ 4]) * (u32)(m2 + key[ 6]); + sums[2] += (u64)(u32)(m0 + key[ 8]) * (u32)(m2 + key[10]); + sums[3] += (u64)(u32)(m0 + key[12]) * (u32)(m2 + key[14]); + sums[0] += (u64)(u32)(m1 + key[ 1]) * (u32)(m3 + key[ 3]); + sums[1] += (u64)(u32)(m1 + key[ 5]) * (u32)(m3 + key[ 7]); + sums[2] += (u64)(u32)(m1 + key[ 9]) * (u32)(m3 + key[11]); + sums[3] += (u64)(u32)(m1 + key[13]) * (u32)(m3 + key[15]); + key += NH_MESSAGE_UNIT / sizeof(key[0]); + message += NH_MESSAGE_UNIT; + message_len -= NH_MESSAGE_UNIT; + } + + hash[0] = cpu_to_le64(sums[0]); + hash[1] = cpu_to_le64(sums[1]); + hash[2] = cpu_to_le64(sums[2]); + hash[3] = cpu_to_le64(sums[3]); +} + +/* Pass the next NH hash value through Poly1305 */ +static void process_nh_hash_value(struct nhpoly1305_state *state, + const struct nhpoly1305_key *key) +{ + BUILD_BUG_ON(NH_HASH_BYTES % POLY1305_BLOCK_SIZE != 0); + + poly1305_core_blocks(&state->poly_state, &key->poly_key, state->nh_hash, + NH_HASH_BYTES / POLY1305_BLOCK_SIZE); +} + +/* + * Feed the next portion of the source data, as a whole number of 16-byte + * "NH message units", through NH and Poly1305. Each NH hash is taken over + * 1024 bytes, except possibly the final one which is taken over a multiple of + * 16 bytes up to 1024. Also, in the case where data is passed in misaligned + * chunks, we combine partial hashes; the end result is the same either way. + */ +static void nhpoly1305_units(struct nhpoly1305_state *state, + const struct nhpoly1305_key *key, + const u8 *src, unsigned int srclen, nh_t nh_fn) +{ + do { + unsigned int bytes; + + if (state->nh_remaining == 0) { + /* Starting a new NH message */ + bytes = min_t(unsigned int, srclen, NH_MESSAGE_BYTES); + nh_fn(key->nh_key, src, bytes, state->nh_hash); + state->nh_remaining = NH_MESSAGE_BYTES - bytes; + } else { + /* Continuing a previous NH message */ + __le64 tmp_hash[NH_NUM_PASSES]; + unsigned int pos; + int i; + + pos = NH_MESSAGE_BYTES - state->nh_remaining; + bytes = min(srclen, state->nh_remaining); + nh_fn(&key->nh_key[pos / 4], src, bytes, tmp_hash); + for (i = 0; i < NH_NUM_PASSES; i++) + le64_add_cpu(&state->nh_hash[i], + le64_to_cpu(tmp_hash[i])); + state->nh_remaining -= bytes; + } + if (state->nh_remaining == 0) + process_nh_hash_value(state, key); + src += bytes; + srclen -= bytes; + } while (srclen); +} + +int crypto_nhpoly1305_setkey(struct crypto_shash *tfm, + const u8 *key, unsigned int keylen) +{ + struct nhpoly1305_key *ctx = crypto_shash_ctx(tfm); + int i; + + if (keylen != NHPOLY1305_KEY_SIZE) + return -EINVAL; + + poly1305_core_setkey(&ctx->poly_key, key); + key += POLY1305_BLOCK_SIZE; + + for (i = 0; i < NH_KEY_WORDS; i++) + ctx->nh_key[i] = get_unaligned_le32(key + i * sizeof(u32)); + + return 0; +} +EXPORT_SYMBOL(crypto_nhpoly1305_setkey); + +int crypto_nhpoly1305_init(struct shash_desc *desc) +{ + struct nhpoly1305_state *state = shash_desc_ctx(desc); + + poly1305_core_init(&state->poly_state); + state->buflen = 0; + state->nh_remaining = 0; + return 0; +} +EXPORT_SYMBOL(crypto_nhpoly1305_init); + +int crypto_nhpoly1305_update_helper(struct shash_desc *desc, + const u8 *src, unsigned int srclen, + nh_t nh_fn) +{ + struct nhpoly1305_state *state = shash_desc_ctx(desc); + const struct nhpoly1305_key *key = crypto_shash_ctx(desc->tfm); + unsigned int bytes; + + if (state->buflen) { + bytes = min(srclen, (int)NH_MESSAGE_UNIT - state->buflen); + memcpy(&state->buffer[state->buflen], src, bytes); + state->buflen += bytes; + if (state->buflen < NH_MESSAGE_UNIT) + return 0; + nhpoly1305_units(state, key, state->buffer, NH_MESSAGE_UNIT, + nh_fn); + state->buflen = 0; + src += bytes; + srclen -= bytes; + } + + if (srclen >= NH_MESSAGE_UNIT) { + bytes = round_down(srclen, NH_MESSAGE_UNIT); + nhpoly1305_units(state, key, src, bytes, nh_fn); + src += bytes; + srclen -= bytes; + } + + if (srclen) { + memcpy(state->buffer, src, srclen); + state->buflen = srclen; + } + return 0; +} +EXPORT_SYMBOL(crypto_nhpoly1305_update_helper); + +int crypto_nhpoly1305_update(struct shash_desc *desc, + const u8 *src, unsigned int srclen) +{ + return crypto_nhpoly1305_update_helper(desc, src, srclen, nh_generic); +} +EXPORT_SYMBOL(crypto_nhpoly1305_update); + +int crypto_nhpoly1305_final_helper(struct shash_desc *desc, u8 *dst, nh_t nh_fn) +{ + struct nhpoly1305_state *state = shash_desc_ctx(desc); + const struct nhpoly1305_key *key = crypto_shash_ctx(desc->tfm); + + if (state->buflen) { + memset(&state->buffer[state->buflen], 0, + NH_MESSAGE_UNIT - state->buflen); + nhpoly1305_units(state, key, state->buffer, NH_MESSAGE_UNIT, + nh_fn); + } + + if (state->nh_remaining) + process_nh_hash_value(state, key); + + poly1305_core_emit(&state->poly_state, dst); + return 0; +} +EXPORT_SYMBOL(crypto_nhpoly1305_final_helper); + +int crypto_nhpoly1305_final(struct shash_desc *desc, u8 *dst) +{ + return crypto_nhpoly1305_final_helper(desc, dst, nh_generic); +} +EXPORT_SYMBOL(crypto_nhpoly1305_final); + +static struct shash_alg nhpoly1305_alg = { + .base.cra_name = "nhpoly1305", + .base.cra_driver_name = "nhpoly1305-generic", + .base.cra_priority = 100, + .base.cra_ctxsize = sizeof(struct nhpoly1305_key), + .base.cra_module = THIS_MODULE, + .digestsize = POLY1305_DIGEST_SIZE, + .init = crypto_nhpoly1305_init, + .update = crypto_nhpoly1305_update, + .final = crypto_nhpoly1305_final, + .setkey = crypto_nhpoly1305_setkey, + .descsize = sizeof(struct nhpoly1305_state), +}; + +static int __init nhpoly1305_mod_init(void) +{ + return crypto_register_shash(&nhpoly1305_alg); +} + +static void __exit nhpoly1305_mod_exit(void) +{ + crypto_unregister_shash(&nhpoly1305_alg); +} + +module_init(nhpoly1305_mod_init); +module_exit(nhpoly1305_mod_exit); + +MODULE_DESCRIPTION("NHPoly1305 ε-almost-∆-universal hash function"); +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR("Eric Biggers "); +MODULE_ALIAS_CRYPTO("nhpoly1305"); +MODULE_ALIAS_CRYPTO("nhpoly1305-generic"); diff --git a/crypto/testmgr.c b/crypto/testmgr.c index 6ff60c3745f1..665911c24786 100644 --- a/crypto/testmgr.c +++ b/crypto/testmgr.c @@ -3311,6 +3311,12 @@ static const struct alg_test_desc alg_test_descs[] = { .dec = __VECS(morus640_dec_tv_template), } } + }, { + .alg = "nhpoly1305", + .test = alg_test_hash, + .suite = { + .hash = __VECS(nhpoly1305_tv_template) + } }, { .alg = "ofb(aes)", .test = alg_test_skcipher, diff --git a/crypto/testmgr.h b/crypto/testmgr.h index a23dca2b11d0..50ea1f2705c7 100644 --- a/crypto/testmgr.h +++ b/crypto/testmgr.h @@ -27,7 +27,7 @@ #define MAX_DIGEST_SIZE 64 #define MAX_TAP 8 -#define MAX_KEYLEN 160 +#define MAX_KEYLEN 1088 #define MAX_IVLEN 32 struct hash_testvec { @@ -35,10 +35,10 @@ struct hash_testvec { const char *key; const char *plaintext; const char *digest; - unsigned char tap[MAX_TAP]; + unsigned short tap[MAX_TAP]; + unsigned short np; unsigned short psize; - unsigned char np; - unsigned char ksize; + unsigned short ksize; }; /* @@ -5709,6 +5709,1238 @@ static const struct hash_testvec poly1305_tv_template[] = { }, }; +/* NHPoly1305 test vectors from https://github.com/google/adiantum */ +static const struct hash_testvec nhpoly1305_tv_template[] = { + { + .key = "\xd2\x5d\x4c\xdd\x8d\x2b\x7f\x7a" + "\xd9\xbe\x71\xec\xd1\x83\x52\xe3" + "\xe1\xad\xd7\x5c\x0a\x75\x9d\xec" + "\x1d\x13\x7e\x5d\x71\x07\xc9\xe4" + "\x57\x2d\x44\x68\xcf\xd8\xd6\xc5" + "\x39\x69\x7d\x32\x75\x51\x4f\x7e" + "\xb2\x4c\xc6\x90\x51\x6e\xd9\xd6" + "\xa5\x8b\x2d\xf1\x94\xf9\xf7\x5e" + "\x2c\x84\x7b\x41\x0f\x88\x50\x89" + "\x30\xd9\xa1\x38\x46\x6c\xc0\x4f" + "\xe8\xdf\xdc\x66\xab\x24\x43\x41" + "\x91\x55\x29\x65\x86\x28\x5e\x45" + "\xd5\x2d\xb7\x80\x08\x9a\xc3\xd4" + "\x9a\x77\x0a\xd4\xef\x3e\xe6\x3f" + "\x6f\x2f\x9b\x3a\x7d\x12\x1e\x80" + "\x6c\x44\xa2\x25\xe1\xf6\x60\xe9" + "\x0d\xaf\xc5\x3c\xa5\x79\xae\x64" + "\xbc\xa0\x39\xa3\x4d\x10\xe5\x4d" + "\xd5\xe7\x89\x7a\x13\xee\x06\x78" + "\xdc\xa4\xdc\x14\x27\xe6\x49\x38" + "\xd0\xe0\x45\x25\x36\xc5\xf4\x79" + "\x2e\x9a\x98\x04\xe4\x2b\x46\x52" + "\x7c\x33\xca\xe2\x56\x51\x50\xe2" + "\xa5\x9a\xae\x18\x6a\x13\xf8\xd2" + "\x21\x31\x66\x02\xe2\xda\x8d\x7e" + "\x41\x19\xb2\x61\xee\x48\x8f\xf1" + "\x65\x24\x2e\x1e\x68\xce\x05\xd9" + "\x2a\xcf\xa5\x3a\x57\xdd\x35\x91" + "\x93\x01\xca\x95\xfc\x2b\x36\x04" + "\xe6\x96\x97\x28\xf6\x31\xfe\xa3" + "\x9d\xf6\x6a\x1e\x80\x8d\xdc\xec" + "\xaf\x66\x11\x13\x02\x88\xd5\x27" + "\x33\xb4\x1a\xcd\xa3\xf6\xde\x31" + "\x8e\xc0\x0e\x6c\xd8\x5a\x97\x5e" + "\xdd\xfd\x60\x69\x38\x46\x3f\x90" + "\x5e\x97\xd3\x32\x76\xc7\x82\x49" + "\xfe\xba\x06\x5f\x2f\xa2\xfd\xff" + "\x80\x05\x40\xe4\x33\x03\xfb\x10" + "\xc0\xde\x65\x8c\xc9\x8d\x3a\x9d" + "\xb5\x7b\x36\x4b\xb5\x0c\xcf\x00" + "\x9c\x87\xe4\x49\xad\x90\xda\x4a" + "\xdd\xbd\xff\xe2\x32\x57\xd6\x78" + "\x36\x39\x6c\xd3\x5b\x9b\x88\x59" + "\x2d\xf0\x46\xe4\x13\x0e\x2b\x35" + "\x0d\x0f\x73\x8a\x4f\x26\x84\x75" + "\x88\x3c\xc5\x58\x66\x18\x1a\xb4" + "\x64\x51\x34\x27\x1b\xa4\x11\xc9" + "\x6d\x91\x8a\xfa\x32\x60\x9d\xd7" + "\x87\xe5\xaa\x43\x72\xf8\xda\xd1" + "\x48\x44\x13\x61\xdc\x8c\x76\x17" + "\x0c\x85\x4e\xf3\xdd\xa2\x42\xd2" + "\x74\xc1\x30\x1b\xeb\x35\x31\x29" + "\x5b\xd7\x4c\x94\x46\x35\xa1\x23" + "\x50\xf2\xa2\x8e\x7e\x4f\x23\x4f" + "\x51\xff\xe2\xc9\xa3\x7d\x56\x8b" + "\x41\xf2\xd0\xc5\x57\x7e\x59\xac" + "\xbb\x65\xf3\xfe\xf7\x17\xef\x63" + "\x7c\x6f\x23\xdd\x22\x8e\xed\x84" + "\x0e\x3b\x09\xb3\xf3\xf4\x8f\xcd" + "\x37\xa8\xe1\xa7\x30\xdb\xb1\xa2" + "\x9c\xa2\xdf\x34\x17\x3e\x68\x44" + "\xd0\xde\x03\x50\xd1\x48\x6b\x20" + "\xe2\x63\x45\xa5\xea\x87\xc2\x42" + "\x95\x03\x49\x05\xed\xe0\x90\x29" + "\x1a\xb8\xcf\x9b\x43\xcf\x29\x7a" + "\x63\x17\x41\x9f\xe0\xc9\x10\xfd" + "\x2c\x56\x8c\x08\x55\xb4\xa9\x27" + "\x0f\x23\xb1\x05\x6a\x12\x46\xc7" + "\xe1\xfe\x28\x93\x93\xd7\x2f\xdc" + "\x98\x30\xdb\x75\x8a\xbe\x97\x7a" + "\x02\xfb\x8c\xba\xbe\x25\x09\xbe" + "\xce\xcb\xa2\xef\x79\x4d\x0e\x9d" + "\x1b\x9d\xb6\x39\x34\x38\xfa\x07" + "\xec\xe8\xfc\x32\x85\x1d\xf7\x85" + "\x63\xc3\x3c\xc0\x02\x75\xd7\x3f" + "\xb2\x68\x60\x66\x65\x81\xc6\xb1" + "\x42\x65\x4b\x4b\x28\xd7\xc7\xaa" + "\x9b\xd2\xdc\x1b\x01\xe0\x26\x39" + "\x01\xc1\x52\x14\xd1\x3f\xb7\xe6" + "\x61\x41\xc7\x93\xd2\xa2\x67\xc6" + "\xf7\x11\xb5\xf5\xea\xdd\x19\xfb" + "\x4d\x21\x12\xd6\x7d\xf1\x10\xb0" + "\x89\x07\xc7\x5a\x52\x73\x70\x2f" + "\x32\xef\x65\x2b\x12\xb2\xf0\xf5" + "\x20\xe0\x90\x59\x7e\x64\xf1\x4c" + "\x41\xb3\xa5\x91\x08\xe6\x5e\x5f" + "\x05\x56\x76\xb4\xb0\xcd\x70\x53" + "\x10\x48\x9c\xff\xc2\x69\x55\x24" + "\x87\xef\x84\xea\xfb\xa7\xbf\xa0" + "\x91\x04\xad\x4f\x8b\x57\x54\x4b" + "\xb6\xe9\xd1\xac\x37\x2f\x1d\x2e" + "\xab\xa5\xa4\xe8\xff\xfb\xd9\x39" + "\x2f\xb7\xac\xd1\xfe\x0b\x9a\x80" + "\x0f\xb6\xf4\x36\x39\x90\x51\xe3" + "\x0a\x2f\xb6\x45\x76\x89\xcd\x61" + "\xfe\x48\x5f\x75\x1d\x13\x00\x62" + "\x80\x24\x47\xe7\xbc\x37\xd7\xe3" + "\x15\xe8\x68\x22\xaf\x80\x6f\x4b" + "\xa8\x9f\x01\x10\x48\x14\xc3\x02" + "\x52\xd2\xc7\x75\x9b\x52\x6d\x30" + "\xac\x13\x85\xc8\xf7\xa3\x58\x4b" + "\x49\xf7\x1c\x45\x55\x8c\x39\x9a" + "\x99\x6d\x97\x27\x27\xe6\xab\xdd" + "\x2c\x42\x1b\x35\xdd\x9d\x73\xbb" + "\x6c\xf3\x64\xf1\xfb\xb9\xf7\xe6" + "\x4a\x3c\xc0\x92\xc0\x2e\xb7\x1a" + "\xbe\xab\xb3\x5a\xe5\xea\xb1\x48" + "\x58\x13\x53\x90\xfd\xc3\x8e\x54" + "\xf9\x18\x16\x73\xe8\xcb\x6d\x39" + "\x0e\xd7\xe0\xfe\xb6\x9f\x43\x97" + "\xe8\xd0\x85\x56\x83\x3e\x98\x68" + "\x7f\xbd\x95\xa8\x9a\x61\x21\x8f" + "\x06\x98\x34\xa6\xc8\xd6\x1d\xf3" + "\x3d\x43\xa4\x9a\x8c\xe5\xd3\x5a" + "\x32\xa2\x04\x22\xa4\x19\x1a\x46" + "\x42\x7e\x4d\xe5\xe0\xe6\x0e\xca" + "\xd5\x58\x9d\x2c\xaf\xda\x33\x5c" + "\xb0\x79\x9e\xc9\xfc\xca\xf0\x2f" + "\xa8\xb2\x77\xeb\x7a\xa2\xdd\x37" + "\x35\x83\x07\xd6\x02\x1a\xb6\x6c" + "\x24\xe2\x59\x08\x0e\xfd\x3e\x46" + "\xec\x40\x93\xf4\x00\x26\x4f\x2a" + "\xff\x47\x2f\xeb\x02\x92\x26\x5b" + "\x53\x17\xc2\x8d\x2a\xc7\xa3\x1b" + "\xcd\xbc\xa7\xe8\xd1\x76\xe3\x80" + "\x21\xca\x5d\x3b\xe4\x9c\x8f\xa9" + "\x5b\x7f\x29\x7f\x7c\xd8\xed\x6d" + "\x8c\xb2\x86\x85\xe7\x77\xf2\x85" + "\xab\x38\xa9\x9d\xc1\x4e\xc5\x64" + "\x33\x73\x8b\x59\x03\xad\x05\xdf" + "\x25\x98\x31\xde\xef\x13\xf1\x9b" + "\x3c\x91\x9d\x7b\xb1\xfa\xe6\xbf" + "\x5b\xed\xa5\x55\xe6\xea\x6c\x74" + "\xf4\xb9\xe4\x45\x64\x72\x81\xc2" + "\x4c\x28\xd4\xcd\xac\xe2\xde\xf9" + "\xeb\x5c\xeb\x61\x60\x5a\xe5\x28", + .ksize = 1088, + .plaintext = "", + .psize = 0, + .digest = "\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00", + }, { + .key = "\x29\x21\x43\xcb\xcb\x13\x07\xde" + "\xbf\x48\xdf\x8a\x7f\xa2\x84\xde" + "\x72\x23\x9d\xf5\xf0\x07\xf2\x4c" + "\x20\x3a\x93\xb9\xcd\x5d\xfe\xcb" + "\x99\x2c\x2b\x58\xc6\x50\x5f\x94" + "\x56\xc3\x7c\x0d\x02\x3f\xb8\x5e" + "\x7b\xc0\x6c\x51\x34\x76\xc0\x0e" + "\xc6\x22\xc8\x9e\x92\xa0\x21\xc9" + "\x85\x5c\x7c\xf8\xe2\x64\x47\xc9" + "\xe4\xa2\x57\x93\xf8\xa2\x69\xcd" + "\x62\x98\x99\xf4\xd7\x7b\x14\xb1" + "\xd8\x05\xff\x04\x15\xc9\xe1\x6e" + "\x9b\xe6\x50\x6b\x0b\x3f\x22\x1f" + "\x08\xde\x0c\x5b\x08\x7e\xc6\x2f" + "\x6c\xed\xd6\xb2\x15\xa4\xb3\xf9" + "\xa7\x46\x38\x2a\xea\x69\xa5\xde" + "\x02\xc3\x96\x89\x4d\x55\x3b\xed" + "\x3d\x3a\x85\x77\xbf\x97\x45\x5c" + "\x9e\x02\x69\xe2\x1b\x68\xbe\x96" + "\xfb\x64\x6f\x0f\xf6\x06\x40\x67" + "\xfa\x04\xe3\x55\xfa\xbe\xa4\x60" + "\xef\x21\x66\x97\xe6\x9d\x5c\x1f" + "\x62\x37\xaa\x31\xde\xe4\x9c\x28" + "\x95\xe0\x22\x86\xf4\x4d\xf3\x07" + "\xfd\x5f\x3a\x54\x2c\x51\x80\x71" + "\xba\x78\x69\x5b\x65\xab\x1f\x81" + "\xed\x3b\xff\x34\xa3\xfb\xbc\x73" + "\x66\x7d\x13\x7f\xdf\x6e\xe2\xe2" + "\xeb\x4f\x6c\xda\x7d\x33\x57\xd0" + "\xd3\x7c\x95\x4f\x33\x58\x21\xc7" + "\xc0\xe5\x6f\x42\x26\xc6\x1f\x5e" + "\x85\x1b\x98\x9a\xa2\x1e\x55\x77" + "\x23\xdf\x81\x5e\x79\x55\x05\xfc" + "\xfb\xda\xee\xba\x5a\xba\xf7\x77" + "\x7f\x0e\xd3\xe1\x37\xfe\x8d\x2b" + "\xd5\x3f\xfb\xd0\xc0\x3c\x0b\x3f" + "\xcf\x3c\x14\xcf\xfb\x46\x72\x4c" + "\x1f\x39\xe2\xda\x03\x71\x6d\x23" + "\xef\x93\xcd\x39\xd9\x37\x80\x4d" + "\x65\x61\xd1\x2c\x03\xa9\x47\x72" + "\x4d\x1e\x0e\x16\x33\x0f\x21\x17" + "\xec\x92\xea\x6f\x37\x22\xa4\xd8" + "\x03\x33\x9e\xd8\x03\x69\x9a\xe8" + "\xb2\x57\xaf\x78\x99\x05\x12\xab" + "\x48\x90\x80\xf0\x12\x9b\x20\x64" + "\x7a\x1d\x47\x5f\xba\x3c\xf9\xc3" + "\x0a\x0d\x8d\xa1\xf9\x1b\x82\x13" + "\x3e\x0d\xec\x0a\x83\xc0\x65\xe1" + "\xe9\x95\xff\x97\xd6\xf2\xe4\xd5" + "\x86\xc0\x1f\x29\x27\x63\xd7\xde" + "\xb7\x0a\x07\x99\x04\x2d\xa3\x89" + "\xa2\x43\xcf\xf3\xe1\x43\xac\x4a" + "\x06\x97\xd0\x05\x4f\x87\xfa\xf9" + "\x9b\xbf\x52\x70\xbd\xbc\x6c\xf3" + "\x03\x13\x60\x41\x28\x09\xec\xcc" + "\xb1\x1a\xec\xd6\xfb\x6f\x2a\x89" + "\x5d\x0b\x53\x9c\x59\xc1\x84\x21" + "\x33\x51\x47\x19\x31\x9c\xd4\x0a" + "\x4d\x04\xec\x50\x90\x61\xbd\xbc" + "\x7e\xc8\xd9\x6c\x98\x1d\x45\x41" + "\x17\x5e\x97\x1c\xc5\xa8\xe8\xea" + "\x46\x58\x53\xf7\x17\xd5\xad\x11" + "\xc8\x54\xf5\x7a\x33\x90\xf5\x19" + "\xba\x36\xb4\xfc\x52\xa5\x72\x3d" + "\x14\xbb\x55\xa7\xe9\xe3\x12\xf7" + "\x1c\x30\xa2\x82\x03\xbf\x53\x91" + "\x2e\x60\x41\x9f\x5b\x69\x39\xf6" + "\x4d\xc8\xf8\x46\x7a\x7f\xa4\x98" + "\x36\xff\x06\xcb\xca\xe7\x33\xf2" + "\xc0\x4a\xf4\x3c\x14\x44\x5f\x6b" + "\x75\xef\x02\x36\x75\x08\x14\xfd" + "\x10\x8e\xa5\x58\xd0\x30\x46\x49" + "\xaf\x3a\xf8\x40\x3d\x35\xdb\x84" + "\x11\x2e\x97\x6a\xb7\x87\x7f\xad" + "\xf1\xfa\xa5\x63\x60\xd8\x5e\xbf" + "\x41\x78\x49\xcf\x77\xbb\x56\xbb" + "\x7d\x01\x67\x05\x22\xc8\x8f\x41" + "\xba\x81\xd2\xca\x2c\x38\xac\x76" + "\x06\xc1\x1a\xc2\xce\xac\x90\x67" + "\x57\x3e\x20\x12\x5b\xd9\x97\x58" + "\x65\x05\xb7\x04\x61\x7e\xd8\x3a" + "\xbf\x55\x3b\x13\xe9\x34\x5a\x37" + "\x36\xcb\x94\x45\xc5\x32\xb3\xa0" + "\x0c\x3e\x49\xc5\xd3\xed\xa7\xf0" + "\x1c\x69\xcc\xea\xcc\x83\xc9\x16" + "\x95\x72\x4b\xf4\x89\xd5\xb9\x10" + "\xf6\x2d\x60\x15\xea\x3c\x06\x66" + "\x9f\x82\xad\x17\xce\xd2\xa4\x48" + "\x7c\x65\xd9\xf8\x02\x4d\x9b\x4c" + "\x89\x06\x3a\x34\x85\x48\x89\x86" + "\xf9\x24\xa9\x54\x72\xdb\x44\x95" + "\xc7\x44\x1c\x19\x11\x4c\x04\xdc" + "\x13\xb9\x67\xc8\xc3\x3a\x6a\x50" + "\xfa\xd1\xfb\xe1\x88\xb6\xf1\xa3" + "\xc5\x3b\xdc\x38\x45\x16\x26\x02" + "\x3b\xb8\x8f\x8b\x58\x7d\x23\x04" + "\x50\x6b\x81\x9f\xae\x66\xac\x6f" + "\xcf\x2a\x9d\xf1\xfd\x1d\x57\x07" + "\xbe\x58\xeb\x77\x0c\xe3\xc2\x19" + "\x14\x74\x1b\x51\x1c\x4f\x41\xf3" + "\x32\x89\xb3\xe7\xde\x62\xf6\x5f" + "\xc7\x6a\x4a\x2a\x5b\x0f\x5f\x87" + "\x9c\x08\xb9\x02\x88\xc8\x29\xb7" + "\x94\x52\xfa\x52\xfe\xaa\x50\x10" + "\xba\x48\x75\x5e\x11\x1b\xe6\x39" + "\xd7\x82\x2c\x87\xf1\x1e\xa4\x38" + "\x72\x3e\x51\xe7\xd8\x3e\x5b\x7b" + "\x31\x16\x89\xba\xd6\xad\x18\x5e" + "\xba\xf8\x12\xb3\xf4\x6c\x47\x30" + "\xc0\x38\x58\xb3\x10\x8d\x58\x5d" + "\xb4\xfb\x19\x7e\x41\xc3\x66\xb8" + "\xd6\x72\x84\xe1\x1a\xc2\x71\x4c" + "\x0d\x4a\x21\x7a\xab\xa2\xc0\x36" + "\x15\xc5\xe9\x46\xd7\x29\x17\x76" + "\x5e\x47\x36\x7f\x72\x05\xa7\xcc" + "\x36\x63\xf9\x47\x7d\xe6\x07\x3c" + "\x8b\x79\x1d\x96\x61\x8d\x90\x65" + "\x7c\xf5\xeb\x4e\x6e\x09\x59\x6d" + "\x62\x50\x1b\x0f\xe0\xdc\x78\xf2" + "\x5b\x83\x1a\xa1\x11\x75\xfd\x18" + "\xd7\xe2\x8d\x65\x14\x21\xce\xbe" + "\xb5\x87\xe3\x0a\xda\x24\x0a\x64" + "\xa9\x9f\x03\x8d\x46\x5d\x24\x1a" + "\x8a\x0c\x42\x01\xca\xb1\x5f\x7c" + "\xa5\xac\x32\x4a\xb8\x07\x91\x18" + "\x6f\xb0\x71\x3c\xc9\xb1\xa8\xf8" + "\x5f\x69\xa5\xa1\xca\x9e\x7a\xaa" + "\xac\xe9\xc7\x47\x41\x75\x25\xc3" + "\x73\xe2\x0b\xdd\x6d\x52\x71\xbe" + "\xc5\xdc\xb4\xe7\x01\x26\x53\x77" + "\x86\x90\x85\x68\x6b\x7b\x03\x53" + "\xda\x52\x52\x51\x68\xc8\xf3\xec" + "\x6c\xd5\x03\x7a\xa3\x0e\xb4\x02" + "\x5f\x1a\xab\xee\xca\x67\x29\x7b" + "\xbd\x96\x59\xb3\x8b\x32\x7a\x92" + "\x9f\xd8\x25\x2b\xdf\xc0\x4c\xda", + .ksize = 1088, + .plaintext = "\xbc\xda\x81\xa8\x78\x79\x1c\xbf" + "\x77\x53\xba\x4c\x30\x5b\xb8\x33", + .psize = 16, + .digest = "\x04\xbf\x7f\x6a\xce\x72\xea\x6a" + "\x79\xdb\xb0\xc9\x60\xf6\x12\xcc", + .np = 6, + .tap = { 4, 4, 1, 1, 1, 5 }, + }, { + .key = "\x65\x4d\xe3\xf8\xd2\x4c\xac\x28" + "\x68\xf5\xb3\x81\x71\x4b\xa1\xfa" + "\x04\x0e\xd3\x81\x36\xbe\x0c\x81" + "\x5e\xaf\xbc\x3a\xa4\xc0\x8e\x8b" + "\x55\x63\xd3\x52\x97\x88\xd6\x19" + "\xbc\x96\xdf\x49\xff\x04\x63\xf5" + "\x0c\x11\x13\xaa\x9e\x1f\x5a\xf7" + "\xdd\xbd\x37\x80\xc3\xd0\xbe\xa7" + "\x05\xc8\x3c\x98\x1e\x05\x3c\x84" + "\x39\x61\xc4\xed\xed\x71\x1b\xc4" + "\x74\x45\x2c\xa1\x56\x70\x97\xfd" + "\x44\x18\x07\x7d\xca\x60\x1f\x73" + "\x3b\x6d\x21\xcb\x61\x87\x70\x25" + "\x46\x21\xf1\x1f\x21\x91\x31\x2d" + "\x5d\xcc\xb7\xd1\x84\x3e\x3d\xdb" + "\x03\x53\x2a\x82\xa6\x9a\x95\xbc" + "\x1a\x1e\x0a\x5e\x07\x43\xab\x43" + "\xaf\x92\x82\x06\x91\x04\x09\xf4" + "\x17\x0a\x9a\x2c\x54\xdb\xb8\xf4" + "\xd0\xf0\x10\x66\x24\x8d\xcd\xda" + "\xfe\x0e\x45\x9d\x6f\xc4\x4e\xf4" + "\x96\xaf\x13\xdc\xa9\xd4\x8c\xc4" + "\xc8\x57\x39\x3c\xc2\xd3\x0a\x76" + "\x4a\x1f\x75\x83\x44\xc7\xd1\x39" + "\xd8\xb5\x41\xba\x73\x87\xfa\x96" + "\xc7\x18\x53\xfb\x9b\xda\xa0\x97" + "\x1d\xee\x60\x85\x9e\x14\xc3\xce" + "\xc4\x05\x29\x3b\x95\x30\xa3\xd1" + "\x9f\x82\x6a\x04\xf5\xa7\x75\x57" + "\x82\x04\xfe\x71\x51\x71\xb1\x49" + "\x50\xf8\xe0\x96\xf1\xfa\xa8\x88" + "\x3f\xa0\x86\x20\xd4\x60\x79\x59" + "\x17\x2d\xd1\x09\xf4\xec\x05\x57" + "\xcf\x62\x7e\x0e\x7e\x60\x78\xe6" + "\x08\x60\x29\xd8\xd5\x08\x1a\x24" + "\xc4\x6c\x24\xe7\x92\x08\x3d\x8a" + "\x98\x7a\xcf\x99\x0a\x65\x0e\xdc" + "\x8c\x8a\xbe\x92\x82\x91\xcc\x62" + "\x30\xb6\xf4\x3f\xc6\x8a\x7f\x12" + "\x4a\x8a\x49\xfa\x3f\x5c\xd4\x5a" + "\xa6\x82\xa3\xe6\xaa\x34\x76\xb2" + "\xab\x0a\x30\xef\x6c\x77\x58\x3f" + "\x05\x6b\xcc\x5c\xae\xdc\xd7\xb9" + "\x51\x7e\x8d\x32\x5b\x24\x25\xbe" + "\x2b\x24\x01\xcf\x80\xda\x16\xd8" + "\x90\x72\x2c\xad\x34\x8d\x0c\x74" + "\x02\xcb\xfd\xcf\x6e\xef\x97\xb5" + "\x4c\xf2\x68\xca\xde\x43\x9e\x8a" + "\xc5\x5f\x31\x7f\x14\x71\x38\xec" + "\xbd\x98\xe5\x71\xc4\xb5\xdb\xef" + "\x59\xd2\xca\xc0\xc1\x86\x75\x01" + "\xd4\x15\x0d\x6f\xa4\xf7\x7b\x37" + "\x47\xda\x18\x93\x63\xda\xbe\x9e" + "\x07\xfb\xb2\x83\xd5\xc4\x34\x55" + "\xee\x73\xa1\x42\x96\xf9\x66\x41" + "\xa4\xcc\xd2\x93\x6e\xe1\x0a\xbb" + "\xd2\xdd\x18\x23\xe6\x6b\x98\x0b" + "\x8a\x83\x59\x2c\xc3\xa6\x59\x5b" + "\x01\x22\x59\xf7\xdc\xb0\x87\x7e" + "\xdb\x7d\xf4\x71\x41\xab\xbd\xee" + "\x79\xbe\x3c\x01\x76\x0b\x2d\x0a" + "\x42\xc9\x77\x8c\xbb\x54\x95\x60" + "\x43\x2e\xe0\x17\x52\xbd\x90\xc9" + "\xc2\x2c\xdd\x90\x24\x22\x76\x40" + "\x5c\xb9\x41\xc9\xa1\xd5\xbd\xe3" + "\x44\xe0\xa4\xab\xcc\xb8\xe2\x32" + "\x02\x15\x04\x1f\x8c\xec\x5d\x14" + "\xac\x18\xaa\xef\x6e\x33\x19\x6e" + "\xde\xfe\x19\xdb\xeb\x61\xca\x18" + "\xad\xd8\x3d\xbf\x09\x11\xc7\xa5" + "\x86\x0b\x0f\xe5\x3e\xde\xe8\xd9" + "\x0a\x69\x9e\x4c\x20\xff\xf9\xc5" + "\xfa\xf8\xf3\x7f\xa5\x01\x4b\x5e" + "\x0f\xf0\x3b\x68\xf0\x46\x8c\x2a" + "\x7a\xc1\x8f\xa0\xfe\x6a\x5b\x44" + "\x70\x5c\xcc\x92\x2c\x6f\x0f\xbd" + "\x25\x3e\xb7\x8e\x73\x58\xda\xc9" + "\xa5\xaa\x9e\xf3\x9b\xfd\x37\x3e" + "\xe2\x88\xa4\x7b\xc8\x5c\xa8\x93" + "\x0e\xe7\x9a\x9c\x2e\x95\x18\x9f" + "\xc8\x45\x0c\x88\x9e\x53\x4f\x3a" + "\x76\xc1\x35\xfa\x17\xd8\xac\xa0" + "\x0c\x2d\x47\x2e\x4f\x69\x9b\xf7" + "\xd0\xb6\x96\x0c\x19\xb3\x08\x01" + "\x65\x7a\x1f\xc7\x31\x86\xdb\xc8" + "\xc1\x99\x8f\xf8\x08\x4a\x9d\x23" + "\x22\xa8\xcf\x27\x01\x01\x88\x93" + "\x9c\x86\x45\xbd\xe0\x51\xca\x52" + "\x84\xba\xfe\x03\xf7\xda\xc5\xce" + "\x3e\x77\x75\x86\xaf\x84\xc8\x05" + "\x44\x01\x0f\x02\xf3\x58\xb0\x06" + "\x5a\xd7\x12\x30\x8d\xdf\x1f\x1f" + "\x0a\xe6\xd2\xea\xf6\x3a\x7a\x99" + "\x63\xe8\xd2\xc1\x4a\x45\x8b\x40" + "\x4d\x0a\xa9\x76\x92\xb3\xda\x87" + "\x36\x33\xf0\x78\xc3\x2f\x5f\x02" + "\x1a\x6a\x2c\x32\xcd\x76\xbf\xbd" + "\x5a\x26\x20\x28\x8c\x8c\xbc\x52" + "\x3d\x0a\xc9\xcb\xab\xa4\x21\xb0" + "\x54\x40\x81\x44\xc7\xd6\x1c\x11" + "\x44\xc6\x02\x92\x14\x5a\xbf\x1a" + "\x09\x8a\x18\xad\xcd\x64\x3d\x53" + "\x4a\xb6\xa5\x1b\x57\x0e\xef\xe0" + "\x8c\x44\x5f\x7d\xbd\x6c\xfd\x60" + "\xae\x02\x24\xb6\x99\xdd\x8c\xaf" + "\x59\x39\x75\x3c\xd1\x54\x7b\x86" + "\xcc\x99\xd9\x28\x0c\xb0\x94\x62" + "\xf9\x51\xd1\x19\x96\x2d\x66\xf5" + "\x55\xcf\x9e\x59\xe2\x6b\x2c\x08" + "\xc0\x54\x48\x24\x45\xc3\x8c\x73" + "\xea\x27\x6e\x66\x7d\x1d\x0e\x6e" + "\x13\xe8\x56\x65\x3a\xb0\x81\x5c" + "\xf0\xe8\xd8\x00\x6b\xcd\x8f\xad" + "\xdd\x53\xf3\xa4\x6c\x43\xd6\x31" + "\xaf\xd2\x76\x1e\x91\x12\xdb\x3c" + "\x8c\xc2\x81\xf0\x49\xdb\xe2\x6b" + "\x76\x62\x0a\x04\xe4\xaa\x8a\x7c" + "\x08\x0b\x5d\xd0\xee\x1d\xfb\xc4" + "\x02\x75\x42\xd6\xba\xa7\x22\xa8" + "\x47\x29\xb7\x85\x6d\x93\x3a\xdb" + "\x00\x53\x0b\xa2\xeb\xf8\xfe\x01" + "\x6f\x8a\x31\xd6\x17\x05\x6f\x67" + "\x88\x95\x32\xfe\x4f\xa6\x4b\xf8" + "\x03\xe4\xcd\x9a\x18\xe8\x4e\x2d" + "\xf7\x97\x9a\x0c\x7d\x9f\x7e\x44" + "\x69\x51\xe0\x32\x6b\x62\x86\x8f" + "\xa6\x8e\x0b\x21\x96\xe5\xaf\x77" + "\xc0\x83\xdf\xa5\x0e\xd0\xa1\x04" + "\xaf\xc1\x10\xcb\x5a\x40\xe4\xe3" + "\x38\x7e\x07\xe8\x4d\xfa\xed\xc5" + "\xf0\x37\xdf\xbb\x8a\xcf\x3d\xdc" + "\x61\xd2\xc6\x2b\xff\x07\xc9\x2f" + "\x0c\x2d\x5c\x07\xa8\x35\x6a\xfc" + "\xae\x09\x03\x45\x74\x51\x4d\xc4" + "\xb8\x23\x87\x4a\x99\x27\x20\x87" + "\x62\x44\x0a\x4a\xce\x78\x47\x22", + .ksize = 1088, + .plaintext = "\x8e\xb0\x4c\xde\x9c\x4a\x04\x5a" + "\xf6\xa9\x7f\x45\x25\xa5\x7b\x3a" + "\xbc\x4d\x73\x39\x81\xb5\xbd\x3d" + "\x21\x6f\xd7\x37\x50\x3c\x7b\x28" + "\xd1\x03\x3a\x17\xed\x7b\x7c\x2a" + "\x16\xbc\xdf\x19\x89\x52\x71\x31" + "\xb6\xc0\xfd\xb5\xd3\xba\x96\x99" + "\xb6\x34\x0b\xd0\x99\x93\xfc\x1a" + "\x01\x3c\x85\xc6\x9b\x78\x5c\x8b" + "\xfe\xae\xd2\xbf\xb2\x6f\xf9\xed" + "\xc8\x25\x17\xfe\x10\x3b\x7d\xda" + "\xf4\x8d\x35\x4b\x7c\x7b\x82\xe7" + "\xc2\xb3\xee\x60\x4a\x03\x86\xc9" + "\x4e\xb5\xc4\xbe\xd2\xbd\x66\xf1" + "\x13\xf1\x09\xab\x5d\xca\x63\x1f" + "\xfc\xfb\x57\x2a\xfc\xca\x66\xd8" + "\x77\x84\x38\x23\x1d\xac\xd3\xb3" + "\x7a\xad\x4c\x70\xfa\x9c\xc9\x61" + "\xa6\x1b\xba\x33\x4b\x4e\x33\xec" + "\xa0\xa1\x64\x39\x40\x05\x1c\xc2" + "\x3f\x49\x9d\xae\xf2\xc5\xf2\xc5" + "\xfe\xe8\xf4\xc2\xf9\x96\x2d\x28" + "\x92\x30\x44\xbc\xd2\x7f\xe1\x6e" + "\x62\x02\x8f\x3d\x1c\x80\xda\x0e" + "\x6a\x90\x7e\x75\xff\xec\x3e\xc4" + "\xcd\x16\x34\x3b\x05\x6d\x4d\x20" + "\x1c\x7b\xf5\x57\x4f\xfa\x3d\xac" + "\xd0\x13\x55\xe8\xb3\xe1\x1b\x78" + "\x30\xe6\x9f\x84\xd4\x69\xd1\x08" + "\x12\x77\xa7\x4a\xbd\xc0\xf2\xd2" + "\x78\xdd\xa3\x81\x12\xcb\x6c\x14" + "\x90\x61\xe2\x84\xc6\x2b\x16\xcc" + "\x40\x99\x50\x88\x01\x09\x64\x4f" + "\x0a\x80\xbe\x61\xae\x46\xc9\x0a" + "\x5d\xe0\xfb\x72\x7a\x1a\xdd\x61" + "\x63\x20\x05\xa0\x4a\xf0\x60\x69" + "\x7f\x92\xbc\xbf\x4e\x39\x4d\xdd" + "\x74\xd1\xb7\xc0\x5a\x34\xb7\xae" + "\x76\x65\x2e\xbc\x36\xb9\x04\x95" + "\x42\xe9\x6f\xca\x78\xb3\x72\x07" + "\xa3\xba\x02\x94\x67\x4c\xb1\xd7" + "\xe9\x30\x0d\xf0\x3b\xb8\x10\x6d" + "\xea\x2b\x21\xbf\x74\x59\x82\x97" + "\x85\xaa\xf1\xd7\x54\x39\xeb\x05" + "\xbd\xf3\x40\xa0\x97\xe6\x74\xfe" + "\xb4\x82\x5b\xb1\x36\xcb\xe8\x0d" + "\xce\x14\xd9\xdf\xf1\x94\x22\xcd" + "\xd6\x00\xba\x04\x4c\x05\x0c\xc0" + "\xd1\x5a\xeb\x52\xd5\xa8\x8e\xc8" + "\x97\xa1\xaa\xc1\xea\xc1\xbe\x7c" + "\x36\xb3\x36\xa0\xc6\x76\x66\xc5" + "\xe2\xaf\xd6\x5c\xe2\xdb\x2c\xb3" + "\x6c\xb9\x99\x7f\xff\x9f\x03\x24" + "\xe1\x51\x44\x66\xd8\x0c\x5d\x7f" + "\x5c\x85\x22\x2a\xcf\x6d\x79\x28" + "\xab\x98\x01\x72\xfe\x80\x87\x5f" + "\x46\xba\xef\x81\x24\xee\xbf\xb0" + "\x24\x74\xa3\x65\x97\x12\xc4\xaf" + "\x8b\xa0\x39\xda\x8a\x7e\x74\x6e" + "\x1b\x42\xb4\x44\x37\xfc\x59\xfd" + "\x86\xed\xfb\x8c\x66\x33\xda\x63" + "\x75\xeb\xe1\xa4\x85\x4f\x50\x8f" + "\x83\x66\x0d\xd3\x37\xfa\xe6\x9c" + "\x4f\x30\x87\x35\x18\xe3\x0b\xb7" + "\x6e\x64\x54\xcd\x70\xb3\xde\x54" + "\xb7\x1d\xe6\x4c\x4d\x55\x12\x12" + "\xaf\x5f\x7f\x5e\xee\x9d\xe8\x8e" + "\x32\x9d\x4e\x75\xeb\xc6\xdd\xaa" + "\x48\x82\xa4\x3f\x3c\xd7\xd3\xa8" + "\x63\x9e\x64\xfe\xe3\x97\x00\x62" + "\xe5\x40\x5d\xc3\xad\x72\xe1\x28" + "\x18\x50\xb7\x75\xef\xcd\x23\xbf" + "\x3f\xc0\x51\x36\xf8\x41\xc3\x08" + "\xcb\xf1\x8d\x38\x34\xbd\x48\x45" + "\x75\xed\xbc\x65\x7b\xb5\x0c\x9b" + "\xd7\x67\x7d\x27\xb4\xc4\x80\xd7" + "\xa9\xb9\xc7\x4a\x97\xaa\xda\xc8" + "\x3c\x74\xcf\x36\x8f\xe4\x41\xe3" + "\xd4\xd3\x26\xa7\xf3\x23\x9d\x8f" + "\x6c\x20\x05\x32\x3e\xe0\xc3\xc8" + "\x56\x3f\xa7\x09\xb7\xfb\xc7\xf7" + "\xbe\x2a\xdd\x0f\x06\x7b\x0d\xdd" + "\xb0\xb4\x86\x17\xfd\xb9\x04\xe5" + "\xc0\x64\x5d\xad\x2a\x36\x38\xdb" + "\x24\xaf\x5b\xff\xca\xf9\x41\xe8" + "\xf9\x2f\x1e\x5e\xf9\xf5\xd5\xf2" + "\xb2\x88\xca\xc9\xa1\x31\xe2\xe8" + "\x10\x95\x65\xbf\xf1\x11\x61\x7a" + "\x30\x1a\x54\x90\xea\xd2\x30\xf6" + "\xa5\xad\x60\xf9\x4d\x84\x21\x1b" + "\xe4\x42\x22\xc8\x12\x4b\xb0\x58" + "\x3e\x9c\x2d\x32\x95\x0a\x8e\xb0" + "\x0a\x7e\x77\x2f\xe8\x97\x31\x6a" + "\xf5\x59\xb4\x26\xe6\x37\x12\xc9" + "\xcb\xa0\x58\x33\x6f\xd5\x55\x55" + "\x3c\xa1\x33\xb1\x0b\x7e\x2e\xb4" + "\x43\x2a\x84\x39\xf0\x9c\xf4\x69" + "\x4f\x1e\x79\xa6\x15\x1b\x87\xbb" + "\xdb\x9b\xe0\xf1\x0b\xba\xe3\x6e" + "\xcc\x2f\x49\x19\x22\x29\xfc\x71" + "\xbb\x77\x38\x18\x61\xaf\x85\x76" + "\xeb\xd1\x09\xcc\x86\x04\x20\x9a" + "\x66\x53\x2f\x44\x8b\xc6\xa3\xd2" + "\x5f\xc7\x79\x82\x66\xa8\x6e\x75" + "\x7d\x94\xd1\x86\x75\x0f\xa5\x4f" + "\x3c\x7a\x33\xce\xd1\x6e\x9d\x7b" + "\x1f\x91\x37\xb8\x37\x80\xfb\xe0" + "\x52\x26\xd0\x9a\xd4\x48\x02\x41" + "\x05\xe3\x5a\x94\xf1\x65\x61\x19" + "\xb8\x88\x4e\x2b\xea\xba\x8b\x58" + "\x8b\x42\x01\x00\xa8\xfe\x00\x5c" + "\xfe\x1c\xee\x31\x15\x69\xfa\xb3" + "\x9b\x5f\x22\x8e\x0d\x2c\xe3\xa5" + "\x21\xb9\x99\x8a\x8e\x94\x5a\xef" + "\x13\x3e\x99\x96\x79\x6e\xd5\x42" + "\x36\x03\xa9\xe2\xca\x65\x4e\x8a" + "\x8a\x30\xd2\x7d\x74\xe7\xf0\xaa" + "\x23\x26\xdd\xcb\x82\x39\xfc\x9d" + "\x51\x76\x21\x80\xa2\xbe\x93\x03" + "\x47\xb0\xc1\xb6\xdc\x63\xfd\x9f" + "\xca\x9d\xa5\xca\x27\x85\xe2\xd8" + "\x15\x5b\x7e\x14\x7a\xc4\x89\xcc" + "\x74\x14\x4b\x46\xd2\xce\xac\x39" + "\x6b\x6a\x5a\xa4\x0e\xe3\x7b\x15" + "\x94\x4b\x0f\x74\xcb\x0c\x7f\xa9" + "\xbe\x09\x39\xa3\xdd\x56\x5c\xc7" + "\x99\x56\x65\x39\xf4\x0b\x7d\x87" + "\xec\xaa\xe3\x4d\x22\x65\x39\x4e", + .psize = 1024, + .digest = "\x64\x3a\xbc\xc3\x3f\x74\x40\x51" + "\x6e\x56\x01\x1a\x51\xec\x36\xde", + .np = 8, + .tap = { 64, 203, 267, 28, 263, 62, 54, 83 }, + }, { + .key = "\x1b\x82\x2e\x1b\x17\x23\xb9\x6d" + "\xdc\x9c\xda\x99\x07\xe3\x5f\xd8" + "\xd2\xf8\x43\x80\x8d\x86\x7d\x80" + "\x1a\xd0\xcc\x13\xb9\x11\x05\x3f" + "\x7e\xcf\x7e\x80\x0e\xd8\x25\x48" + "\x8b\xaa\x63\x83\x92\xd0\x72\xf5" + "\x4f\x67\x7e\x50\x18\x25\xa4\xd1" + "\xe0\x7e\x1e\xba\xd8\xa7\x6e\xdb" + "\x1a\xcc\x0d\xfe\x9f\x6d\x22\x35" + "\xe1\xe6\xe0\xa8\x7b\x9c\xb1\x66" + "\xa3\xf8\xff\x4d\x90\x84\x28\xbc" + "\xdc\x19\xc7\x91\x49\xfc\xf6\x33" + "\xc9\x6e\x65\x7f\x28\x6f\x68\x2e" + "\xdf\x1a\x75\xe9\xc2\x0c\x96\xb9" + "\x31\x22\xc4\x07\xc6\x0a\x2f\xfd" + "\x36\x06\x5f\x5c\xc5\xb1\x3a\xf4" + "\x5e\x48\xa4\x45\x2b\x88\xa7\xee" + "\xa9\x8b\x52\xcc\x99\xd9\x2f\xb8" + "\xa4\x58\x0a\x13\xeb\x71\x5a\xfa" + "\xe5\x5e\xbe\xf2\x64\xad\x75\xbc" + "\x0b\x5b\x34\x13\x3b\x23\x13\x9a" + "\x69\x30\x1e\x9a\xb8\x03\xb8\x8b" + "\x3e\x46\x18\x6d\x38\xd9\xb3\xd8" + "\xbf\xf1\xd0\x28\xe6\x51\x57\x80" + "\x5e\x99\xfb\xd0\xce\x1e\x83\xf7" + "\xe9\x07\x5a\x63\xa9\xef\xce\xa5" + "\xfb\x3f\x37\x17\xfc\x0b\x37\x0e" + "\xbb\x4b\x21\x62\xb7\x83\x0e\xa9" + "\x9e\xb0\xc4\xad\x47\xbe\x35\xe7" + "\x51\xb2\xf2\xac\x2b\x65\x7b\x48" + "\xe3\x3f\x5f\xb6\x09\x04\x0c\x58" + "\xce\x99\xa9\x15\x2f\x4e\xc1\xf2" + "\x24\x48\xc0\xd8\x6c\xd3\x76\x17" + "\x83\x5d\xe6\xe3\xfd\x01\x8e\xf7" + "\x42\xa5\x04\x29\x30\xdf\xf9\x00" + "\x4a\xdc\x71\x22\x1a\x33\x15\xb6" + "\xd7\x72\xfb\x9a\xb8\xeb\x2b\x38" + "\xea\xa8\x61\xa8\x90\x11\x9d\x73" + "\x2e\x6c\xce\x81\x54\x5a\x9f\xcd" + "\xcf\xd5\xbd\x26\x5d\x66\xdb\xfb" + "\xdc\x1e\x7c\x10\xfe\x58\x82\x10" + "\x16\x24\x01\xce\x67\x55\x51\xd1" + "\xdd\x6b\x44\xa3\x20\x8e\xa9\xa6" + "\x06\xa8\x29\x77\x6e\x00\x38\x5b" + "\xde\x4d\x58\xd8\x1f\x34\xdf\xf9" + "\x2c\xac\x3e\xad\xfb\x92\x0d\x72" + "\x39\xa4\xac\x44\x10\xc0\x43\xc4" + "\xa4\x77\x3b\xfc\xc4\x0d\x37\xd3" + "\x05\x84\xda\x53\x71\xf8\x80\xd3" + "\x34\x44\xdb\x09\xb4\x2b\x8e\xe3" + "\x00\x75\x50\x9e\x43\x22\x00\x0b" + "\x7c\x70\xab\xd4\x41\xf1\x93\xcd" + "\x25\x2d\x84\x74\xb5\xf2\x92\xcd" + "\x0a\x28\xea\x9a\x49\x02\x96\xcb" + "\x85\x9e\x2f\x33\x03\x86\x1d\xdc" + "\x1d\x31\xd5\xfc\x9d\xaa\xc5\xe9" + "\x9a\xc4\x57\xf5\x35\xed\xf4\x4b" + "\x3d\x34\xc2\x29\x13\x86\x36\x42" + "\x5d\xbf\x90\x86\x13\x77\xe5\xc3" + "\x62\xb4\xfe\x0b\x70\x39\x35\x65" + "\x02\xea\xf6\xce\x57\x0c\xbb\x74" + "\x29\xe3\xfd\x60\x90\xfd\x10\x38" + "\xd5\x4e\x86\xbd\x37\x70\xf0\x97" + "\xa6\xab\x3b\x83\x64\x52\xca\x66" + "\x2f\xf9\xa4\xca\x3a\x55\x6b\xb0" + "\xe8\x3a\x34\xdb\x9e\x48\x50\x2f" + "\x3b\xef\xfd\x08\x2d\x5f\xc1\x37" + "\x5d\xbe\x73\xe4\xd8\xe9\xac\xca" + "\x8a\xaa\x48\x7c\x5c\xf4\xa6\x96" + "\x5f\xfa\x70\xa6\xb7\x8b\x50\xcb" + "\xa6\xf5\xa9\xbd\x7b\x75\x4c\x22" + "\x0b\x19\x40\x2e\xc9\x39\x39\x32" + "\x83\x03\xa8\xa4\x98\xe6\x8e\x16" + "\xb9\xde\x08\xc5\xfc\xbf\xad\x39" + "\xa8\xc7\x93\x6c\x6f\x23\xaf\xc1" + "\xab\xe1\xdf\xbb\x39\xae\x93\x29" + "\x0e\x7d\x80\x8d\x3e\x65\xf3\xfd" + "\x96\x06\x65\x90\xa1\x28\x64\x4b" + "\x69\xf9\xa8\x84\x27\x50\xfc\x87" + "\xf7\xbf\x55\x8e\x56\x13\x58\x7b" + "\x85\xb4\x6a\x72\x0f\x40\xf1\x4f" + "\x83\x81\x1f\x76\xde\x15\x64\x7a" + "\x7a\x80\xe4\xc7\x5e\x63\x01\x91" + "\xd7\x6b\xea\x0b\x9b\xa2\x99\x3b" + "\x6c\x88\xd8\xfd\x59\x3c\x8d\x22" + "\x86\x56\xbe\xab\xa1\x37\x08\x01" + "\x50\x85\x69\x29\xee\x9f\xdf\x21" + "\x3e\x20\x20\xf5\xb0\xbb\x6b\xd0" + "\x9c\x41\x38\xec\x54\x6f\x2d\xbd" + "\x0f\xe1\xbd\xf1\x2b\x6e\x60\x56" + "\x29\xe5\x7a\x70\x1c\xe2\xfc\x97" + "\x82\x68\x67\xd9\x3d\x1f\xfb\xd8" + "\x07\x9f\xbf\x96\x74\xba\x6a\x0e" + "\x10\x48\x20\xd8\x13\x1e\xb5\x44" + "\xf2\xcc\xb1\x8b\xfb\xbb\xec\xd7" + "\x37\x70\x1f\x7c\x55\xd2\x4b\xb9" + "\xfd\x70\x5e\xa3\x91\x73\x63\x52" + "\x13\x47\x5a\x06\xfb\x01\x67\xa5" + "\xc0\xd0\x49\x19\x56\x66\x9a\x77" + "\x64\xaf\x8c\x25\x91\x52\x87\x0e" + "\x18\xf3\x5f\x97\xfd\x71\x13\xf8" + "\x05\xa5\x39\xcc\x65\xd3\xcc\x63" + "\x5b\xdb\x5f\x7e\x5f\x6e\xad\xc4" + "\xf4\xa0\xc5\xc2\x2b\x4d\x97\x38" + "\x4f\xbc\xfa\x33\x17\xb4\x47\xb9" + "\x43\x24\x15\x8d\xd2\xed\x80\x68" + "\x84\xdb\x04\x80\xca\x5e\x6a\x35" + "\x2c\x2c\xe7\xc5\x03\x5f\x54\xb0" + "\x5e\x4f\x1d\x40\x54\x3d\x78\x9a" + "\xac\xda\x80\x27\x4d\x15\x4c\x1a" + "\x6e\x80\xc9\xc4\x3b\x84\x0e\xd9" + "\x2e\x93\x01\x8c\xc3\xc8\x91\x4b" + "\xb3\xaa\x07\x04\x68\x5b\x93\xa5" + "\xe7\xc4\x9d\xe7\x07\xee\xf5\x3b" + "\x40\x89\xcc\x60\x34\x9d\xb4\x06" + "\x1b\xef\x92\xe6\xc1\x2a\x7d\x0f" + "\x81\xaa\x56\xe3\xd7\xed\xa7\xd4" + "\xa7\x3a\x49\xc4\xad\x81\x5c\x83" + "\x55\x8e\x91\x54\xb7\x7d\x65\xa5" + "\x06\x16\xd5\x9a\x16\xc1\xb0\xa2" + "\x06\xd8\x98\x47\x73\x7e\x73\xa0" + "\xb8\x23\xb1\x52\xbf\x68\x74\x5d" + "\x0b\xcb\xfa\x8c\x46\xe3\x24\xe6" + "\xab\xd4\x69\x8d\x8c\xf2\x8a\x59" + "\xbe\x48\x46\x50\x8c\x9a\xe8\xe3" + "\x31\x55\x0a\x06\xed\x4f\xf8\xb7" + "\x4f\xe3\x85\x17\x30\xbd\xd5\x20" + "\xe7\x5b\xb2\x32\xcf\x6b\x16\x44" + "\xd2\xf5\x7e\xd7\xd1\x2f\xee\x64" + "\x3e\x9d\x10\xef\x27\x35\x43\x64" + "\x67\xfb\x7a\x7b\xe0\x62\x31\x9a" + "\x4d\xdf\xa5\xab\xc0\x20\xbb\x01" + "\xe9\x7b\x54\xf1\xde\xb2\x79\x50" + "\x6c\x4b\x91\xdb\x7f\xbb\x50\xc1" + "\x55\x44\x38\x9a\xe0\x9f\xe8\x29" + "\x6f\x15\xf8\x4e\xa6\xec\xa0\x60", + .ksize = 1088, + .plaintext = "\x15\x68\x9e\x2f\xad\x15\x52\xdf" + "\xf0\x42\x62\x24\x2a\x2d\xea\xbf" + "\xc7\xf3\xb4\x1a\xf5\xed\xb2\x08" + "\x15\x60\x1c\x00\x77\xbf\x0b\x0e" + "\xb7\x2c\xcf\x32\x3a\xc7\x01\x77" + "\xef\xa6\x75\xd0\x29\xc7\x68\x20" + "\xb2\x92\x25\xbf\x12\x34\xe9\xa4" + "\xfd\x32\x7b\x3f\x7c\xbd\xa5\x02" + "\x38\x41\xde\xc9\xc1\x09\xd9\xfc" + "\x6e\x78\x22\x83\x18\xf7\x50\x8d" + "\x8f\x9c\x2d\x02\xa5\x30\xac\xff" + "\xea\x63\x2e\x80\x37\x83\xb0\x58" + "\xda\x2f\xef\x21\x55\xba\x7b\xb1" + "\xb6\xed\xf5\xd2\x4d\xaa\x8c\xa9" + "\xdd\xdb\x0f\xb4\xce\xc1\x9a\xb1" + "\xc1\xdc\xbd\xab\x86\xc2\xdf\x0b" + "\xe1\x2c\xf9\xbe\xf6\xd8\xda\x62" + "\x72\xdd\x98\x09\x52\xc0\xc4\xb6" + "\x7b\x17\x5c\xf5\xd8\x4b\x88\xd6" + "\x6b\xbf\x84\x4a\x3f\xf5\x4d\xd2" + "\x94\xe2\x9c\xff\xc7\x3c\xd9\xc8" + "\x37\x38\xbc\x8c\xf3\xe7\xb7\xd0" + "\x1d\x78\xc4\x39\x07\xc8\x5e\x79" + "\xb6\x5a\x90\x5b\x6e\x97\xc9\xd4" + "\x82\x9c\xf3\x83\x7a\xe7\x97\xfc" + "\x1d\xbb\xef\xdb\xce\xe0\x82\xad" + "\xca\x07\x6c\x54\x62\x6f\x81\xe6" + "\x7a\x5a\x96\x6e\x80\x3a\xa2\x37" + "\x6f\xc6\xa4\x29\xc3\x9e\x19\x94" + "\x9f\xb0\x3e\x38\xfb\x3c\x2b\x7d" + "\xaa\xb8\x74\xda\x54\x23\x51\x12" + "\x4b\x96\x36\x8f\x91\x4f\x19\x37" + "\x83\xc9\xdd\xc7\x1a\x32\x2d\xab" + "\xc7\x89\xe2\x07\x47\x6c\xe8\xa6" + "\x70\x6b\x8e\x0c\xda\x5c\x6a\x59" + "\x27\x33\x0e\xe1\xe1\x20\xe8\xc8" + "\xae\xdc\xd0\xe3\x6d\xa8\xa6\x06" + "\x41\xb4\xd4\xd4\xcf\x91\x3e\x06" + "\xb0\x9a\xf7\xf1\xaa\xa6\x23\x92" + "\x10\x86\xf0\x94\xd1\x7c\x2e\x07" + "\x30\xfb\xc5\xd8\xf3\x12\xa9\xe8" + "\x22\x1c\x97\x1a\xad\x96\xb0\xa1" + "\x72\x6a\x6b\xb4\xfd\xf7\xe8\xfa" + "\xe2\x74\xd8\x65\x8d\x35\x17\x4b" + "\x00\x23\x5c\x8c\x70\xad\x71\xa2" + "\xca\xc5\x6c\x59\xbf\xb4\xc0\x6d" + "\x86\x98\x3e\x19\x5a\x90\x92\xb1" + "\x66\x57\x6a\x91\x68\x7c\xbc\xf3" + "\xf1\xdb\x94\xf8\x48\xf1\x36\xd8" + "\x78\xac\x1c\xa9\xcc\xd6\x27\xba" + "\x91\x54\x22\xf5\xe6\x05\x3f\xcc" + "\xc2\x8f\x2c\x3b\x2b\xc3\x2b\x2b" + "\x3b\xb8\xb6\x29\xb7\x2f\x94\xb6" + "\x7b\xfc\x94\x3e\xd0\x7a\x41\x59" + "\x7b\x1f\x9a\x09\xa6\xed\x4a\x82" + "\x9d\x34\x1c\xbd\x4e\x1c\x3a\x66" + "\x80\x74\x0e\x9a\x4f\x55\x54\x47" + "\x16\xba\x2a\x0a\x03\x35\x99\xa3" + "\x5c\x63\x8d\xa2\x72\x8b\x17\x15" + "\x68\x39\x73\xeb\xec\xf2\xe8\xf5" + "\x95\x32\x27\xd6\xc4\xfe\xb0\x51" + "\xd5\x0c\x50\xc5\xcd\x6d\x16\xb3" + "\xa3\x1e\x95\x69\xad\x78\x95\x06" + "\xb9\x46\xf2\x6d\x24\x5a\x99\x76" + "\x73\x6a\x91\xa6\xac\x12\xe1\x28" + "\x79\xbc\x08\x4e\x97\x00\x98\x63" + "\x07\x1c\x4e\xd1\x68\xf3\xb3\x81" + "\xa8\xa6\x5f\xf1\x01\xc9\xc1\xaf" + "\x3a\x96\xf9\x9d\xb5\x5a\x5f\x8f" + "\x7e\xc1\x7e\x77\x0a\x40\xc8\x8e" + "\xfc\x0e\xed\xe1\x0d\xb0\xe5\x5e" + "\x5e\x6f\xf5\x7f\xab\x33\x7d\xcd" + "\xf0\x09\x4b\xb2\x11\x37\xdc\x65" + "\x97\x32\x62\x71\x3a\x29\x54\xb9" + "\xc7\xa4\xbf\x75\x0f\xf9\x40\xa9" + "\x8d\xd7\x8b\xa7\xe0\x9a\xbe\x15" + "\xc6\xda\xd8\x00\x14\x69\x1a\xaf" + "\x5f\x79\xc3\xf5\xbb\x6c\x2a\x9d" + "\xdd\x3c\x5f\x97\x21\xe1\x3a\x03" + "\x84\x6a\xe9\x76\x11\x1f\xd3\xd5" + "\xf0\x54\x20\x4d\xc2\x91\xc3\xa4" + "\x36\x25\xbe\x1b\x2a\x06\xb7\xf3" + "\xd1\xd0\x55\x29\x81\x4c\x83\xa3" + "\xa6\x84\x1e\x5c\xd1\xd0\x6c\x90" + "\xa4\x11\xf0\xd7\x63\x6a\x48\x05" + "\xbc\x48\x18\x53\xcd\xb0\x8d\xdb" + "\xdc\xfe\x55\x11\x5c\x51\xb3\xab" + "\xab\x63\x3e\x31\x5a\x8b\x93\x63" + "\x34\xa9\xba\x2b\x69\x1a\xc0\xe3" + "\xcb\x41\xbc\xd7\xf5\x7f\x82\x3e" + "\x01\xa3\x3c\x72\xf4\xfe\xdf\xbe" + "\xb1\x67\x17\x2b\x37\x60\x0d\xca" + "\x6f\xc3\x94\x2c\xd2\x92\x6d\x9d" + "\x75\x18\x77\xaa\x29\x38\x96\xed" + "\x0e\x20\x70\x92\xd5\xd0\xb4\x00" + "\xc0\x31\xf2\xc9\x43\x0e\x75\x1d" + "\x4b\x64\xf2\x1f\xf2\x29\x6c\x7b" + "\x7f\xec\x59\x7d\x8c\x0d\xd4\xd3" + "\xac\x53\x4c\xa3\xde\x42\x92\x95" + "\x6d\xa3\x4f\xd0\xe6\x3d\xe7\xec" + "\x7a\x4d\x68\xf1\xfe\x67\x66\x09" + "\x83\x22\xb1\x98\x43\x8c\xab\xb8" + "\x45\xe6\x6d\xdf\x5e\x50\x71\xce" + "\xf5\x4e\x40\x93\x2b\xfa\x86\x0e" + "\xe8\x30\xbd\x82\xcc\x1c\x9c\x5f" + "\xad\xfd\x08\x31\xbe\x52\xe7\xe6" + "\xf2\x06\x01\x62\x25\x15\x99\x74" + "\x33\x51\x52\x57\x3f\x57\x87\x61" + "\xb9\x7f\x29\x3d\xcd\x92\x5e\xa6" + "\x5c\x3b\xf1\xed\x5f\xeb\x82\xed" + "\x56\x7b\x61\xe7\xfd\x02\x47\x0e" + "\x2a\x15\xa4\xce\x43\x86\x9b\xe1" + "\x2b\x4c\x2a\xd9\x42\x97\xf7\x9a" + "\xe5\x47\x46\x48\xd3\x55\x6f\x4d" + "\xd9\xeb\x4b\xdd\x7b\x21\x2f\xb3" + "\xa8\x36\x28\xdf\xca\xf1\xf6\xd9" + "\x10\xf6\x1c\xfd\x2e\x0c\x27\xe0" + "\x01\xb3\xff\x6d\x47\x08\x4d\xd4" + "\x00\x25\xee\x55\x4a\xe9\xe8\x5b" + "\xd8\xf7\x56\x12\xd4\x50\xb2\xe5" + "\x51\x6f\x34\x63\x69\xd2\x4e\x96" + "\x4e\xbc\x79\xbf\x18\xae\xc6\x13" + "\x80\x92\x77\xb0\xb4\x0f\x29\x94" + "\x6f\x4c\xbb\x53\x11\x36\xc3\x9f" + "\x42\x8e\x96\x8a\x91\xc8\xe9\xfc" + "\xfe\xbf\x7c\x2d\x6f\xf9\xb8\x44" + "\x89\x1b\x09\x53\x0a\x2a\x92\xc3" + "\x54\x7a\x3a\xf9\xe2\xe4\x75\x87" + "\xa0\x5e\x4b\x03\x7a\x0d\x8a\xf4" + "\x55\x59\x94\x2b\x63\x96\x0e\xf5", + .psize = 1040, + .digest = "\xb5\xb9\x08\xb3\x24\x3e\x03\xf0" + "\xd6\x0b\x57\xbc\x0a\x6d\x89\x59", + }, { + .key = "\xf6\x34\x42\x71\x35\x52\x8b\x58" + "\x02\x3a\x8e\x4a\x8d\x41\x13\xe9" + "\x7f\xba\xb9\x55\x9d\x73\x4d\xf8" + "\x3f\x5d\x73\x15\xff\xd3\x9e\x7f" + "\x20\x2a\x6a\xa8\xd1\xf0\x8f\x12" + "\x6b\x02\xd8\x6c\xde\xba\x80\x22" + "\x19\x37\xc8\xd0\x4e\x89\x17\x7c" + "\x7c\xdd\x88\xfd\x41\xc0\x04\xb7" + "\x1d\xac\x19\xe3\x20\xc7\x16\xcf" + "\x58\xee\x1d\x7a\x61\x69\xa9\x12" + "\x4b\xef\x4f\xb6\x38\xdd\x78\xf8" + "\x28\xee\x70\x08\xc7\x7c\xcc\xc8" + "\x1e\x41\xf5\x80\x86\x70\xd0\xf0" + "\xa3\x87\x6b\x0a\x00\xd2\x41\x28" + "\x74\x26\xf1\x24\xf3\xd0\x28\x77" + "\xd7\xcd\xf6\x2d\x61\xf4\xa2\x13" + "\x77\xb4\x6f\xa0\xf4\xfb\xd6\xb5" + "\x38\x9d\x5a\x0c\x51\xaf\xad\x63" + "\x27\x67\x8c\x01\xea\x42\x1a\x66" + "\xda\x16\x7c\x3c\x30\x0c\x66\x53" + "\x1c\x88\xa4\x5c\xb2\xe3\x78\x0a" + "\x13\x05\x6d\xe2\xaf\xb3\xe4\x75" + "\x00\x99\x58\xee\x76\x09\x64\xaa" + "\xbb\x2e\xb1\x81\xec\xd8\x0e\xd3" + "\x0c\x33\x5d\xb7\x98\xef\x36\xb6" + "\xd2\x65\x69\x41\x70\x12\xdc\x25" + "\x41\x03\x99\x81\x41\x19\x62\x13" + "\xd1\x0a\x29\xc5\x8c\xe0\x4c\xf3" + "\xd6\xef\x4c\xf4\x1d\x83\x2e\x6d" + "\x8e\x14\x87\xed\x80\xe0\xaa\xd3" + "\x08\x04\x73\x1a\x84\x40\xf5\x64" + "\xbd\x61\x32\x65\x40\x42\xfb\xb0" + "\x40\xf6\x40\x8d\xc7\x7f\x14\xd0" + "\x83\x99\xaa\x36\x7e\x60\xc6\xbf" + "\x13\x8a\xf9\x21\xe4\x7e\x68\x87" + "\xf3\x33\x86\xb4\xe0\x23\x7e\x0a" + "\x21\xb1\xf5\xad\x67\x3c\x9c\x9d" + "\x09\xab\xaf\x5f\xba\xe0\xd0\x82" + "\x48\x22\x70\xb5\x6d\x53\xd6\x0e" + "\xde\x64\x92\x41\xb0\xd3\xfb\xda" + "\x21\xfe\xab\xea\x20\xc4\x03\x58" + "\x18\x2e\x7d\x2f\x03\xa9\x47\x66" + "\xdf\x7b\xa4\x6b\x34\x6b\x55\x9c" + "\x4f\xd7\x9c\x47\xfb\xa9\x42\xec" + "\x5a\x12\xfd\xfe\x76\xa0\x92\x9d" + "\xfe\x1e\x16\xdd\x24\x2a\xe4\x27" + "\xd5\xa9\xf2\x05\x4f\x83\xa2\xaf" + "\xfe\xee\x83\x7a\xad\xde\xdf\x9a" + "\x80\xd5\x81\x14\x93\x16\x7e\x46" + "\x47\xc2\x14\xef\x49\x6e\xb9\xdb" + "\x40\xe8\x06\x6f\x9c\x2a\xfd\x62" + "\x06\x46\xfd\x15\x1d\x36\x61\x6f" + "\x77\x77\x5e\x64\xce\x78\x1b\x85" + "\xbf\x50\x9a\xfd\x67\xa6\x1a\x65" + "\xad\x5b\x33\x30\xf1\x71\xaa\xd9" + "\x23\x0d\x92\x24\x5f\xae\x57\xb0" + "\x24\x37\x0a\x94\x12\xfb\xb5\xb1" + "\xd3\xb8\x1d\x12\x29\xb0\x80\x24" + "\x2d\x47\x9f\x96\x1f\x95\xf1\xb1" + "\xda\x35\xf6\x29\xe0\xe1\x23\x96" + "\xc7\xe8\x22\x9b\x7c\xac\xf9\x41" + "\x39\x01\xe5\x73\x15\x5e\x99\xec" + "\xb4\xc1\xf4\xe7\xa7\x97\x6a\xd5" + "\x90\x9a\xa0\x1d\xf3\x5a\x8b\x5f" + "\xdf\x01\x52\xa4\x93\x31\x97\xb0" + "\x93\x24\xb5\xbc\xb2\x14\x24\x98" + "\x4a\x8f\x19\x85\xc3\x2d\x0f\x74" + "\x9d\x16\x13\x80\x5e\x59\x62\x62" + "\x25\xe0\xd1\x2f\x64\xef\xba\xac" + "\xcd\x09\x07\x15\x8a\xcf\x73\xb5" + "\x8b\xc9\xd8\x24\xb0\x53\xd5\x6f" + "\xe1\x2b\x77\xb1\xc5\xe4\xa7\x0e" + "\x18\x45\xab\x36\x03\x59\xa8\xbd" + "\x43\xf0\xd8\x2c\x1a\x69\x96\xbb" + "\x13\xdf\x6c\x33\x77\xdf\x25\x34" + "\x5b\xa5\x5b\x8c\xf9\x51\x05\xd4" + "\x8b\x8b\x44\x87\x49\xfc\xa0\x8f" + "\x45\x15\x5b\x40\x42\xc4\x09\x92" + "\x98\x0c\x4d\xf4\x26\x37\x1b\x13" + "\x76\x01\x93\x8d\x4f\xe6\xed\x18" + "\xd0\x79\x7b\x3f\x44\x50\xcb\xee" + "\xf7\x4a\xc9\x9e\xe0\x96\x74\xa7" + "\xe6\x93\xb2\x53\xca\x55\xa8\xdc" + "\x1e\x68\x07\x87\xb7\x2e\xc1\x08" + "\xb2\xa4\x5b\xaf\xc6\xdb\x5c\x66" + "\x41\x1c\x51\xd9\xb0\x07\x00\x0d" + "\xf0\x4c\xdc\x93\xde\xa9\x1e\x8e" + "\xd3\x22\x62\xd8\x8b\x88\x2c\xea" + "\x5e\xf1\x6e\x14\x40\xc7\xbe\xaa" + "\x42\x28\xd0\x26\x30\x78\x01\x9b" + "\x83\x07\xbc\x94\xc7\x57\xa2\x9f" + "\x03\x07\xff\x16\xff\x3c\x6e\x48" + "\x0a\xd0\xdd\x4c\xf6\x64\x9a\xf1" + "\xcd\x30\x12\x82\x2c\x38\xd3\x26" + "\x83\xdb\xab\x3e\xc6\xf8\xe6\xfa" + "\x77\x0a\x78\x82\x75\xf8\x63\x51" + "\x59\xd0\x8d\x24\x9f\x25\xe6\xa3" + "\x4c\xbc\x34\xfc\xe3\x10\xc7\x62" + "\xd4\x23\xc8\x3d\xa7\xc6\xa6\x0a" + "\x4f\x7e\x29\x9d\x6d\xbe\xb5\xf1" + "\xdf\xa4\x53\xfa\xc0\x23\x0f\x37" + "\x84\x68\xd0\xb5\xc8\xc6\xae\xf8" + "\xb7\x8d\xb3\x16\xfe\x8f\x87\xad" + "\xd0\xc1\x08\xee\x12\x1c\x9b\x1d" + "\x90\xf8\xd1\x63\xa4\x92\x3c\xf0" + "\xc7\x34\xd8\xf1\x14\xed\xa3\xbc" + "\x17\x7e\xd4\x62\x42\x54\x57\x2c" + "\x3e\x7a\x35\x35\x17\x0f\x0b\x7f" + "\x81\xa1\x3f\xd0\xcd\xc8\x3b\x96" + "\xe9\xe0\x4a\x04\xe1\xb6\x3c\xa1" + "\xd6\xca\xc4\xbd\xb6\xb5\x95\x34" + "\x12\x9d\xc5\x96\xf2\xdf\xba\x54" + "\x76\xd1\xb2\x6b\x3b\x39\xe0\xb9" + "\x18\x62\xfb\xf7\xfc\x12\xf1\x5f" + "\x7e\xc7\xe3\x59\x4c\xa6\xc2\x3d" + "\x40\x15\xf9\xa3\x95\x64\x4c\x74" + "\x8b\x73\x77\x33\x07\xa7\x04\x1d" + "\x33\x5a\x7e\x8f\xbd\x86\x01\x4f" + "\x3e\xb9\x27\x6f\xe2\x41\xf7\x09" + "\x67\xfd\x29\x28\xc5\xe4\xf6\x18" + "\x4c\x1b\x49\xb2\x9c\x5b\xf6\x81" + "\x4f\xbb\x5c\xcc\x0b\xdf\x84\x23" + "\x58\xd6\x28\x34\x93\x3a\x25\x97" + "\xdf\xb2\xc3\x9e\x97\x38\x0b\x7d" + "\x10\xb3\x54\x35\x23\x8c\x64\xee" + "\xf0\xd8\x66\xff\x8b\x22\xd2\x5b" + "\x05\x16\x3c\x89\xf7\xb1\x75\xaf" + "\xc0\xae\x6a\x4f\x3f\xaf\x9a\xf4" + "\xf4\x9a\x24\xd9\x80\x82\xc0\x12" + "\xde\x96\xd1\xbe\x15\x0b\x8d\x6a" + "\xd7\x12\xe4\x85\x9f\x83\xc9\xc3" + "\xff\x0b\xb5\xaf\x3b\xd8\x6d\x67" + "\x81\x45\xe6\xac\xec\xc1\x7b\x16" + "\x18\x0a\xce\x4b\xc0\x2e\x76\xbc" + "\x1b\xfa\xb4\x34\xb8\xfc\x3e\xc8" + "\x5d\x90\x71\x6d\x7a\x79\xef\x06", + .ksize = 1088, + .plaintext = "\xaa\x5d\x54\xcb\xea\x1e\x46\x0f" + "\x45\x87\x70\x51\x8a\x66\x7a\x33" + "\xb4\x18\xff\xa9\x82\xf9\x45\x4b" + "\x93\xae\x2e\x7f\xab\x98\xfe\xbf" + "\x01\xee\xe5\xa0\x37\x8f\x57\xa6" + "\xb0\x76\x0d\xa4\xd6\x28\x2b\x5d" + "\xe1\x03\xd6\x1c\x6f\x34\x0d\xe7" + "\x61\x2d\x2e\xe5\xae\x5d\x47\xc7" + "\x80\x4b\x18\x8f\xa8\x99\xbc\x28" + "\xed\x1d\x9d\x86\x7d\xd7\x41\xd1" + "\xe0\x2b\xe1\x8c\x93\x2a\xa7\x80" + "\xe1\x07\xa0\xa9\x9f\x8c\x8d\x1a" + "\x55\xfc\x6b\x24\x7a\xbd\x3e\x51" + "\x68\x4b\x26\x59\xc8\xa7\x16\xd9" + "\xb9\x61\x13\xde\x8b\x63\x1c\xf6" + "\x60\x01\xfb\x08\xb3\x5b\x0a\xbf" + "\x34\x73\xda\x87\x87\x3d\x6f\x97" + "\x4a\x0c\xa3\x58\x20\xa2\xc0\x81" + "\x5b\x8c\xef\xa9\xc2\x01\x1e\x64" + "\x83\x8c\xbc\x03\xb6\xd0\x29\x9f" + "\x54\xe2\xce\x8b\xc2\x07\x85\x78" + "\x25\x38\x96\x4c\xb4\xbe\x17\x4a" + "\x65\xa6\xfa\x52\x9d\x66\x9d\x65" + "\x4a\xd1\x01\x01\xf0\xcb\x13\xcc" + "\xa5\x82\xf3\xf2\x66\xcd\x3f\x9d" + "\xd1\xaa\xe4\x67\xea\xf2\xad\x88" + "\x56\x76\xa7\x9b\x59\x3c\xb1\x5d" + "\x78\xfd\x69\x79\x74\x78\x43\x26" + "\x7b\xde\x3f\xf1\xf5\x4e\x14\xd9" + "\x15\xf5\x75\xb5\x2e\x19\xf3\x0c" + "\x48\x72\xd6\x71\x6d\x03\x6e\xaa" + "\xa7\x08\xf9\xaa\x70\xa3\x0f\x4d" + "\x12\x8a\xdd\xe3\x39\x73\x7e\xa7" + "\xea\x1f\x6d\x06\x26\x2a\xf2\xc5" + "\x52\xb4\xbf\xfd\x52\x0c\x06\x60" + "\x90\xd1\xb2\x7b\x56\xae\xac\x58" + "\x5a\x6b\x50\x2a\xf5\xe0\x30\x3c" + "\x2a\x98\x0f\x1b\x5b\x0a\x84\x6c" + "\x31\xae\x92\xe2\xd4\xbb\x7f\x59" + "\x26\x10\xb9\x89\x37\x68\x26\xbf" + "\x41\xc8\x49\xc4\x70\x35\x7d\xff" + "\x2d\x7f\xf6\x8a\x93\x68\x8c\x78" + "\x0d\x53\xce\x7d\xff\x7d\xfb\xae" + "\x13\x1b\x75\xc4\x78\xd7\x71\xd8" + "\xea\xd3\xf4\x9d\x95\x64\x8e\xb4" + "\xde\xb8\xe4\xa6\x68\xc8\xae\x73" + "\x58\xaf\xa8\xb0\x5a\x20\xde\x87" + "\x43\xb9\x0f\xe3\xad\x41\x4b\xd5" + "\xb7\xad\x16\x00\xa6\xff\xf6\x74" + "\xbf\x8c\x9f\xb3\x58\x1b\xb6\x55" + "\xa9\x90\x56\x28\xf0\xb5\x13\x4e" + "\x9e\xf7\x25\x86\xe0\x07\x7b\x98" + "\xd8\x60\x5d\x38\x95\x3c\xe4\x22" + "\x16\x2f\xb2\xa2\xaf\xe8\x90\x17" + "\xec\x11\x83\x1a\xf4\xa9\x26\xda" + "\x39\x72\xf5\x94\x61\x05\x51\xec" + "\xa8\x30\x8b\x2c\x13\xd0\x72\xac" + "\xb9\xd2\xa0\x4c\x4b\x78\xe8\x6e" + "\x04\x85\xe9\x04\x49\x82\x91\xff" + "\x89\xe5\xab\x4c\xaa\x37\x03\x12" + "\xca\x8b\x74\x10\xfd\x9e\xd9\x7b" + "\xcb\xdb\x82\x6e\xce\x2e\x33\x39" + "\xce\xd2\x84\x6e\x34\x71\x51\x6e" + "\x0d\xd6\x01\x87\xc7\xfa\x0a\xd3" + "\xad\x36\xf3\x4c\x9f\x96\x5e\x62" + "\x62\x54\xc3\x03\x78\xd6\xab\xdd" + "\x89\x73\x55\x25\x30\xf8\xa7\xe6" + "\x4f\x11\x0c\x7c\x0a\xa1\x2b\x7b" + "\x3d\x0d\xde\x81\xd4\x9d\x0b\xae" + "\xdf\x00\xf9\x4c\xb6\x90\x8e\x16" + "\xcb\x11\xc8\xd1\x2e\x73\x13\x75" + "\x75\x3e\xaa\xf5\xee\x02\xb3\x18" + "\xa6\x2d\xf5\x3b\x51\xd1\x1f\x47" + "\x6b\x2c\xdb\xc4\x10\xe0\xc8\xba" + "\x9d\xac\xb1\x9d\x75\xd5\x41\x0e" + "\x7e\xbe\x18\x5b\xa4\x1f\xf8\x22" + "\x4c\xc1\x68\xda\x6d\x51\x34\x6c" + "\x19\x59\xec\xb5\xb1\xec\xa7\x03" + "\xca\x54\x99\x63\x05\x6c\xb1\xac" + "\x9c\x31\xd6\xdb\xba\x7b\x14\x12" + "\x7a\xc3\x2f\xbf\x8d\xdc\x37\x46" + "\xdb\xd2\xbc\xd4\x2f\xab\x30\xd5" + "\xed\x34\x99\x8e\x83\x3e\xbe\x4c" + "\x86\x79\x58\xe0\x33\x8d\x9a\xb8" + "\xa9\xa6\x90\x46\xa2\x02\xb8\xdd" + "\xf5\xf9\x1a\x5c\x8c\x01\xaa\x6e" + "\xb4\x22\x12\xf5\x0c\x1b\x9b\x7a" + "\xc3\x80\xf3\x06\x00\x5f\x30\xd5" + "\x06\xdb\x7d\x82\xc2\xd4\x0b\x4c" + "\x5f\xe9\xc5\xf5\xdf\x97\x12\xbf" + "\x56\xaf\x9b\x69\xcd\xee\x30\xb4" + "\xa8\x71\xff\x3e\x7d\x73\x7a\xb4" + "\x0d\xa5\x46\x7a\xf3\xf4\x15\x87" + "\x5d\x93\x2b\x8c\x37\x64\xb5\xdd" + "\x48\xd1\xe5\x8c\xae\xd4\xf1\x76" + "\xda\xf4\xba\x9e\x25\x0e\xad\xa3" + "\x0d\x08\x7c\xa8\x82\x16\x8d\x90" + "\x56\x40\x16\x84\xe7\x22\x53\x3a" + "\x58\xbc\xb9\x8f\x33\xc8\xc2\x84" + "\x22\xe6\x0d\xe7\xb3\xdc\x5d\xdf" + "\xd7\x2a\x36\xe4\x16\x06\x07\xd2" + "\x97\x60\xb2\xf5\x5e\x14\xc9\xfd" + "\x8b\x05\xd1\xce\xee\x9a\x65\x99" + "\xb7\xae\x19\xb7\xc8\xbc\xd5\xa2" + "\x7b\x95\xe1\xcc\xba\x0d\xdc\x8a" + "\x1d\x59\x52\x50\xaa\x16\x02\x82" + "\xdf\x61\x33\x2e\x44\xce\x49\xc7" + "\xe5\xc6\x2e\x76\xcf\x80\x52\xf0" + "\x3d\x17\x34\x47\x3f\xd3\x80\x48" + "\xa2\xba\xd5\xc7\x7b\x02\x28\xdb" + "\xac\x44\xc7\x6e\x05\x5c\xc2\x79" + "\xb3\x7d\x6a\x47\x77\x66\xf1\x38" + "\xf0\xf5\x4f\x27\x1a\x31\xca\x6c" + "\x72\x95\x92\x8e\x3f\xb0\xec\x1d" + "\xc7\x2a\xff\x73\xee\xdf\x55\x80" + "\x93\xd2\xbd\x34\xd3\x9f\x00\x51" + "\xfb\x2e\x41\xba\x6c\x5a\x7c\x17" + "\x7f\xe6\x70\xac\x8d\x39\x3f\x77" + "\xe2\x23\xac\x8f\x72\x4e\xe4\x53" + "\xcc\xf1\x1b\xf1\x35\xfe\x52\xa4" + "\xd6\xb8\x40\x6b\xc1\xfd\xa0\xa1" + "\xf5\x46\x65\xc2\x50\xbb\x43\xe2" + "\xd1\x43\x28\x34\x74\xf5\x87\xa0" + "\xf2\x5e\x27\x3b\x59\x2b\x3e\x49" + "\xdf\x46\xee\xaf\x71\xd7\x32\x36" + "\xc7\x14\x0b\x58\x6e\x3e\x2d\x41" + "\xfa\x75\x66\x3a\x54\xe0\xb2\xb9" + "\xaf\xdd\x04\x80\x15\x19\x3f\x6f" + "\xce\x12\xb4\xd8\xe8\x89\x3c\x05" + "\x30\xeb\xf3\x3d\xcd\x27\xec\xdc" + "\x56\x70\x12\xcf\x78\x2b\x77\xbf" + "\x22\xf0\x1b\x17\x9c\xcc\xd6\x1b" + "\x2d\x3d\xa0\x3b\xd8\xc9\x70\xa4" + "\x7a\x3e\x07\xb9\x06\xc3\xfa\xb0" + "\x33\xee\xc1\xd8\xf6\xe0\xf0\xb2" + "\x61\x12\x69\xb0\x5f\x28\x99\xda" + "\xc3\x61\x48\xfa\x07\x16\x03\xc4" + "\xa8\xe1\x3c\xe8\x0e\x64\x15\x30" + "\xc1\x9d\x84\x2f\x73\x98\x0e\x3a" + "\xf2\x86\x21\xa4\x9e\x1d\xb5\x86" + "\x16\xdb\x2b\x9a\x06\x64\x8e\x79" + "\x8d\x76\x3e\xc3\xc2\x64\x44\xe3" + "\xda\xbc\x1a\x52\xd7\x61\x03\x65" + "\x54\x32\x77\x01\xed\x9d\x8a\x43" + "\x25\x24\xe3\xc1\xbe\xb8\x2f\xcb" + "\x89\x14\x64\xab\xf6\xa0\x6e\x02" + "\x57\xe4\x7d\xa9\x4e\x9a\x03\x36" + "\xad\xf1\xb1\xfc\x0b\xe6\x79\x51" + "\x9f\x81\x77\xc4\x14\x78\x9d\xbf" + "\xb6\xd6\xa3\x8c\xba\x0b\x26\xe7" + "\xc8\xb9\x5c\xcc\xe1\x5f\xd5\xc6" + "\xc4\xca\xc2\xa3\x45\xba\x94\x13" + "\xb2\x8f\xc3\x54\x01\x09\xe7\x8b" + "\xda\x2a\x0a\x11\x02\x43\xcb\x57" + "\xc9\xcc\xb5\x5c\xab\xc4\xec\x54" + "\x00\x06\x34\xe1\x6e\x03\x89\x7c" + "\xc6\xfb\x6a\xc7\x60\x43\xd6\xc5" + "\xb5\x68\x72\x89\x8f\x42\xc3\x74" + "\xbd\x25\xaa\x9f\x67\xb5\xdf\x26" + "\x20\xe8\xb7\x01\x3c\xe4\x77\xce" + "\xc4\x65\xa7\x23\x79\xea\x33\xc7" + "\x82\x14\x5c\x82\xf2\x4e\x3d\xf6" + "\xc6\x4a\x0e\x29\xbb\xec\x44\xcd" + "\x2f\xd1\x4f\x21\x71\xa9\xce\x0f" + "\x5c\xf2\x72\x5c\x08\x2e\x21\xd2" + "\xc3\x29\x13\xd8\xac\xc3\xda\x13" + "\x1a\x9d\xa7\x71\x1d\x27\x1d\x27" + "\x1d\xea\xab\x44\x79\xad\xe5\xeb" + "\xef\x1f\x22\x0a\x44\x4f\xcb\x87" + "\xa7\x58\x71\x0e\x66\xf8\x60\xbf" + "\x60\x74\x4a\xb4\xec\x2e\xfe\xd3" + "\xf5\xb8\xfe\x46\x08\x50\x99\x6c" + "\x66\xa5\xa8\x34\x44\xb5\xe5\xf0" + "\xdd\x2c\x67\x4e\x35\x96\x8e\x67" + "\x48\x3f\x5f\x37\x44\x60\x51\x2e" + "\x14\x91\x5e\x57\xc3\x0e\x79\x77" + "\x2f\x03\xf4\xe2\x1c\x72\xbf\x85" + "\x5d\xd3\x17\xdf\x6c\xc5\x70\x24" + "\x42\xdf\x51\x4e\x2a\xb2\xd2\x5b" + "\x9e\x69\x83\x41\x11\xfe\x73\x22" + "\xde\x8a\x9e\xd8\x8a\xfb\x20\x38" + "\xd8\x47\x6f\xd5\xed\x8f\x41\xfd" + "\x13\x7a\x18\x03\x7d\x0f\xcd\x7d" + "\xa6\x7d\x31\x9e\xf1\x8f\x30\xa3" + "\x8b\x4c\x24\xb7\xf5\x48\xd7\xd9" + "\x12\xe7\x84\x97\x5c\x31\x6d\xfb" + "\xdf\xf3\xd3\xd1\xd5\x0c\x30\x06" + "\x01\x6a\xbc\x6c\x78\x7b\xa6\x50" + "\xfa\x0f\x3c\x42\x2d\xa5\xa3\x3b" + "\xcf\x62\x50\xff\x71\x6d\xe7\xda" + "\x27\xab\xc6\x67\x16\x65\x68\x64" + "\xc7\xd5\x5f\x81\xa9\xf6\x65\xb3" + "\x5e\x43\x91\x16\xcd\x3d\x55\x37" + "\x55\xb3\xf0\x28\xc5\x54\x19\xc0" + "\xe0\xd6\x2a\x61\xd4\xc8\x72\x51" + "\xe9\xa1\x7b\x48\x21\xad\x44\x09" + "\xe4\x01\x61\x3c\x8a\x5b\xf9\xa1" + "\x6e\x1b\xdf\xc0\x04\xa8\x8b\xf2" + "\x21\xbe\x34\x7b\xfc\xa1\xcd\xc9" + "\xa9\x96\xf4\xa4\x4c\xf7\x4e\x8f" + "\x84\xcc\xd3\xa8\x92\x77\x8f\x36" + "\xe2\x2e\x8c\x33\xe8\x84\xa6\x0c" + "\x6c\x8a\xda\x14\x32\xc2\x96\xff" + "\xc6\x4a\xc2\x9b\x30\x7f\xd1\x29" + "\xc0\xd5\x78\x41\x00\x80\x80\x03" + "\x2a\xb1\xde\x26\x03\x48\x49\xee" + "\x57\x14\x76\x51\x3c\x36\x5d\x0a" + "\x5c\x9f\xe8\xd8\x53\xdb\x4f\xd4" + "\x38\xbf\x66\xc9\x75\x12\x18\x75" + "\x34\x2d\x93\x22\x96\x51\x24\x6e" + "\x4e\xd9\x30\xea\x67\xff\x92\x1c" + "\x16\x26\xe9\xb5\x33\xab\x8c\x22" + "\x47\xdb\xa0\x2c\x08\xf0\x12\x69" + "\x7e\x93\x52\xda\xa5\xe5\xca\xc1" + "\x0f\x55\x2a\xbd\x09\x30\x88\x1b" + "\x9c\xc6\x9f\xe6\xdb\xa6\x92\xeb" + "\xf4\xbd\x5c\xc4\xdb\xc6\x71\x09" + "\xab\x5e\x48\x0c\xed\x6f\xda\x8e" + "\x8d\x0c\x98\x71\x7d\x10\xd0\x9c" + "\x20\x9b\x79\x53\x26\x5d\xb9\x85" + "\x8a\x31\xb8\xc5\x1c\x97\xde\x88" + "\x61\x55\x7f\x7c\x21\x06\xea\xc4" + "\x5f\xaf\xf2\xf0\xd5\x5e\x7d\xb4" + "\x6e\xcf\xe9\xae\x1b\x0e\x11\x80" + "\xc1\x9a\x74\x7e\x52\x6f\xa0\xb7" + "\x24\xcd\x8d\x0a\x11\x40\x63\x72" + "\xfa\xe2\xc5\xb3\x94\xef\x29\xa2" + "\x1a\x23\x43\x04\x37\x55\x0d\xe9" + "\x83\xb2\x29\x51\x49\x64\xa0\xbd" + "\xde\x73\xfd\xa5\x7c\x95\x70\x62" + "\x58\xdc\xe2\xd0\xbf\x98\xf5\x8a" + "\x6a\xfd\xce\xa8\x0e\x42\x2a\xeb" + "\xd2\xff\x83\x27\x53\x5c\xa0\x6e" + "\x93\xef\xe2\xb9\x5d\x35\xd6\x98" + "\xf6\x71\x19\x7a\x54\xa1\xa7\xe8" + "\x09\xfe\xf6\x9e\xc7\xbd\x3e\x29" + "\xbd\x6b\x17\xf4\xe7\x3e\x10\x5c" + "\xc1\xd2\x59\x4f\x4b\x12\x1a\x5b" + "\x50\x80\x59\xb9\xec\x13\x66\xa8" + "\xd2\x31\x7b\x6a\x61\x22\xdd\x7d" + "\x61\xee\x87\x16\x46\x9f\xf9\xc7" + "\x41\xee\x74\xf8\xd0\x96\x2c\x76" + "\x2a\xac\x7d\x6e\x9f\x0e\x7f\x95" + "\xfe\x50\x16\xb2\x23\xca\x62\xd5" + "\x68\xcf\x07\x3f\x3f\x97\x85\x2a" + "\x0c\x25\x45\xba\xdb\x32\xcb\x83" + "\x8c\x4f\xe0\x6d\x9a\x99\xf9\xc9" + "\xda\xd4\x19\x31\xc1\x7c\x6d\xd9" + "\x9c\x56\xd3\xec\xc1\x81\x4c\xed" + "\x28\x9d\x87\xeb\x19\xd7\x1a\x4f" + "\x04\x6a\xcb\x1f\xcf\x1f\xa2\x16" + "\xfc\x2a\x0d\xa1\x14\x2d\xfa\xc5" + "\x5a\xd2\xc5\xf9\x19\x7c\x20\x1f" + "\x2d\x10\xc0\x66\x7c\xd9\x2d\xe5" + "\x88\x70\x59\xa7\x85\xd5\x2e\x7c" + "\x5c\xe3\xb7\x12\xd6\x97\x3f\x29", + .psize = 2048, + .digest = "\x37\x90\x92\xc2\xeb\x01\x87\xd9" + "\x95\xc7\x91\xc3\x17\x8b\x38\x52", + } +}; + + /* * DES test vectors. */ diff --git a/include/crypto/nhpoly1305.h b/include/crypto/nhpoly1305.h new file mode 100644 index 000000000000..53c04423c582 --- /dev/null +++ b/include/crypto/nhpoly1305.h @@ -0,0 +1,74 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Common values and helper functions for the NHPoly1305 hash function. + */ + +#ifndef _NHPOLY1305_H +#define _NHPOLY1305_H + +#include +#include + +/* NH parameterization: */ + +/* Endianness: little */ +/* Word size: 32 bits (works well on NEON, SSE2, AVX2) */ + +/* Stride: 2 words (optimal on ARM32 NEON; works okay on other CPUs too) */ +#define NH_PAIR_STRIDE 2 +#define NH_MESSAGE_UNIT (NH_PAIR_STRIDE * 2 * sizeof(u32)) + +/* Num passes (Toeplitz iteration count): 4, to give ε = 2^{-128} */ +#define NH_NUM_PASSES 4 +#define NH_HASH_BYTES (NH_NUM_PASSES * sizeof(u64)) + +/* Max message size: 1024 bytes (32x compression factor) */ +#define NH_NUM_STRIDES 64 +#define NH_MESSAGE_WORDS (NH_PAIR_STRIDE * 2 * NH_NUM_STRIDES) +#define NH_MESSAGE_BYTES (NH_MESSAGE_WORDS * sizeof(u32)) +#define NH_KEY_WORDS (NH_MESSAGE_WORDS + \ + NH_PAIR_STRIDE * 2 * (NH_NUM_PASSES - 1)) +#define NH_KEY_BYTES (NH_KEY_WORDS * sizeof(u32)) + +#define NHPOLY1305_KEY_SIZE (POLY1305_BLOCK_SIZE + NH_KEY_BYTES) + +struct nhpoly1305_key { + struct poly1305_key poly_key; + u32 nh_key[NH_KEY_WORDS]; +}; + +struct nhpoly1305_state { + + /* Running total of polynomial evaluation */ + struct poly1305_state poly_state; + + /* Partial block buffer */ + u8 buffer[NH_MESSAGE_UNIT]; + unsigned int buflen; + + /* + * Number of bytes remaining until the current NH message reaches + * NH_MESSAGE_BYTES. When nonzero, 'nh_hash' holds the partial NH hash. + */ + unsigned int nh_remaining; + + __le64 nh_hash[NH_NUM_PASSES]; +}; + +typedef void (*nh_t)(const u32 *key, const u8 *message, size_t message_len, + __le64 hash[NH_NUM_PASSES]); + +int crypto_nhpoly1305_setkey(struct crypto_shash *tfm, + const u8 *key, unsigned int keylen); + +int crypto_nhpoly1305_init(struct shash_desc *desc); +int crypto_nhpoly1305_update(struct shash_desc *desc, + const u8 *src, unsigned int srclen); +int crypto_nhpoly1305_update_helper(struct shash_desc *desc, + const u8 *src, unsigned int srclen, + nh_t nh_fn); +int crypto_nhpoly1305_final(struct shash_desc *desc, u8 *dst); +int crypto_nhpoly1305_final_helper(struct shash_desc *desc, u8 *dst, + nh_t nh_fn); + +#endif /* _NHPOLY1305_H */ -- cgit v1.2.3-59-g8ed1b From 16aae3595a9d41c97d983889b341c455779c2ecf Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 16 Nov 2018 17:26:30 -0800 Subject: crypto: arm/nhpoly1305 - add NEON-accelerated NHPoly1305 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add an ARM NEON implementation of NHPoly1305, an ε-almost-∆-universal hash function used in the Adiantum encryption mode. For now, only the NH portion is actually NEON-accelerated; the Poly1305 part is less performance-critical so is just implemented in C. Signed-off-by: Eric Biggers Reviewed-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- arch/arm/crypto/Kconfig | 5 ++ arch/arm/crypto/Makefile | 2 + arch/arm/crypto/nh-neon-core.S | 116 +++++++++++++++++++++++++++++++++ arch/arm/crypto/nhpoly1305-neon-glue.c | 77 ++++++++++++++++++++++ 4 files changed, 200 insertions(+) create mode 100644 arch/arm/crypto/nh-neon-core.S create mode 100644 arch/arm/crypto/nhpoly1305-neon-glue.c diff --git a/arch/arm/crypto/Kconfig b/arch/arm/crypto/Kconfig index 59c674cf08ef..a95322b59799 100644 --- a/arch/arm/crypto/Kconfig +++ b/arch/arm/crypto/Kconfig @@ -131,4 +131,9 @@ config CRYPTO_CHACHA20_NEON select CRYPTO_BLKCIPHER select CRYPTO_CHACHA20 +config CRYPTO_NHPOLY1305_NEON + tristate "NEON accelerated NHPoly1305 hash function (for Adiantum)" + depends on KERNEL_MODE_NEON + select CRYPTO_NHPOLY1305 + endif diff --git a/arch/arm/crypto/Makefile b/arch/arm/crypto/Makefile index 005482ff9504..b65d6bfab8e6 100644 --- a/arch/arm/crypto/Makefile +++ b/arch/arm/crypto/Makefile @@ -10,6 +10,7 @@ obj-$(CONFIG_CRYPTO_SHA1_ARM_NEON) += sha1-arm-neon.o obj-$(CONFIG_CRYPTO_SHA256_ARM) += sha256-arm.o obj-$(CONFIG_CRYPTO_SHA512_ARM) += sha512-arm.o obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha-neon.o +obj-$(CONFIG_CRYPTO_NHPOLY1305_NEON) += nhpoly1305-neon.o ce-obj-$(CONFIG_CRYPTO_AES_ARM_CE) += aes-arm-ce.o ce-obj-$(CONFIG_CRYPTO_SHA1_ARM_CE) += sha1-arm-ce.o @@ -53,6 +54,7 @@ ghash-arm-ce-y := ghash-ce-core.o ghash-ce-glue.o crct10dif-arm-ce-y := crct10dif-ce-core.o crct10dif-ce-glue.o crc32-arm-ce-y:= crc32-ce-core.o crc32-ce-glue.o chacha-neon-y := chacha-neon-core.o chacha-neon-glue.o +nhpoly1305-neon-y := nh-neon-core.o nhpoly1305-neon-glue.o ifdef REGENERATE_ARM_CRYPTO quiet_cmd_perl = PERL $@ diff --git a/arch/arm/crypto/nh-neon-core.S b/arch/arm/crypto/nh-neon-core.S new file mode 100644 index 000000000000..434d80ab531c --- /dev/null +++ b/arch/arm/crypto/nh-neon-core.S @@ -0,0 +1,116 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * NH - ε-almost-universal hash function, NEON accelerated version + * + * Copyright 2018 Google LLC + * + * Author: Eric Biggers + */ + +#include + + .text + .fpu neon + + KEY .req r0 + MESSAGE .req r1 + MESSAGE_LEN .req r2 + HASH .req r3 + + PASS0_SUMS .req q0 + PASS0_SUM_A .req d0 + PASS0_SUM_B .req d1 + PASS1_SUMS .req q1 + PASS1_SUM_A .req d2 + PASS1_SUM_B .req d3 + PASS2_SUMS .req q2 + PASS2_SUM_A .req d4 + PASS2_SUM_B .req d5 + PASS3_SUMS .req q3 + PASS3_SUM_A .req d6 + PASS3_SUM_B .req d7 + K0 .req q4 + K1 .req q5 + K2 .req q6 + K3 .req q7 + T0 .req q8 + T0_L .req d16 + T0_H .req d17 + T1 .req q9 + T1_L .req d18 + T1_H .req d19 + T2 .req q10 + T2_L .req d20 + T2_H .req d21 + T3 .req q11 + T3_L .req d22 + T3_H .req d23 + +.macro _nh_stride k0, k1, k2, k3 + + // Load next message stride + vld1.8 {T3}, [MESSAGE]! + + // Load next key stride + vld1.32 {\k3}, [KEY]! + + // Add message words to key words + vadd.u32 T0, T3, \k0 + vadd.u32 T1, T3, \k1 + vadd.u32 T2, T3, \k2 + vadd.u32 T3, T3, \k3 + + // Multiply 32x32 => 64 and accumulate + vmlal.u32 PASS0_SUMS, T0_L, T0_H + vmlal.u32 PASS1_SUMS, T1_L, T1_H + vmlal.u32 PASS2_SUMS, T2_L, T2_H + vmlal.u32 PASS3_SUMS, T3_L, T3_H +.endm + +/* + * void nh_neon(const u32 *key, const u8 *message, size_t message_len, + * u8 hash[NH_HASH_BYTES]) + * + * It's guaranteed that message_len % 16 == 0. + */ +ENTRY(nh_neon) + + vld1.32 {K0,K1}, [KEY]! + vmov.u64 PASS0_SUMS, #0 + vmov.u64 PASS1_SUMS, #0 + vld1.32 {K2}, [KEY]! + vmov.u64 PASS2_SUMS, #0 + vmov.u64 PASS3_SUMS, #0 + + subs MESSAGE_LEN, MESSAGE_LEN, #64 + blt .Lloop4_done +.Lloop4: + _nh_stride K0, K1, K2, K3 + _nh_stride K1, K2, K3, K0 + _nh_stride K2, K3, K0, K1 + _nh_stride K3, K0, K1, K2 + subs MESSAGE_LEN, MESSAGE_LEN, #64 + bge .Lloop4 + +.Lloop4_done: + ands MESSAGE_LEN, MESSAGE_LEN, #63 + beq .Ldone + _nh_stride K0, K1, K2, K3 + + subs MESSAGE_LEN, MESSAGE_LEN, #16 + beq .Ldone + _nh_stride K1, K2, K3, K0 + + subs MESSAGE_LEN, MESSAGE_LEN, #16 + beq .Ldone + _nh_stride K2, K3, K0, K1 + +.Ldone: + // Sum the accumulators for each pass, then store the sums to 'hash' + vadd.u64 T0_L, PASS0_SUM_A, PASS0_SUM_B + vadd.u64 T0_H, PASS1_SUM_A, PASS1_SUM_B + vadd.u64 T1_L, PASS2_SUM_A, PASS2_SUM_B + vadd.u64 T1_H, PASS3_SUM_A, PASS3_SUM_B + vst1.8 {T0-T1}, [HASH] + bx lr +ENDPROC(nh_neon) diff --git a/arch/arm/crypto/nhpoly1305-neon-glue.c b/arch/arm/crypto/nhpoly1305-neon-glue.c new file mode 100644 index 000000000000..49aae87cb2bc --- /dev/null +++ b/arch/arm/crypto/nhpoly1305-neon-glue.c @@ -0,0 +1,77 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * NHPoly1305 - ε-almost-∆-universal hash function for Adiantum + * (NEON accelerated version) + * + * Copyright 2018 Google LLC + */ + +#include +#include +#include +#include +#include + +asmlinkage void nh_neon(const u32 *key, const u8 *message, size_t message_len, + u8 hash[NH_HASH_BYTES]); + +/* wrapper to avoid indirect call to assembly, which doesn't work with CFI */ +static void _nh_neon(const u32 *key, const u8 *message, size_t message_len, + __le64 hash[NH_NUM_PASSES]) +{ + nh_neon(key, message, message_len, (u8 *)hash); +} + +static int nhpoly1305_neon_update(struct shash_desc *desc, + const u8 *src, unsigned int srclen) +{ + if (srclen < 64 || !may_use_simd()) + return crypto_nhpoly1305_update(desc, src, srclen); + + do { + unsigned int n = min_t(unsigned int, srclen, PAGE_SIZE); + + kernel_neon_begin(); + crypto_nhpoly1305_update_helper(desc, src, n, _nh_neon); + kernel_neon_end(); + src += n; + srclen -= n; + } while (srclen); + return 0; +} + +static struct shash_alg nhpoly1305_alg = { + .base.cra_name = "nhpoly1305", + .base.cra_driver_name = "nhpoly1305-neon", + .base.cra_priority = 200, + .base.cra_ctxsize = sizeof(struct nhpoly1305_key), + .base.cra_module = THIS_MODULE, + .digestsize = POLY1305_DIGEST_SIZE, + .init = crypto_nhpoly1305_init, + .update = nhpoly1305_neon_update, + .final = crypto_nhpoly1305_final, + .setkey = crypto_nhpoly1305_setkey, + .descsize = sizeof(struct nhpoly1305_state), +}; + +static int __init nhpoly1305_mod_init(void) +{ + if (!(elf_hwcap & HWCAP_NEON)) + return -ENODEV; + + return crypto_register_shash(&nhpoly1305_alg); +} + +static void __exit nhpoly1305_mod_exit(void) +{ + crypto_unregister_shash(&nhpoly1305_alg); +} + +module_init(nhpoly1305_mod_init); +module_exit(nhpoly1305_mod_exit); + +MODULE_DESCRIPTION("NHPoly1305 ε-almost-∆-universal hash function (NEON-accelerated)"); +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR("Eric Biggers "); +MODULE_ALIAS_CRYPTO("nhpoly1305"); +MODULE_ALIAS_CRYPTO("nhpoly1305-neon"); -- cgit v1.2.3-59-g8ed1b From 059c2a4d8e164dccc3078e49e7f286023b019a98 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 16 Nov 2018 17:26:31 -0800 Subject: crypto: adiantum - add Adiantum support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add support for the Adiantum encryption mode. Adiantum was designed by Paul Crowley and is specified by our paper: Adiantum: length-preserving encryption for entry-level processors (https://eprint.iacr.org/2018/720.pdf) See our paper for full details; this patch only provides an overview. Adiantum is a tweakable, length-preserving encryption mode designed for fast and secure disk encryption, especially on CPUs without dedicated crypto instructions. Adiantum encrypts each sector using the XChaCha12 stream cipher, two passes of an ε-almost-∆-universal (εA∆U) hash function, and an invocation of the AES-256 block cipher on a single 16-byte block. On CPUs without AES instructions, Adiantum is much faster than AES-XTS; for example, on ARM Cortex-A7, on 4096-byte sectors Adiantum encryption is about 4 times faster than AES-256-XTS encryption, and decryption about 5 times faster. Adiantum is a specialization of the more general HBSH construction. Our earlier proposal, HPolyC, was also a HBSH specialization, but it used a different εA∆U hash function, one based on Poly1305 only. Adiantum's εA∆U hash function, which is based primarily on the "NH" hash function like that used in UMAC (RFC4418), is about twice as fast as HPolyC's; consequently, Adiantum is about 20% faster than HPolyC. This speed comes with no loss of security: Adiantum is provably just as secure as HPolyC, in fact slightly *more* secure. Like HPolyC, Adiantum's security is reducible to that of XChaCha12 and AES-256, subject to a security bound. XChaCha12 itself has a security reduction to ChaCha12. Therefore, one need not "trust" Adiantum; one need only trust ChaCha12 and AES-256. Note that the εA∆U hash function is only used for its proven combinatorical properties so cannot be "broken". Adiantum is also a true wide-block encryption mode, so flipping any plaintext bit in the sector scrambles the entire ciphertext, and vice versa. No other such mode is available in the kernel currently; doing the same with XTS scrambles only 16 bytes. Adiantum also supports arbitrary-length tweaks and naturally supports any length input >= 16 bytes without needing "ciphertext stealing". For the stream cipher, Adiantum uses XChaCha12 rather than XChaCha20 in order to make encryption feasible on the widest range of devices. Although the 20-round variant is quite popular, the best known attacks on ChaCha are on only 7 rounds, so ChaCha12 still has a substantial security margin; in fact, larger than AES-256's. 12-round Salsa20 is also the eSTREAM recommendation. For the block cipher, Adiantum uses AES-256, despite it having a lower security margin than XChaCha12 and needing table lookups, due to AES's extensive adoption and analysis making it the obvious first choice. Nevertheless, for flexibility this patch also permits the "adiantum" template to be instantiated with XChaCha20 and/or with an alternate block cipher. We need Adiantum support in the kernel for use in dm-crypt and fscrypt, where currently the only other suitable options are block cipher modes such as AES-XTS. A big problem with this is that many low-end mobile devices (e.g. Android Go phones sold primarily in developing countries, as well as some smartwatches) still have CPUs that lack AES instructions, e.g. ARM Cortex-A7. Sadly, AES-XTS encryption is much too slow to be viable on these devices. We did find that some "lightweight" block ciphers are fast enough, but these suffer from problems such as not having much cryptanalysis or being too controversial. The ChaCha stream cipher has excellent performance but is insecure to use directly for disk encryption, since each sector's IV is reused each time it is overwritten. Even restricting the threat model to offline attacks only isn't enough, since modern flash storage devices don't guarantee that "overwrites" are really overwrites, due to wear-leveling. Adiantum avoids this problem by constructing a "tweakable super-pseudorandom permutation"; this is the strongest possible security model for length-preserving encryption. Of course, storing random nonces along with the ciphertext would be the ideal solution. But doing that with existing hardware and filesystems runs into major practical problems; in most cases it would require data journaling (like dm-integrity) which severely degrades performance. Thus, for now length-preserving encryption is still needed. Signed-off-by: Eric Biggers Reviewed-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- crypto/Kconfig | 23 ++ crypto/Makefile | 1 + crypto/adiantum.c | 658 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ crypto/tcrypt.c | 12 + crypto/testmgr.c | 12 + crypto/testmgr.h | 461 ++++++++++++++++++++++++++++++++++++++ 6 files changed, 1167 insertions(+) create mode 100644 crypto/adiantum.c diff --git a/crypto/Kconfig b/crypto/Kconfig index eaeb8a986b7d..b6376d5d973e 100644 --- a/crypto/Kconfig +++ b/crypto/Kconfig @@ -501,6 +501,29 @@ config CRYPTO_NHPOLY1305 select CRYPTO_HASH select CRYPTO_POLY1305 +config CRYPTO_ADIANTUM + tristate "Adiantum support" + select CRYPTO_CHACHA20 + select CRYPTO_POLY1305 + select CRYPTO_NHPOLY1305 + help + Adiantum is a tweakable, length-preserving encryption mode + designed for fast and secure disk encryption, especially on + CPUs without dedicated crypto instructions. It encrypts + each sector using the XChaCha12 stream cipher, two passes of + an ε-almost-∆-universal hash function, and an invocation of + the AES-256 block cipher on a single 16-byte block. On CPUs + without AES instructions, Adiantum is much faster than + AES-XTS. + + Adiantum's security is provably reducible to that of its + underlying stream and block ciphers, subject to a security + bound. Unlike XTS, Adiantum is a true wide-block encryption + mode, so it actually provides an even stronger notion of + security than XTS, subject to the security bound. + + If unsure, say N. + comment "Hash modes" config CRYPTO_CMAC diff --git a/crypto/Makefile b/crypto/Makefile index c3310c85f09f..5e789dc2d4fd 100644 --- a/crypto/Makefile +++ b/crypto/Makefile @@ -85,6 +85,7 @@ obj-$(CONFIG_CRYPTO_LRW) += lrw.o obj-$(CONFIG_CRYPTO_XTS) += xts.o obj-$(CONFIG_CRYPTO_CTR) += ctr.o obj-$(CONFIG_CRYPTO_KEYWRAP) += keywrap.o +obj-$(CONFIG_CRYPTO_ADIANTUM) += adiantum.o obj-$(CONFIG_CRYPTO_NHPOLY1305) += nhpoly1305.o obj-$(CONFIG_CRYPTO_GCM) += gcm.o obj-$(CONFIG_CRYPTO_CCM) += ccm.o diff --git a/crypto/adiantum.c b/crypto/adiantum.c new file mode 100644 index 000000000000..2dfcf12fd452 --- /dev/null +++ b/crypto/adiantum.c @@ -0,0 +1,658 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Adiantum length-preserving encryption mode + * + * Copyright 2018 Google LLC + */ + +/* + * Adiantum is a tweakable, length-preserving encryption mode designed for fast + * and secure disk encryption, especially on CPUs without dedicated crypto + * instructions. Adiantum encrypts each sector using the XChaCha12 stream + * cipher, two passes of an ε-almost-∆-universal (εA∆U) hash function based on + * NH and Poly1305, and an invocation of the AES-256 block cipher on a single + * 16-byte block. See the paper for details: + * + * Adiantum: length-preserving encryption for entry-level processors + * (https://eprint.iacr.org/2018/720.pdf) + * + * For flexibility, this implementation also allows other ciphers: + * + * - Stream cipher: XChaCha12 or XChaCha20 + * - Block cipher: any with a 128-bit block size and 256-bit key + * + * This implementation doesn't currently allow other εA∆U hash functions, i.e. + * HPolyC is not supported. This is because Adiantum is ~20% faster than HPolyC + * but still provably as secure, and also the εA∆U hash function of HBSH is + * formally defined to take two inputs (tweak, message) which makes it difficult + * to wrap with the crypto_shash API. Rather, some details need to be handled + * here. Nevertheless, if needed in the future, support for other εA∆U hash + * functions could be added here. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "internal.h" + +/* + * Size of right-hand block of input data, in bytes; also the size of the block + * cipher's block size and the hash function's output. + */ +#define BLOCKCIPHER_BLOCK_SIZE 16 + +/* Size of the block cipher key (K_E) in bytes */ +#define BLOCKCIPHER_KEY_SIZE 32 + +/* Size of the hash key (K_H) in bytes */ +#define HASH_KEY_SIZE (POLY1305_BLOCK_SIZE + NHPOLY1305_KEY_SIZE) + +/* + * The specification allows variable-length tweaks, but Linux's crypto API + * currently only allows algorithms to support a single length. The "natural" + * tweak length for Adiantum is 16, since that fits into one Poly1305 block for + * the best performance. But longer tweaks are useful for fscrypt, to avoid + * needing to derive per-file keys. So instead we use two blocks, or 32 bytes. + */ +#define TWEAK_SIZE 32 + +struct adiantum_instance_ctx { + struct crypto_skcipher_spawn streamcipher_spawn; + struct crypto_spawn blockcipher_spawn; + struct crypto_shash_spawn hash_spawn; +}; + +struct adiantum_tfm_ctx { + struct crypto_skcipher *streamcipher; + struct crypto_cipher *blockcipher; + struct crypto_shash *hash; + struct poly1305_key header_hash_key; +}; + +struct adiantum_request_ctx { + + /* + * Buffer for right-hand block of data, i.e. + * + * P_L => P_M => C_M => C_R when encrypting, or + * C_R => C_M => P_M => P_L when decrypting. + * + * Also used to build the IV for the stream cipher. + */ + union { + u8 bytes[XCHACHA_IV_SIZE]; + __le32 words[XCHACHA_IV_SIZE / sizeof(__le32)]; + le128 bignum; /* interpret as element of Z/(2^{128}Z) */ + } rbuf; + + bool enc; /* true if encrypting, false if decrypting */ + + /* + * The result of the Poly1305 εA∆U hash function applied to + * (message length, tweak). + */ + le128 header_hash; + + /* Sub-requests, must be last */ + union { + struct shash_desc hash_desc; + struct skcipher_request streamcipher_req; + } u; +}; + +/* + * Given the XChaCha stream key K_S, derive the block cipher key K_E and the + * hash key K_H as follows: + * + * K_E || K_H || ... = XChaCha(key=K_S, nonce=1||0^191) + * + * Note that this denotes using bits from the XChaCha keystream, which here we + * get indirectly by encrypting a buffer containing all 0's. + */ +static int adiantum_setkey(struct crypto_skcipher *tfm, const u8 *key, + unsigned int keylen) +{ + struct adiantum_tfm_ctx *tctx = crypto_skcipher_ctx(tfm); + struct { + u8 iv[XCHACHA_IV_SIZE]; + u8 derived_keys[BLOCKCIPHER_KEY_SIZE + HASH_KEY_SIZE]; + struct scatterlist sg; + struct crypto_wait wait; + struct skcipher_request req; /* must be last */ + } *data; + u8 *keyp; + int err; + + /* Set the stream cipher key (K_S) */ + crypto_skcipher_clear_flags(tctx->streamcipher, CRYPTO_TFM_REQ_MASK); + crypto_skcipher_set_flags(tctx->streamcipher, + crypto_skcipher_get_flags(tfm) & + CRYPTO_TFM_REQ_MASK); + err = crypto_skcipher_setkey(tctx->streamcipher, key, keylen); + crypto_skcipher_set_flags(tfm, + crypto_skcipher_get_flags(tctx->streamcipher) & + CRYPTO_TFM_RES_MASK); + if (err) + return err; + + /* Derive the subkeys */ + data = kzalloc(sizeof(*data) + + crypto_skcipher_reqsize(tctx->streamcipher), GFP_KERNEL); + if (!data) + return -ENOMEM; + data->iv[0] = 1; + sg_init_one(&data->sg, data->derived_keys, sizeof(data->derived_keys)); + crypto_init_wait(&data->wait); + skcipher_request_set_tfm(&data->req, tctx->streamcipher); + skcipher_request_set_callback(&data->req, CRYPTO_TFM_REQ_MAY_SLEEP | + CRYPTO_TFM_REQ_MAY_BACKLOG, + crypto_req_done, &data->wait); + skcipher_request_set_crypt(&data->req, &data->sg, &data->sg, + sizeof(data->derived_keys), data->iv); + err = crypto_wait_req(crypto_skcipher_encrypt(&data->req), &data->wait); + if (err) + goto out; + keyp = data->derived_keys; + + /* Set the block cipher key (K_E) */ + crypto_cipher_clear_flags(tctx->blockcipher, CRYPTO_TFM_REQ_MASK); + crypto_cipher_set_flags(tctx->blockcipher, + crypto_skcipher_get_flags(tfm) & + CRYPTO_TFM_REQ_MASK); + err = crypto_cipher_setkey(tctx->blockcipher, keyp, + BLOCKCIPHER_KEY_SIZE); + crypto_skcipher_set_flags(tfm, + crypto_cipher_get_flags(tctx->blockcipher) & + CRYPTO_TFM_RES_MASK); + if (err) + goto out; + keyp += BLOCKCIPHER_KEY_SIZE; + + /* Set the hash key (K_H) */ + poly1305_core_setkey(&tctx->header_hash_key, keyp); + keyp += POLY1305_BLOCK_SIZE; + + crypto_shash_clear_flags(tctx->hash, CRYPTO_TFM_REQ_MASK); + crypto_shash_set_flags(tctx->hash, crypto_skcipher_get_flags(tfm) & + CRYPTO_TFM_REQ_MASK); + err = crypto_shash_setkey(tctx->hash, keyp, NHPOLY1305_KEY_SIZE); + crypto_skcipher_set_flags(tfm, crypto_shash_get_flags(tctx->hash) & + CRYPTO_TFM_RES_MASK); + keyp += NHPOLY1305_KEY_SIZE; + WARN_ON(keyp != &data->derived_keys[ARRAY_SIZE(data->derived_keys)]); +out: + kzfree(data); + return err; +} + +/* Addition in Z/(2^{128}Z) */ +static inline void le128_add(le128 *r, const le128 *v1, const le128 *v2) +{ + u64 x = le64_to_cpu(v1->b); + u64 y = le64_to_cpu(v2->b); + + r->b = cpu_to_le64(x + y); + r->a = cpu_to_le64(le64_to_cpu(v1->a) + le64_to_cpu(v2->a) + + (x + y < x)); +} + +/* Subtraction in Z/(2^{128}Z) */ +static inline void le128_sub(le128 *r, const le128 *v1, const le128 *v2) +{ + u64 x = le64_to_cpu(v1->b); + u64 y = le64_to_cpu(v2->b); + + r->b = cpu_to_le64(x - y); + r->a = cpu_to_le64(le64_to_cpu(v1->a) - le64_to_cpu(v2->a) - + (x - y > x)); +} + +/* + * Apply the Poly1305 εA∆U hash function to (message length, tweak) and save the + * result to rctx->header_hash. + * + * This value is reused in both the first and second hash steps. Specifically, + * it's added to the result of an independently keyed εA∆U hash function (for + * equal length inputs only) taken over the message. This gives the overall + * Adiantum hash of the (tweak, message) pair. + */ +static void adiantum_hash_header(struct skcipher_request *req) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + const struct adiantum_tfm_ctx *tctx = crypto_skcipher_ctx(tfm); + struct adiantum_request_ctx *rctx = skcipher_request_ctx(req); + const unsigned int bulk_len = req->cryptlen - BLOCKCIPHER_BLOCK_SIZE; + struct { + __le64 message_bits; + __le64 padding; + } header = { + .message_bits = cpu_to_le64((u64)bulk_len * 8) + }; + struct poly1305_state state; + + poly1305_core_init(&state); + + BUILD_BUG_ON(sizeof(header) % POLY1305_BLOCK_SIZE != 0); + poly1305_core_blocks(&state, &tctx->header_hash_key, + &header, sizeof(header) / POLY1305_BLOCK_SIZE); + + BUILD_BUG_ON(TWEAK_SIZE % POLY1305_BLOCK_SIZE != 0); + poly1305_core_blocks(&state, &tctx->header_hash_key, req->iv, + TWEAK_SIZE / POLY1305_BLOCK_SIZE); + + poly1305_core_emit(&state, &rctx->header_hash); +} + +/* Hash the left-hand block (the "bulk") of the message using NHPoly1305 */ +static int adiantum_hash_message(struct skcipher_request *req, + struct scatterlist *sgl, le128 *digest) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + const struct adiantum_tfm_ctx *tctx = crypto_skcipher_ctx(tfm); + struct adiantum_request_ctx *rctx = skcipher_request_ctx(req); + const unsigned int bulk_len = req->cryptlen - BLOCKCIPHER_BLOCK_SIZE; + struct shash_desc *hash_desc = &rctx->u.hash_desc; + struct sg_mapping_iter miter; + unsigned int i, n; + int err; + + hash_desc->tfm = tctx->hash; + hash_desc->flags = 0; + + err = crypto_shash_init(hash_desc); + if (err) + return err; + + sg_miter_start(&miter, sgl, sg_nents(sgl), + SG_MITER_FROM_SG | SG_MITER_ATOMIC); + for (i = 0; i < bulk_len; i += n) { + sg_miter_next(&miter); + n = min_t(unsigned int, miter.length, bulk_len - i); + err = crypto_shash_update(hash_desc, miter.addr, n); + if (err) + break; + } + sg_miter_stop(&miter); + if (err) + return err; + + return crypto_shash_final(hash_desc, (u8 *)digest); +} + +/* Continue Adiantum encryption/decryption after the stream cipher step */ +static int adiantum_finish(struct skcipher_request *req) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + const struct adiantum_tfm_ctx *tctx = crypto_skcipher_ctx(tfm); + struct adiantum_request_ctx *rctx = skcipher_request_ctx(req); + const unsigned int bulk_len = req->cryptlen - BLOCKCIPHER_BLOCK_SIZE; + le128 digest; + int err; + + /* If decrypting, decrypt C_M with the block cipher to get P_M */ + if (!rctx->enc) + crypto_cipher_decrypt_one(tctx->blockcipher, rctx->rbuf.bytes, + rctx->rbuf.bytes); + + /* + * Second hash step + * enc: C_R = C_M - H_{K_H}(T, C_L) + * dec: P_R = P_M - H_{K_H}(T, P_L) + */ + err = adiantum_hash_message(req, req->dst, &digest); + if (err) + return err; + le128_add(&digest, &digest, &rctx->header_hash); + le128_sub(&rctx->rbuf.bignum, &rctx->rbuf.bignum, &digest); + scatterwalk_map_and_copy(&rctx->rbuf.bignum, req->dst, + bulk_len, BLOCKCIPHER_BLOCK_SIZE, 1); + return 0; +} + +static void adiantum_streamcipher_done(struct crypto_async_request *areq, + int err) +{ + struct skcipher_request *req = areq->data; + + if (!err) + err = adiantum_finish(req); + + skcipher_request_complete(req, err); +} + +static int adiantum_crypt(struct skcipher_request *req, bool enc) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + const struct adiantum_tfm_ctx *tctx = crypto_skcipher_ctx(tfm); + struct adiantum_request_ctx *rctx = skcipher_request_ctx(req); + const unsigned int bulk_len = req->cryptlen - BLOCKCIPHER_BLOCK_SIZE; + unsigned int stream_len; + le128 digest; + int err; + + if (req->cryptlen < BLOCKCIPHER_BLOCK_SIZE) + return -EINVAL; + + rctx->enc = enc; + + /* + * First hash step + * enc: P_M = P_R + H_{K_H}(T, P_L) + * dec: C_M = C_R + H_{K_H}(T, C_L) + */ + adiantum_hash_header(req); + err = adiantum_hash_message(req, req->src, &digest); + if (err) + return err; + le128_add(&digest, &digest, &rctx->header_hash); + scatterwalk_map_and_copy(&rctx->rbuf.bignum, req->src, + bulk_len, BLOCKCIPHER_BLOCK_SIZE, 0); + le128_add(&rctx->rbuf.bignum, &rctx->rbuf.bignum, &digest); + + /* If encrypting, encrypt P_M with the block cipher to get C_M */ + if (enc) + crypto_cipher_encrypt_one(tctx->blockcipher, rctx->rbuf.bytes, + rctx->rbuf.bytes); + + /* Initialize the rest of the XChaCha IV (first part is C_M) */ + BUILD_BUG_ON(BLOCKCIPHER_BLOCK_SIZE != 16); + BUILD_BUG_ON(XCHACHA_IV_SIZE != 32); /* nonce || stream position */ + rctx->rbuf.words[4] = cpu_to_le32(1); + rctx->rbuf.words[5] = 0; + rctx->rbuf.words[6] = 0; + rctx->rbuf.words[7] = 0; + + /* + * XChaCha needs to be done on all the data except the last 16 bytes; + * for disk encryption that usually means 4080 or 496 bytes. But ChaCha + * implementations tend to be most efficient when passed a whole number + * of 64-byte ChaCha blocks, or sometimes even a multiple of 256 bytes. + * And here it doesn't matter whether the last 16 bytes are written to, + * as the second hash step will overwrite them. Thus, round the XChaCha + * length up to the next 64-byte boundary if possible. + */ + stream_len = bulk_len; + if (round_up(stream_len, CHACHA_BLOCK_SIZE) <= req->cryptlen) + stream_len = round_up(stream_len, CHACHA_BLOCK_SIZE); + + skcipher_request_set_tfm(&rctx->u.streamcipher_req, tctx->streamcipher); + skcipher_request_set_crypt(&rctx->u.streamcipher_req, req->src, + req->dst, stream_len, &rctx->rbuf); + skcipher_request_set_callback(&rctx->u.streamcipher_req, + req->base.flags, + adiantum_streamcipher_done, req); + return crypto_skcipher_encrypt(&rctx->u.streamcipher_req) ?: + adiantum_finish(req); +} + +static int adiantum_encrypt(struct skcipher_request *req) +{ + return adiantum_crypt(req, true); +} + +static int adiantum_decrypt(struct skcipher_request *req) +{ + return adiantum_crypt(req, false); +} + +static int adiantum_init_tfm(struct crypto_skcipher *tfm) +{ + struct skcipher_instance *inst = skcipher_alg_instance(tfm); + struct adiantum_instance_ctx *ictx = skcipher_instance_ctx(inst); + struct adiantum_tfm_ctx *tctx = crypto_skcipher_ctx(tfm); + struct crypto_skcipher *streamcipher; + struct crypto_cipher *blockcipher; + struct crypto_shash *hash; + unsigned int subreq_size; + int err; + + streamcipher = crypto_spawn_skcipher(&ictx->streamcipher_spawn); + if (IS_ERR(streamcipher)) + return PTR_ERR(streamcipher); + + blockcipher = crypto_spawn_cipher(&ictx->blockcipher_spawn); + if (IS_ERR(blockcipher)) { + err = PTR_ERR(blockcipher); + goto err_free_streamcipher; + } + + hash = crypto_spawn_shash(&ictx->hash_spawn); + if (IS_ERR(hash)) { + err = PTR_ERR(hash); + goto err_free_blockcipher; + } + + tctx->streamcipher = streamcipher; + tctx->blockcipher = blockcipher; + tctx->hash = hash; + + BUILD_BUG_ON(offsetofend(struct adiantum_request_ctx, u) != + sizeof(struct adiantum_request_ctx)); + subreq_size = max(FIELD_SIZEOF(struct adiantum_request_ctx, + u.hash_desc) + + crypto_shash_descsize(hash), + FIELD_SIZEOF(struct adiantum_request_ctx, + u.streamcipher_req) + + crypto_skcipher_reqsize(streamcipher)); + + crypto_skcipher_set_reqsize(tfm, + offsetof(struct adiantum_request_ctx, u) + + subreq_size); + return 0; + +err_free_blockcipher: + crypto_free_cipher(blockcipher); +err_free_streamcipher: + crypto_free_skcipher(streamcipher); + return err; +} + +static void adiantum_exit_tfm(struct crypto_skcipher *tfm) +{ + struct adiantum_tfm_ctx *tctx = crypto_skcipher_ctx(tfm); + + crypto_free_skcipher(tctx->streamcipher); + crypto_free_cipher(tctx->blockcipher); + crypto_free_shash(tctx->hash); +} + +static void adiantum_free_instance(struct skcipher_instance *inst) +{ + struct adiantum_instance_ctx *ictx = skcipher_instance_ctx(inst); + + crypto_drop_skcipher(&ictx->streamcipher_spawn); + crypto_drop_spawn(&ictx->blockcipher_spawn); + crypto_drop_shash(&ictx->hash_spawn); + kfree(inst); +} + +/* + * Check for a supported set of inner algorithms. + * See the comment at the beginning of this file. + */ +static bool adiantum_supported_algorithms(struct skcipher_alg *streamcipher_alg, + struct crypto_alg *blockcipher_alg, + struct shash_alg *hash_alg) +{ + if (strcmp(streamcipher_alg->base.cra_name, "xchacha12") != 0 && + strcmp(streamcipher_alg->base.cra_name, "xchacha20") != 0) + return false; + + if (blockcipher_alg->cra_cipher.cia_min_keysize > BLOCKCIPHER_KEY_SIZE || + blockcipher_alg->cra_cipher.cia_max_keysize < BLOCKCIPHER_KEY_SIZE) + return false; + if (blockcipher_alg->cra_blocksize != BLOCKCIPHER_BLOCK_SIZE) + return false; + + if (strcmp(hash_alg->base.cra_name, "nhpoly1305") != 0) + return false; + + return true; +} + +static int adiantum_create(struct crypto_template *tmpl, struct rtattr **tb) +{ + struct crypto_attr_type *algt; + const char *streamcipher_name; + const char *blockcipher_name; + const char *nhpoly1305_name; + struct skcipher_instance *inst; + struct adiantum_instance_ctx *ictx; + struct skcipher_alg *streamcipher_alg; + struct crypto_alg *blockcipher_alg; + struct crypto_alg *_hash_alg; + struct shash_alg *hash_alg; + int err; + + algt = crypto_get_attr_type(tb); + if (IS_ERR(algt)) + return PTR_ERR(algt); + + if ((algt->type ^ CRYPTO_ALG_TYPE_SKCIPHER) & algt->mask) + return -EINVAL; + + streamcipher_name = crypto_attr_alg_name(tb[1]); + if (IS_ERR(streamcipher_name)) + return PTR_ERR(streamcipher_name); + + blockcipher_name = crypto_attr_alg_name(tb[2]); + if (IS_ERR(blockcipher_name)) + return PTR_ERR(blockcipher_name); + + nhpoly1305_name = crypto_attr_alg_name(tb[3]); + if (nhpoly1305_name == ERR_PTR(-ENOENT)) + nhpoly1305_name = "nhpoly1305"; + if (IS_ERR(nhpoly1305_name)) + return PTR_ERR(nhpoly1305_name); + + inst = kzalloc(sizeof(*inst) + sizeof(*ictx), GFP_KERNEL); + if (!inst) + return -ENOMEM; + ictx = skcipher_instance_ctx(inst); + + /* Stream cipher, e.g. "xchacha12" */ + err = crypto_grab_skcipher(&ictx->streamcipher_spawn, streamcipher_name, + 0, crypto_requires_sync(algt->type, + algt->mask)); + if (err) + goto out_free_inst; + streamcipher_alg = crypto_spawn_skcipher_alg(&ictx->streamcipher_spawn); + + /* Block cipher, e.g. "aes" */ + err = crypto_grab_spawn(&ictx->blockcipher_spawn, blockcipher_name, + CRYPTO_ALG_TYPE_CIPHER, CRYPTO_ALG_TYPE_MASK); + if (err) + goto out_drop_streamcipher; + blockcipher_alg = ictx->blockcipher_spawn.alg; + + /* NHPoly1305 εA∆U hash function */ + _hash_alg = crypto_alg_mod_lookup(nhpoly1305_name, + CRYPTO_ALG_TYPE_SHASH, + CRYPTO_ALG_TYPE_MASK); + if (IS_ERR(_hash_alg)) { + err = PTR_ERR(_hash_alg); + goto out_drop_blockcipher; + } + hash_alg = __crypto_shash_alg(_hash_alg); + err = crypto_init_shash_spawn(&ictx->hash_spawn, hash_alg, + skcipher_crypto_instance(inst)); + if (err) { + crypto_mod_put(_hash_alg); + goto out_drop_blockcipher; + } + + /* Check the set of algorithms */ + if (!adiantum_supported_algorithms(streamcipher_alg, blockcipher_alg, + hash_alg)) { + pr_warn("Unsupported Adiantum instantiation: (%s,%s,%s)\n", + streamcipher_alg->base.cra_name, + blockcipher_alg->cra_name, hash_alg->base.cra_name); + err = -EINVAL; + goto out_drop_hash; + } + + /* Instance fields */ + + err = -ENAMETOOLONG; + if (snprintf(inst->alg.base.cra_name, CRYPTO_MAX_ALG_NAME, + "adiantum(%s,%s)", streamcipher_alg->base.cra_name, + blockcipher_alg->cra_name) >= CRYPTO_MAX_ALG_NAME) + goto out_drop_hash; + if (snprintf(inst->alg.base.cra_driver_name, CRYPTO_MAX_ALG_NAME, + "adiantum(%s,%s,%s)", + streamcipher_alg->base.cra_driver_name, + blockcipher_alg->cra_driver_name, + hash_alg->base.cra_driver_name) >= CRYPTO_MAX_ALG_NAME) + goto out_drop_hash; + + inst->alg.base.cra_blocksize = BLOCKCIPHER_BLOCK_SIZE; + inst->alg.base.cra_ctxsize = sizeof(struct adiantum_tfm_ctx); + inst->alg.base.cra_alignmask = streamcipher_alg->base.cra_alignmask | + hash_alg->base.cra_alignmask; + /* + * The block cipher is only invoked once per message, so for long + * messages (e.g. sectors for disk encryption) its performance doesn't + * matter as much as that of the stream cipher and hash function. Thus, + * weigh the block cipher's ->cra_priority less. + */ + inst->alg.base.cra_priority = (4 * streamcipher_alg->base.cra_priority + + 2 * hash_alg->base.cra_priority + + blockcipher_alg->cra_priority) / 7; + + inst->alg.setkey = adiantum_setkey; + inst->alg.encrypt = adiantum_encrypt; + inst->alg.decrypt = adiantum_decrypt; + inst->alg.init = adiantum_init_tfm; + inst->alg.exit = adiantum_exit_tfm; + inst->alg.min_keysize = crypto_skcipher_alg_min_keysize(streamcipher_alg); + inst->alg.max_keysize = crypto_skcipher_alg_max_keysize(streamcipher_alg); + inst->alg.ivsize = TWEAK_SIZE; + + inst->free = adiantum_free_instance; + + err = skcipher_register_instance(tmpl, inst); + if (err) + goto out_drop_hash; + + return 0; + +out_drop_hash: + crypto_drop_shash(&ictx->hash_spawn); +out_drop_blockcipher: + crypto_drop_spawn(&ictx->blockcipher_spawn); +out_drop_streamcipher: + crypto_drop_skcipher(&ictx->streamcipher_spawn); +out_free_inst: + kfree(inst); + return err; +} + +/* adiantum(streamcipher_name, blockcipher_name [, nhpoly1305_name]) */ +static struct crypto_template adiantum_tmpl = { + .name = "adiantum", + .create = adiantum_create, + .module = THIS_MODULE, +}; + +static int __init adiantum_module_init(void) +{ + return crypto_register_template(&adiantum_tmpl); +} + +static void __exit adiantum_module_exit(void) +{ + crypto_unregister_template(&adiantum_tmpl); +} + +module_init(adiantum_module_init); +module_exit(adiantum_module_exit); + +MODULE_DESCRIPTION("Adiantum length-preserving encryption mode"); +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR("Eric Biggers "); +MODULE_ALIAS_CRYPTO("adiantum"); diff --git a/crypto/tcrypt.c b/crypto/tcrypt.c index 5fb120474902..0590a9204562 100644 --- a/crypto/tcrypt.c +++ b/crypto/tcrypt.c @@ -2320,6 +2320,18 @@ static int do_test(const char *alg, u32 type, u32 mask, int m, u32 num_mb) test_cipher_speed("ctr(sm4)", DECRYPT, sec, NULL, 0, speed_template_16); break; + + case 219: + test_cipher_speed("adiantum(xchacha12,aes)", ENCRYPT, sec, NULL, + 0, speed_template_32); + test_cipher_speed("adiantum(xchacha12,aes)", DECRYPT, sec, NULL, + 0, speed_template_32); + test_cipher_speed("adiantum(xchacha20,aes)", ENCRYPT, sec, NULL, + 0, speed_template_32); + test_cipher_speed("adiantum(xchacha20,aes)", DECRYPT, sec, NULL, + 0, speed_template_32); + break; + case 300: if (alg) { test_hash_speed(alg, sec, generic_hash_speed_template); diff --git a/crypto/testmgr.c b/crypto/testmgr.c index 665911c24786..0f684a414acb 100644 --- a/crypto/testmgr.c +++ b/crypto/testmgr.c @@ -2404,6 +2404,18 @@ static int alg_test_null(const struct alg_test_desc *desc, /* Please keep this list sorted by algorithm name. */ static const struct alg_test_desc alg_test_descs[] = { { + .alg = "adiantum(xchacha12,aes)", + .test = alg_test_skcipher, + .suite = { + .cipher = __VECS(adiantum_xchacha12_aes_tv_template) + }, + }, { + .alg = "adiantum(xchacha20,aes)", + .test = alg_test_skcipher, + .suite = { + .cipher = __VECS(adiantum_xchacha20_aes_tv_template) + }, + }, { .alg = "aegis128", .test = alg_test_aead, .suite = { diff --git a/crypto/testmgr.h b/crypto/testmgr.h index 50ea1f2705c7..e7e56a8febbc 100644 --- a/crypto/testmgr.h +++ b/crypto/testmgr.h @@ -33381,6 +33381,467 @@ static const struct cipher_testvec xchacha12_tv_template[] = { }, }; +/* Adiantum test vectors from https://github.com/google/adiantum */ +static const struct cipher_testvec adiantum_xchacha12_aes_tv_template[] = { + { + .key = "\x9e\xeb\xb2\x49\x3c\x1c\xf5\xf4" + "\x6a\x99\xc2\xc4\xdf\xb1\xf4\xdd" + "\x75\x20\x57\xea\x2c\x4f\xcd\xb2" + "\xa5\x3d\x7b\x49\x1e\xab\xfd\x0f", + .klen = 32, + .iv = "\xdf\x63\xd4\xab\xd2\x49\xf3\xd8" + "\x33\x81\x37\x60\x7d\xfa\x73\x08" + "\xd8\x49\x6d\x80\xe8\x2f\x62\x54" + "\xeb\x0e\xa9\x39\x5b\x45\x7f\x8a", + .ptext = "\x67\xc9\xf2\x30\x84\x41\x8e\x43" + "\xfb\xf3\xb3\x3e\x79\x36\x7f\xe8", + .ctext = "\x6d\x32\x86\x18\x67\x86\x0f\x3f" + "\x96\x7c\x9d\x28\x0d\x53\xec\x9f", + .len = 16, + .also_non_np = 1, + .np = 2, + .tap = { 14, 2 }, + }, { + .key = "\x36\x2b\x57\x97\xf8\x5d\xcd\x99" + "\x5f\x1a\x5a\x44\x1d\x92\x0f\x27" + "\xcc\x16\xd7\x2b\x85\x63\x99\xd3" + "\xba\x96\xa1\xdb\xd2\x60\x68\xda", + .klen = 32, + .iv = "\xef\x58\x69\xb1\x2c\x5e\x9a\x47" + "\x24\xc1\xb1\x69\xe1\x12\x93\x8f" + "\x43\x3d\x6d\x00\xdb\x5e\xd8\xd9" + "\x12\x9a\xfe\xd9\xff\x2d\xaa\xc4", + .ptext = "\x5e\xa8\x68\x19\x85\x98\x12\x23" + "\x26\x0a\xcc\xdb\x0a\x04\xb9\xdf" + "\x4d\xb3\x48\x7b\xb0\xe3\xc8\x19" + "\x43\x5a\x46\x06\x94\x2d\xf2", + .ctext = "\xc7\xc6\xf1\x73\x8f\xc4\xff\x4a" + "\x39\xbe\x78\xbe\x8d\x28\xc8\x89" + "\x46\x63\xe7\x0c\x7d\x87\xe8\x4e" + "\xc9\x18\x7b\xbe\x18\x60\x50", + .len = 31, + }, { + .key = "\xa5\x28\x24\x34\x1a\x3c\xd8\xf7" + "\x05\x91\x8f\xee\x85\x1f\x35\x7f" + "\x80\x3d\xfc\x9b\x94\xf6\xfc\x9e" + "\x19\x09\x00\xa9\x04\x31\x4f\x11", + .klen = 32, + .iv = "\xa1\xba\x49\x95\xff\x34\x6d\xb8" + "\xcd\x87\x5d\x5e\xfd\xea\x85\xdb" + "\x8a\x7b\x5e\xb2\x5d\x57\xdd\x62" + "\xac\xa9\x8c\x41\x42\x94\x75\xb7", + .ptext = "\x69\xb4\xe8\x8c\x37\xe8\x67\x82" + "\xf1\xec\x5d\x04\xe5\x14\x91\x13" + "\xdf\xf2\x87\x1b\x69\x81\x1d\x71" + "\x70\x9e\x9c\x3b\xde\x49\x70\x11" + "\xa0\xa3\xdb\x0d\x54\x4f\x66\x69" + "\xd7\xdb\x80\xa7\x70\x92\x68\xce" + "\x81\x04\x2c\xc6\xab\xae\xe5\x60" + "\x15\xe9\x6f\xef\xaa\x8f\xa7\xa7" + "\x63\x8f\xf2\xf0\x77\xf1\xa8\xea" + "\xe1\xb7\x1f\x9e\xab\x9e\x4b\x3f" + "\x07\x87\x5b\x6f\xcd\xa8\xaf\xb9" + "\xfa\x70\x0b\x52\xb8\xa8\xa7\x9e" + "\x07\x5f\xa6\x0e\xb3\x9b\x79\x13" + "\x79\xc3\x3e\x8d\x1c\x2c\x68\xc8" + "\x51\x1d\x3c\x7b\x7d\x79\x77\x2a" + "\x56\x65\xc5\x54\x23\x28\xb0\x03", + .ctext = "\x9e\x16\xab\xed\x4b\xa7\x42\x5a" + "\xc6\xfb\x4e\x76\xff\xbe\x03\xa0" + "\x0f\xe3\xad\xba\xe4\x98\x2b\x0e" + "\x21\x48\xa0\xb8\x65\x48\x27\x48" + "\x84\x54\x54\xb2\x9a\x94\x7b\xe6" + "\x4b\x29\xe9\xcf\x05\x91\x80\x1a" + "\x3a\xf3\x41\x96\x85\x1d\x9f\x74" + "\x51\x56\x63\xfa\x7c\x28\x85\x49" + "\xf7\x2f\xf9\xf2\x18\x46\xf5\x33" + "\x80\xa3\x3c\xce\xb2\x57\x93\xf5" + "\xae\xbd\xa9\xf5\x7b\x30\xc4\x93" + "\x66\xe0\x30\x77\x16\xe4\xa0\x31" + "\xba\x70\xbc\x68\x13\xf5\xb0\x9a" + "\xc1\xfc\x7e\xfe\x55\x80\x5c\x48" + "\x74\xa6\xaa\xa3\xac\xdc\xc2\xf5" + "\x8d\xde\x34\x86\x78\x60\x75\x8d", + .len = 128, + .also_non_np = 1, + .np = 4, + .tap = { 104, 16, 4, 4 }, + }, { + .key = "\xd3\x81\x72\x18\x23\xff\x6f\x4a" + "\x25\x74\x29\x0d\x51\x8a\x0e\x13" + "\xc1\x53\x5d\x30\x8d\xee\x75\x0d" + "\x14\xd6\x69\xc9\x15\xa9\x0c\x60", + .klen = 32, + .iv = "\x65\x9b\xd4\xa8\x7d\x29\x1d\xf4" + "\xc4\xd6\x9b\x6a\x28\xab\x64\xe2" + "\x62\x81\x97\xc5\x81\xaa\xf9\x44" + "\xc1\x72\x59\x82\xaf\x16\xc8\x2c", + .ptext = "\xc7\x6b\x52\x6a\x10\xf0\xcc\x09" + "\xc1\x12\x1d\x6d\x21\xa6\x78\xf5" + "\x05\xa3\x69\x60\x91\x36\x98\x57" + "\xba\x0c\x14\xcc\xf3\x2d\x73\x03" + "\xc6\xb2\x5f\xc8\x16\x27\x37\x5d" + "\xd0\x0b\x87\xb2\x50\x94\x7b\x58" + "\x04\xf4\xe0\x7f\x6e\x57\x8e\xc9" + "\x41\x84\xc1\xb1\x7e\x4b\x91\x12" + "\x3a\x8b\x5d\x50\x82\x7b\xcb\xd9" + "\x9a\xd9\x4e\x18\x06\x23\x9e\xd4" + "\xa5\x20\x98\xef\xb5\xda\xe5\xc0" + "\x8a\x6a\x83\x77\x15\x84\x1e\xae" + "\x78\x94\x9d\xdf\xb7\xd1\xea\x67" + "\xaa\xb0\x14\x15\xfa\x67\x21\x84" + "\xd3\x41\x2a\xce\xba\x4b\x4a\xe8" + "\x95\x62\xa9\x55\xf0\x80\xad\xbd" + "\xab\xaf\xdd\x4f\xa5\x7c\x13\x36" + "\xed\x5e\x4f\x72\xad\x4b\xf1\xd0" + "\x88\x4e\xec\x2c\x88\x10\x5e\xea" + "\x12\xc0\x16\x01\x29\xa3\xa0\x55" + "\xaa\x68\xf3\xe9\x9d\x3b\x0d\x3b" + "\x6d\xec\xf8\xa0\x2d\xf0\x90\x8d" + "\x1c\xe2\x88\xd4\x24\x71\xf9\xb3" + "\xc1\x9f\xc5\xd6\x76\x70\xc5\x2e" + "\x9c\xac\xdb\x90\xbd\x83\x72\xba" + "\x6e\xb5\xa5\x53\x83\xa9\xa5\xbf" + "\x7d\x06\x0e\x3c\x2a\xd2\x04\xb5" + "\x1e\x19\x38\x09\x16\xd2\x82\x1f" + "\x75\x18\x56\xb8\x96\x0b\xa6\xf9" + "\xcf\x62\xd9\x32\x5d\xa9\xd7\x1d" + "\xec\xe4\xdf\x1b\xbe\xf1\x36\xee" + "\xe3\x7b\xb5\x2f\xee\xf8\x53\x3d" + "\x6a\xb7\x70\xa9\xfc\x9c\x57\x25" + "\xf2\x89\x10\xd3\xb8\xa8\x8c\x30" + "\xae\x23\x4f\x0e\x13\x66\x4f\xe1" + "\xb6\xc0\xe4\xf8\xef\x93\xbd\x6e" + "\x15\x85\x6b\xe3\x60\x81\x1d\x68" + "\xd7\x31\x87\x89\x09\xab\xd5\x96" + "\x1d\xf3\x6d\x67\x80\xca\x07\x31" + "\x5d\xa7\xe4\xfb\x3e\xf2\x9b\x33" + "\x52\x18\xc8\x30\xfe\x2d\xca\x1e" + "\x79\x92\x7a\x60\x5c\xb6\x58\x87" + "\xa4\x36\xa2\x67\x92\x8b\xa4\xb7" + "\xf1\x86\xdf\xdc\xc0\x7e\x8f\x63" + "\xd2\xa2\xdc\x78\xeb\x4f\xd8\x96" + "\x47\xca\xb8\x91\xf9\xf7\x94\x21" + "\x5f\x9a\x9f\x5b\xb8\x40\x41\x4b" + "\x66\x69\x6a\x72\xd0\xcb\x70\xb7" + "\x93\xb5\x37\x96\x05\x37\x4f\xe5" + "\x8c\xa7\x5a\x4e\x8b\xb7\x84\xea" + "\xc7\xfc\x19\x6e\x1f\x5a\xa1\xac" + "\x18\x7d\x52\x3b\xb3\x34\x62\x99" + "\xe4\x9e\x31\x04\x3f\xc0\x8d\x84" + "\x17\x7c\x25\x48\x52\x67\x11\x27" + "\x67\xbb\x5a\x85\xca\x56\xb2\x5c" + "\xe6\xec\xd5\x96\x3d\x15\xfc\xfb" + "\x22\x25\xf4\x13\xe5\x93\x4b\x9a" + "\x77\xf1\x52\x18\xfa\x16\x5e\x49" + "\x03\x45\xa8\x08\xfa\xb3\x41\x92" + "\x79\x50\x33\xca\xd0\xd7\x42\x55" + "\xc3\x9a\x0c\x4e\xd9\xa4\x3c\x86" + "\x80\x9f\x53\xd1\xa4\x2e\xd1\xbc" + "\xf1\x54\x6e\x93\xa4\x65\x99\x8e" + "\xdf\x29\xc0\x64\x63\x07\xbb\xea", + .ctext = "\x15\x97\xd0\x86\x18\x03\x9c\x51" + "\xc5\x11\x36\x62\x13\x92\xe6\x73" + "\x29\x79\xde\xa1\x00\x3e\x08\x64" + "\x17\x1a\xbc\xd5\xfe\x33\x0e\x0c" + "\x7c\x94\xa7\xc6\x3c\xbe\xac\xa2" + "\x89\xe6\xbc\xdf\x0c\x33\x27\x42" + "\x46\x73\x2f\xba\x4e\xa6\x46\x8f" + "\xe4\xee\x39\x63\x42\x65\xa3\x88" + "\x7a\xad\x33\x23\xa9\xa7\x20\x7f" + "\x0b\xe6\x6a\xc3\x60\xda\x9e\xb4" + "\xd6\x07\x8a\x77\x26\xd1\xab\x44" + "\x99\x55\x03\x5e\xed\x8d\x7b\xbd" + "\xc8\x21\xb7\x21\x30\x3f\xc0\xb5" + "\xc8\xec\x6c\x23\xa6\xa3\x6d\xf1" + "\x30\x0a\xd0\xa6\xa9\x28\x69\xae" + "\x2a\xe6\x54\xac\x82\x9d\x6a\x95" + "\x6f\x06\x44\xc5\x5a\x77\x6e\xec" + "\xf8\xf8\x63\xb2\xe6\xaa\xbd\x8e" + "\x0e\x8a\x62\x00\x03\xc8\x84\xdd" + "\x47\x4a\xc3\x55\xba\xb7\xe7\xdf" + "\x08\xbf\x62\xf5\xe8\xbc\xb6\x11" + "\xe4\xcb\xd0\x66\x74\x32\xcf\xd4" + "\xf8\x51\x80\x39\x14\x05\x12\xdb" + "\x87\x93\xe2\x26\x30\x9c\x3a\x21" + "\xe5\xd0\x38\x57\x80\x15\xe4\x08" + "\x58\x05\x49\x7d\xe6\x92\x77\x70" + "\xfb\x1e\x2d\x6a\x84\x00\xc8\x68" + "\xf7\x1a\xdd\xf0\x7b\x38\x1e\xd8" + "\x2c\x78\x78\x61\xcf\xe3\xde\x69" + "\x1f\xd5\x03\xd5\x1a\xb4\xcf\x03" + "\xc8\x7a\x70\x68\x35\xb4\xf6\xbe" + "\x90\x62\xb2\x28\x99\x86\xf5\x44" + "\x99\xeb\x31\xcf\xca\xdf\xd0\x21" + "\xd6\x60\xf7\x0f\x40\xb4\x80\xb7" + "\xab\xe1\x9b\x45\xba\x66\xda\xee" + "\xdd\x04\x12\x40\x98\xe1\x69\xe5" + "\x2b\x9c\x59\x80\xe7\x7b\xcc\x63" + "\xa6\xc0\x3a\xa9\xfe\x8a\xf9\x62" + "\x11\x34\x61\x94\x35\xfe\xf2\x99" + "\xfd\xee\x19\xea\x95\xb6\x12\xbf" + "\x1b\xdf\x02\x1a\xcc\x3e\x7e\x65" + "\x78\x74\x10\x50\x29\x63\x28\xea" + "\x6b\xab\xd4\x06\x4d\x15\x24\x31" + "\xc7\x0a\xc9\x16\xb6\x48\xf0\xbf" + "\x49\xdb\x68\x71\x31\x8f\x87\xe2" + "\x13\x05\x64\xd6\x22\x0c\xf8\x36" + "\x84\x24\x3e\x69\x5e\xb8\x9e\x16" + "\x73\x6c\x83\x1e\xe0\x9f\x9e\xba" + "\xe5\x59\x21\x33\x1b\xa9\x26\xc2" + "\xc7\xd9\x30\x73\xb6\xa6\x73\x82" + "\x19\xfa\x44\x4d\x40\x8b\x69\x04" + "\x94\x74\xea\x6e\xb3\x09\x47\x01" + "\x2a\xb9\x78\x34\x43\x11\xed\xd6" + "\x8c\x95\x65\x1b\x85\x67\xa5\x40" + "\xac\x9c\x05\x4b\x57\x4a\xa9\x96" + "\x0f\xdd\x4f\xa1\xe0\xcf\x6e\xc7" + "\x1b\xed\xa2\xb4\x56\x8c\x09\x6e" + "\xa6\x65\xd7\x55\x81\xb7\xed\x11" + "\x9b\x40\x75\xa8\x6b\x56\xaf\x16" + "\x8b\x3d\xf4\xcb\xfe\xd5\x1d\x3d" + "\x85\xc2\xc0\xde\x43\x39\x4a\x96" + "\xba\x88\x97\xc0\xd6\x00\x0e\x27" + "\x21\xb0\x21\x52\xba\xa7\x37\xaa" + "\xcc\xbf\x95\xa8\xf4\xd0\x91\xf6", + .len = 512, + .also_non_np = 1, + .np = 2, + .tap = { 144, 368 }, + } +}; + +/* Adiantum with XChaCha20 instead of XChaCha12 */ +/* Test vectors from https://github.com/google/adiantum */ +static const struct cipher_testvec adiantum_xchacha20_aes_tv_template[] = { + { + .key = "\x9e\xeb\xb2\x49\x3c\x1c\xf5\xf4" + "\x6a\x99\xc2\xc4\xdf\xb1\xf4\xdd" + "\x75\x20\x57\xea\x2c\x4f\xcd\xb2" + "\xa5\x3d\x7b\x49\x1e\xab\xfd\x0f", + .klen = 32, + .iv = "\xdf\x63\xd4\xab\xd2\x49\xf3\xd8" + "\x33\x81\x37\x60\x7d\xfa\x73\x08" + "\xd8\x49\x6d\x80\xe8\x2f\x62\x54" + "\xeb\x0e\xa9\x39\x5b\x45\x7f\x8a", + .ptext = "\x67\xc9\xf2\x30\x84\x41\x8e\x43" + "\xfb\xf3\xb3\x3e\x79\x36\x7f\xe8", + .ctext = "\xf6\x78\x97\xd6\xaa\x94\x01\x27" + "\x2e\x4d\x83\xe0\x6e\x64\x9a\xdf", + .len = 16, + .also_non_np = 1, + .np = 3, + .tap = { 5, 2, 9 }, + }, { + .key = "\x36\x2b\x57\x97\xf8\x5d\xcd\x99" + "\x5f\x1a\x5a\x44\x1d\x92\x0f\x27" + "\xcc\x16\xd7\x2b\x85\x63\x99\xd3" + "\xba\x96\xa1\xdb\xd2\x60\x68\xda", + .klen = 32, + .iv = "\xef\x58\x69\xb1\x2c\x5e\x9a\x47" + "\x24\xc1\xb1\x69\xe1\x12\x93\x8f" + "\x43\x3d\x6d\x00\xdb\x5e\xd8\xd9" + "\x12\x9a\xfe\xd9\xff\x2d\xaa\xc4", + .ptext = "\x5e\xa8\x68\x19\x85\x98\x12\x23" + "\x26\x0a\xcc\xdb\x0a\x04\xb9\xdf" + "\x4d\xb3\x48\x7b\xb0\xe3\xc8\x19" + "\x43\x5a\x46\x06\x94\x2d\xf2", + .ctext = "\x4b\xb8\x90\x10\xdf\x7f\x64\x08" + "\x0e\x14\x42\x5f\x00\x74\x09\x36" + "\x57\x72\xb5\xfd\xb5\x5d\xb8\x28" + "\x0c\x04\x91\x14\x91\xe9\x37", + .len = 31, + .also_non_np = 1, + .np = 2, + .tap = { 16, 15 }, + }, { + .key = "\xa5\x28\x24\x34\x1a\x3c\xd8\xf7" + "\x05\x91\x8f\xee\x85\x1f\x35\x7f" + "\x80\x3d\xfc\x9b\x94\xf6\xfc\x9e" + "\x19\x09\x00\xa9\x04\x31\x4f\x11", + .klen = 32, + .iv = "\xa1\xba\x49\x95\xff\x34\x6d\xb8" + "\xcd\x87\x5d\x5e\xfd\xea\x85\xdb" + "\x8a\x7b\x5e\xb2\x5d\x57\xdd\x62" + "\xac\xa9\x8c\x41\x42\x94\x75\xb7", + .ptext = "\x69\xb4\xe8\x8c\x37\xe8\x67\x82" + "\xf1\xec\x5d\x04\xe5\x14\x91\x13" + "\xdf\xf2\x87\x1b\x69\x81\x1d\x71" + "\x70\x9e\x9c\x3b\xde\x49\x70\x11" + "\xa0\xa3\xdb\x0d\x54\x4f\x66\x69" + "\xd7\xdb\x80\xa7\x70\x92\x68\xce" + "\x81\x04\x2c\xc6\xab\xae\xe5\x60" + "\x15\xe9\x6f\xef\xaa\x8f\xa7\xa7" + "\x63\x8f\xf2\xf0\x77\xf1\xa8\xea" + "\xe1\xb7\x1f\x9e\xab\x9e\x4b\x3f" + "\x07\x87\x5b\x6f\xcd\xa8\xaf\xb9" + "\xfa\x70\x0b\x52\xb8\xa8\xa7\x9e" + "\x07\x5f\xa6\x0e\xb3\x9b\x79\x13" + "\x79\xc3\x3e\x8d\x1c\x2c\x68\xc8" + "\x51\x1d\x3c\x7b\x7d\x79\x77\x2a" + "\x56\x65\xc5\x54\x23\x28\xb0\x03", + .ctext = "\xb1\x8b\xa0\x05\x77\xa8\x4d\x59" + "\x1b\x8e\x21\xfc\x3a\x49\xfa\xd4" + "\xeb\x36\xf3\xc4\xdf\xdc\xae\x67" + "\x07\x3f\x70\x0e\xe9\x66\xf5\x0c" + "\x30\x4d\x66\xc9\xa4\x2f\x73\x9c" + "\x13\xc8\x49\x44\xcc\x0a\x90\x9d" + "\x7c\xdd\x19\x3f\xea\x72\x8d\x58" + "\xab\xe7\x09\x2c\xec\xb5\x44\xd2" + "\xca\xa6\x2d\x7a\x5c\x9c\x2b\x15" + "\xec\x2a\xa6\x69\x91\xf9\xf3\x13" + "\xf7\x72\xc1\xc1\x40\xd5\xe1\x94" + "\xf4\x29\xa1\x3e\x25\x02\xa8\x3e" + "\x94\xc1\x91\x14\xa1\x14\xcb\xbe" + "\x67\x4c\xb9\x38\xfe\xa7\xaa\x32" + "\x29\x62\x0d\xb2\xf6\x3c\x58\x57" + "\xc1\xd5\x5a\xbb\xd6\xa6\x2a\xe5", + .len = 128, + .also_non_np = 1, + .np = 4, + .tap = { 112, 7, 8, 1 }, + }, { + .key = "\xd3\x81\x72\x18\x23\xff\x6f\x4a" + "\x25\x74\x29\x0d\x51\x8a\x0e\x13" + "\xc1\x53\x5d\x30\x8d\xee\x75\x0d" + "\x14\xd6\x69\xc9\x15\xa9\x0c\x60", + .klen = 32, + .iv = "\x65\x9b\xd4\xa8\x7d\x29\x1d\xf4" + "\xc4\xd6\x9b\x6a\x28\xab\x64\xe2" + "\x62\x81\x97\xc5\x81\xaa\xf9\x44" + "\xc1\x72\x59\x82\xaf\x16\xc8\x2c", + .ptext = "\xc7\x6b\x52\x6a\x10\xf0\xcc\x09" + "\xc1\x12\x1d\x6d\x21\xa6\x78\xf5" + "\x05\xa3\x69\x60\x91\x36\x98\x57" + "\xba\x0c\x14\xcc\xf3\x2d\x73\x03" + "\xc6\xb2\x5f\xc8\x16\x27\x37\x5d" + "\xd0\x0b\x87\xb2\x50\x94\x7b\x58" + "\x04\xf4\xe0\x7f\x6e\x57\x8e\xc9" + "\x41\x84\xc1\xb1\x7e\x4b\x91\x12" + "\x3a\x8b\x5d\x50\x82\x7b\xcb\xd9" + "\x9a\xd9\x4e\x18\x06\x23\x9e\xd4" + "\xa5\x20\x98\xef\xb5\xda\xe5\xc0" + "\x8a\x6a\x83\x77\x15\x84\x1e\xae" + "\x78\x94\x9d\xdf\xb7\xd1\xea\x67" + "\xaa\xb0\x14\x15\xfa\x67\x21\x84" + "\xd3\x41\x2a\xce\xba\x4b\x4a\xe8" + "\x95\x62\xa9\x55\xf0\x80\xad\xbd" + "\xab\xaf\xdd\x4f\xa5\x7c\x13\x36" + "\xed\x5e\x4f\x72\xad\x4b\xf1\xd0" + "\x88\x4e\xec\x2c\x88\x10\x5e\xea" + "\x12\xc0\x16\x01\x29\xa3\xa0\x55" + "\xaa\x68\xf3\xe9\x9d\x3b\x0d\x3b" + "\x6d\xec\xf8\xa0\x2d\xf0\x90\x8d" + "\x1c\xe2\x88\xd4\x24\x71\xf9\xb3" + "\xc1\x9f\xc5\xd6\x76\x70\xc5\x2e" + "\x9c\xac\xdb\x90\xbd\x83\x72\xba" + "\x6e\xb5\xa5\x53\x83\xa9\xa5\xbf" + "\x7d\x06\x0e\x3c\x2a\xd2\x04\xb5" + "\x1e\x19\x38\x09\x16\xd2\x82\x1f" + "\x75\x18\x56\xb8\x96\x0b\xa6\xf9" + "\xcf\x62\xd9\x32\x5d\xa9\xd7\x1d" + "\xec\xe4\xdf\x1b\xbe\xf1\x36\xee" + "\xe3\x7b\xb5\x2f\xee\xf8\x53\x3d" + "\x6a\xb7\x70\xa9\xfc\x9c\x57\x25" + "\xf2\x89\x10\xd3\xb8\xa8\x8c\x30" + "\xae\x23\x4f\x0e\x13\x66\x4f\xe1" + "\xb6\xc0\xe4\xf8\xef\x93\xbd\x6e" + "\x15\x85\x6b\xe3\x60\x81\x1d\x68" + "\xd7\x31\x87\x89\x09\xab\xd5\x96" + "\x1d\xf3\x6d\x67\x80\xca\x07\x31" + "\x5d\xa7\xe4\xfb\x3e\xf2\x9b\x33" + "\x52\x18\xc8\x30\xfe\x2d\xca\x1e" + "\x79\x92\x7a\x60\x5c\xb6\x58\x87" + "\xa4\x36\xa2\x67\x92\x8b\xa4\xb7" + "\xf1\x86\xdf\xdc\xc0\x7e\x8f\x63" + "\xd2\xa2\xdc\x78\xeb\x4f\xd8\x96" + "\x47\xca\xb8\x91\xf9\xf7\x94\x21" + "\x5f\x9a\x9f\x5b\xb8\x40\x41\x4b" + "\x66\x69\x6a\x72\xd0\xcb\x70\xb7" + "\x93\xb5\x37\x96\x05\x37\x4f\xe5" + "\x8c\xa7\x5a\x4e\x8b\xb7\x84\xea" + "\xc7\xfc\x19\x6e\x1f\x5a\xa1\xac" + "\x18\x7d\x52\x3b\xb3\x34\x62\x99" + "\xe4\x9e\x31\x04\x3f\xc0\x8d\x84" + "\x17\x7c\x25\x48\x52\x67\x11\x27" + "\x67\xbb\x5a\x85\xca\x56\xb2\x5c" + "\xe6\xec\xd5\x96\x3d\x15\xfc\xfb" + "\x22\x25\xf4\x13\xe5\x93\x4b\x9a" + "\x77\xf1\x52\x18\xfa\x16\x5e\x49" + "\x03\x45\xa8\x08\xfa\xb3\x41\x92" + "\x79\x50\x33\xca\xd0\xd7\x42\x55" + "\xc3\x9a\x0c\x4e\xd9\xa4\x3c\x86" + "\x80\x9f\x53\xd1\xa4\x2e\xd1\xbc" + "\xf1\x54\x6e\x93\xa4\x65\x99\x8e" + "\xdf\x29\xc0\x64\x63\x07\xbb\xea", + .ctext = "\xe0\x33\xf6\xe0\xb4\xa5\xdd\x2b" + "\xdd\xce\xfc\x12\x1e\xfc\x2d\xf2" + "\x8b\xc7\xeb\xc1\xc4\x2a\xe8\x44" + "\x0f\x3d\x97\x19\x2e\x6d\xa2\x38" + "\x9d\xa6\xaa\xe1\x96\xb9\x08\xe8" + "\x0b\x70\x48\x5c\xed\xb5\x9b\xcb" + "\x8b\x40\x88\x7e\x69\x73\xf7\x16" + "\x71\xbb\x5b\xfc\xa3\x47\x5d\xa6" + "\xae\x3a\x64\xc4\xe7\xb8\xa8\xe7" + "\xb1\x32\x19\xdb\xe3\x01\xb8\xf0" + "\xa4\x86\xb4\x4c\xc2\xde\x5c\xd2" + "\x6c\x77\xd2\xe8\x18\xb7\x0a\xc9" + "\x3d\x53\xb5\xc4\x5c\xf0\x8c\x06" + "\xdc\x90\xe0\x74\x47\x1b\x0b\xf6" + "\xd2\x71\x6b\xc4\xf1\x97\x00\x2d" + "\x63\x57\x44\x1f\x8c\xf4\xe6\x9b" + "\xe0\x7a\xdd\xec\x32\x73\x42\x32" + "\x7f\x35\x67\x60\x0d\xcf\x10\x52" + "\x61\x22\x53\x8d\x8e\xbb\x33\x76" + "\x59\xd9\x10\xce\xdf\xef\xc0\x41" + "\xd5\x33\x29\x6a\xda\x46\xa4\x51" + "\xf0\x99\x3d\x96\x31\xdd\xb5\xcb" + "\x3e\x2a\x1f\xc7\x5c\x79\xd3\xc5" + "\x20\xa1\xb1\x39\x1b\xc6\x0a\x70" + "\x26\x39\x95\x07\xad\x7a\xc9\x69" + "\xfe\x81\xc7\x88\x08\x38\xaf\xad" + "\x9e\x8d\xfb\xe8\x24\x0d\x22\xb8" + "\x0e\xed\xbe\x37\x53\x7c\xa6\xc6" + "\x78\x62\xec\xa3\x59\xd9\xc6\x9d" + "\xb8\x0e\x69\x77\x84\x2d\x6a\x4c" + "\xc5\xd9\xb2\xa0\x2b\xa8\x80\xcc" + "\xe9\x1e\x9c\x5a\xc4\xa1\xb2\x37" + "\x06\x9b\x30\x32\x67\xf7\xe7\xd2" + "\x42\xc7\xdf\x4e\xd4\xcb\xa0\x12" + "\x94\xa1\x34\x85\x93\x50\x4b\x0a" + "\x3c\x7d\x49\x25\x01\x41\x6b\x96" + "\xa9\x12\xbb\x0b\xc0\xd7\xd0\x93" + "\x1f\x70\x38\xb8\x21\xee\xf6\xa7" + "\xee\xeb\xe7\x81\xa4\x13\xb4\x87" + "\xfa\xc1\xb0\xb5\x37\x8b\x74\xa2" + "\x4e\xc7\xc2\xad\x3d\x62\x3f\xf8" + "\x34\x42\xe5\xae\x45\x13\x63\xfe" + "\xfc\x2a\x17\x46\x61\xa9\xd3\x1c" + "\x4c\xaf\xf0\x09\x62\x26\x66\x1e" + "\x74\xcf\xd6\x68\x3d\x7d\xd8\xb7" + "\xe7\xe6\xf8\xf0\x08\x20\xf7\x47" + "\x1c\x52\xaa\x0f\x3e\x21\xa3\xf2" + "\xbf\x2f\x95\x16\xa8\xc8\xc8\x8c" + "\x99\x0f\x5d\xfb\xfa\x2b\x58\x8a" + "\x7e\xd6\x74\x02\x60\xf0\xd0\x5b" + "\x65\xa8\xac\xea\x8d\x68\x46\x34" + "\x26\x9d\x4f\xb1\x9a\x8e\xc0\x1a" + "\xf1\xed\xc6\x7a\x83\xfd\x8a\x57" + "\xf2\xe6\xe4\xba\xfc\xc6\x3c\xad" + "\x5b\x19\x50\x2f\x3a\xcc\x06\x46" + "\x04\x51\x3f\x91\x97\xf0\xd2\x07" + "\xe7\x93\x89\x7e\xb5\x32\x0f\x03" + "\xe5\x58\x9e\x74\x72\xeb\xc2\x38" + "\x00\x0c\x91\x72\x69\xed\x7d\x6d" + "\xc8\x71\xf0\xec\xff\x80\xd9\x1c" + "\x9e\xd2\xfa\x15\xfc\x6c\x4e\xbc" + "\xb1\xa6\xbd\xbd\x70\x40\xca\x20" + "\xb8\x78\xd2\xa3\xc6\xf3\x79\x9c" + "\xc7\x27\xe1\x6a\x29\xad\xa4\x03", + .len = 512, + } +}; + /* * CTS (Cipher Text Stealing) mode tests */ -- cgit v1.2.3-59-g8ed1b From cee7a36ecb5bafef8c87fb2c10641e6125044154 Mon Sep 17 00:00:00 2001 From: Martin Willi Date: Tue, 20 Nov 2018 17:30:48 +0100 Subject: crypto: x86/chacha20 - Add a 8-block AVX-512VL variant This variant is similar to the AVX2 version, but benefits from the AVX-512 rotate instructions and the additional registers, so it can operate without any data on the stack. It uses ymm registers only to avoid the massive core throttling on Skylake-X platforms. Nontheless does it bring a ~30% speed improvement compared to the AVX2 variant for random encryption lengths. The AVX2 version uses "rep movsb" for partial block XORing via the stack. With AVX-512, the new "vmovdqu8" can do this much more efficiently. The associated "kmov" instructions to work with dynamic masks is not part of the AVX-512VL instruction set, hence we depend on AVX-512BW as well. Given that the major AVX-512VL architectures provide AVX-512BW and this extension does not affect core clocking, this seems to be no problem at least for now. Signed-off-by: Martin Willi Signed-off-by: Herbert Xu --- arch/x86/crypto/Makefile | 5 + arch/x86/crypto/chacha20-avx512vl-x86_64.S | 396 +++++++++++++++++++++++++++++ arch/x86/crypto/chacha20_glue.c | 26 ++ 3 files changed, 427 insertions(+) create mode 100644 arch/x86/crypto/chacha20-avx512vl-x86_64.S diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile index a4b0007a54e1..ce4e43642984 100644 --- a/arch/x86/crypto/Makefile +++ b/arch/x86/crypto/Makefile @@ -8,6 +8,7 @@ OBJECT_FILES_NON_STANDARD := y avx_supported := $(call as-instr,vpxor %xmm0$(comma)%xmm0$(comma)%xmm0,yes,no) avx2_supported := $(call as-instr,vpgatherdd %ymm0$(comma)(%eax$(comma)%ymm1\ $(comma)4)$(comma)%ymm2,yes,no) +avx512_supported :=$(call as-instr,vpmovm2b %k1$(comma)%zmm5,yes,no) sha1_ni_supported :=$(call as-instr,sha1msg1 %xmm0$(comma)%xmm1,yes,no) sha256_ni_supported :=$(call as-instr,sha256msg1 %xmm0$(comma)%xmm1,yes,no) @@ -103,6 +104,10 @@ ifeq ($(avx2_supported),yes) morus1280-avx2-y := morus1280-avx2-asm.o morus1280-avx2-glue.o endif +ifeq ($(avx512_supported),yes) + chacha20-x86_64-y += chacha20-avx512vl-x86_64.o +endif + aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o aesni-intel-$(CONFIG_64BIT) += aesni-intel_avx-x86_64.o aes_ctrby8_avx-x86_64.o ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o diff --git a/arch/x86/crypto/chacha20-avx512vl-x86_64.S b/arch/x86/crypto/chacha20-avx512vl-x86_64.S new file mode 100644 index 000000000000..e1877afcaa73 --- /dev/null +++ b/arch/x86/crypto/chacha20-avx512vl-x86_64.S @@ -0,0 +1,396 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ +/* + * ChaCha20 256-bit cipher algorithm, RFC7539, x64 AVX-512VL functions + * + * Copyright (C) 2018 Martin Willi + */ + +#include + +.section .rodata.cst32.CTR8BL, "aM", @progbits, 32 +.align 32 +CTR8BL: .octa 0x00000003000000020000000100000000 + .octa 0x00000007000000060000000500000004 + +.text + +ENTRY(chacha20_8block_xor_avx512vl) + # %rdi: Input state matrix, s + # %rsi: up to 8 data blocks output, o + # %rdx: up to 8 data blocks input, i + # %rcx: input/output length in bytes + + # This function encrypts eight consecutive ChaCha20 blocks by loading + # the state matrix in AVX registers eight times. Compared to AVX2, this + # mostly benefits from the new rotate instructions in VL and the + # additional registers. + + vzeroupper + + # x0..15[0-7] = s[0..15] + vpbroadcastd 0x00(%rdi),%ymm0 + vpbroadcastd 0x04(%rdi),%ymm1 + vpbroadcastd 0x08(%rdi),%ymm2 + vpbroadcastd 0x0c(%rdi),%ymm3 + vpbroadcastd 0x10(%rdi),%ymm4 + vpbroadcastd 0x14(%rdi),%ymm5 + vpbroadcastd 0x18(%rdi),%ymm6 + vpbroadcastd 0x1c(%rdi),%ymm7 + vpbroadcastd 0x20(%rdi),%ymm8 + vpbroadcastd 0x24(%rdi),%ymm9 + vpbroadcastd 0x28(%rdi),%ymm10 + vpbroadcastd 0x2c(%rdi),%ymm11 + vpbroadcastd 0x30(%rdi),%ymm12 + vpbroadcastd 0x34(%rdi),%ymm13 + vpbroadcastd 0x38(%rdi),%ymm14 + vpbroadcastd 0x3c(%rdi),%ymm15 + + # x12 += counter values 0-3 + vpaddd CTR8BL(%rip),%ymm12,%ymm12 + + vmovdqa64 %ymm0,%ymm16 + vmovdqa64 %ymm1,%ymm17 + vmovdqa64 %ymm2,%ymm18 + vmovdqa64 %ymm3,%ymm19 + vmovdqa64 %ymm4,%ymm20 + vmovdqa64 %ymm5,%ymm21 + vmovdqa64 %ymm6,%ymm22 + vmovdqa64 %ymm7,%ymm23 + vmovdqa64 %ymm8,%ymm24 + vmovdqa64 %ymm9,%ymm25 + vmovdqa64 %ymm10,%ymm26 + vmovdqa64 %ymm11,%ymm27 + vmovdqa64 %ymm12,%ymm28 + vmovdqa64 %ymm13,%ymm29 + vmovdqa64 %ymm14,%ymm30 + vmovdqa64 %ymm15,%ymm31 + + mov $10,%eax + +.Ldoubleround8: + # x0 += x4, x12 = rotl32(x12 ^ x0, 16) + vpaddd %ymm0,%ymm4,%ymm0 + vpxord %ymm0,%ymm12,%ymm12 + vprold $16,%ymm12,%ymm12 + # x1 += x5, x13 = rotl32(x13 ^ x1, 16) + vpaddd %ymm1,%ymm5,%ymm1 + vpxord %ymm1,%ymm13,%ymm13 + vprold $16,%ymm13,%ymm13 + # x2 += x6, x14 = rotl32(x14 ^ x2, 16) + vpaddd %ymm2,%ymm6,%ymm2 + vpxord %ymm2,%ymm14,%ymm14 + vprold $16,%ymm14,%ymm14 + # x3 += x7, x15 = rotl32(x15 ^ x3, 16) + vpaddd %ymm3,%ymm7,%ymm3 + vpxord %ymm3,%ymm15,%ymm15 + vprold $16,%ymm15,%ymm15 + + # x8 += x12, x4 = rotl32(x4 ^ x8, 12) + vpaddd %ymm12,%ymm8,%ymm8 + vpxord %ymm8,%ymm4,%ymm4 + vprold $12,%ymm4,%ymm4 + # x9 += x13, x5 = rotl32(x5 ^ x9, 12) + vpaddd %ymm13,%ymm9,%ymm9 + vpxord %ymm9,%ymm5,%ymm5 + vprold $12,%ymm5,%ymm5 + # x10 += x14, x6 = rotl32(x6 ^ x10, 12) + vpaddd %ymm14,%ymm10,%ymm10 + vpxord %ymm10,%ymm6,%ymm6 + vprold $12,%ymm6,%ymm6 + # x11 += x15, x7 = rotl32(x7 ^ x11, 12) + vpaddd %ymm15,%ymm11,%ymm11 + vpxord %ymm11,%ymm7,%ymm7 + vprold $12,%ymm7,%ymm7 + + # x0 += x4, x12 = rotl32(x12 ^ x0, 8) + vpaddd %ymm0,%ymm4,%ymm0 + vpxord %ymm0,%ymm12,%ymm12 + vprold $8,%ymm12,%ymm12 + # x1 += x5, x13 = rotl32(x13 ^ x1, 8) + vpaddd %ymm1,%ymm5,%ymm1 + vpxord %ymm1,%ymm13,%ymm13 + vprold $8,%ymm13,%ymm13 + # x2 += x6, x14 = rotl32(x14 ^ x2, 8) + vpaddd %ymm2,%ymm6,%ymm2 + vpxord %ymm2,%ymm14,%ymm14 + vprold $8,%ymm14,%ymm14 + # x3 += x7, x15 = rotl32(x15 ^ x3, 8) + vpaddd %ymm3,%ymm7,%ymm3 + vpxord %ymm3,%ymm15,%ymm15 + vprold $8,%ymm15,%ymm15 + + # x8 += x12, x4 = rotl32(x4 ^ x8, 7) + vpaddd %ymm12,%ymm8,%ymm8 + vpxord %ymm8,%ymm4,%ymm4 + vprold $7,%ymm4,%ymm4 + # x9 += x13, x5 = rotl32(x5 ^ x9, 7) + vpaddd %ymm13,%ymm9,%ymm9 + vpxord %ymm9,%ymm5,%ymm5 + vprold $7,%ymm5,%ymm5 + # x10 += x14, x6 = rotl32(x6 ^ x10, 7) + vpaddd %ymm14,%ymm10,%ymm10 + vpxord %ymm10,%ymm6,%ymm6 + vprold $7,%ymm6,%ymm6 + # x11 += x15, x7 = rotl32(x7 ^ x11, 7) + vpaddd %ymm15,%ymm11,%ymm11 + vpxord %ymm11,%ymm7,%ymm7 + vprold $7,%ymm7,%ymm7 + + # x0 += x5, x15 = rotl32(x15 ^ x0, 16) + vpaddd %ymm0,%ymm5,%ymm0 + vpxord %ymm0,%ymm15,%ymm15 + vprold $16,%ymm15,%ymm15 + # x1 += x6, x12 = rotl32(x12 ^ x1, 16) + vpaddd %ymm1,%ymm6,%ymm1 + vpxord %ymm1,%ymm12,%ymm12 + vprold $16,%ymm12,%ymm12 + # x2 += x7, x13 = rotl32(x13 ^ x2, 16) + vpaddd %ymm2,%ymm7,%ymm2 + vpxord %ymm2,%ymm13,%ymm13 + vprold $16,%ymm13,%ymm13 + # x3 += x4, x14 = rotl32(x14 ^ x3, 16) + vpaddd %ymm3,%ymm4,%ymm3 + vpxord %ymm3,%ymm14,%ymm14 + vprold $16,%ymm14,%ymm14 + + # x10 += x15, x5 = rotl32(x5 ^ x10, 12) + vpaddd %ymm15,%ymm10,%ymm10 + vpxord %ymm10,%ymm5,%ymm5 + vprold $12,%ymm5,%ymm5 + # x11 += x12, x6 = rotl32(x6 ^ x11, 12) + vpaddd %ymm12,%ymm11,%ymm11 + vpxord %ymm11,%ymm6,%ymm6 + vprold $12,%ymm6,%ymm6 + # x8 += x13, x7 = rotl32(x7 ^ x8, 12) + vpaddd %ymm13,%ymm8,%ymm8 + vpxord %ymm8,%ymm7,%ymm7 + vprold $12,%ymm7,%ymm7 + # x9 += x14, x4 = rotl32(x4 ^ x9, 12) + vpaddd %ymm14,%ymm9,%ymm9 + vpxord %ymm9,%ymm4,%ymm4 + vprold $12,%ymm4,%ymm4 + + # x0 += x5, x15 = rotl32(x15 ^ x0, 8) + vpaddd %ymm0,%ymm5,%ymm0 + vpxord %ymm0,%ymm15,%ymm15 + vprold $8,%ymm15,%ymm15 + # x1 += x6, x12 = rotl32(x12 ^ x1, 8) + vpaddd %ymm1,%ymm6,%ymm1 + vpxord %ymm1,%ymm12,%ymm12 + vprold $8,%ymm12,%ymm12 + # x2 += x7, x13 = rotl32(x13 ^ x2, 8) + vpaddd %ymm2,%ymm7,%ymm2 + vpxord %ymm2,%ymm13,%ymm13 + vprold $8,%ymm13,%ymm13 + # x3 += x4, x14 = rotl32(x14 ^ x3, 8) + vpaddd %ymm3,%ymm4,%ymm3 + vpxord %ymm3,%ymm14,%ymm14 + vprold $8,%ymm14,%ymm14 + + # x10 += x15, x5 = rotl32(x5 ^ x10, 7) + vpaddd %ymm15,%ymm10,%ymm10 + vpxord %ymm10,%ymm5,%ymm5 + vprold $7,%ymm5,%ymm5 + # x11 += x12, x6 = rotl32(x6 ^ x11, 7) + vpaddd %ymm12,%ymm11,%ymm11 + vpxord %ymm11,%ymm6,%ymm6 + vprold $7,%ymm6,%ymm6 + # x8 += x13, x7 = rotl32(x7 ^ x8, 7) + vpaddd %ymm13,%ymm8,%ymm8 + vpxord %ymm8,%ymm7,%ymm7 + vprold $7,%ymm7,%ymm7 + # x9 += x14, x4 = rotl32(x4 ^ x9, 7) + vpaddd %ymm14,%ymm9,%ymm9 + vpxord %ymm9,%ymm4,%ymm4 + vprold $7,%ymm4,%ymm4 + + dec %eax + jnz .Ldoubleround8 + + # x0..15[0-3] += s[0..15] + vpaddd %ymm16,%ymm0,%ymm0 + vpaddd %ymm17,%ymm1,%ymm1 + vpaddd %ymm18,%ymm2,%ymm2 + vpaddd %ymm19,%ymm3,%ymm3 + vpaddd %ymm20,%ymm4,%ymm4 + vpaddd %ymm21,%ymm5,%ymm5 + vpaddd %ymm22,%ymm6,%ymm6 + vpaddd %ymm23,%ymm7,%ymm7 + vpaddd %ymm24,%ymm8,%ymm8 + vpaddd %ymm25,%ymm9,%ymm9 + vpaddd %ymm26,%ymm10,%ymm10 + vpaddd %ymm27,%ymm11,%ymm11 + vpaddd %ymm28,%ymm12,%ymm12 + vpaddd %ymm29,%ymm13,%ymm13 + vpaddd %ymm30,%ymm14,%ymm14 + vpaddd %ymm31,%ymm15,%ymm15 + + # interleave 32-bit words in state n, n+1 + vpunpckldq %ymm1,%ymm0,%ymm16 + vpunpckhdq %ymm1,%ymm0,%ymm17 + vpunpckldq %ymm3,%ymm2,%ymm18 + vpunpckhdq %ymm3,%ymm2,%ymm19 + vpunpckldq %ymm5,%ymm4,%ymm20 + vpunpckhdq %ymm5,%ymm4,%ymm21 + vpunpckldq %ymm7,%ymm6,%ymm22 + vpunpckhdq %ymm7,%ymm6,%ymm23 + vpunpckldq %ymm9,%ymm8,%ymm24 + vpunpckhdq %ymm9,%ymm8,%ymm25 + vpunpckldq %ymm11,%ymm10,%ymm26 + vpunpckhdq %ymm11,%ymm10,%ymm27 + vpunpckldq %ymm13,%ymm12,%ymm28 + vpunpckhdq %ymm13,%ymm12,%ymm29 + vpunpckldq %ymm15,%ymm14,%ymm30 + vpunpckhdq %ymm15,%ymm14,%ymm31 + + # interleave 64-bit words in state n, n+2 + vpunpcklqdq %ymm18,%ymm16,%ymm0 + vpunpcklqdq %ymm19,%ymm17,%ymm1 + vpunpckhqdq %ymm18,%ymm16,%ymm2 + vpunpckhqdq %ymm19,%ymm17,%ymm3 + vpunpcklqdq %ymm22,%ymm20,%ymm4 + vpunpcklqdq %ymm23,%ymm21,%ymm5 + vpunpckhqdq %ymm22,%ymm20,%ymm6 + vpunpckhqdq %ymm23,%ymm21,%ymm7 + vpunpcklqdq %ymm26,%ymm24,%ymm8 + vpunpcklqdq %ymm27,%ymm25,%ymm9 + vpunpckhqdq %ymm26,%ymm24,%ymm10 + vpunpckhqdq %ymm27,%ymm25,%ymm11 + vpunpcklqdq %ymm30,%ymm28,%ymm12 + vpunpcklqdq %ymm31,%ymm29,%ymm13 + vpunpckhqdq %ymm30,%ymm28,%ymm14 + vpunpckhqdq %ymm31,%ymm29,%ymm15 + + # interleave 128-bit words in state n, n+4 + # xor/write first four blocks + vmovdqa64 %ymm0,%ymm16 + vperm2i128 $0x20,%ymm4,%ymm0,%ymm0 + cmp $0x0020,%rcx + jl .Lxorpart8 + vpxord 0x0000(%rdx),%ymm0,%ymm0 + vmovdqu64 %ymm0,0x0000(%rsi) + vmovdqa64 %ymm16,%ymm0 + vperm2i128 $0x31,%ymm4,%ymm0,%ymm4 + + vperm2i128 $0x20,%ymm12,%ymm8,%ymm0 + cmp $0x0040,%rcx + jl .Lxorpart8 + vpxord 0x0020(%rdx),%ymm0,%ymm0 + vmovdqu64 %ymm0,0x0020(%rsi) + vperm2i128 $0x31,%ymm12,%ymm8,%ymm12 + + vperm2i128 $0x20,%ymm6,%ymm2,%ymm0 + cmp $0x0060,%rcx + jl .Lxorpart8 + vpxord 0x0040(%rdx),%ymm0,%ymm0 + vmovdqu64 %ymm0,0x0040(%rsi) + vperm2i128 $0x31,%ymm6,%ymm2,%ymm6 + + vperm2i128 $0x20,%ymm14,%ymm10,%ymm0 + cmp $0x0080,%rcx + jl .Lxorpart8 + vpxord 0x0060(%rdx),%ymm0,%ymm0 + vmovdqu64 %ymm0,0x0060(%rsi) + vperm2i128 $0x31,%ymm14,%ymm10,%ymm14 + + vperm2i128 $0x20,%ymm5,%ymm1,%ymm0 + cmp $0x00a0,%rcx + jl .Lxorpart8 + vpxord 0x0080(%rdx),%ymm0,%ymm0 + vmovdqu64 %ymm0,0x0080(%rsi) + vperm2i128 $0x31,%ymm5,%ymm1,%ymm5 + + vperm2i128 $0x20,%ymm13,%ymm9,%ymm0 + cmp $0x00c0,%rcx + jl .Lxorpart8 + vpxord 0x00a0(%rdx),%ymm0,%ymm0 + vmovdqu64 %ymm0,0x00a0(%rsi) + vperm2i128 $0x31,%ymm13,%ymm9,%ymm13 + + vperm2i128 $0x20,%ymm7,%ymm3,%ymm0 + cmp $0x00e0,%rcx + jl .Lxorpart8 + vpxord 0x00c0(%rdx),%ymm0,%ymm0 + vmovdqu64 %ymm0,0x00c0(%rsi) + vperm2i128 $0x31,%ymm7,%ymm3,%ymm7 + + vperm2i128 $0x20,%ymm15,%ymm11,%ymm0 + cmp $0x0100,%rcx + jl .Lxorpart8 + vpxord 0x00e0(%rdx),%ymm0,%ymm0 + vmovdqu64 %ymm0,0x00e0(%rsi) + vperm2i128 $0x31,%ymm15,%ymm11,%ymm15 + + # xor remaining blocks, write to output + vmovdqa64 %ymm4,%ymm0 + cmp $0x0120,%rcx + jl .Lxorpart8 + vpxord 0x0100(%rdx),%ymm0,%ymm0 + vmovdqu64 %ymm0,0x0100(%rsi) + + vmovdqa64 %ymm12,%ymm0 + cmp $0x0140,%rcx + jl .Lxorpart8 + vpxord 0x0120(%rdx),%ymm0,%ymm0 + vmovdqu64 %ymm0,0x0120(%rsi) + + vmovdqa64 %ymm6,%ymm0 + cmp $0x0160,%rcx + jl .Lxorpart8 + vpxord 0x0140(%rdx),%ymm0,%ymm0 + vmovdqu64 %ymm0,0x0140(%rsi) + + vmovdqa64 %ymm14,%ymm0 + cmp $0x0180,%rcx + jl .Lxorpart8 + vpxord 0x0160(%rdx),%ymm0,%ymm0 + vmovdqu64 %ymm0,0x0160(%rsi) + + vmovdqa64 %ymm5,%ymm0 + cmp $0x01a0,%rcx + jl .Lxorpart8 + vpxord 0x0180(%rdx),%ymm0,%ymm0 + vmovdqu64 %ymm0,0x0180(%rsi) + + vmovdqa64 %ymm13,%ymm0 + cmp $0x01c0,%rcx + jl .Lxorpart8 + vpxord 0x01a0(%rdx),%ymm0,%ymm0 + vmovdqu64 %ymm0,0x01a0(%rsi) + + vmovdqa64 %ymm7,%ymm0 + cmp $0x01e0,%rcx + jl .Lxorpart8 + vpxord 0x01c0(%rdx),%ymm0,%ymm0 + vmovdqu64 %ymm0,0x01c0(%rsi) + + vmovdqa64 %ymm15,%ymm0 + cmp $0x0200,%rcx + jl .Lxorpart8 + vpxord 0x01e0(%rdx),%ymm0,%ymm0 + vmovdqu64 %ymm0,0x01e0(%rsi) + +.Ldone8: + vzeroupper + ret + +.Lxorpart8: + # xor remaining bytes from partial register into output + mov %rcx,%rax + and $0x1f,%rcx + jz .Ldone8 + mov %rax,%r9 + and $~0x1f,%r9 + + mov $1,%rax + shld %cl,%rax,%rax + sub $1,%rax + kmovq %rax,%k1 + + vmovdqu8 (%rdx,%r9),%ymm1{%k1}{z} + vpxord %ymm0,%ymm1,%ymm1 + vmovdqu8 %ymm1,(%rsi,%r9){%k1} + + jmp .Ldone8 + +ENDPROC(chacha20_8block_xor_avx512vl) diff --git a/arch/x86/crypto/chacha20_glue.c b/arch/x86/crypto/chacha20_glue.c index 1e9e66509226..6a67e70bc82a 100644 --- a/arch/x86/crypto/chacha20_glue.c +++ b/arch/x86/crypto/chacha20_glue.c @@ -31,6 +31,11 @@ asmlinkage void chacha20_4block_xor_avx2(u32 *state, u8 *dst, const u8 *src, asmlinkage void chacha20_8block_xor_avx2(u32 *state, u8 *dst, const u8 *src, unsigned int len); static bool chacha20_use_avx2; +#ifdef CONFIG_AS_AVX512 +asmlinkage void chacha20_8block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src, + unsigned int len); +static bool chacha20_use_avx512vl; +#endif #endif static unsigned int chacha20_advance(unsigned int len, unsigned int maxblocks) @@ -43,6 +48,22 @@ static void chacha20_dosimd(u32 *state, u8 *dst, const u8 *src, unsigned int bytes) { #ifdef CONFIG_AS_AVX2 +#ifdef CONFIG_AS_AVX512 + if (chacha20_use_avx512vl) { + while (bytes >= CHACHA_BLOCK_SIZE * 8) { + chacha20_8block_xor_avx512vl(state, dst, src, bytes); + bytes -= CHACHA_BLOCK_SIZE * 8; + src += CHACHA_BLOCK_SIZE * 8; + dst += CHACHA_BLOCK_SIZE * 8; + state[12] += 8; + } + if (bytes > CHACHA_BLOCK_SIZE * 4) { + chacha20_8block_xor_avx512vl(state, dst, src, bytes); + state[12] += chacha20_advance(bytes, 8); + return; + } + } +#endif if (chacha20_use_avx2) { while (bytes >= CHACHA_BLOCK_SIZE * 8) { chacha20_8block_xor_avx2(state, dst, src, bytes); @@ -149,6 +170,11 @@ static int __init chacha20_simd_mod_init(void) chacha20_use_avx2 = boot_cpu_has(X86_FEATURE_AVX) && boot_cpu_has(X86_FEATURE_AVX2) && cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL); +#ifdef CONFIG_AS_AVX512 + chacha20_use_avx512vl = chacha20_use_avx2 && + boot_cpu_has(X86_FEATURE_AVX512VL) && + boot_cpu_has(X86_FEATURE_AVX512BW); /* kmovq */ +#endif #endif return crypto_register_skcipher(&alg); } -- cgit v1.2.3-59-g8ed1b From 29a47b54e030efe308aa90e6c26a9ce7f5f84ed8 Mon Sep 17 00:00:00 2001 From: Martin Willi Date: Tue, 20 Nov 2018 17:30:49 +0100 Subject: crypto: x86/chacha20 - Add a 2-block AVX-512VL variant This version uses the same principle as the AVX2 version. It benefits from the AVX-512VL rotate instructions and the more efficient partial block handling using "vmovdqu8", resulting in a speedup of ~20%. Unlike the AVX2 version, it is faster than the single block SSSE3 version to process a single block. Hence we engage that function for (partial) single block lengths as well. Signed-off-by: Martin Willi Signed-off-by: Herbert Xu --- arch/x86/crypto/chacha20-avx512vl-x86_64.S | 171 +++++++++++++++++++++++++++++ arch/x86/crypto/chacha20_glue.c | 7 ++ 2 files changed, 178 insertions(+) diff --git a/arch/x86/crypto/chacha20-avx512vl-x86_64.S b/arch/x86/crypto/chacha20-avx512vl-x86_64.S index e1877afcaa73..261097578715 100644 --- a/arch/x86/crypto/chacha20-avx512vl-x86_64.S +++ b/arch/x86/crypto/chacha20-avx512vl-x86_64.S @@ -7,6 +7,11 @@ #include +.section .rodata.cst32.CTR2BL, "aM", @progbits, 32 +.align 32 +CTR2BL: .octa 0x00000000000000000000000000000000 + .octa 0x00000000000000000000000000000001 + .section .rodata.cst32.CTR8BL, "aM", @progbits, 32 .align 32 CTR8BL: .octa 0x00000003000000020000000100000000 @@ -14,6 +19,172 @@ CTR8BL: .octa 0x00000003000000020000000100000000 .text +ENTRY(chacha20_2block_xor_avx512vl) + # %rdi: Input state matrix, s + # %rsi: up to 2 data blocks output, o + # %rdx: up to 2 data blocks input, i + # %rcx: input/output length in bytes + + # This function encrypts two ChaCha20 blocks by loading the state + # matrix twice across four AVX registers. It performs matrix operations + # on four words in each matrix in parallel, but requires shuffling to + # rearrange the words after each round. + + vzeroupper + + # x0..3[0-2] = s0..3 + vbroadcasti128 0x00(%rdi),%ymm0 + vbroadcasti128 0x10(%rdi),%ymm1 + vbroadcasti128 0x20(%rdi),%ymm2 + vbroadcasti128 0x30(%rdi),%ymm3 + + vpaddd CTR2BL(%rip),%ymm3,%ymm3 + + vmovdqa %ymm0,%ymm8 + vmovdqa %ymm1,%ymm9 + vmovdqa %ymm2,%ymm10 + vmovdqa %ymm3,%ymm11 + + mov $10,%rax + +.Ldoubleround: + + # x0 += x1, x3 = rotl32(x3 ^ x0, 16) + vpaddd %ymm1,%ymm0,%ymm0 + vpxord %ymm0,%ymm3,%ymm3 + vprold $16,%ymm3,%ymm3 + + # x2 += x3, x1 = rotl32(x1 ^ x2, 12) + vpaddd %ymm3,%ymm2,%ymm2 + vpxord %ymm2,%ymm1,%ymm1 + vprold $12,%ymm1,%ymm1 + + # x0 += x1, x3 = rotl32(x3 ^ x0, 8) + vpaddd %ymm1,%ymm0,%ymm0 + vpxord %ymm0,%ymm3,%ymm3 + vprold $8,%ymm3,%ymm3 + + # x2 += x3, x1 = rotl32(x1 ^ x2, 7) + vpaddd %ymm3,%ymm2,%ymm2 + vpxord %ymm2,%ymm1,%ymm1 + vprold $7,%ymm1,%ymm1 + + # x1 = shuffle32(x1, MASK(0, 3, 2, 1)) + vpshufd $0x39,%ymm1,%ymm1 + # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) + vpshufd $0x4e,%ymm2,%ymm2 + # x3 = shuffle32(x3, MASK(2, 1, 0, 3)) + vpshufd $0x93,%ymm3,%ymm3 + + # x0 += x1, x3 = rotl32(x3 ^ x0, 16) + vpaddd %ymm1,%ymm0,%ymm0 + vpxord %ymm0,%ymm3,%ymm3 + vprold $16,%ymm3,%ymm3 + + # x2 += x3, x1 = rotl32(x1 ^ x2, 12) + vpaddd %ymm3,%ymm2,%ymm2 + vpxord %ymm2,%ymm1,%ymm1 + vprold $12,%ymm1,%ymm1 + + # x0 += x1, x3 = rotl32(x3 ^ x0, 8) + vpaddd %ymm1,%ymm0,%ymm0 + vpxord %ymm0,%ymm3,%ymm3 + vprold $8,%ymm3,%ymm3 + + # x2 += x3, x1 = rotl32(x1 ^ x2, 7) + vpaddd %ymm3,%ymm2,%ymm2 + vpxord %ymm2,%ymm1,%ymm1 + vprold $7,%ymm1,%ymm1 + + # x1 = shuffle32(x1, MASK(2, 1, 0, 3)) + vpshufd $0x93,%ymm1,%ymm1 + # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) + vpshufd $0x4e,%ymm2,%ymm2 + # x3 = shuffle32(x3, MASK(0, 3, 2, 1)) + vpshufd $0x39,%ymm3,%ymm3 + + dec %rax + jnz .Ldoubleround + + # o0 = i0 ^ (x0 + s0) + vpaddd %ymm8,%ymm0,%ymm7 + cmp $0x10,%rcx + jl .Lxorpart2 + vpxord 0x00(%rdx),%xmm7,%xmm6 + vmovdqu %xmm6,0x00(%rsi) + vextracti128 $1,%ymm7,%xmm0 + # o1 = i1 ^ (x1 + s1) + vpaddd %ymm9,%ymm1,%ymm7 + cmp $0x20,%rcx + jl .Lxorpart2 + vpxord 0x10(%rdx),%xmm7,%xmm6 + vmovdqu %xmm6,0x10(%rsi) + vextracti128 $1,%ymm7,%xmm1 + # o2 = i2 ^ (x2 + s2) + vpaddd %ymm10,%ymm2,%ymm7 + cmp $0x30,%rcx + jl .Lxorpart2 + vpxord 0x20(%rdx),%xmm7,%xmm6 + vmovdqu %xmm6,0x20(%rsi) + vextracti128 $1,%ymm7,%xmm2 + # o3 = i3 ^ (x3 + s3) + vpaddd %ymm11,%ymm3,%ymm7 + cmp $0x40,%rcx + jl .Lxorpart2 + vpxord 0x30(%rdx),%xmm7,%xmm6 + vmovdqu %xmm6,0x30(%rsi) + vextracti128 $1,%ymm7,%xmm3 + + # xor and write second block + vmovdqa %xmm0,%xmm7 + cmp $0x50,%rcx + jl .Lxorpart2 + vpxord 0x40(%rdx),%xmm7,%xmm6 + vmovdqu %xmm6,0x40(%rsi) + + vmovdqa %xmm1,%xmm7 + cmp $0x60,%rcx + jl .Lxorpart2 + vpxord 0x50(%rdx),%xmm7,%xmm6 + vmovdqu %xmm6,0x50(%rsi) + + vmovdqa %xmm2,%xmm7 + cmp $0x70,%rcx + jl .Lxorpart2 + vpxord 0x60(%rdx),%xmm7,%xmm6 + vmovdqu %xmm6,0x60(%rsi) + + vmovdqa %xmm3,%xmm7 + cmp $0x80,%rcx + jl .Lxorpart2 + vpxord 0x70(%rdx),%xmm7,%xmm6 + vmovdqu %xmm6,0x70(%rsi) + +.Ldone2: + vzeroupper + ret + +.Lxorpart2: + # xor remaining bytes from partial register into output + mov %rcx,%rax + and $0xf,%rcx + jz .Ldone8 + mov %rax,%r9 + and $~0xf,%r9 + + mov $1,%rax + shld %cl,%rax,%rax + sub $1,%rax + kmovq %rax,%k1 + + vmovdqu8 (%rdx,%r9),%xmm1{%k1}{z} + vpxord %xmm7,%xmm1,%xmm1 + vmovdqu8 %xmm1,(%rsi,%r9){%k1} + + jmp .Ldone2 + +ENDPROC(chacha20_2block_xor_avx512vl) + ENTRY(chacha20_8block_xor_avx512vl) # %rdi: Input state matrix, s # %rsi: up to 8 data blocks output, o diff --git a/arch/x86/crypto/chacha20_glue.c b/arch/x86/crypto/chacha20_glue.c index 6a67e70bc82a..d6a95a6a324e 100644 --- a/arch/x86/crypto/chacha20_glue.c +++ b/arch/x86/crypto/chacha20_glue.c @@ -32,6 +32,8 @@ asmlinkage void chacha20_8block_xor_avx2(u32 *state, u8 *dst, const u8 *src, unsigned int len); static bool chacha20_use_avx2; #ifdef CONFIG_AS_AVX512 +asmlinkage void chacha20_2block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src, + unsigned int len); asmlinkage void chacha20_8block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src, unsigned int len); static bool chacha20_use_avx512vl; @@ -62,6 +64,11 @@ static void chacha20_dosimd(u32 *state, u8 *dst, const u8 *src, state[12] += chacha20_advance(bytes, 8); return; } + if (bytes) { + chacha20_2block_xor_avx512vl(state, dst, src, bytes); + state[12] += chacha20_advance(bytes, 2); + return; + } } #endif if (chacha20_use_avx2) { -- cgit v1.2.3-59-g8ed1b From 180def6c4ad139ae6f97953ae810092ace295d5b Mon Sep 17 00:00:00 2001 From: Martin Willi Date: Tue, 20 Nov 2018 17:30:50 +0100 Subject: crypto: x86/chacha20 - Add a 4-block AVX-512VL variant This version uses the same principle as the AVX2 version by scheduling the operations for two block pairs in parallel. It benefits from the AVX-512VL rotate instructions and the more efficient partial block handling using "vmovdqu8", resulting in a speedup of the raw block function of ~20%. Signed-off-by: Martin Willi Signed-off-by: Herbert Xu --- arch/x86/crypto/chacha20-avx512vl-x86_64.S | 272 +++++++++++++++++++++++++++++ arch/x86/crypto/chacha20_glue.c | 7 + 2 files changed, 279 insertions(+) diff --git a/arch/x86/crypto/chacha20-avx512vl-x86_64.S b/arch/x86/crypto/chacha20-avx512vl-x86_64.S index 261097578715..55d34de29e3e 100644 --- a/arch/x86/crypto/chacha20-avx512vl-x86_64.S +++ b/arch/x86/crypto/chacha20-avx512vl-x86_64.S @@ -12,6 +12,11 @@ CTR2BL: .octa 0x00000000000000000000000000000000 .octa 0x00000000000000000000000000000001 +.section .rodata.cst32.CTR4BL, "aM", @progbits, 32 +.align 32 +CTR4BL: .octa 0x00000000000000000000000000000002 + .octa 0x00000000000000000000000000000003 + .section .rodata.cst32.CTR8BL, "aM", @progbits, 32 .align 32 CTR8BL: .octa 0x00000003000000020000000100000000 @@ -185,6 +190,273 @@ ENTRY(chacha20_2block_xor_avx512vl) ENDPROC(chacha20_2block_xor_avx512vl) +ENTRY(chacha20_4block_xor_avx512vl) + # %rdi: Input state matrix, s + # %rsi: up to 4 data blocks output, o + # %rdx: up to 4 data blocks input, i + # %rcx: input/output length in bytes + + # This function encrypts four ChaCha20 block by loading the state + # matrix four times across eight AVX registers. It performs matrix + # operations on four words in two matrices in parallel, sequentially + # to the operations on the four words of the other two matrices. The + # required word shuffling has a rather high latency, we can do the + # arithmetic on two matrix-pairs without much slowdown. + + vzeroupper + + # x0..3[0-4] = s0..3 + vbroadcasti128 0x00(%rdi),%ymm0 + vbroadcasti128 0x10(%rdi),%ymm1 + vbroadcasti128 0x20(%rdi),%ymm2 + vbroadcasti128 0x30(%rdi),%ymm3 + + vmovdqa %ymm0,%ymm4 + vmovdqa %ymm1,%ymm5 + vmovdqa %ymm2,%ymm6 + vmovdqa %ymm3,%ymm7 + + vpaddd CTR2BL(%rip),%ymm3,%ymm3 + vpaddd CTR4BL(%rip),%ymm7,%ymm7 + + vmovdqa %ymm0,%ymm11 + vmovdqa %ymm1,%ymm12 + vmovdqa %ymm2,%ymm13 + vmovdqa %ymm3,%ymm14 + vmovdqa %ymm7,%ymm15 + + mov $10,%rax + +.Ldoubleround4: + + # x0 += x1, x3 = rotl32(x3 ^ x0, 16) + vpaddd %ymm1,%ymm0,%ymm0 + vpxord %ymm0,%ymm3,%ymm3 + vprold $16,%ymm3,%ymm3 + + vpaddd %ymm5,%ymm4,%ymm4 + vpxord %ymm4,%ymm7,%ymm7 + vprold $16,%ymm7,%ymm7 + + # x2 += x3, x1 = rotl32(x1 ^ x2, 12) + vpaddd %ymm3,%ymm2,%ymm2 + vpxord %ymm2,%ymm1,%ymm1 + vprold $12,%ymm1,%ymm1 + + vpaddd %ymm7,%ymm6,%ymm6 + vpxord %ymm6,%ymm5,%ymm5 + vprold $12,%ymm5,%ymm5 + + # x0 += x1, x3 = rotl32(x3 ^ x0, 8) + vpaddd %ymm1,%ymm0,%ymm0 + vpxord %ymm0,%ymm3,%ymm3 + vprold $8,%ymm3,%ymm3 + + vpaddd %ymm5,%ymm4,%ymm4 + vpxord %ymm4,%ymm7,%ymm7 + vprold $8,%ymm7,%ymm7 + + # x2 += x3, x1 = rotl32(x1 ^ x2, 7) + vpaddd %ymm3,%ymm2,%ymm2 + vpxord %ymm2,%ymm1,%ymm1 + vprold $7,%ymm1,%ymm1 + + vpaddd %ymm7,%ymm6,%ymm6 + vpxord %ymm6,%ymm5,%ymm5 + vprold $7,%ymm5,%ymm5 + + # x1 = shuffle32(x1, MASK(0, 3, 2, 1)) + vpshufd $0x39,%ymm1,%ymm1 + vpshufd $0x39,%ymm5,%ymm5 + # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) + vpshufd $0x4e,%ymm2,%ymm2 + vpshufd $0x4e,%ymm6,%ymm6 + # x3 = shuffle32(x3, MASK(2, 1, 0, 3)) + vpshufd $0x93,%ymm3,%ymm3 + vpshufd $0x93,%ymm7,%ymm7 + + # x0 += x1, x3 = rotl32(x3 ^ x0, 16) + vpaddd %ymm1,%ymm0,%ymm0 + vpxord %ymm0,%ymm3,%ymm3 + vprold $16,%ymm3,%ymm3 + + vpaddd %ymm5,%ymm4,%ymm4 + vpxord %ymm4,%ymm7,%ymm7 + vprold $16,%ymm7,%ymm7 + + # x2 += x3, x1 = rotl32(x1 ^ x2, 12) + vpaddd %ymm3,%ymm2,%ymm2 + vpxord %ymm2,%ymm1,%ymm1 + vprold $12,%ymm1,%ymm1 + + vpaddd %ymm7,%ymm6,%ymm6 + vpxord %ymm6,%ymm5,%ymm5 + vprold $12,%ymm5,%ymm5 + + # x0 += x1, x3 = rotl32(x3 ^ x0, 8) + vpaddd %ymm1,%ymm0,%ymm0 + vpxord %ymm0,%ymm3,%ymm3 + vprold $8,%ymm3,%ymm3 + + vpaddd %ymm5,%ymm4,%ymm4 + vpxord %ymm4,%ymm7,%ymm7 + vprold $8,%ymm7,%ymm7 + + # x2 += x3, x1 = rotl32(x1 ^ x2, 7) + vpaddd %ymm3,%ymm2,%ymm2 + vpxord %ymm2,%ymm1,%ymm1 + vprold $7,%ymm1,%ymm1 + + vpaddd %ymm7,%ymm6,%ymm6 + vpxord %ymm6,%ymm5,%ymm5 + vprold $7,%ymm5,%ymm5 + + # x1 = shuffle32(x1, MASK(2, 1, 0, 3)) + vpshufd $0x93,%ymm1,%ymm1 + vpshufd $0x93,%ymm5,%ymm5 + # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) + vpshufd $0x4e,%ymm2,%ymm2 + vpshufd $0x4e,%ymm6,%ymm6 + # x3 = shuffle32(x3, MASK(0, 3, 2, 1)) + vpshufd $0x39,%ymm3,%ymm3 + vpshufd $0x39,%ymm7,%ymm7 + + dec %rax + jnz .Ldoubleround4 + + # o0 = i0 ^ (x0 + s0), first block + vpaddd %ymm11,%ymm0,%ymm10 + cmp $0x10,%rcx + jl .Lxorpart4 + vpxord 0x00(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0x00(%rsi) + vextracti128 $1,%ymm10,%xmm0 + # o1 = i1 ^ (x1 + s1), first block + vpaddd %ymm12,%ymm1,%ymm10 + cmp $0x20,%rcx + jl .Lxorpart4 + vpxord 0x10(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0x10(%rsi) + vextracti128 $1,%ymm10,%xmm1 + # o2 = i2 ^ (x2 + s2), first block + vpaddd %ymm13,%ymm2,%ymm10 + cmp $0x30,%rcx + jl .Lxorpart4 + vpxord 0x20(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0x20(%rsi) + vextracti128 $1,%ymm10,%xmm2 + # o3 = i3 ^ (x3 + s3), first block + vpaddd %ymm14,%ymm3,%ymm10 + cmp $0x40,%rcx + jl .Lxorpart4 + vpxord 0x30(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0x30(%rsi) + vextracti128 $1,%ymm10,%xmm3 + + # xor and write second block + vmovdqa %xmm0,%xmm10 + cmp $0x50,%rcx + jl .Lxorpart4 + vpxord 0x40(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0x40(%rsi) + + vmovdqa %xmm1,%xmm10 + cmp $0x60,%rcx + jl .Lxorpart4 + vpxord 0x50(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0x50(%rsi) + + vmovdqa %xmm2,%xmm10 + cmp $0x70,%rcx + jl .Lxorpart4 + vpxord 0x60(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0x60(%rsi) + + vmovdqa %xmm3,%xmm10 + cmp $0x80,%rcx + jl .Lxorpart4 + vpxord 0x70(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0x70(%rsi) + + # o0 = i0 ^ (x0 + s0), third block + vpaddd %ymm11,%ymm4,%ymm10 + cmp $0x90,%rcx + jl .Lxorpart4 + vpxord 0x80(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0x80(%rsi) + vextracti128 $1,%ymm10,%xmm4 + # o1 = i1 ^ (x1 + s1), third block + vpaddd %ymm12,%ymm5,%ymm10 + cmp $0xa0,%rcx + jl .Lxorpart4 + vpxord 0x90(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0x90(%rsi) + vextracti128 $1,%ymm10,%xmm5 + # o2 = i2 ^ (x2 + s2), third block + vpaddd %ymm13,%ymm6,%ymm10 + cmp $0xb0,%rcx + jl .Lxorpart4 + vpxord 0xa0(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0xa0(%rsi) + vextracti128 $1,%ymm10,%xmm6 + # o3 = i3 ^ (x3 + s3), third block + vpaddd %ymm15,%ymm7,%ymm10 + cmp $0xc0,%rcx + jl .Lxorpart4 + vpxord 0xb0(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0xb0(%rsi) + vextracti128 $1,%ymm10,%xmm7 + + # xor and write fourth block + vmovdqa %xmm4,%xmm10 + cmp $0xd0,%rcx + jl .Lxorpart4 + vpxord 0xc0(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0xc0(%rsi) + + vmovdqa %xmm5,%xmm10 + cmp $0xe0,%rcx + jl .Lxorpart4 + vpxord 0xd0(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0xd0(%rsi) + + vmovdqa %xmm6,%xmm10 + cmp $0xf0,%rcx + jl .Lxorpart4 + vpxord 0xe0(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0xe0(%rsi) + + vmovdqa %xmm7,%xmm10 + cmp $0x100,%rcx + jl .Lxorpart4 + vpxord 0xf0(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0xf0(%rsi) + +.Ldone4: + vzeroupper + ret + +.Lxorpart4: + # xor remaining bytes from partial register into output + mov %rcx,%rax + and $0xf,%rcx + jz .Ldone8 + mov %rax,%r9 + and $~0xf,%r9 + + mov $1,%rax + shld %cl,%rax,%rax + sub $1,%rax + kmovq %rax,%k1 + + vmovdqu8 (%rdx,%r9),%xmm1{%k1}{z} + vpxord %xmm10,%xmm1,%xmm1 + vmovdqu8 %xmm1,(%rsi,%r9){%k1} + + jmp .Ldone4 + +ENDPROC(chacha20_4block_xor_avx512vl) + ENTRY(chacha20_8block_xor_avx512vl) # %rdi: Input state matrix, s # %rsi: up to 8 data blocks output, o diff --git a/arch/x86/crypto/chacha20_glue.c b/arch/x86/crypto/chacha20_glue.c index d6a95a6a324e..773d075a1483 100644 --- a/arch/x86/crypto/chacha20_glue.c +++ b/arch/x86/crypto/chacha20_glue.c @@ -34,6 +34,8 @@ static bool chacha20_use_avx2; #ifdef CONFIG_AS_AVX512 asmlinkage void chacha20_2block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src, unsigned int len); +asmlinkage void chacha20_4block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src, + unsigned int len); asmlinkage void chacha20_8block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src, unsigned int len); static bool chacha20_use_avx512vl; @@ -64,6 +66,11 @@ static void chacha20_dosimd(u32 *state, u8 *dst, const u8 *src, state[12] += chacha20_advance(bytes, 8); return; } + if (bytes > CHACHA_BLOCK_SIZE * 2) { + chacha20_4block_xor_avx512vl(state, dst, src, bytes); + state[12] += chacha20_advance(bytes, 4); + return; + } if (bytes) { chacha20_2block_xor_avx512vl(state, dst, src, bytes); state[12] += chacha20_advance(bytes, 2); -- cgit v1.2.3-59-g8ed1b From 4bede34c1aa19266628b13f68db95ec4a16a6f38 Mon Sep 17 00:00:00 2001 From: "Nagadheeraj, Rottela" Date: Wed, 21 Nov 2018 07:36:58 +0000 Subject: crypto: cavium/nitrox - crypto request format changes nitrox_skcipher_crypt() will do the necessary formatting/ordering of input and output sglists based on the algorithm requirements. It will also accommodate the mandatory output buffers required for NITROX hardware like Output request headers (ORH) and Completion headers. Signed-off-by: Nagadheeraj Rottela Reviewed-by: Srikanth Jampala Signed-off-by: Herbert Xu --- drivers/crypto/cavium/nitrox/nitrox_algs.c | 111 ++++++++++- drivers/crypto/cavium/nitrox/nitrox_req.h | 94 ++++++---- drivers/crypto/cavium/nitrox/nitrox_reqmgr.c | 266 ++++++--------------------- 3 files changed, 227 insertions(+), 244 deletions(-) diff --git a/drivers/crypto/cavium/nitrox/nitrox_algs.c b/drivers/crypto/cavium/nitrox/nitrox_algs.c index 5d54ebc20cb3..10075a97ff0d 100644 --- a/drivers/crypto/cavium/nitrox/nitrox_algs.c +++ b/drivers/crypto/cavium/nitrox/nitrox_algs.c @@ -155,13 +155,109 @@ static int nitrox_aes_setkey(struct crypto_skcipher *cipher, const u8 *key, return nitrox_skcipher_setkey(cipher, aes_keylen, key, keylen); } +static int alloc_src_sglist(struct skcipher_request *skreq, int ivsize) +{ + struct nitrox_kcrypt_request *nkreq = skcipher_request_ctx(skreq); + int nents = sg_nents(skreq->src) + 1; + struct se_crypto_request *creq = &nkreq->creq; + char *iv; + struct scatterlist *sg; + + /* Allocate buffer to hold IV and input scatterlist array */ + nkreq->src = alloc_req_buf(nents, ivsize, creq->gfp); + if (!nkreq->src) + return -ENOMEM; + + /* copy iv */ + iv = nkreq->src; + memcpy(iv, skreq->iv, ivsize); + + sg = (struct scatterlist *)(iv + ivsize); + creq->src = sg; + sg_init_table(sg, nents); + + /* Input format: + * +----+----------------+ + * | IV | SRC sg entries | + * +----+----------------+ + */ + + /* IV */ + sg = create_single_sg(sg, iv, ivsize); + /* SRC entries */ + create_multi_sg(sg, skreq->src); + + return 0; +} + +static int alloc_dst_sglist(struct skcipher_request *skreq, int ivsize) +{ + struct nitrox_kcrypt_request *nkreq = skcipher_request_ctx(skreq); + int nents = sg_nents(skreq->dst) + 3; + int extralen = ORH_HLEN + COMP_HLEN; + struct se_crypto_request *creq = &nkreq->creq; + struct scatterlist *sg; + char *iv = nkreq->src; + + /* Allocate buffer to hold ORH, COMPLETION and output scatterlist + * array + */ + nkreq->dst = alloc_req_buf(nents, extralen, creq->gfp); + if (!nkreq->dst) + return -ENOMEM; + + creq->orh = (u64 *)(nkreq->dst); + set_orh_value(creq->orh); + + creq->comp = (u64 *)(nkreq->dst + ORH_HLEN); + set_comp_value(creq->comp); + + sg = (struct scatterlist *)(nkreq->dst + ORH_HLEN + COMP_HLEN); + creq->dst = sg; + sg_init_table(sg, nents); + + /* Output format: + * +-----+----+----------------+-----------------+ + * | ORH | IV | DST sg entries | COMPLETION Bytes| + * +-----+----+----------------+-----------------+ + */ + + /* ORH */ + sg = create_single_sg(sg, creq->orh, ORH_HLEN); + /* IV */ + sg = create_single_sg(sg, iv, ivsize); + /* DST entries */ + sg = create_multi_sg(sg, skreq->dst); + /* COMPLETION Bytes */ + create_single_sg(sg, creq->comp, COMP_HLEN); + + return 0; +} + +static void free_src_sglist(struct skcipher_request *skreq) +{ + struct nitrox_kcrypt_request *nkreq = skcipher_request_ctx(skreq); + + kfree(nkreq->src); +} + +static void free_dst_sglist(struct skcipher_request *skreq) +{ + struct nitrox_kcrypt_request *nkreq = skcipher_request_ctx(skreq); + + kfree(nkreq->dst); +} + static void nitrox_skcipher_callback(struct skcipher_request *skreq, int err) { + free_src_sglist(skreq); + free_dst_sglist(skreq); if (err) { pr_err_ratelimited("request failed status 0x%0x\n", err); err = -EINVAL; } + skcipher_request_complete(skreq, err); } @@ -172,6 +268,7 @@ static int nitrox_skcipher_crypt(struct skcipher_request *skreq, bool enc) struct nitrox_kcrypt_request *nkreq = skcipher_request_ctx(skreq); int ivsize = crypto_skcipher_ivsize(cipher); struct se_crypto_request *creq; + int ret; creq = &nkreq->creq; creq->flags = skreq->base.flags; @@ -192,11 +289,15 @@ static int nitrox_skcipher_crypt(struct skcipher_request *skreq, bool enc) creq->ctx_handle = nctx->u.ctx_handle; creq->ctrl.s.ctxl = sizeof(struct flexi_crypto_context); - /* copy the iv */ - memcpy(creq->iv, skreq->iv, ivsize); - creq->ivsize = ivsize; - creq->src = skreq->src; - creq->dst = skreq->dst; + ret = alloc_src_sglist(skreq, ivsize); + if (ret) + return ret; + + ret = alloc_dst_sglist(skreq, ivsize); + if (ret) { + free_src_sglist(skreq); + return ret; + } nkreq->nctx = nctx; nkreq->skreq = skreq; diff --git a/drivers/crypto/cavium/nitrox/nitrox_req.h b/drivers/crypto/cavium/nitrox/nitrox_req.h index 19f0a20e3bb3..d45ff91c19a9 100644 --- a/drivers/crypto/cavium/nitrox/nitrox_req.h +++ b/drivers/crypto/cavium/nitrox/nitrox_req.h @@ -7,6 +7,8 @@ #include "nitrox_dev.h" +#define PENDING_SIG 0xFFFFFFFFFFFFFFFFUL + /** * struct gphdr - General purpose Header * @param0: first parameter. @@ -46,13 +48,6 @@ union se_req_ctrl { } s; }; -struct nitrox_sglist { - u16 len; - u16 raz0; - u32 raz1; - dma_addr_t dma; -}; - #define MAX_IV_LEN 16 /** @@ -62,8 +57,10 @@ struct nitrox_sglist { * @ctx_handle: Crypto context handle. * @gph: GP Header * @ctrl: Request Information. - * @in: Input sglist - * @out: Output sglist + * @orh: ORH address + * @comp: completion address + * @src: Input sglist + * @dst: Output sglist */ struct se_crypto_request { u8 opcode; @@ -73,9 +70,8 @@ struct se_crypto_request { struct gphdr gph; union se_req_ctrl ctrl; - - u8 iv[MAX_IV_LEN]; - u16 ivsize; + u64 *orh; + u64 *comp; struct scatterlist *src; struct scatterlist *dst; @@ -200,6 +196,8 @@ struct nitrox_kcrypt_request { struct se_crypto_request creq; struct nitrox_crypto_ctx *nctx; struct skcipher_request *skreq; + u8 *src; + u8 *dst; }; /** @@ -376,26 +374,19 @@ struct nitrox_sgcomp { /* * strutct nitrox_sgtable - SG list information - * @map_cnt: Number of buffers mapped - * @nr_comp: Number of sglist components + * @sgmap_cnt: Number of buffers mapped * @total_bytes: Total bytes in sglist. - * @len: Total sglist components length. - * @dma: DMA address of sglist component. - * @dir: DMA direction. - * @buf: crypto request buffer. - * @sglist: SG list of input/output buffers. + * @sgcomp_len: Total sglist components length. + * @sgcomp_dma: DMA address of sglist component. + * @sg: crypto request buffer. * @sgcomp: sglist component for NITROX. */ struct nitrox_sgtable { - u8 map_bufs_cnt; - u8 nr_sgcomp; + u8 sgmap_cnt; u16 total_bytes; - u32 len; - dma_addr_t dma; - enum dma_data_direction dir; - - struct scatterlist *buf; - struct nitrox_sglist *sglist; + u32 sgcomp_len; + dma_addr_t sgcomp_dma; + struct scatterlist *sg; struct nitrox_sgcomp *sgcomp; }; @@ -405,10 +396,8 @@ struct nitrox_sgtable { #define COMP_HLEN 8 struct resp_hdr { - u64 orh; - dma_addr_t orh_dma; - u64 completion; - dma_addr_t completion_dma; + u64 *orh; + u64 *completion; }; typedef void (*completion_t)(struct skcipher_request *skreq, int err); @@ -434,7 +423,6 @@ struct nitrox_softreq { u32 flags; gfp_t gfp; atomic_t status; - bool inplace; struct nitrox_device *ndev; struct nitrox_cmdq *cmdq; @@ -450,4 +438,46 @@ struct nitrox_softreq { struct skcipher_request *skreq; }; +static inline void *alloc_req_buf(int nents, int extralen, gfp_t gfp) +{ + size_t size; + + size = sizeof(struct scatterlist) * nents; + size += extralen; + + return kzalloc(size, gfp); +} + +static inline struct scatterlist *create_single_sg(struct scatterlist *sg, + void *buf, int buflen) +{ + sg_set_buf(sg, buf, buflen); + sg++; + return sg; +} + +static inline struct scatterlist *create_multi_sg(struct scatterlist *to_sg, + struct scatterlist *from_sg) +{ + struct scatterlist *sg; + int i; + + for_each_sg(from_sg, sg, sg_nents(from_sg), i) { + sg_set_buf(to_sg, sg_virt(sg), sg->length); + to_sg++; + } + + return to_sg; +} + +static inline void set_orh_value(u64 *orh) +{ + WRITE_ONCE(*orh, PENDING_SIG); +} + +static inline void set_comp_value(u64 *comp) +{ + WRITE_ONCE(*comp, PENDING_SIG); +} + #endif /* __NITROX_REQ_H */ diff --git a/drivers/crypto/cavium/nitrox/nitrox_reqmgr.c b/drivers/crypto/cavium/nitrox/nitrox_reqmgr.c index 3987cd84c033..d566bb904ec2 100644 --- a/drivers/crypto/cavium/nitrox/nitrox_reqmgr.c +++ b/drivers/crypto/cavium/nitrox/nitrox_reqmgr.c @@ -13,7 +13,6 @@ #define FDATA_SIZE 32 /* Base destination port for the solicited requests */ #define SOLICIT_BASE_DPORT 256 -#define PENDING_SIG 0xFFFFFFFFFFFFFFFFUL #define REQ_NOT_POSTED 1 #define REQ_BACKLOG 2 @@ -52,58 +51,26 @@ static inline int incr_index(int index, int count, int max) return index; } -/** - * dma_free_sglist - unmap and free the sg lists. - * @ndev: N5 device - * @sgtbl: SG table - */ static void softreq_unmap_sgbufs(struct nitrox_softreq *sr) { struct nitrox_device *ndev = sr->ndev; struct device *dev = DEV(ndev); - struct nitrox_sglist *sglist; - - /* unmap in sgbuf */ - sglist = sr->in.sglist; - if (!sglist) - goto out_unmap; - - /* unmap iv */ - dma_unmap_single(dev, sglist->dma, sglist->len, DMA_BIDIRECTIONAL); - /* unmpa src sglist */ - dma_unmap_sg(dev, sr->in.buf, (sr->in.map_bufs_cnt - 1), sr->in.dir); - /* unamp gather component */ - dma_unmap_single(dev, sr->in.dma, sr->in.len, DMA_TO_DEVICE); - kfree(sr->in.sglist); + + + dma_unmap_sg(dev, sr->in.sg, sr->in.sgmap_cnt, DMA_BIDIRECTIONAL); + dma_unmap_single(dev, sr->in.sgcomp_dma, sr->in.sgcomp_len, + DMA_TO_DEVICE); kfree(sr->in.sgcomp); - sr->in.sglist = NULL; - sr->in.buf = NULL; - sr->in.map_bufs_cnt = 0; - -out_unmap: - /* unmap out sgbuf */ - sglist = sr->out.sglist; - if (!sglist) - return; - - /* unmap orh */ - dma_unmap_single(dev, sr->resp.orh_dma, ORH_HLEN, sr->out.dir); - - /* unmap dst sglist */ - if (!sr->inplace) { - dma_unmap_sg(dev, sr->out.buf, (sr->out.map_bufs_cnt - 3), - sr->out.dir); - } - /* unmap completion */ - dma_unmap_single(dev, sr->resp.completion_dma, COMP_HLEN, sr->out.dir); + sr->in.sg = NULL; + sr->in.sgmap_cnt = 0; - /* unmap scatter component */ - dma_unmap_single(dev, sr->out.dma, sr->out.len, DMA_TO_DEVICE); - kfree(sr->out.sglist); + dma_unmap_sg(dev, sr->out.sg, sr->out.sgmap_cnt, + DMA_BIDIRECTIONAL); + dma_unmap_single(dev, sr->out.sgcomp_dma, sr->out.sgcomp_len, + DMA_TO_DEVICE); kfree(sr->out.sgcomp); - sr->out.sglist = NULL; - sr->out.buf = NULL; - sr->out.map_bufs_cnt = 0; + sr->out.sg = NULL; + sr->out.sgmap_cnt = 0; } static void softreq_destroy(struct nitrox_softreq *sr) @@ -116,7 +83,7 @@ static void softreq_destroy(struct nitrox_softreq *sr) * create_sg_component - create SG componets for N5 device. * @sr: Request structure * @sgtbl: SG table - * @nr_comp: total number of components required + * @map_nents: number of dma mapped entries * * Component structure * @@ -140,7 +107,7 @@ static int create_sg_component(struct nitrox_softreq *sr, { struct nitrox_device *ndev = sr->ndev; struct nitrox_sgcomp *sgcomp; - struct nitrox_sglist *sglist; + struct scatterlist *sg; dma_addr_t dma; size_t sz_comp; int i, j, nr_sgcomp; @@ -154,17 +121,15 @@ static int create_sg_component(struct nitrox_softreq *sr, return -ENOMEM; sgtbl->sgcomp = sgcomp; - sgtbl->nr_sgcomp = nr_sgcomp; - sglist = sgtbl->sglist; + sg = sgtbl->sg; /* populate device sg component */ for (i = 0; i < nr_sgcomp; i++) { - for (j = 0; j < 4; j++) { - sgcomp->len[j] = cpu_to_be16(sglist->len); - sgcomp->dma[j] = cpu_to_be64(sglist->dma); - sglist++; + for (j = 0; j < 4 && sg; j++) { + sgcomp[i].len[j] = cpu_to_be16(sg_dma_len(sg)); + sgcomp[i].dma[j] = cpu_to_be64(sg_dma_address(sg)); + sg = sg_next(sg); } - sgcomp++; } /* map the device sg component */ dma = dma_map_single(DEV(ndev), sgtbl->sgcomp, sz_comp, DMA_TO_DEVICE); @@ -174,8 +139,8 @@ static int create_sg_component(struct nitrox_softreq *sr, return -ENOMEM; } - sgtbl->dma = dma; - sgtbl->len = sz_comp; + sgtbl->sgcomp_dma = dma; + sgtbl->sgcomp_len = sz_comp; return 0; } @@ -193,66 +158,27 @@ static int dma_map_inbufs(struct nitrox_softreq *sr, { struct device *dev = DEV(sr->ndev); struct scatterlist *sg = req->src; - struct nitrox_sglist *glist; int i, nents, ret = 0; - dma_addr_t dma; - size_t sz; - nents = sg_nents(req->src); + nents = dma_map_sg(dev, req->src, sg_nents(req->src), + DMA_BIDIRECTIONAL); + if (!nents) + return -EINVAL; - /* creater gather list IV and src entries */ - sz = roundup((1 + nents), 4) * sizeof(*glist); - glist = kzalloc(sz, sr->gfp); - if (!glist) - return -ENOMEM; - - sr->in.sglist = glist; - /* map IV */ - dma = dma_map_single(dev, &req->iv, req->ivsize, DMA_BIDIRECTIONAL); - if (dma_mapping_error(dev, dma)) { - ret = -EINVAL; - goto iv_map_err; - } - - sr->in.dir = (req->src == req->dst) ? DMA_BIDIRECTIONAL : DMA_TO_DEVICE; - /* map src entries */ - nents = dma_map_sg(dev, req->src, nents, sr->in.dir); - if (!nents) { - ret = -EINVAL; - goto src_map_err; - } - sr->in.buf = req->src; - - /* store the mappings */ - glist->len = req->ivsize; - glist->dma = dma; - glist++; - sr->in.total_bytes += req->ivsize; - - for_each_sg(req->src, sg, nents, i) { - glist->len = sg_dma_len(sg); - glist->dma = sg_dma_address(sg); - sr->in.total_bytes += glist->len; - glist++; - } - /* roundup map count to align with entires in sg component */ - sr->in.map_bufs_cnt = (1 + nents); + for_each_sg(req->src, sg, nents, i) + sr->in.total_bytes += sg_dma_len(sg); - /* create NITROX gather component */ - ret = create_sg_component(sr, &sr->in, sr->in.map_bufs_cnt); + sr->in.sg = req->src; + sr->in.sgmap_cnt = nents; + ret = create_sg_component(sr, &sr->in, sr->in.sgmap_cnt); if (ret) goto incomp_err; return 0; incomp_err: - dma_unmap_sg(dev, req->src, nents, sr->in.dir); - sr->in.map_bufs_cnt = 0; -src_map_err: - dma_unmap_single(dev, dma, req->ivsize, DMA_BIDIRECTIONAL); -iv_map_err: - kfree(sr->in.sglist); - sr->in.sglist = NULL; + dma_unmap_sg(dev, req->src, nents, DMA_BIDIRECTIONAL); + sr->in.sgmap_cnt = 0; return ret; } @@ -260,104 +186,25 @@ static int dma_map_outbufs(struct nitrox_softreq *sr, struct se_crypto_request *req) { struct device *dev = DEV(sr->ndev); - struct nitrox_sglist *glist = sr->in.sglist; - struct nitrox_sglist *slist; - struct scatterlist *sg; - int i, nents, map_bufs_cnt, ret = 0; - size_t sz; + int nents, ret = 0; - nents = sg_nents(req->dst); + nents = dma_map_sg(dev, req->dst, sg_nents(req->dst), + DMA_BIDIRECTIONAL); + if (!nents) + return -EINVAL; - /* create scatter list ORH, IV, dst entries and Completion header */ - sz = roundup((3 + nents), 4) * sizeof(*slist); - slist = kzalloc(sz, sr->gfp); - if (!slist) - return -ENOMEM; - - sr->out.sglist = slist; - sr->out.dir = DMA_BIDIRECTIONAL; - /* map ORH */ - sr->resp.orh_dma = dma_map_single(dev, &sr->resp.orh, ORH_HLEN, - sr->out.dir); - if (dma_mapping_error(dev, sr->resp.orh_dma)) { - ret = -EINVAL; - goto orh_map_err; - } - - /* map completion */ - sr->resp.completion_dma = dma_map_single(dev, &sr->resp.completion, - COMP_HLEN, sr->out.dir); - if (dma_mapping_error(dev, sr->resp.completion_dma)) { - ret = -EINVAL; - goto compl_map_err; - } - - sr->inplace = (req->src == req->dst) ? true : false; - /* out place */ - if (!sr->inplace) { - nents = dma_map_sg(dev, req->dst, nents, sr->out.dir); - if (!nents) { - ret = -EINVAL; - goto dst_map_err; - } - } - sr->out.buf = req->dst; - - /* store the mappings */ - /* orh */ - slist->len = ORH_HLEN; - slist->dma = sr->resp.orh_dma; - slist++; - - /* copy the glist mappings */ - if (sr->inplace) { - nents = sr->in.map_bufs_cnt - 1; - map_bufs_cnt = sr->in.map_bufs_cnt; - while (map_bufs_cnt--) { - slist->len = glist->len; - slist->dma = glist->dma; - slist++; - glist++; - } - } else { - /* copy iv mapping */ - slist->len = glist->len; - slist->dma = glist->dma; - slist++; - /* copy remaining maps */ - for_each_sg(req->dst, sg, nents, i) { - slist->len = sg_dma_len(sg); - slist->dma = sg_dma_address(sg); - slist++; - } - } - - /* completion */ - slist->len = COMP_HLEN; - slist->dma = sr->resp.completion_dma; - - sr->out.map_bufs_cnt = (3 + nents); - - ret = create_sg_component(sr, &sr->out, sr->out.map_bufs_cnt); + sr->out.sg = req->dst; + sr->out.sgmap_cnt = nents; + ret = create_sg_component(sr, &sr->out, sr->out.sgmap_cnt); if (ret) goto outcomp_map_err; return 0; outcomp_map_err: - if (!sr->inplace) - dma_unmap_sg(dev, req->dst, nents, sr->out.dir); - sr->out.map_bufs_cnt = 0; - sr->out.buf = NULL; -dst_map_err: - dma_unmap_single(dev, sr->resp.completion_dma, COMP_HLEN, sr->out.dir); - sr->resp.completion_dma = 0; -compl_map_err: - dma_unmap_single(dev, sr->resp.orh_dma, ORH_HLEN, sr->out.dir); - sr->resp.orh_dma = 0; -orh_map_err: - kfree(sr->out.sglist); - sr->out.sglist = NULL; + dma_unmap_sg(dev, req->dst, nents, DMA_BIDIRECTIONAL); + sr->out.sgmap_cnt = 0; + sr->out.sg = NULL; return ret; } @@ -556,8 +403,8 @@ int nitrox_process_se_request(struct nitrox_device *ndev, atomic_set(&sr->status, REQ_NOT_POSTED); - WRITE_ONCE(sr->resp.orh, PENDING_SIG); - WRITE_ONCE(sr->resp.completion, PENDING_SIG); + sr->resp.orh = req->orh; + sr->resp.completion = req->comp; ret = softreq_map_iobuf(sr, req); if (ret) { @@ -598,13 +445,13 @@ int nitrox_process_se_request(struct nitrox_device *ndev, /* fill the packet instruction */ /* word 0 */ - sr->instr.dptr0 = cpu_to_be64(sr->in.dma); + sr->instr.dptr0 = cpu_to_be64(sr->in.sgcomp_dma); /* word 1 */ sr->instr.ih.value = 0; sr->instr.ih.s.g = 1; - sr->instr.ih.s.gsz = sr->in.map_bufs_cnt; - sr->instr.ih.s.ssz = sr->out.map_bufs_cnt; + sr->instr.ih.s.gsz = sr->in.sgmap_cnt; + sr->instr.ih.s.ssz = sr->out.sgmap_cnt; sr->instr.ih.s.fsz = FDATA_SIZE + sizeof(struct gphdr); sr->instr.ih.s.tlen = sr->instr.ih.s.fsz + sr->in.total_bytes; sr->instr.ih.value = cpu_to_be64(sr->instr.ih.value); @@ -626,11 +473,11 @@ int nitrox_process_se_request(struct nitrox_device *ndev, /* word 4 */ sr->instr.slc.value[0] = 0; - sr->instr.slc.s.ssz = sr->out.map_bufs_cnt; + sr->instr.slc.s.ssz = sr->out.sgmap_cnt; sr->instr.slc.value[0] = cpu_to_be64(sr->instr.slc.value[0]); /* word 5 */ - sr->instr.slc.s.rptr = cpu_to_be64(sr->out.dma); + sr->instr.slc.s.rptr = cpu_to_be64(sr->out.sgcomp_dma); /* * No conversion for front data, @@ -664,6 +511,11 @@ void backlog_qflush_work(struct work_struct *work) post_backlog_cmds(cmdq); } +static bool sr_completed(struct nitrox_softreq *sr) +{ + return (READ_ONCE(*sr->resp.orh) != READ_ONCE(*sr->resp.completion)); +} + /** * process_request_list - process completed requests * @ndev: N5 device @@ -691,13 +543,13 @@ static void process_response_list(struct nitrox_cmdq *cmdq) break; /* check orh and completion bytes updates */ - if (READ_ONCE(sr->resp.orh) == READ_ONCE(sr->resp.completion)) { + if (!sr_completed(sr)) { /* request not completed, check for timeout */ if (!cmd_timeout(sr->tstamp, ndev->timeout)) break; dev_err_ratelimited(DEV(ndev), "Request timeout, orh 0x%016llx\n", - READ_ONCE(sr->resp.orh)); + READ_ONCE(*sr->resp.orh)); } atomic_dec(&cmdq->pending_count); atomic64_inc(&ndev->stats.completed); @@ -710,7 +562,7 @@ static void process_response_list(struct nitrox_cmdq *cmdq) skreq = sr->skreq; /* ORH error code */ - err = READ_ONCE(sr->resp.orh) & 0xff; + err = READ_ONCE(*sr->resp.orh) & 0xff; softreq_destroy(sr); if (callback) -- cgit v1.2.3-59-g8ed1b From 7a027b57f959660547779b6290a3b50ff1601980 Mon Sep 17 00:00:00 2001 From: "Srikanth, Jampala" Date: Wed, 21 Nov 2018 09:52:24 +0000 Subject: crypto: cavium/nitrox - Enable interrups for PF in SR-IOV mode. Enable the available interrupt vectors for PF in SR-IOV Mode. Only single vector entry 192 is valid of PF. This is used to notify any hardware errors and mailbox messages from VF(s). Signed-off-by: Srikanth Jampala Signed-off-by: Herbert Xu --- drivers/crypto/cavium/nitrox/nitrox_dev.h | 15 +++++- drivers/crypto/cavium/nitrox/nitrox_isr.c | 84 ++++++++++++++++++++++++++++- drivers/crypto/cavium/nitrox/nitrox_isr.h | 2 + drivers/crypto/cavium/nitrox/nitrox_sriov.c | 51 +++++++++++++++--- 4 files changed, 142 insertions(+), 10 deletions(-) diff --git a/drivers/crypto/cavium/nitrox/nitrox_dev.h b/drivers/crypto/cavium/nitrox/nitrox_dev.h index 283e252385fb..247df32f687c 100644 --- a/drivers/crypto/cavium/nitrox/nitrox_dev.h +++ b/drivers/crypto/cavium/nitrox/nitrox_dev.h @@ -103,6 +103,16 @@ struct nitrox_q_vector { }; }; +/** + * struct nitrox_iov - SR-IOV information + * @num_vfs: number of VF(s) enabled + * @msix: MSI-X for PF in SR-IOV case + */ +struct nitrox_iov { + int num_vfs; + struct msix_entry msix; +}; + /* * NITROX Device states */ @@ -150,6 +160,9 @@ enum vf_mode { * @ctx_pool: DMA pool for crypto context * @pkt_inq: Packet input rings * @qvec: MSI-X queue vectors information + * @iov: SR-IOV informatin + * @num_vecs: number of MSI-X vectors + * @stats: request statistics * @hw: hardware information * @debugfs_dir: debugfs directory */ @@ -168,13 +181,13 @@ struct nitrox_device { int node; u16 qlen; u16 nr_queues; - int num_vfs; enum vf_mode mode; struct dma_pool *ctx_pool; struct nitrox_cmdq *pkt_inq; struct nitrox_q_vector *qvec; + struct nitrox_iov iov; int num_vecs; struct nitrox_stats stats; diff --git a/drivers/crypto/cavium/nitrox/nitrox_isr.c b/drivers/crypto/cavium/nitrox/nitrox_isr.c index 88a77b8fb3fb..c5b797808680 100644 --- a/drivers/crypto/cavium/nitrox/nitrox_isr.c +++ b/drivers/crypto/cavium/nitrox/nitrox_isr.c @@ -13,6 +13,7 @@ * - NPS packet ring, AQMQ ring and ZQMQ ring */ #define NR_RING_VECTORS 3 +#define NR_NON_RING_VECTORS 1 /* base entry for packet ring/port */ #define PKT_RING_MSIX_BASE 0 #define NON_RING_MSIX_BASE 192 @@ -275,6 +276,7 @@ void nitrox_unregister_interrupts(struct nitrox_device *ndev) qvec->valid = false; } kfree(ndev->qvec); + ndev->qvec = NULL; pci_free_irq_vectors(pdev); } @@ -321,6 +323,7 @@ int nitrox_register_interrupts(struct nitrox_device *ndev) if (qvec->ring >= ndev->nr_queues) break; + qvec->cmdq = &ndev->pkt_inq[qvec->ring]; snprintf(qvec->name, IRQ_NAMESZ, "nitrox-pkt%d", qvec->ring); /* get the vector number */ vec = pci_irq_vector(pdev, i); @@ -335,13 +338,13 @@ int nitrox_register_interrupts(struct nitrox_device *ndev) tasklet_init(&qvec->resp_tasklet, pkt_slc_resp_tasklet, (unsigned long)qvec); - qvec->cmdq = &ndev->pkt_inq[qvec->ring]; qvec->valid = true; } /* request irqs for non ring vectors */ i = NON_RING_MSIX_BASE; qvec = &ndev->qvec[i]; + qvec->ndev = ndev; snprintf(qvec->name, IRQ_NAMESZ, "nitrox-core-int%d", i); /* get the vector number */ @@ -356,7 +359,6 @@ int nitrox_register_interrupts(struct nitrox_device *ndev) tasklet_init(&qvec->resp_tasklet, nps_core_int_tasklet, (unsigned long)qvec); - qvec->ndev = ndev; qvec->valid = true; return 0; @@ -365,3 +367,81 @@ irq_fail: nitrox_unregister_interrupts(ndev); return ret; } + +void nitrox_sriov_unregister_interrupts(struct nitrox_device *ndev) +{ + struct pci_dev *pdev = ndev->pdev; + int i; + + for (i = 0; i < ndev->num_vecs; i++) { + struct nitrox_q_vector *qvec; + int vec; + + qvec = ndev->qvec + i; + if (!qvec->valid) + continue; + + vec = ndev->iov.msix.vector; + irq_set_affinity_hint(vec, NULL); + free_irq(vec, qvec); + + tasklet_disable(&qvec->resp_tasklet); + tasklet_kill(&qvec->resp_tasklet); + qvec->valid = false; + } + kfree(ndev->qvec); + ndev->qvec = NULL; + pci_disable_msix(pdev); +} + +int nitrox_sriov_register_interupts(struct nitrox_device *ndev) +{ + struct pci_dev *pdev = ndev->pdev; + struct nitrox_q_vector *qvec; + int vec, cpu; + int ret; + + /** + * only non ring vectors i.e Entry 192 is available + * for PF in SR-IOV mode. + */ + ndev->iov.msix.entry = NON_RING_MSIX_BASE; + ret = pci_enable_msix_exact(pdev, &ndev->iov.msix, NR_NON_RING_VECTORS); + if (ret) { + dev_err(DEV(ndev), "failed to allocate nps-core-int%d\n", + NON_RING_MSIX_BASE); + return ret; + } + + qvec = kcalloc(NR_NON_RING_VECTORS, sizeof(*qvec), GFP_KERNEL); + if (!qvec) { + pci_disable_msix(pdev); + return -ENOMEM; + } + qvec->ndev = ndev; + + ndev->qvec = qvec; + ndev->num_vecs = NR_NON_RING_VECTORS; + snprintf(qvec->name, IRQ_NAMESZ, "nitrox-core-int%d", + NON_RING_MSIX_BASE); + + vec = ndev->iov.msix.vector; + ret = request_irq(vec, nps_core_int_isr, 0, qvec->name, qvec); + if (ret) { + dev_err(DEV(ndev), "irq failed for nitrox-core-int%d\n", + NON_RING_MSIX_BASE); + goto iov_irq_fail; + } + cpu = num_online_cpus(); + irq_set_affinity_hint(vec, get_cpu_mask(cpu)); + + tasklet_init(&qvec->resp_tasklet, nps_core_int_tasklet, + (unsigned long)qvec); + qvec->valid = true; + + return 0; + +iov_irq_fail: + nitrox_sriov_unregister_interrupts(ndev); + return ret; +} diff --git a/drivers/crypto/cavium/nitrox/nitrox_isr.h b/drivers/crypto/cavium/nitrox/nitrox_isr.h index 63418a6cc52c..1062c9336c1f 100644 --- a/drivers/crypto/cavium/nitrox/nitrox_isr.h +++ b/drivers/crypto/cavium/nitrox/nitrox_isr.h @@ -6,5 +6,7 @@ int nitrox_register_interrupts(struct nitrox_device *ndev); void nitrox_unregister_interrupts(struct nitrox_device *ndev); +int nitrox_sriov_register_interupts(struct nitrox_device *ndev); +void nitrox_sriov_unregister_interrupts(struct nitrox_device *ndev); #endif /* __NITROX_ISR_H */ diff --git a/drivers/crypto/cavium/nitrox/nitrox_sriov.c b/drivers/crypto/cavium/nitrox/nitrox_sriov.c index 30c0aa874583..7ba0cc5d6d02 100644 --- a/drivers/crypto/cavium/nitrox/nitrox_sriov.c +++ b/drivers/crypto/cavium/nitrox/nitrox_sriov.c @@ -7,6 +7,10 @@ #include "nitrox_common.h" #include "nitrox_isr.h" +/** + * num_vfs_valid - validate VF count + * @num_vfs: number of VF(s) + */ static inline bool num_vfs_valid(int num_vfs) { bool valid = false; @@ -48,7 +52,7 @@ static inline enum vf_mode num_vfs_to_mode(int num_vfs) return mode; } -static void pf_sriov_cleanup(struct nitrox_device *ndev) +static void nitrox_pf_cleanup(struct nitrox_device *ndev) { /* PF has no queues in SR-IOV mode */ atomic_set(&ndev->state, __NDEV_NOT_READY); @@ -60,7 +64,11 @@ static void pf_sriov_cleanup(struct nitrox_device *ndev) nitrox_common_sw_cleanup(ndev); } -static int pf_sriov_init(struct nitrox_device *ndev) +/** + * nitrox_pf_reinit - re-initialize PF resources once SR-IOV is disabled + * @ndev: NITROX device + */ +static int nitrox_pf_reinit(struct nitrox_device *ndev) { int err; @@ -86,6 +94,18 @@ static int pf_sriov_init(struct nitrox_device *ndev) return nitrox_crypto_register(); } +static int nitrox_sriov_init(struct nitrox_device *ndev) +{ + /* register interrupts for PF in SR-IOV */ + return nitrox_sriov_register_interupts(ndev); +} + +static void nitrox_sriov_cleanup(struct nitrox_device *ndev) +{ + /* unregister interrupts for PF in SR-IOV */ + nitrox_sriov_unregister_interrupts(ndev); +} + static int nitrox_sriov_enable(struct pci_dev *pdev, int num_vfs) { struct nitrox_device *ndev = pci_get_drvdata(pdev); @@ -106,17 +126,31 @@ static int nitrox_sriov_enable(struct pci_dev *pdev, int num_vfs) } dev_info(DEV(ndev), "Enabled VF(s) %d\n", num_vfs); - ndev->num_vfs = num_vfs; + ndev->iov.num_vfs = num_vfs; ndev->mode = num_vfs_to_mode(num_vfs); /* set bit in flags */ set_bit(__NDEV_SRIOV_BIT, &ndev->flags); /* cleanup PF resources */ - pf_sriov_cleanup(ndev); + nitrox_pf_cleanup(ndev); - config_nps_core_vfcfg_mode(ndev, ndev->mode); + /* PF SR-IOV mode initialization */ + err = nitrox_sriov_init(ndev); + if (err) + goto iov_fail; + config_nps_core_vfcfg_mode(ndev, ndev->mode); return num_vfs; + +iov_fail: + pci_disable_sriov(pdev); + /* clear bit in flags */ + clear_bit(__NDEV_SRIOV_BIT, &ndev->flags); + ndev->iov.num_vfs = 0; + ndev->mode = __NDEV_MODE_PF; + /* reset back to working mode in PF */ + nitrox_pf_reinit(ndev); + return err; } static int nitrox_sriov_disable(struct pci_dev *pdev) @@ -134,12 +168,15 @@ static int nitrox_sriov_disable(struct pci_dev *pdev) /* clear bit in flags */ clear_bit(__NDEV_SRIOV_BIT, &ndev->flags); - ndev->num_vfs = 0; + ndev->iov.num_vfs = 0; ndev->mode = __NDEV_MODE_PF; + /* cleanup PF SR-IOV resources */ + nitrox_sriov_cleanup(ndev); + config_nps_core_vfcfg_mode(ndev, ndev->mode); - return pf_sriov_init(ndev); + return nitrox_pf_reinit(ndev); } int nitrox_sriov_configure(struct pci_dev *pdev, int num_vfs) -- cgit v1.2.3-59-g8ed1b From 1c876a90e25398a7396ff4de9074ab530e7892b4 Mon Sep 17 00:00:00 2001 From: Gilad Ben-Yossef Date: Tue, 13 Nov 2018 09:40:35 +0000 Subject: crypto: ccree - add support for CryptoCell 703 Add support for Arm TrustZone CryptoCell 703. The 703 is a variant of the CryptoCell 713 that supports only algorithms certified by the Chinesse Office of the State Commercial Cryptography Administration (OSCCA). Signed-off-by: Gilad Ben-Yossef Signed-off-by: Herbert Xu --- drivers/crypto/Kconfig | 2 +- drivers/crypto/ccree/cc_aead.c | 16 +++++++++++++++- drivers/crypto/ccree/cc_cipher.c | 38 +++++++++++++++++++++++++++++++++++++- drivers/crypto/ccree/cc_driver.c | 19 +++++++++++++++---- drivers/crypto/ccree/cc_driver.h | 8 ++++++++ drivers/crypto/ccree/cc_hash.c | 16 ++++++++++++++-- 6 files changed, 90 insertions(+), 9 deletions(-) diff --git a/drivers/crypto/Kconfig b/drivers/crypto/Kconfig index 9f1baaec385f..d80751d48cf1 100644 --- a/drivers/crypto/Kconfig +++ b/drivers/crypto/Kconfig @@ -767,7 +767,7 @@ config CRYPTO_DEV_CCREE help Say 'Y' to enable a driver for the REE interface of the Arm TrustZone CryptoCell family of processors. Currently the - CryptoCell 713, 712, 710 and 630 are supported. + CryptoCell 713, 703, 712, 710 and 630 are supported. Choose this if you wish to use hardware acceleration of cryptographic operations on the system REE. If unsure say Y. diff --git a/drivers/crypto/ccree/cc_aead.c b/drivers/crypto/ccree/cc_aead.c index e0ac376ceb74..f2643cda45db 100644 --- a/drivers/crypto/ccree/cc_aead.c +++ b/drivers/crypto/ccree/cc_aead.c @@ -2367,6 +2367,7 @@ static struct cc_alg_template aead_algs[] = { .flow_mode = S_DIN_to_AES, .auth_mode = DRV_HASH_SHA1, .min_hw_rev = CC_HW_REV_630, + .std_body = CC_STD_NIST, }, { .name = "authenc(hmac(sha1),cbc(des3_ede))", @@ -2386,6 +2387,7 @@ static struct cc_alg_template aead_algs[] = { .flow_mode = S_DIN_to_DES, .auth_mode = DRV_HASH_SHA1, .min_hw_rev = CC_HW_REV_630, + .std_body = CC_STD_NIST, }, { .name = "authenc(hmac(sha256),cbc(aes))", @@ -2405,6 +2407,7 @@ static struct cc_alg_template aead_algs[] = { .flow_mode = S_DIN_to_AES, .auth_mode = DRV_HASH_SHA256, .min_hw_rev = CC_HW_REV_630, + .std_body = CC_STD_NIST, }, { .name = "authenc(hmac(sha256),cbc(des3_ede))", @@ -2424,6 +2427,7 @@ static struct cc_alg_template aead_algs[] = { .flow_mode = S_DIN_to_DES, .auth_mode = DRV_HASH_SHA256, .min_hw_rev = CC_HW_REV_630, + .std_body = CC_STD_NIST, }, { .name = "authenc(xcbc(aes),cbc(aes))", @@ -2443,6 +2447,7 @@ static struct cc_alg_template aead_algs[] = { .flow_mode = S_DIN_to_AES, .auth_mode = DRV_HASH_XCBC_MAC, .min_hw_rev = CC_HW_REV_630, + .std_body = CC_STD_NIST, }, { .name = "authenc(hmac(sha1),rfc3686(ctr(aes)))", @@ -2462,6 +2467,7 @@ static struct cc_alg_template aead_algs[] = { .flow_mode = S_DIN_to_AES, .auth_mode = DRV_HASH_SHA1, .min_hw_rev = CC_HW_REV_630, + .std_body = CC_STD_NIST, }, { .name = "authenc(hmac(sha256),rfc3686(ctr(aes)))", @@ -2481,6 +2487,7 @@ static struct cc_alg_template aead_algs[] = { .flow_mode = S_DIN_to_AES, .auth_mode = DRV_HASH_SHA256, .min_hw_rev = CC_HW_REV_630, + .std_body = CC_STD_NIST, }, { .name = "authenc(xcbc(aes),rfc3686(ctr(aes)))", @@ -2500,6 +2507,7 @@ static struct cc_alg_template aead_algs[] = { .flow_mode = S_DIN_to_AES, .auth_mode = DRV_HASH_XCBC_MAC, .min_hw_rev = CC_HW_REV_630, + .std_body = CC_STD_NIST, }, { .name = "ccm(aes)", @@ -2519,6 +2527,7 @@ static struct cc_alg_template aead_algs[] = { .flow_mode = S_DIN_to_AES, .auth_mode = DRV_HASH_NULL, .min_hw_rev = CC_HW_REV_630, + .std_body = CC_STD_NIST, }, { .name = "rfc4309(ccm(aes))", @@ -2538,6 +2547,7 @@ static struct cc_alg_template aead_algs[] = { .flow_mode = S_DIN_to_AES, .auth_mode = DRV_HASH_NULL, .min_hw_rev = CC_HW_REV_630, + .std_body = CC_STD_NIST, }, { .name = "gcm(aes)", @@ -2557,6 +2567,7 @@ static struct cc_alg_template aead_algs[] = { .flow_mode = S_DIN_to_AES, .auth_mode = DRV_HASH_NULL, .min_hw_rev = CC_HW_REV_630, + .std_body = CC_STD_NIST, }, { .name = "rfc4106(gcm(aes))", @@ -2576,6 +2587,7 @@ static struct cc_alg_template aead_algs[] = { .flow_mode = S_DIN_to_AES, .auth_mode = DRV_HASH_NULL, .min_hw_rev = CC_HW_REV_630, + .std_body = CC_STD_NIST, }, { .name = "rfc4543(gcm(aes))", @@ -2595,6 +2607,7 @@ static struct cc_alg_template aead_algs[] = { .flow_mode = S_DIN_to_AES, .auth_mode = DRV_HASH_NULL, .min_hw_rev = CC_HW_REV_630, + .std_body = CC_STD_NIST, }, }; @@ -2679,7 +2692,8 @@ int cc_aead_alloc(struct cc_drvdata *drvdata) /* Linux crypto */ for (alg = 0; alg < ARRAY_SIZE(aead_algs); alg++) { - if (aead_algs[alg].min_hw_rev > drvdata->hw_rev) + if ((aead_algs[alg].min_hw_rev > drvdata->hw_rev) || + !(drvdata->std_bodies & aead_algs[alg].std_body)) continue; t_alg = cc_create_aead_alg(&aead_algs[alg], dev); diff --git a/drivers/crypto/ccree/cc_cipher.c b/drivers/crypto/ccree/cc_cipher.c index 989e70f43bd9..cc92b031fad1 100644 --- a/drivers/crypto/ccree/cc_cipher.c +++ b/drivers/crypto/ccree/cc_cipher.c @@ -833,6 +833,7 @@ static const struct cc_alg_template skcipher_algs[] = { .cipher_mode = DRV_CIPHER_XTS, .flow_mode = S_DIN_to_AES, .min_hw_rev = CC_HW_REV_630, + .std_body = CC_STD_NIST, }, { .name = "xts512(paes)", @@ -850,6 +851,7 @@ static const struct cc_alg_template skcipher_algs[] = { .flow_mode = S_DIN_to_AES, .data_unit = 512, .min_hw_rev = CC_HW_REV_712, + .std_body = CC_STD_NIST, }, { .name = "xts4096(paes)", @@ -867,6 +869,7 @@ static const struct cc_alg_template skcipher_algs[] = { .flow_mode = S_DIN_to_AES, .data_unit = 4096, .min_hw_rev = CC_HW_REV_712, + .std_body = CC_STD_NIST, }, { .name = "essiv(paes)", @@ -883,6 +886,7 @@ static const struct cc_alg_template skcipher_algs[] = { .cipher_mode = DRV_CIPHER_ESSIV, .flow_mode = S_DIN_to_AES, .min_hw_rev = CC_HW_REV_712, + .std_body = CC_STD_NIST, }, { .name = "essiv512(paes)", @@ -900,6 +904,7 @@ static const struct cc_alg_template skcipher_algs[] = { .flow_mode = S_DIN_to_AES, .data_unit = 512, .min_hw_rev = CC_HW_REV_712, + .std_body = CC_STD_NIST, }, { .name = "essiv4096(paes)", @@ -917,6 +922,7 @@ static const struct cc_alg_template skcipher_algs[] = { .flow_mode = S_DIN_to_AES, .data_unit = 4096, .min_hw_rev = CC_HW_REV_712, + .std_body = CC_STD_NIST, }, { .name = "bitlocker(paes)", @@ -933,6 +939,7 @@ static const struct cc_alg_template skcipher_algs[] = { .cipher_mode = DRV_CIPHER_BITLOCKER, .flow_mode = S_DIN_to_AES, .min_hw_rev = CC_HW_REV_712, + .std_body = CC_STD_NIST, }, { .name = "bitlocker512(paes)", @@ -950,6 +957,7 @@ static const struct cc_alg_template skcipher_algs[] = { .flow_mode = S_DIN_to_AES, .data_unit = 512, .min_hw_rev = CC_HW_REV_712, + .std_body = CC_STD_NIST, }, { .name = "bitlocker4096(paes)", @@ -967,6 +975,7 @@ static const struct cc_alg_template skcipher_algs[] = { .flow_mode = S_DIN_to_AES, .data_unit = 4096, .min_hw_rev = CC_HW_REV_712, + .std_body = CC_STD_NIST, }, { .name = "ecb(paes)", @@ -983,6 +992,7 @@ static const struct cc_alg_template skcipher_algs[] = { .cipher_mode = DRV_CIPHER_ECB, .flow_mode = S_DIN_to_AES, .min_hw_rev = CC_HW_REV_712, + .std_body = CC_STD_NIST, }, { .name = "cbc(paes)", @@ -999,6 +1009,7 @@ static const struct cc_alg_template skcipher_algs[] = { .cipher_mode = DRV_CIPHER_CBC, .flow_mode = S_DIN_to_AES, .min_hw_rev = CC_HW_REV_712, + .std_body = CC_STD_NIST, }, { .name = "ofb(paes)", @@ -1015,6 +1026,7 @@ static const struct cc_alg_template skcipher_algs[] = { .cipher_mode = DRV_CIPHER_OFB, .flow_mode = S_DIN_to_AES, .min_hw_rev = CC_HW_REV_712, + .std_body = CC_STD_NIST, }, { .name = "cts(cbc(paes))", @@ -1031,6 +1043,7 @@ static const struct cc_alg_template skcipher_algs[] = { .cipher_mode = DRV_CIPHER_CBC_CTS, .flow_mode = S_DIN_to_AES, .min_hw_rev = CC_HW_REV_712, + .std_body = CC_STD_NIST, }, { .name = "ctr(paes)", @@ -1047,6 +1060,7 @@ static const struct cc_alg_template skcipher_algs[] = { .cipher_mode = DRV_CIPHER_CTR, .flow_mode = S_DIN_to_AES, .min_hw_rev = CC_HW_REV_712, + .std_body = CC_STD_NIST, }, { .name = "xts(aes)", @@ -1063,6 +1077,7 @@ static const struct cc_alg_template skcipher_algs[] = { .cipher_mode = DRV_CIPHER_XTS, .flow_mode = S_DIN_to_AES, .min_hw_rev = CC_HW_REV_630, + .std_body = CC_STD_NIST, }, { .name = "xts512(aes)", @@ -1080,6 +1095,7 @@ static const struct cc_alg_template skcipher_algs[] = { .flow_mode = S_DIN_to_AES, .data_unit = 512, .min_hw_rev = CC_HW_REV_712, + .std_body = CC_STD_NIST, }, { .name = "xts4096(aes)", @@ -1097,6 +1113,7 @@ static const struct cc_alg_template skcipher_algs[] = { .flow_mode = S_DIN_to_AES, .data_unit = 4096, .min_hw_rev = CC_HW_REV_712, + .std_body = CC_STD_NIST, }, { .name = "essiv(aes)", @@ -1113,6 +1130,7 @@ static const struct cc_alg_template skcipher_algs[] = { .cipher_mode = DRV_CIPHER_ESSIV, .flow_mode = S_DIN_to_AES, .min_hw_rev = CC_HW_REV_712, + .std_body = CC_STD_NIST, }, { .name = "essiv512(aes)", @@ -1130,6 +1148,7 @@ static const struct cc_alg_template skcipher_algs[] = { .flow_mode = S_DIN_to_AES, .data_unit = 512, .min_hw_rev = CC_HW_REV_712, + .std_body = CC_STD_NIST, }, { .name = "essiv4096(aes)", @@ -1147,6 +1166,7 @@ static const struct cc_alg_template skcipher_algs[] = { .flow_mode = S_DIN_to_AES, .data_unit = 4096, .min_hw_rev = CC_HW_REV_712, + .std_body = CC_STD_NIST, }, { .name = "bitlocker(aes)", @@ -1163,6 +1183,7 @@ static const struct cc_alg_template skcipher_algs[] = { .cipher_mode = DRV_CIPHER_BITLOCKER, .flow_mode = S_DIN_to_AES, .min_hw_rev = CC_HW_REV_712, + .std_body = CC_STD_NIST, }, { .name = "bitlocker512(aes)", @@ -1180,6 +1201,7 @@ static const struct cc_alg_template skcipher_algs[] = { .flow_mode = S_DIN_to_AES, .data_unit = 512, .min_hw_rev = CC_HW_REV_712, + .std_body = CC_STD_NIST, }, { .name = "bitlocker4096(aes)", @@ -1197,6 +1219,7 @@ static const struct cc_alg_template skcipher_algs[] = { .flow_mode = S_DIN_to_AES, .data_unit = 4096, .min_hw_rev = CC_HW_REV_712, + .std_body = CC_STD_NIST, }, { .name = "ecb(aes)", @@ -1213,6 +1236,7 @@ static const struct cc_alg_template skcipher_algs[] = { .cipher_mode = DRV_CIPHER_ECB, .flow_mode = S_DIN_to_AES, .min_hw_rev = CC_HW_REV_630, + .std_body = CC_STD_NIST, }, { .name = "cbc(aes)", @@ -1229,6 +1253,7 @@ static const struct cc_alg_template skcipher_algs[] = { .cipher_mode = DRV_CIPHER_CBC, .flow_mode = S_DIN_to_AES, .min_hw_rev = CC_HW_REV_630, + .std_body = CC_STD_NIST, }, { .name = "ofb(aes)", @@ -1245,6 +1270,7 @@ static const struct cc_alg_template skcipher_algs[] = { .cipher_mode = DRV_CIPHER_OFB, .flow_mode = S_DIN_to_AES, .min_hw_rev = CC_HW_REV_630, + .std_body = CC_STD_NIST, }, { .name = "cts(cbc(aes))", @@ -1261,6 +1287,7 @@ static const struct cc_alg_template skcipher_algs[] = { .cipher_mode = DRV_CIPHER_CBC_CTS, .flow_mode = S_DIN_to_AES, .min_hw_rev = CC_HW_REV_630, + .std_body = CC_STD_NIST, }, { .name = "ctr(aes)", @@ -1277,6 +1304,7 @@ static const struct cc_alg_template skcipher_algs[] = { .cipher_mode = DRV_CIPHER_CTR, .flow_mode = S_DIN_to_AES, .min_hw_rev = CC_HW_REV_630, + .std_body = CC_STD_NIST, }, { .name = "cbc(des3_ede)", @@ -1293,6 +1321,7 @@ static const struct cc_alg_template skcipher_algs[] = { .cipher_mode = DRV_CIPHER_CBC, .flow_mode = S_DIN_to_DES, .min_hw_rev = CC_HW_REV_630, + .std_body = CC_STD_NIST, }, { .name = "ecb(des3_ede)", @@ -1309,6 +1338,7 @@ static const struct cc_alg_template skcipher_algs[] = { .cipher_mode = DRV_CIPHER_ECB, .flow_mode = S_DIN_to_DES, .min_hw_rev = CC_HW_REV_630, + .std_body = CC_STD_NIST, }, { .name = "cbc(des)", @@ -1325,6 +1355,7 @@ static const struct cc_alg_template skcipher_algs[] = { .cipher_mode = DRV_CIPHER_CBC, .flow_mode = S_DIN_to_DES, .min_hw_rev = CC_HW_REV_630, + .std_body = CC_STD_NIST, }, { .name = "ecb(des)", @@ -1341,6 +1372,7 @@ static const struct cc_alg_template skcipher_algs[] = { .cipher_mode = DRV_CIPHER_ECB, .flow_mode = S_DIN_to_DES, .min_hw_rev = CC_HW_REV_630, + .std_body = CC_STD_NIST, }, { .name = "cbc(sm4)", @@ -1357,6 +1389,7 @@ static const struct cc_alg_template skcipher_algs[] = { .cipher_mode = DRV_CIPHER_CBC, .flow_mode = S_DIN_to_SM4, .min_hw_rev = CC_HW_REV_713, + .std_body = CC_STD_OSCCA, }, { .name = "ecb(sm4)", @@ -1373,6 +1406,7 @@ static const struct cc_alg_template skcipher_algs[] = { .cipher_mode = DRV_CIPHER_ECB, .flow_mode = S_DIN_to_SM4, .min_hw_rev = CC_HW_REV_713, + .std_body = CC_STD_OSCCA, }, { .name = "ctr(sm4)", @@ -1389,6 +1423,7 @@ static const struct cc_alg_template skcipher_algs[] = { .cipher_mode = DRV_CIPHER_CTR, .flow_mode = S_DIN_to_SM4, .min_hw_rev = CC_HW_REV_713, + .std_body = CC_STD_OSCCA, }, }; @@ -1464,7 +1499,8 @@ int cc_cipher_alloc(struct cc_drvdata *drvdata) dev_dbg(dev, "Number of algorithms = %zu\n", ARRAY_SIZE(skcipher_algs)); for (alg = 0; alg < ARRAY_SIZE(skcipher_algs); alg++) { - if (skcipher_algs[alg].min_hw_rev > drvdata->hw_rev) + if ((skcipher_algs[alg].min_hw_rev > drvdata->hw_rev) || + !(drvdata->std_bodies & skcipher_algs[alg].std_body)) continue; dev_dbg(dev, "creating %s\n", skcipher_algs[alg].driver_name); diff --git a/drivers/crypto/ccree/cc_driver.c b/drivers/crypto/ccree/cc_driver.c index 09a6ada0b547..8ada308d72ee 100644 --- a/drivers/crypto/ccree/cc_driver.c +++ b/drivers/crypto/ccree/cc_driver.c @@ -39,27 +39,37 @@ struct cc_hw_data { char *name; enum cc_hw_rev rev; u32 sig; + int std_bodies; }; /* Hardware revisions defs. */ +/* The 703 is a OSCCA only variant of the 713 */ +static const struct cc_hw_data cc703_hw = { + .name = "703", .rev = CC_HW_REV_713, .std_bodies = CC_STD_OSCCA +}; + static const struct cc_hw_data cc713_hw = { - .name = "713", .rev = CC_HW_REV_713 + .name = "713", .rev = CC_HW_REV_713, .std_bodies = CC_STD_ALL }; static const struct cc_hw_data cc712_hw = { - .name = "712", .rev = CC_HW_REV_712, .sig = 0xDCC71200U + .name = "712", .rev = CC_HW_REV_712, .sig = 0xDCC71200U, + .std_bodies = CC_STD_ALL }; static const struct cc_hw_data cc710_hw = { - .name = "710", .rev = CC_HW_REV_710, .sig = 0xDCC63200U + .name = "710", .rev = CC_HW_REV_710, .sig = 0xDCC63200U, + .std_bodies = CC_STD_ALL }; static const struct cc_hw_data cc630p_hw = { - .name = "630P", .rev = CC_HW_REV_630, .sig = 0xDCC63000U + .name = "630P", .rev = CC_HW_REV_630, .sig = 0xDCC63000U, + .std_bodies = CC_STD_ALL }; static const struct of_device_id arm_ccree_dev_of_match[] = { + { .compatible = "arm,cryptocell-703-ree", .data = &cc703_hw }, { .compatible = "arm,cryptocell-713-ree", .data = &cc713_hw }, { .compatible = "arm,cryptocell-712-ree", .data = &cc712_hw }, { .compatible = "arm,cryptocell-710-ree", .data = &cc710_hw }, @@ -209,6 +219,7 @@ static int init_cc_resources(struct platform_device *plat_dev) hw_rev = (struct cc_hw_data *)dev_id->data; new_drvdata->hw_rev_name = hw_rev->name; new_drvdata->hw_rev = hw_rev->rev; + new_drvdata->std_bodies = hw_rev->std_bodies; if (hw_rev->rev >= CC_HW_REV_712) { new_drvdata->axim_mon_offset = CC_REG(AXIM_MON_COMP); diff --git a/drivers/crypto/ccree/cc_driver.h b/drivers/crypto/ccree/cc_driver.h index 170790967854..5be7fd431b05 100644 --- a/drivers/crypto/ccree/cc_driver.h +++ b/drivers/crypto/ccree/cc_driver.h @@ -45,6 +45,12 @@ enum cc_hw_rev { CC_HW_REV_713 = 713 }; +enum cc_std_body { + CC_STD_NIST = 0x1, + CC_STD_OSCCA = 0x2, + CC_STD_ALL = 0x3 +}; + #define CC_COHERENT_CACHE_PARAMS 0xEEE /* Maximum DMA mask supported by IP */ @@ -131,6 +137,7 @@ struct cc_drvdata { u32 axim_mon_offset; u32 sig_offset; u32 ver_offset; + int std_bodies; }; struct cc_crypto_alg { @@ -156,6 +163,7 @@ struct cc_alg_template { int flow_mode; /* Note: currently, refers to the cipher mode only. */ int auth_mode; u32 min_hw_rev; + enum cc_std_body std_body; unsigned int data_unit; struct cc_drvdata *drvdata; }; diff --git a/drivers/crypto/ccree/cc_hash.c b/drivers/crypto/ccree/cc_hash.c index c80c9ae555d0..2c4ddc8fb76b 100644 --- a/drivers/crypto/ccree/cc_hash.c +++ b/drivers/crypto/ccree/cc_hash.c @@ -1539,6 +1539,7 @@ struct cc_hash_template { int inter_digestsize; struct cc_drvdata *drvdata; u32 min_hw_rev; + enum cc_std_body std_body; }; #define CC_STATE_SIZE(_x) \ @@ -1573,6 +1574,7 @@ static struct cc_hash_template driver_hash[] = { .hw_mode = DRV_HASH_HW_SHA1, .inter_digestsize = SHA1_DIGEST_SIZE, .min_hw_rev = CC_HW_REV_630, + .std_body = CC_STD_NIST, }, { .name = "sha256", @@ -1599,6 +1601,7 @@ static struct cc_hash_template driver_hash[] = { .hw_mode = DRV_HASH_HW_SHA256, .inter_digestsize = SHA256_DIGEST_SIZE, .min_hw_rev = CC_HW_REV_630, + .std_body = CC_STD_NIST, }, { .name = "sha224", @@ -1625,6 +1628,7 @@ static struct cc_hash_template driver_hash[] = { .hw_mode = DRV_HASH_HW_SHA256, .inter_digestsize = SHA256_DIGEST_SIZE, .min_hw_rev = CC_HW_REV_630, + .std_body = CC_STD_NIST, }, { .name = "sha384", @@ -1651,6 +1655,7 @@ static struct cc_hash_template driver_hash[] = { .hw_mode = DRV_HASH_HW_SHA512, .inter_digestsize = SHA512_DIGEST_SIZE, .min_hw_rev = CC_HW_REV_712, + .std_body = CC_STD_NIST, }, { .name = "sha512", @@ -1677,6 +1682,7 @@ static struct cc_hash_template driver_hash[] = { .hw_mode = DRV_HASH_HW_SHA512, .inter_digestsize = SHA512_DIGEST_SIZE, .min_hw_rev = CC_HW_REV_712, + .std_body = CC_STD_NIST, }, { .name = "md5", @@ -1703,6 +1709,7 @@ static struct cc_hash_template driver_hash[] = { .hw_mode = DRV_HASH_HW_MD5, .inter_digestsize = MD5_DIGEST_SIZE, .min_hw_rev = CC_HW_REV_630, + .std_body = CC_STD_NIST, }, { .name = "sm3", @@ -1727,6 +1734,7 @@ static struct cc_hash_template driver_hash[] = { .hw_mode = DRV_HASH_HW_SM3, .inter_digestsize = SM3_DIGEST_SIZE, .min_hw_rev = CC_HW_REV_713, + .std_body = CC_STD_OSCCA, }, { .mac_name = "xcbc(aes)", @@ -1751,6 +1759,7 @@ static struct cc_hash_template driver_hash[] = { .hw_mode = DRV_CIPHER_XCBC_MAC, .inter_digestsize = AES_BLOCK_SIZE, .min_hw_rev = CC_HW_REV_630, + .std_body = CC_STD_NIST, }, { .mac_name = "cmac(aes)", @@ -1775,6 +1784,7 @@ static struct cc_hash_template driver_hash[] = { .hw_mode = DRV_CIPHER_CMAC, .inter_digestsize = AES_BLOCK_SIZE, .min_hw_rev = CC_HW_REV_630, + .std_body = CC_STD_NIST, }, }; @@ -2001,9 +2011,11 @@ int cc_hash_alloc(struct cc_drvdata *drvdata) struct cc_hash_alg *t_alg; int hw_mode = driver_hash[alg].hw_mode; - /* We either support both HASH and MAC or none */ - if (driver_hash[alg].min_hw_rev > drvdata->hw_rev) + /* Check that the HW revision and variants are suitable */ + if ((driver_hash[alg].min_hw_rev > drvdata->hw_rev) || + !(drvdata->std_bodies & driver_hash[alg].std_body)) continue; + if (driver_hash[alg].is_mac) { /* register hmac version */ t_alg = cc_alloc_hash_alg(&driver_hash[alg], dev, true); -- cgit v1.2.3-59-g8ed1b From fefbc0b4bcb3c0cc45bed906af46e7cd627ac4bc Mon Sep 17 00:00:00 2001 From: Gilad Ben-Yossef Date: Tue, 13 Nov 2018 09:40:36 +0000 Subject: dt-bindings: crypto: ccree: add dt bindings for ccree 703 Add device tree bindings associating Arm TrustZone CryptoCell 703 with the ccree driver. Signed-off-by: Gilad Ben-Yossef Reviewed-by: Rob Herring Signed-off-by: Herbert Xu --- Documentation/devicetree/bindings/crypto/arm-cryptocell.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/devicetree/bindings/crypto/arm-cryptocell.txt b/Documentation/devicetree/bindings/crypto/arm-cryptocell.txt index 0ac06ec05272..6130e6eb4af8 100644 --- a/Documentation/devicetree/bindings/crypto/arm-cryptocell.txt +++ b/Documentation/devicetree/bindings/crypto/arm-cryptocell.txt @@ -3,6 +3,7 @@ Arm TrustZone CryptoCell cryptographic engine Required properties: - compatible: Should be one of - "arm,cryptocell-713-ree" + "arm,cryptocell-703-ree" "arm,cryptocell-712-ree" "arm,cryptocell-710-ree" "arm,cryptocell-630p-ree" -- cgit v1.2.3-59-g8ed1b From 18596781e0beb6b4633cb5582308ba14c37eaaed Mon Sep 17 00:00:00 2001 From: Gilad Ben-Yossef Date: Tue, 13 Nov 2018 09:40:37 +0000 Subject: MAINTAINERS: ccree: add co-maintainer Add Yael Chemla as co-maintainer of Arm TrustZone CryptoCell REE driver. Signed-off-by: Gilad Ben-Yossef Signed-off-by: Herbert Xu --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index f4855974f325..ab42acbe6b1d 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3388,6 +3388,7 @@ F: include/linux/spi/cc2520.h F: Documentation/devicetree/bindings/net/ieee802154/cc2520.txt CCREE ARM TRUSTZONE CRYPTOCELL REE DRIVER +M: Yael Chemla M: Gilad Ben-Yossef L: linux-crypto@vger.kernel.org S: Supported -- cgit v1.2.3-59-g8ed1b From c97e4df573f2434b07f08d3b93673f61158d267f Mon Sep 17 00:00:00 2001 From: Paulo Flabiano Smorigo Date: Tue, 27 Nov 2018 15:27:21 -0200 Subject: MAINTAINERS: change NX/VMX maintainers Add Breno and Nayna as NX/VMX crypto driver maintainers. Also change my email address to my personal account and remove Leonidas since he's not working with the driver anymore. Signed-off-by: Paulo Flabiano Smorigo Signed-off-by: Herbert Xu --- MAINTAINERS | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index ab42acbe6b1d..6d6675bbe729 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -7002,8 +7002,9 @@ F: crypto/842.c F: lib/842/ IBM Power in-Nest Crypto Acceleration -M: Leonidas S. Barbosa -M: Paulo Flabiano Smorigo +M: Breno Leitão +M: Nayna Jain +M: Paulo Flabiano Smorigo L: linux-crypto@vger.kernel.org S: Supported F: drivers/crypto/nx/Makefile @@ -7069,8 +7070,9 @@ S: Supported F: drivers/scsi/ibmvscsi_tgt/ IBM Power VMX Cryptographic instructions -M: Leonidas S. Barbosa -M: Paulo Flabiano Smorigo +M: Breno Leitão +M: Nayna Jain +M: Paulo Flabiano Smorigo L: linux-crypto@vger.kernel.org S: Supported F: drivers/crypto/vmx/Makefile -- cgit v1.2.3-59-g8ed1b From 2ced26078fcff26db532d6300a1b5f8ffd11a5e1 Mon Sep 17 00:00:00 2001 From: Corentin Labbe Date: Thu, 29 Nov 2018 14:42:16 +0000 Subject: crypto: user - made crypto_user_stat optional Even if CRYPTO_STATS is set to n, some part of CRYPTO_STATS are compiled. This patch made all part of crypto_user_stat uncompiled in that case. Signed-off-by: Corentin Labbe Signed-off-by: Herbert Xu --- crypto/Makefile | 3 ++- crypto/algapi.c | 2 ++ include/crypto/internal/cryptouser.h | 17 +++++++++++++++++ include/linux/crypto.h | 2 ++ 4 files changed, 23 insertions(+), 1 deletion(-) diff --git a/crypto/Makefile b/crypto/Makefile index 5e789dc2d4fd..799ed5e94606 100644 --- a/crypto/Makefile +++ b/crypto/Makefile @@ -54,7 +54,8 @@ cryptomgr-y := algboss.o testmgr.o obj-$(CONFIG_CRYPTO_MANAGER2) += cryptomgr.o obj-$(CONFIG_CRYPTO_USER) += crypto_user.o -crypto_user-y := crypto_user_base.o crypto_user_stat.o +crypto_user-y := crypto_user_base.o +crypto_user-$(CONFIG_CRYPTO_STATS) += crypto_user_stat.o obj-$(CONFIG_CRYPTO_CMAC) += cmac.o obj-$(CONFIG_CRYPTO_HMAC) += hmac.o obj-$(CONFIG_CRYPTO_VMAC) += vmac.o diff --git a/crypto/algapi.c b/crypto/algapi.c index 2545c5f89c4c..f5396c88e8cd 100644 --- a/crypto/algapi.c +++ b/crypto/algapi.c @@ -258,6 +258,7 @@ static struct crypto_larval *__crypto_register_alg(struct crypto_alg *alg) list_add(&alg->cra_list, &crypto_alg_list); list_add(&larval->alg.cra_list, &crypto_alg_list); +#ifdef CONFIG_CRYPTO_STATS atomic_set(&alg->encrypt_cnt, 0); atomic_set(&alg->decrypt_cnt, 0); atomic64_set(&alg->encrypt_tlen, 0); @@ -265,6 +266,7 @@ static struct crypto_larval *__crypto_register_alg(struct crypto_alg *alg) atomic_set(&alg->verify_cnt, 0); atomic_set(&alg->cipher_err_cnt, 0); atomic_set(&alg->sign_cnt, 0); +#endif out: return larval; diff --git a/include/crypto/internal/cryptouser.h b/include/crypto/internal/cryptouser.h index 8db299c25566..3492ab42eefb 100644 --- a/include/crypto/internal/cryptouser.h +++ b/include/crypto/internal/cryptouser.h @@ -3,6 +3,23 @@ struct crypto_alg *crypto_alg_match(struct crypto_user_alg *p, int exact); +#ifdef CONFIG_CRYPTO_STATS int crypto_dump_reportstat(struct sk_buff *skb, struct netlink_callback *cb); int crypto_reportstat(struct sk_buff *in_skb, struct nlmsghdr *in_nlh, struct nlattr **attrs); int crypto_dump_reportstat_done(struct netlink_callback *cb); +#else +static int crypto_dump_reportstat(struct sk_buff *skb, struct netlink_callback *cb) +{ + return -ENOTSUPP; +} + +static int crypto_reportstat(struct sk_buff *in_skb, struct nlmsghdr *in_nlh, struct nlattr **attrs) +{ + return -ENOTSUPP; +} + +static int crypto_dump_reportstat_done(struct netlink_callback *cb) +{ + return -ENOTSUPP; +} +#endif diff --git a/include/linux/crypto.h b/include/linux/crypto.h index 3634ad6fe202..3e05053b8d57 100644 --- a/include/linux/crypto.h +++ b/include/linux/crypto.h @@ -515,6 +515,7 @@ struct crypto_alg { struct module *cra_module; +#ifdef CONFIG_CRYPTO_STATS union { atomic_t encrypt_cnt; atomic_t compress_cnt; @@ -552,6 +553,7 @@ struct crypto_alg { atomic_t compute_shared_secret_cnt; }; atomic_t sign_cnt; +#endif /* CONFIG_CRYPTO_STATS */ } CRYPTO_MINALIGN_ATTR; -- cgit v1.2.3-59-g8ed1b From a6a31385364ca0f7b98ace0bad93d793f07f97f3 Mon Sep 17 00:00:00 2001 From: Corentin Labbe Date: Thu, 29 Nov 2018 14:42:17 +0000 Subject: crypto: user - CRYPTO_STATS should depend on CRYPTO_USER CRYPTO_STATS is using CRYPTO_USER stuff, so it should depends on it. Signed-off-by: Corentin Labbe Signed-off-by: Herbert Xu --- crypto/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/crypto/Kconfig b/crypto/Kconfig index b6376d5d973e..5994d0fa7a78 100644 --- a/crypto/Kconfig +++ b/crypto/Kconfig @@ -1865,6 +1865,7 @@ config CRYPTO_USER_API_AEAD config CRYPTO_STATS bool "Crypto usage statistics for User-space" + depends on CRYPTO_USER help This option enables the gathering of crypto stats. This will collect: -- cgit v1.2.3-59-g8ed1b From 6e8e72cd206e2ba68801e4f2490f639d41808c8d Mon Sep 17 00:00:00 2001 From: Corentin Labbe Date: Thu, 29 Nov 2018 14:42:18 +0000 Subject: crypto: user - convert all stats from u32 to u64 All the 32-bit fields need to be 64-bit. In some cases, UINT32_MAX crypto operations can be done in seconds. Reported-by: Eric Biggers Signed-off-by: Corentin Labbe Signed-off-by: Herbert Xu --- crypto/algapi.c | 10 ++-- crypto/crypto_user_stat.c | 114 +++++++++++++++++++--------------------- include/crypto/acompress.h | 8 +-- include/crypto/aead.h | 8 +-- include/crypto/akcipher.h | 16 +++--- include/crypto/hash.h | 6 +-- include/crypto/kpp.h | 12 ++--- include/crypto/rng.h | 8 +-- include/crypto/skcipher.h | 8 +-- include/linux/crypto.h | 46 ++++++++-------- include/uapi/linux/cryptouser.h | 38 +++++++------- 11 files changed, 133 insertions(+), 141 deletions(-) diff --git a/crypto/algapi.c b/crypto/algapi.c index f5396c88e8cd..42fe316f80ee 100644 --- a/crypto/algapi.c +++ b/crypto/algapi.c @@ -259,13 +259,13 @@ static struct crypto_larval *__crypto_register_alg(struct crypto_alg *alg) list_add(&larval->alg.cra_list, &crypto_alg_list); #ifdef CONFIG_CRYPTO_STATS - atomic_set(&alg->encrypt_cnt, 0); - atomic_set(&alg->decrypt_cnt, 0); + atomic64_set(&alg->encrypt_cnt, 0); + atomic64_set(&alg->decrypt_cnt, 0); atomic64_set(&alg->encrypt_tlen, 0); atomic64_set(&alg->decrypt_tlen, 0); - atomic_set(&alg->verify_cnt, 0); - atomic_set(&alg->cipher_err_cnt, 0); - atomic_set(&alg->sign_cnt, 0); + atomic64_set(&alg->verify_cnt, 0); + atomic64_set(&alg->cipher_err_cnt, 0); + atomic64_set(&alg->sign_cnt, 0); #endif out: diff --git a/crypto/crypto_user_stat.c b/crypto/crypto_user_stat.c index a6fb2e6f618d..352569f378a0 100644 --- a/crypto/crypto_user_stat.c +++ b/crypto/crypto_user_stat.c @@ -35,22 +35,21 @@ static int crypto_report_aead(struct sk_buff *skb, struct crypto_alg *alg) { struct crypto_stat raead; u64 v64; - u32 v32; memset(&raead, 0, sizeof(raead)); strscpy(raead.type, "aead", sizeof(raead.type)); - v32 = atomic_read(&alg->encrypt_cnt); - raead.stat_encrypt_cnt = v32; + v64 = atomic64_read(&alg->encrypt_cnt); + raead.stat_encrypt_cnt = v64; v64 = atomic64_read(&alg->encrypt_tlen); raead.stat_encrypt_tlen = v64; - v32 = atomic_read(&alg->decrypt_cnt); - raead.stat_decrypt_cnt = v32; + v64 = atomic64_read(&alg->decrypt_cnt); + raead.stat_decrypt_cnt = v64; v64 = atomic64_read(&alg->decrypt_tlen); raead.stat_decrypt_tlen = v64; - v32 = atomic_read(&alg->aead_err_cnt); - raead.stat_aead_err_cnt = v32; + v64 = atomic64_read(&alg->aead_err_cnt); + raead.stat_aead_err_cnt = v64; return nla_put(skb, CRYPTOCFGA_STAT_AEAD, sizeof(raead), &raead); } @@ -59,22 +58,21 @@ static int crypto_report_cipher(struct sk_buff *skb, struct crypto_alg *alg) { struct crypto_stat rcipher; u64 v64; - u32 v32; memset(&rcipher, 0, sizeof(rcipher)); strscpy(rcipher.type, "cipher", sizeof(rcipher.type)); - v32 = atomic_read(&alg->encrypt_cnt); - rcipher.stat_encrypt_cnt = v32; + v64 = atomic64_read(&alg->encrypt_cnt); + rcipher.stat_encrypt_cnt = v64; v64 = atomic64_read(&alg->encrypt_tlen); rcipher.stat_encrypt_tlen = v64; - v32 = atomic_read(&alg->decrypt_cnt); - rcipher.stat_decrypt_cnt = v32; + v64 = atomic64_read(&alg->decrypt_cnt); + rcipher.stat_decrypt_cnt = v64; v64 = atomic64_read(&alg->decrypt_tlen); rcipher.stat_decrypt_tlen = v64; - v32 = atomic_read(&alg->cipher_err_cnt); - rcipher.stat_cipher_err_cnt = v32; + v64 = atomic64_read(&alg->cipher_err_cnt); + rcipher.stat_cipher_err_cnt = v64; return nla_put(skb, CRYPTOCFGA_STAT_CIPHER, sizeof(rcipher), &rcipher); } @@ -83,21 +81,20 @@ static int crypto_report_comp(struct sk_buff *skb, struct crypto_alg *alg) { struct crypto_stat rcomp; u64 v64; - u32 v32; memset(&rcomp, 0, sizeof(rcomp)); strscpy(rcomp.type, "compression", sizeof(rcomp.type)); - v32 = atomic_read(&alg->compress_cnt); - rcomp.stat_compress_cnt = v32; + v64 = atomic64_read(&alg->compress_cnt); + rcomp.stat_compress_cnt = v64; v64 = atomic64_read(&alg->compress_tlen); rcomp.stat_compress_tlen = v64; - v32 = atomic_read(&alg->decompress_cnt); - rcomp.stat_decompress_cnt = v32; + v64 = atomic64_read(&alg->decompress_cnt); + rcomp.stat_decompress_cnt = v64; v64 = atomic64_read(&alg->decompress_tlen); rcomp.stat_decompress_tlen = v64; - v32 = atomic_read(&alg->cipher_err_cnt); - rcomp.stat_compress_err_cnt = v32; + v64 = atomic64_read(&alg->cipher_err_cnt); + rcomp.stat_compress_err_cnt = v64; return nla_put(skb, CRYPTOCFGA_STAT_COMPRESS, sizeof(rcomp), &rcomp); } @@ -106,21 +103,20 @@ static int crypto_report_acomp(struct sk_buff *skb, struct crypto_alg *alg) { struct crypto_stat racomp; u64 v64; - u32 v32; memset(&racomp, 0, sizeof(racomp)); strscpy(racomp.type, "acomp", sizeof(racomp.type)); - v32 = atomic_read(&alg->compress_cnt); - racomp.stat_compress_cnt = v32; + v64 = atomic64_read(&alg->compress_cnt); + racomp.stat_compress_cnt = v64; v64 = atomic64_read(&alg->compress_tlen); racomp.stat_compress_tlen = v64; - v32 = atomic_read(&alg->decompress_cnt); - racomp.stat_decompress_cnt = v32; + v64 = atomic64_read(&alg->decompress_cnt); + racomp.stat_decompress_cnt = v64; v64 = atomic64_read(&alg->decompress_tlen); racomp.stat_decompress_tlen = v64; - v32 = atomic_read(&alg->cipher_err_cnt); - racomp.stat_compress_err_cnt = v32; + v64 = atomic64_read(&alg->cipher_err_cnt); + racomp.stat_compress_err_cnt = v64; return nla_put(skb, CRYPTOCFGA_STAT_ACOMP, sizeof(racomp), &racomp); } @@ -129,25 +125,24 @@ static int crypto_report_akcipher(struct sk_buff *skb, struct crypto_alg *alg) { struct crypto_stat rakcipher; u64 v64; - u32 v32; memset(&rakcipher, 0, sizeof(rakcipher)); strscpy(rakcipher.type, "akcipher", sizeof(rakcipher.type)); - v32 = atomic_read(&alg->encrypt_cnt); - rakcipher.stat_encrypt_cnt = v32; + v64 = atomic64_read(&alg->encrypt_cnt); + rakcipher.stat_encrypt_cnt = v64; v64 = atomic64_read(&alg->encrypt_tlen); rakcipher.stat_encrypt_tlen = v64; - v32 = atomic_read(&alg->decrypt_cnt); - rakcipher.stat_decrypt_cnt = v32; + v64 = atomic64_read(&alg->decrypt_cnt); + rakcipher.stat_decrypt_cnt = v64; v64 = atomic64_read(&alg->decrypt_tlen); rakcipher.stat_decrypt_tlen = v64; - v32 = atomic_read(&alg->sign_cnt); - rakcipher.stat_sign_cnt = v32; - v32 = atomic_read(&alg->verify_cnt); - rakcipher.stat_verify_cnt = v32; - v32 = atomic_read(&alg->akcipher_err_cnt); - rakcipher.stat_akcipher_err_cnt = v32; + v64 = atomic64_read(&alg->sign_cnt); + rakcipher.stat_sign_cnt = v64; + v64 = atomic64_read(&alg->verify_cnt); + rakcipher.stat_verify_cnt = v64; + v64 = atomic64_read(&alg->akcipher_err_cnt); + rakcipher.stat_akcipher_err_cnt = v64; return nla_put(skb, CRYPTOCFGA_STAT_AKCIPHER, sizeof(rakcipher), &rakcipher); @@ -156,19 +151,19 @@ static int crypto_report_akcipher(struct sk_buff *skb, struct crypto_alg *alg) static int crypto_report_kpp(struct sk_buff *skb, struct crypto_alg *alg) { struct crypto_stat rkpp; - u32 v; + u64 v; memset(&rkpp, 0, sizeof(rkpp)); strscpy(rkpp.type, "kpp", sizeof(rkpp.type)); - v = atomic_read(&alg->setsecret_cnt); + v = atomic64_read(&alg->setsecret_cnt); rkpp.stat_setsecret_cnt = v; - v = atomic_read(&alg->generate_public_key_cnt); + v = atomic64_read(&alg->generate_public_key_cnt); rkpp.stat_generate_public_key_cnt = v; - v = atomic_read(&alg->compute_shared_secret_cnt); + v = atomic64_read(&alg->compute_shared_secret_cnt); rkpp.stat_compute_shared_secret_cnt = v; - v = atomic_read(&alg->kpp_err_cnt); + v = atomic64_read(&alg->kpp_err_cnt); rkpp.stat_kpp_err_cnt = v; return nla_put(skb, CRYPTOCFGA_STAT_KPP, sizeof(rkpp), &rkpp); @@ -178,18 +173,17 @@ static int crypto_report_ahash(struct sk_buff *skb, struct crypto_alg *alg) { struct crypto_stat rhash; u64 v64; - u32 v32; memset(&rhash, 0, sizeof(rhash)); strscpy(rhash.type, "ahash", sizeof(rhash.type)); - v32 = atomic_read(&alg->hash_cnt); - rhash.stat_hash_cnt = v32; + v64 = atomic64_read(&alg->hash_cnt); + rhash.stat_hash_cnt = v64; v64 = atomic64_read(&alg->hash_tlen); rhash.stat_hash_tlen = v64; - v32 = atomic_read(&alg->hash_err_cnt); - rhash.stat_hash_err_cnt = v32; + v64 = atomic64_read(&alg->hash_err_cnt); + rhash.stat_hash_err_cnt = v64; return nla_put(skb, CRYPTOCFGA_STAT_HASH, sizeof(rhash), &rhash); } @@ -198,18 +192,17 @@ static int crypto_report_shash(struct sk_buff *skb, struct crypto_alg *alg) { struct crypto_stat rhash; u64 v64; - u32 v32; memset(&rhash, 0, sizeof(rhash)); strscpy(rhash.type, "shash", sizeof(rhash.type)); - v32 = atomic_read(&alg->hash_cnt); - rhash.stat_hash_cnt = v32; + v64 = atomic64_read(&alg->hash_cnt); + rhash.stat_hash_cnt = v64; v64 = atomic64_read(&alg->hash_tlen); rhash.stat_hash_tlen = v64; - v32 = atomic_read(&alg->hash_err_cnt); - rhash.stat_hash_err_cnt = v32; + v64 = atomic64_read(&alg->hash_err_cnt); + rhash.stat_hash_err_cnt = v64; return nla_put(skb, CRYPTOCFGA_STAT_HASH, sizeof(rhash), &rhash); } @@ -218,20 +211,19 @@ static int crypto_report_rng(struct sk_buff *skb, struct crypto_alg *alg) { struct crypto_stat rrng; u64 v64; - u32 v32; memset(&rrng, 0, sizeof(rrng)); strscpy(rrng.type, "rng", sizeof(rrng.type)); - v32 = atomic_read(&alg->generate_cnt); - rrng.stat_generate_cnt = v32; + v64 = atomic64_read(&alg->generate_cnt); + rrng.stat_generate_cnt = v64; v64 = atomic64_read(&alg->generate_tlen); rrng.stat_generate_tlen = v64; - v32 = atomic_read(&alg->seed_cnt); - rrng.stat_seed_cnt = v32; - v32 = atomic_read(&alg->hash_err_cnt); - rrng.stat_rng_err_cnt = v32; + v64 = atomic64_read(&alg->seed_cnt); + rrng.stat_seed_cnt = v64; + v64 = atomic64_read(&alg->hash_err_cnt); + rrng.stat_rng_err_cnt = v64; return nla_put(skb, CRYPTOCFGA_STAT_RNG, sizeof(rrng), &rrng); } diff --git a/include/crypto/acompress.h b/include/crypto/acompress.h index 22e6f412c595..f79918196811 100644 --- a/include/crypto/acompress.h +++ b/include/crypto/acompress.h @@ -240,9 +240,9 @@ static inline void crypto_stat_compress(struct acomp_req *req, int ret) struct crypto_acomp *tfm = crypto_acomp_reqtfm(req); if (ret && ret != -EINPROGRESS && ret != -EBUSY) { - atomic_inc(&tfm->base.__crt_alg->compress_err_cnt); + atomic64_inc(&tfm->base.__crt_alg->compress_err_cnt); } else { - atomic_inc(&tfm->base.__crt_alg->compress_cnt); + atomic64_inc(&tfm->base.__crt_alg->compress_cnt); atomic64_add(req->slen, &tfm->base.__crt_alg->compress_tlen); } #endif @@ -254,9 +254,9 @@ static inline void crypto_stat_decompress(struct acomp_req *req, int ret) struct crypto_acomp *tfm = crypto_acomp_reqtfm(req); if (ret && ret != -EINPROGRESS && ret != -EBUSY) { - atomic_inc(&tfm->base.__crt_alg->compress_err_cnt); + atomic64_inc(&tfm->base.__crt_alg->compress_err_cnt); } else { - atomic_inc(&tfm->base.__crt_alg->decompress_cnt); + atomic64_inc(&tfm->base.__crt_alg->decompress_cnt); atomic64_add(req->slen, &tfm->base.__crt_alg->decompress_tlen); } #endif diff --git a/include/crypto/aead.h b/include/crypto/aead.h index 0d765d7bfb82..99afd78c665d 100644 --- a/include/crypto/aead.h +++ b/include/crypto/aead.h @@ -312,9 +312,9 @@ static inline void crypto_stat_aead_encrypt(struct aead_request *req, int ret) struct crypto_aead *tfm = crypto_aead_reqtfm(req); if (ret && ret != -EINPROGRESS && ret != -EBUSY) { - atomic_inc(&tfm->base.__crt_alg->aead_err_cnt); + atomic64_inc(&tfm->base.__crt_alg->aead_err_cnt); } else { - atomic_inc(&tfm->base.__crt_alg->encrypt_cnt); + atomic64_inc(&tfm->base.__crt_alg->encrypt_cnt); atomic64_add(req->cryptlen, &tfm->base.__crt_alg->encrypt_tlen); } #endif @@ -326,9 +326,9 @@ static inline void crypto_stat_aead_decrypt(struct aead_request *req, int ret) struct crypto_aead *tfm = crypto_aead_reqtfm(req); if (ret && ret != -EINPROGRESS && ret != -EBUSY) { - atomic_inc(&tfm->base.__crt_alg->aead_err_cnt); + atomic64_inc(&tfm->base.__crt_alg->aead_err_cnt); } else { - atomic_inc(&tfm->base.__crt_alg->decrypt_cnt); + atomic64_inc(&tfm->base.__crt_alg->decrypt_cnt); atomic64_add(req->cryptlen, &tfm->base.__crt_alg->decrypt_tlen); } #endif diff --git a/include/crypto/akcipher.h b/include/crypto/akcipher.h index afac71119396..3dc05cf7e0a9 100644 --- a/include/crypto/akcipher.h +++ b/include/crypto/akcipher.h @@ -278,9 +278,9 @@ static inline void crypto_stat_akcipher_encrypt(struct akcipher_request *req, struct crypto_akcipher *tfm = crypto_akcipher_reqtfm(req); if (ret && ret != -EINPROGRESS && ret != -EBUSY) { - atomic_inc(&tfm->base.__crt_alg->akcipher_err_cnt); + atomic64_inc(&tfm->base.__crt_alg->akcipher_err_cnt); } else { - atomic_inc(&tfm->base.__crt_alg->encrypt_cnt); + atomic64_inc(&tfm->base.__crt_alg->encrypt_cnt); atomic64_add(req->src_len, &tfm->base.__crt_alg->encrypt_tlen); } #endif @@ -293,9 +293,9 @@ static inline void crypto_stat_akcipher_decrypt(struct akcipher_request *req, struct crypto_akcipher *tfm = crypto_akcipher_reqtfm(req); if (ret && ret != -EINPROGRESS && ret != -EBUSY) { - atomic_inc(&tfm->base.__crt_alg->akcipher_err_cnt); + atomic64_inc(&tfm->base.__crt_alg->akcipher_err_cnt); } else { - atomic_inc(&tfm->base.__crt_alg->decrypt_cnt); + atomic64_inc(&tfm->base.__crt_alg->decrypt_cnt); atomic64_add(req->src_len, &tfm->base.__crt_alg->decrypt_tlen); } #endif @@ -308,9 +308,9 @@ static inline void crypto_stat_akcipher_sign(struct akcipher_request *req, struct crypto_akcipher *tfm = crypto_akcipher_reqtfm(req); if (ret && ret != -EINPROGRESS && ret != -EBUSY) - atomic_inc(&tfm->base.__crt_alg->akcipher_err_cnt); + atomic64_inc(&tfm->base.__crt_alg->akcipher_err_cnt); else - atomic_inc(&tfm->base.__crt_alg->sign_cnt); + atomic64_inc(&tfm->base.__crt_alg->sign_cnt); #endif } @@ -321,9 +321,9 @@ static inline void crypto_stat_akcipher_verify(struct akcipher_request *req, struct crypto_akcipher *tfm = crypto_akcipher_reqtfm(req); if (ret && ret != -EINPROGRESS && ret != -EBUSY) - atomic_inc(&tfm->base.__crt_alg->akcipher_err_cnt); + atomic64_inc(&tfm->base.__crt_alg->akcipher_err_cnt); else - atomic_inc(&tfm->base.__crt_alg->verify_cnt); + atomic64_inc(&tfm->base.__crt_alg->verify_cnt); #endif } diff --git a/include/crypto/hash.h b/include/crypto/hash.h index bc7796600338..52920bed05ba 100644 --- a/include/crypto/hash.h +++ b/include/crypto/hash.h @@ -418,7 +418,7 @@ static inline void crypto_stat_ahash_update(struct ahash_request *req, int ret) struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); if (ret && ret != -EINPROGRESS && ret != -EBUSY) - atomic_inc(&tfm->base.__crt_alg->hash_err_cnt); + atomic64_inc(&tfm->base.__crt_alg->hash_err_cnt); else atomic64_add(req->nbytes, &tfm->base.__crt_alg->hash_tlen); #endif @@ -430,9 +430,9 @@ static inline void crypto_stat_ahash_final(struct ahash_request *req, int ret) struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); if (ret && ret != -EINPROGRESS && ret != -EBUSY) { - atomic_inc(&tfm->base.__crt_alg->hash_err_cnt); + atomic64_inc(&tfm->base.__crt_alg->hash_err_cnt); } else { - atomic_inc(&tfm->base.__crt_alg->hash_cnt); + atomic64_inc(&tfm->base.__crt_alg->hash_cnt); atomic64_add(req->nbytes, &tfm->base.__crt_alg->hash_tlen); } #endif diff --git a/include/crypto/kpp.h b/include/crypto/kpp.h index f517ba6d3a27..bd5103a80919 100644 --- a/include/crypto/kpp.h +++ b/include/crypto/kpp.h @@ -272,9 +272,9 @@ static inline void crypto_stat_kpp_set_secret(struct crypto_kpp *tfm, int ret) { #ifdef CONFIG_CRYPTO_STATS if (ret) - atomic_inc(&tfm->base.__crt_alg->kpp_err_cnt); + atomic64_inc(&tfm->base.__crt_alg->kpp_err_cnt); else - atomic_inc(&tfm->base.__crt_alg->setsecret_cnt); + atomic64_inc(&tfm->base.__crt_alg->setsecret_cnt); #endif } @@ -285,9 +285,9 @@ static inline void crypto_stat_kpp_generate_public_key(struct kpp_request *req, struct crypto_kpp *tfm = crypto_kpp_reqtfm(req); if (ret) - atomic_inc(&tfm->base.__crt_alg->kpp_err_cnt); + atomic64_inc(&tfm->base.__crt_alg->kpp_err_cnt); else - atomic_inc(&tfm->base.__crt_alg->generate_public_key_cnt); + atomic64_inc(&tfm->base.__crt_alg->generate_public_key_cnt); #endif } @@ -298,9 +298,9 @@ static inline void crypto_stat_kpp_compute_shared_secret(struct kpp_request *req struct crypto_kpp *tfm = crypto_kpp_reqtfm(req); if (ret) - atomic_inc(&tfm->base.__crt_alg->kpp_err_cnt); + atomic64_inc(&tfm->base.__crt_alg->kpp_err_cnt); else - atomic_inc(&tfm->base.__crt_alg->compute_shared_secret_cnt); + atomic64_inc(&tfm->base.__crt_alg->compute_shared_secret_cnt); #endif } diff --git a/include/crypto/rng.h b/include/crypto/rng.h index 6d258f5b68f1..966615bba45e 100644 --- a/include/crypto/rng.h +++ b/include/crypto/rng.h @@ -126,9 +126,9 @@ static inline void crypto_stat_rng_seed(struct crypto_rng *tfm, int ret) { #ifdef CONFIG_CRYPTO_STATS if (ret && ret != -EINPROGRESS && ret != -EBUSY) - atomic_inc(&tfm->base.__crt_alg->rng_err_cnt); + atomic64_inc(&tfm->base.__crt_alg->rng_err_cnt); else - atomic_inc(&tfm->base.__crt_alg->seed_cnt); + atomic64_inc(&tfm->base.__crt_alg->seed_cnt); #endif } @@ -137,9 +137,9 @@ static inline void crypto_stat_rng_generate(struct crypto_rng *tfm, { #ifdef CONFIG_CRYPTO_STATS if (ret && ret != -EINPROGRESS && ret != -EBUSY) { - atomic_inc(&tfm->base.__crt_alg->rng_err_cnt); + atomic64_inc(&tfm->base.__crt_alg->rng_err_cnt); } else { - atomic_inc(&tfm->base.__crt_alg->generate_cnt); + atomic64_inc(&tfm->base.__crt_alg->generate_cnt); atomic64_add(dlen, &tfm->base.__crt_alg->generate_tlen); } #endif diff --git a/include/crypto/skcipher.h b/include/crypto/skcipher.h index 925f547cdcfa..dff54731ddf4 100644 --- a/include/crypto/skcipher.h +++ b/include/crypto/skcipher.h @@ -491,9 +491,9 @@ static inline void crypto_stat_skcipher_encrypt(struct skcipher_request *req, { #ifdef CONFIG_CRYPTO_STATS if (ret && ret != -EINPROGRESS && ret != -EBUSY) { - atomic_inc(&alg->cipher_err_cnt); + atomic64_inc(&alg->cipher_err_cnt); } else { - atomic_inc(&alg->encrypt_cnt); + atomic64_inc(&alg->encrypt_cnt); atomic64_add(req->cryptlen, &alg->encrypt_tlen); } #endif @@ -504,9 +504,9 @@ static inline void crypto_stat_skcipher_decrypt(struct skcipher_request *req, { #ifdef CONFIG_CRYPTO_STATS if (ret && ret != -EINPROGRESS && ret != -EBUSY) { - atomic_inc(&alg->cipher_err_cnt); + atomic64_inc(&alg->cipher_err_cnt); } else { - atomic_inc(&alg->decrypt_cnt); + atomic64_inc(&alg->decrypt_cnt); atomic64_add(req->cryptlen, &alg->decrypt_tlen); } #endif diff --git a/include/linux/crypto.h b/include/linux/crypto.h index 3e05053b8d57..b109b50906e7 100644 --- a/include/linux/crypto.h +++ b/include/linux/crypto.h @@ -517,11 +517,11 @@ struct crypto_alg { #ifdef CONFIG_CRYPTO_STATS union { - atomic_t encrypt_cnt; - atomic_t compress_cnt; - atomic_t generate_cnt; - atomic_t hash_cnt; - atomic_t setsecret_cnt; + atomic64_t encrypt_cnt; + atomic64_t compress_cnt; + atomic64_t generate_cnt; + atomic64_t hash_cnt; + atomic64_t setsecret_cnt; }; union { atomic64_t encrypt_tlen; @@ -530,29 +530,29 @@ struct crypto_alg { atomic64_t hash_tlen; }; union { - atomic_t akcipher_err_cnt; - atomic_t cipher_err_cnt; - atomic_t compress_err_cnt; - atomic_t aead_err_cnt; - atomic_t hash_err_cnt; - atomic_t rng_err_cnt; - atomic_t kpp_err_cnt; + atomic64_t akcipher_err_cnt; + atomic64_t cipher_err_cnt; + atomic64_t compress_err_cnt; + atomic64_t aead_err_cnt; + atomic64_t hash_err_cnt; + atomic64_t rng_err_cnt; + atomic64_t kpp_err_cnt; }; union { - atomic_t decrypt_cnt; - atomic_t decompress_cnt; - atomic_t seed_cnt; - atomic_t generate_public_key_cnt; + atomic64_t decrypt_cnt; + atomic64_t decompress_cnt; + atomic64_t seed_cnt; + atomic64_t generate_public_key_cnt; }; union { atomic64_t decrypt_tlen; atomic64_t decompress_tlen; }; union { - atomic_t verify_cnt; - atomic_t compute_shared_secret_cnt; + atomic64_t verify_cnt; + atomic64_t compute_shared_secret_cnt; }; - atomic_t sign_cnt; + atomic64_t sign_cnt; #endif /* CONFIG_CRYPTO_STATS */ } CRYPTO_MINALIGN_ATTR; @@ -983,9 +983,9 @@ static inline void crypto_stat_ablkcipher_encrypt(struct ablkcipher_request *req crypto_ablkcipher_crt(crypto_ablkcipher_reqtfm(req)); if (ret && ret != -EINPROGRESS && ret != -EBUSY) { - atomic_inc(&crt->base->base.__crt_alg->cipher_err_cnt); + atomic64_inc(&crt->base->base.__crt_alg->cipher_err_cnt); } else { - atomic_inc(&crt->base->base.__crt_alg->encrypt_cnt); + atomic64_inc(&crt->base->base.__crt_alg->encrypt_cnt); atomic64_add(req->nbytes, &crt->base->base.__crt_alg->encrypt_tlen); } #endif @@ -999,9 +999,9 @@ static inline void crypto_stat_ablkcipher_decrypt(struct ablkcipher_request *req crypto_ablkcipher_crt(crypto_ablkcipher_reqtfm(req)); if (ret && ret != -EINPROGRESS && ret != -EBUSY) { - atomic_inc(&crt->base->base.__crt_alg->cipher_err_cnt); + atomic64_inc(&crt->base->base.__crt_alg->cipher_err_cnt); } else { - atomic_inc(&crt->base->base.__crt_alg->decrypt_cnt); + atomic64_inc(&crt->base->base.__crt_alg->decrypt_cnt); atomic64_add(req->nbytes, &crt->base->base.__crt_alg->decrypt_tlen); } #endif diff --git a/include/uapi/linux/cryptouser.h b/include/uapi/linux/cryptouser.h index 6dafbc3e4414..9f8187077ce4 100644 --- a/include/uapi/linux/cryptouser.h +++ b/include/uapi/linux/cryptouser.h @@ -79,11 +79,11 @@ struct crypto_user_alg { struct crypto_stat { char type[CRYPTO_MAX_NAME]; union { - __u32 stat_encrypt_cnt; - __u32 stat_compress_cnt; - __u32 stat_generate_cnt; - __u32 stat_hash_cnt; - __u32 stat_setsecret_cnt; + __u64 stat_encrypt_cnt; + __u64 stat_compress_cnt; + __u64 stat_generate_cnt; + __u64 stat_hash_cnt; + __u64 stat_setsecret_cnt; }; union { __u64 stat_encrypt_tlen; @@ -92,29 +92,29 @@ struct crypto_stat { __u64 stat_hash_tlen; }; union { - __u32 stat_akcipher_err_cnt; - __u32 stat_cipher_err_cnt; - __u32 stat_compress_err_cnt; - __u32 stat_aead_err_cnt; - __u32 stat_hash_err_cnt; - __u32 stat_rng_err_cnt; - __u32 stat_kpp_err_cnt; + __u64 stat_akcipher_err_cnt; + __u64 stat_cipher_err_cnt; + __u64 stat_compress_err_cnt; + __u64 stat_aead_err_cnt; + __u64 stat_hash_err_cnt; + __u64 stat_rng_err_cnt; + __u64 stat_kpp_err_cnt; }; union { - __u32 stat_decrypt_cnt; - __u32 stat_decompress_cnt; - __u32 stat_seed_cnt; - __u32 stat_generate_public_key_cnt; + __u64 stat_decrypt_cnt; + __u64 stat_decompress_cnt; + __u64 stat_seed_cnt; + __u64 stat_generate_public_key_cnt; }; union { __u64 stat_decrypt_tlen; __u64 stat_decompress_tlen; }; union { - __u32 stat_verify_cnt; - __u32 stat_compute_shared_secret_cnt; + __u64 stat_verify_cnt; + __u64 stat_compute_shared_secret_cnt; }; - __u32 stat_sign_cnt; + __u64 stat_sign_cnt; }; struct crypto_report_larval { -- cgit v1.2.3-59-g8ed1b From 7f0a9d5c9d1ba8ab3e5b144e52553744dc0d7471 Mon Sep 17 00:00:00 2001 From: Corentin Labbe Date: Thu, 29 Nov 2018 14:42:19 +0000 Subject: crypto: user - split user space crypto stat structures It is cleaner to have each stat in their own structures. Signed-off-by: Corentin Labbe Signed-off-by: Herbert Xu --- crypto/crypto_user_stat.c | 20 ++++---- include/uapi/linux/cryptouser.h | 100 +++++++++++++++++++++++++--------------- 2 files changed, 72 insertions(+), 48 deletions(-) diff --git a/crypto/crypto_user_stat.c b/crypto/crypto_user_stat.c index 352569f378a0..3c14be2f7a1b 100644 --- a/crypto/crypto_user_stat.c +++ b/crypto/crypto_user_stat.c @@ -33,7 +33,7 @@ struct crypto_dump_info { static int crypto_report_aead(struct sk_buff *skb, struct crypto_alg *alg) { - struct crypto_stat raead; + struct crypto_stat_aead raead; u64 v64; memset(&raead, 0, sizeof(raead)); @@ -56,7 +56,7 @@ static int crypto_report_aead(struct sk_buff *skb, struct crypto_alg *alg) static int crypto_report_cipher(struct sk_buff *skb, struct crypto_alg *alg) { - struct crypto_stat rcipher; + struct crypto_stat_cipher rcipher; u64 v64; memset(&rcipher, 0, sizeof(rcipher)); @@ -79,7 +79,7 @@ static int crypto_report_cipher(struct sk_buff *skb, struct crypto_alg *alg) static int crypto_report_comp(struct sk_buff *skb, struct crypto_alg *alg) { - struct crypto_stat rcomp; + struct crypto_stat_compress rcomp; u64 v64; memset(&rcomp, 0, sizeof(rcomp)); @@ -101,7 +101,7 @@ static int crypto_report_comp(struct sk_buff *skb, struct crypto_alg *alg) static int crypto_report_acomp(struct sk_buff *skb, struct crypto_alg *alg) { - struct crypto_stat racomp; + struct crypto_stat_compress racomp; u64 v64; memset(&racomp, 0, sizeof(racomp)); @@ -123,7 +123,7 @@ static int crypto_report_acomp(struct sk_buff *skb, struct crypto_alg *alg) static int crypto_report_akcipher(struct sk_buff *skb, struct crypto_alg *alg) { - struct crypto_stat rakcipher; + struct crypto_stat_akcipher rakcipher; u64 v64; memset(&rakcipher, 0, sizeof(rakcipher)); @@ -150,7 +150,7 @@ static int crypto_report_akcipher(struct sk_buff *skb, struct crypto_alg *alg) static int crypto_report_kpp(struct sk_buff *skb, struct crypto_alg *alg) { - struct crypto_stat rkpp; + struct crypto_stat_kpp rkpp; u64 v; memset(&rkpp, 0, sizeof(rkpp)); @@ -171,7 +171,7 @@ static int crypto_report_kpp(struct sk_buff *skb, struct crypto_alg *alg) static int crypto_report_ahash(struct sk_buff *skb, struct crypto_alg *alg) { - struct crypto_stat rhash; + struct crypto_stat_hash rhash; u64 v64; memset(&rhash, 0, sizeof(rhash)); @@ -190,7 +190,7 @@ static int crypto_report_ahash(struct sk_buff *skb, struct crypto_alg *alg) static int crypto_report_shash(struct sk_buff *skb, struct crypto_alg *alg) { - struct crypto_stat rhash; + struct crypto_stat_hash rhash; u64 v64; memset(&rhash, 0, sizeof(rhash)); @@ -209,7 +209,7 @@ static int crypto_report_shash(struct sk_buff *skb, struct crypto_alg *alg) static int crypto_report_rng(struct sk_buff *skb, struct crypto_alg *alg) { - struct crypto_stat rrng; + struct crypto_stat_rng rrng; u64 v64; memset(&rrng, 0, sizeof(rrng)); @@ -248,7 +248,7 @@ static int crypto_reportstat_one(struct crypto_alg *alg, if (nla_put_u32(skb, CRYPTOCFGA_PRIORITY_VAL, alg->cra_priority)) goto nla_put_failure; if (alg->cra_flags & CRYPTO_ALG_LARVAL) { - struct crypto_stat rl; + struct crypto_stat_larval rl; memset(&rl, 0, sizeof(rl)); strscpy(rl.type, "larval", sizeof(rl.type)); diff --git a/include/uapi/linux/cryptouser.h b/include/uapi/linux/cryptouser.h index 9f8187077ce4..3a70f025e27d 100644 --- a/include/uapi/linux/cryptouser.h +++ b/include/uapi/linux/cryptouser.h @@ -76,45 +76,69 @@ struct crypto_user_alg { __u32 cru_flags; }; -struct crypto_stat { - char type[CRYPTO_MAX_NAME]; - union { - __u64 stat_encrypt_cnt; - __u64 stat_compress_cnt; - __u64 stat_generate_cnt; - __u64 stat_hash_cnt; - __u64 stat_setsecret_cnt; - }; - union { - __u64 stat_encrypt_tlen; - __u64 stat_compress_tlen; - __u64 stat_generate_tlen; - __u64 stat_hash_tlen; - }; - union { - __u64 stat_akcipher_err_cnt; - __u64 stat_cipher_err_cnt; - __u64 stat_compress_err_cnt; - __u64 stat_aead_err_cnt; - __u64 stat_hash_err_cnt; - __u64 stat_rng_err_cnt; - __u64 stat_kpp_err_cnt; - }; - union { - __u64 stat_decrypt_cnt; - __u64 stat_decompress_cnt; - __u64 stat_seed_cnt; - __u64 stat_generate_public_key_cnt; - }; - union { - __u64 stat_decrypt_tlen; - __u64 stat_decompress_tlen; - }; - union { - __u64 stat_verify_cnt; - __u64 stat_compute_shared_secret_cnt; - }; +struct crypto_stat_aead { + char type[CRYPTO_MAX_NAME]; + __u64 stat_encrypt_cnt; + __u64 stat_encrypt_tlen; + __u64 stat_decrypt_cnt; + __u64 stat_decrypt_tlen; + __u64 stat_aead_err_cnt; +}; + +struct crypto_stat_akcipher { + char type[CRYPTO_MAX_NAME]; + __u64 stat_encrypt_cnt; + __u64 stat_encrypt_tlen; + __u64 stat_decrypt_cnt; + __u64 stat_decrypt_tlen; + __u64 stat_verify_cnt; __u64 stat_sign_cnt; + __u64 stat_akcipher_err_cnt; +}; + +struct crypto_stat_cipher { + char type[CRYPTO_MAX_NAME]; + __u64 stat_encrypt_cnt; + __u64 stat_encrypt_tlen; + __u64 stat_decrypt_cnt; + __u64 stat_decrypt_tlen; + __u64 stat_cipher_err_cnt; +}; + +struct crypto_stat_compress { + char type[CRYPTO_MAX_NAME]; + __u64 stat_compress_cnt; + __u64 stat_compress_tlen; + __u64 stat_decompress_cnt; + __u64 stat_decompress_tlen; + __u64 stat_compress_err_cnt; +}; + +struct crypto_stat_hash { + char type[CRYPTO_MAX_NAME]; + __u64 stat_hash_cnt; + __u64 stat_hash_tlen; + __u64 stat_hash_err_cnt; +}; + +struct crypto_stat_kpp { + char type[CRYPTO_MAX_NAME]; + __u64 stat_setsecret_cnt; + __u64 stat_generate_public_key_cnt; + __u64 stat_compute_shared_secret_cnt; + __u64 stat_kpp_err_cnt; +}; + +struct crypto_stat_rng { + char type[CRYPTO_MAX_NAME]; + __u64 stat_generate_cnt; + __u64 stat_generate_tlen; + __u64 stat_seed_cnt; + __u64 stat_rng_err_cnt; +}; + +struct crypto_stat_larval { + char type[CRYPTO_MAX_NAME]; }; struct crypto_report_larval { -- cgit v1.2.3-59-g8ed1b From 76d09ea7c22f2cabf1f66ffc287c23b19b120be9 Mon Sep 17 00:00:00 2001 From: Corentin Labbe Date: Thu, 29 Nov 2018 14:42:20 +0000 Subject: crypto: tool: getstat: convert user space example to the new crypto_user_stat uapi This patch converts the getstat example tool to the recent changes done in crypto_user_stat - changed all stats to u64 - separated struct stats for each crypto alg Signed-off-by: Corentin Labbe Signed-off-by: Herbert Xu --- tools/crypto/getstat.c | 54 +++++++++++++++++++++++++------------------------- 1 file changed, 27 insertions(+), 27 deletions(-) diff --git a/tools/crypto/getstat.c b/tools/crypto/getstat.c index 24115173a483..57fbb94608d4 100644 --- a/tools/crypto/getstat.c +++ b/tools/crypto/getstat.c @@ -152,53 +152,53 @@ static int get_stat(const char *drivername) if (tb[CRYPTOCFGA_STAT_HASH]) { struct rtattr *rta = tb[CRYPTOCFGA_STAT_HASH]; - struct crypto_stat *rhash = - (struct crypto_stat *)RTA_DATA(rta); - printf("%s\tHash\n\tHash: %u bytes: %llu\n\tErrors: %u\n", + struct crypto_stat_hash *rhash = + (struct crypto_stat_hash *)RTA_DATA(rta); + printf("%s\tHash\n\tHash: %llu bytes: %llu\n\tErrors: %llu\n", drivername, rhash->stat_hash_cnt, rhash->stat_hash_tlen, rhash->stat_hash_err_cnt); } else if (tb[CRYPTOCFGA_STAT_COMPRESS]) { struct rtattr *rta = tb[CRYPTOCFGA_STAT_COMPRESS]; - struct crypto_stat *rblk = - (struct crypto_stat *)RTA_DATA(rta); - printf("%s\tCompress\n\tCompress: %u bytes: %llu\n\tDecompress: %u bytes: %llu\n\tErrors: %u\n", + struct crypto_stat_compress *rblk = + (struct crypto_stat_compress *)RTA_DATA(rta); + printf("%s\tCompress\n\tCompress: %llu bytes: %llu\n\tDecompress: %llu bytes: %llu\n\tErrors: %llu\n", drivername, rblk->stat_compress_cnt, rblk->stat_compress_tlen, rblk->stat_decompress_cnt, rblk->stat_decompress_tlen, rblk->stat_compress_err_cnt); } else if (tb[CRYPTOCFGA_STAT_ACOMP]) { struct rtattr *rta = tb[CRYPTOCFGA_STAT_ACOMP]; - struct crypto_stat *rcomp = - (struct crypto_stat *)RTA_DATA(rta); - printf("%s\tACompress\n\tCompress: %u bytes: %llu\n\tDecompress: %u bytes: %llu\n\tErrors: %u\n", + struct crypto_stat_compress *rcomp = + (struct crypto_stat_compress *)RTA_DATA(rta); + printf("%s\tACompress\n\tCompress: %llu bytes: %llu\n\tDecompress: %llu bytes: %llu\n\tErrors: %llu\n", drivername, rcomp->stat_compress_cnt, rcomp->stat_compress_tlen, rcomp->stat_decompress_cnt, rcomp->stat_decompress_tlen, rcomp->stat_compress_err_cnt); } else if (tb[CRYPTOCFGA_STAT_AEAD]) { struct rtattr *rta = tb[CRYPTOCFGA_STAT_AEAD]; - struct crypto_stat *raead = - (struct crypto_stat *)RTA_DATA(rta); - printf("%s\tAEAD\n\tEncrypt: %u bytes: %llu\n\tDecrypt: %u bytes: %llu\n\tErrors: %u\n", + struct crypto_stat_aead *raead = + (struct crypto_stat_aead *)RTA_DATA(rta); + printf("%s\tAEAD\n\tEncrypt: %llu bytes: %llu\n\tDecrypt: %llu bytes: %llu\n\tErrors: %llu\n", drivername, raead->stat_encrypt_cnt, raead->stat_encrypt_tlen, raead->stat_decrypt_cnt, raead->stat_decrypt_tlen, raead->stat_aead_err_cnt); } else if (tb[CRYPTOCFGA_STAT_BLKCIPHER]) { struct rtattr *rta = tb[CRYPTOCFGA_STAT_BLKCIPHER]; - struct crypto_stat *rblk = - (struct crypto_stat *)RTA_DATA(rta); - printf("%s\tCipher\n\tEncrypt: %u bytes: %llu\n\tDecrypt: %u bytes: %llu\n\tErrors: %u\n", + struct crypto_stat_cipher *rblk = + (struct crypto_stat_cipher *)RTA_DATA(rta); + printf("%s\tCipher\n\tEncrypt: %llu bytes: %llu\n\tDecrypt: %llu bytes: %llu\n\tErrors: %llu\n", drivername, rblk->stat_encrypt_cnt, rblk->stat_encrypt_tlen, rblk->stat_decrypt_cnt, rblk->stat_decrypt_tlen, rblk->stat_cipher_err_cnt); } else if (tb[CRYPTOCFGA_STAT_AKCIPHER]) { struct rtattr *rta = tb[CRYPTOCFGA_STAT_AKCIPHER]; - struct crypto_stat *rblk = - (struct crypto_stat *)RTA_DATA(rta); - printf("%s\tAkcipher\n\tEncrypt: %u bytes: %llu\n\tDecrypt: %u bytes: %llu\n\tSign: %u\n\tVerify: %u\n\tErrors: %u\n", + struct crypto_stat_akcipher *rblk = + (struct crypto_stat_akcipher *)RTA_DATA(rta); + printf("%s\tAkcipher\n\tEncrypt: %llu bytes: %llu\n\tDecrypt: %llu bytes: %llu\n\tSign: %llu\n\tVerify: %llu\n\tErrors: %llu\n", drivername, rblk->stat_encrypt_cnt, rblk->stat_encrypt_tlen, rblk->stat_decrypt_cnt, rblk->stat_decrypt_tlen, @@ -206,27 +206,27 @@ static int get_stat(const char *drivername) rblk->stat_akcipher_err_cnt); } else if (tb[CRYPTOCFGA_STAT_CIPHER]) { struct rtattr *rta = tb[CRYPTOCFGA_STAT_CIPHER]; - struct crypto_stat *rblk = - (struct crypto_stat *)RTA_DATA(rta); - printf("%s\tcipher\n\tEncrypt: %u bytes: %llu\n\tDecrypt: %u bytes: %llu\n\tErrors: %u\n", + struct crypto_stat_cipher *rblk = + (struct crypto_stat_cipher *)RTA_DATA(rta); + printf("%s\tcipher\n\tEncrypt: %llu bytes: %llu\n\tDecrypt: %llu bytes: %llu\n\tErrors: %llu\n", drivername, rblk->stat_encrypt_cnt, rblk->stat_encrypt_tlen, rblk->stat_decrypt_cnt, rblk->stat_decrypt_tlen, rblk->stat_cipher_err_cnt); } else if (tb[CRYPTOCFGA_STAT_RNG]) { struct rtattr *rta = tb[CRYPTOCFGA_STAT_RNG]; - struct crypto_stat *rrng = - (struct crypto_stat *)RTA_DATA(rta); - printf("%s\tRNG\n\tSeed: %u\n\tGenerate: %u bytes: %llu\n\tErrors: %u\n", + struct crypto_stat_rng *rrng = + (struct crypto_stat_rng *)RTA_DATA(rta); + printf("%s\tRNG\n\tSeed: %llu\n\tGenerate: %llu bytes: %llu\n\tErrors: %llu\n", drivername, rrng->stat_seed_cnt, rrng->stat_generate_cnt, rrng->stat_generate_tlen, rrng->stat_rng_err_cnt); } else if (tb[CRYPTOCFGA_STAT_KPP]) { struct rtattr *rta = tb[CRYPTOCFGA_STAT_KPP]; - struct crypto_stat *rkpp = - (struct crypto_stat *)RTA_DATA(rta); - printf("%s\tKPP\n\tSetsecret: %u\n\tGenerate public key: %u\n\tCompute_shared_secret: %u\n\tErrors: %u\n", + struct crypto_stat_kpp *rkpp = + (struct crypto_stat_kpp *)RTA_DATA(rta); + printf("%s\tKPP\n\tSetsecret: %llu\n\tGenerate public key: %llu\n\tCompute_shared_secret: %llu\n\tErrors: %llu\n", drivername, rkpp->stat_setsecret_cnt, rkpp->stat_generate_public_key_cnt, -- cgit v1.2.3-59-g8ed1b From f7d76e05d058b832b373237566cc1af8251371b5 Mon Sep 17 00:00:00 2001 From: Corentin Labbe Date: Thu, 29 Nov 2018 14:42:21 +0000 Subject: crypto: user - fix use_after_free of struct xxx_request All crypto_stats functions use the struct xxx_request for feeding stats, but in some case this structure could already be freed. For fixing this, the needed parameters (len and alg) will be stored before the request being executed. Fixes: cac5818c25d0 ("crypto: user - Implement a generic crypto statistics") Reported-by: syzbot Signed-off-by: Corentin Labbe Signed-off-by: Herbert Xu --- crypto/ahash.c | 17 +++- crypto/algapi.c | 233 +++++++++++++++++++++++++++++++++++++++++++++ crypto/rng.c | 4 +- include/crypto/acompress.h | 38 ++------ include/crypto/aead.h | 38 ++------ include/crypto/akcipher.h | 74 +++----------- include/crypto/hash.h | 32 +------ include/crypto/kpp.h | 48 ++-------- include/crypto/rng.h | 27 +----- include/crypto/skcipher.h | 36 ++----- include/linux/crypto.h | 105 +++++++++++++------- 11 files changed, 376 insertions(+), 276 deletions(-) diff --git a/crypto/ahash.c b/crypto/ahash.c index 3a348fbcf8f9..5d320a811f75 100644 --- a/crypto/ahash.c +++ b/crypto/ahash.c @@ -364,20 +364,28 @@ static int crypto_ahash_op(struct ahash_request *req, int crypto_ahash_final(struct ahash_request *req) { + struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); + struct crypto_alg *alg = tfm->base.__crt_alg; + unsigned int nbytes = req->nbytes; int ret; + crypto_stats_get(alg); ret = crypto_ahash_op(req, crypto_ahash_reqtfm(req)->final); - crypto_stat_ahash_final(req, ret); + crypto_stats_ahash_final(nbytes, ret, alg); return ret; } EXPORT_SYMBOL_GPL(crypto_ahash_final); int crypto_ahash_finup(struct ahash_request *req) { + struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); + struct crypto_alg *alg = tfm->base.__crt_alg; + unsigned int nbytes = req->nbytes; int ret; + crypto_stats_get(alg); ret = crypto_ahash_op(req, crypto_ahash_reqtfm(req)->finup); - crypto_stat_ahash_final(req, ret); + crypto_stats_ahash_final(nbytes, ret, alg); return ret; } EXPORT_SYMBOL_GPL(crypto_ahash_finup); @@ -385,13 +393,16 @@ EXPORT_SYMBOL_GPL(crypto_ahash_finup); int crypto_ahash_digest(struct ahash_request *req) { struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); + struct crypto_alg *alg = tfm->base.__crt_alg; + unsigned int nbytes = req->nbytes; int ret; + crypto_stats_get(alg); if (crypto_ahash_get_flags(tfm) & CRYPTO_TFM_NEED_KEY) ret = -ENOKEY; else ret = crypto_ahash_op(req, tfm->digest); - crypto_stat_ahash_final(req, ret); + crypto_stats_ahash_final(nbytes, ret, alg); return ret; } EXPORT_SYMBOL_GPL(crypto_ahash_digest); diff --git a/crypto/algapi.c b/crypto/algapi.c index 42fe316f80ee..4c1e6079d271 100644 --- a/crypto/algapi.c +++ b/crypto/algapi.c @@ -1078,6 +1078,239 @@ int crypto_type_has_alg(const char *name, const struct crypto_type *frontend, } EXPORT_SYMBOL_GPL(crypto_type_has_alg); +#ifdef CONFIG_CRYPTO_STATS +void crypto_stats_get(struct crypto_alg *alg) +{ + crypto_alg_get(alg); +} +EXPORT_SYMBOL_GPL(crypto_stats_get); + +void crypto_stats_ablkcipher_encrypt(unsigned int nbytes, int ret, + struct crypto_alg *alg) +{ + if (ret && ret != -EINPROGRESS && ret != -EBUSY) { + atomic64_inc(&alg->cipher_err_cnt); + } else { + atomic64_inc(&alg->encrypt_cnt); + atomic64_add(nbytes, &alg->encrypt_tlen); + } + crypto_alg_put(alg); +} +EXPORT_SYMBOL_GPL(crypto_stats_ablkcipher_encrypt); + +void crypto_stats_ablkcipher_decrypt(unsigned int nbytes, int ret, + struct crypto_alg *alg) +{ + if (ret && ret != -EINPROGRESS && ret != -EBUSY) { + atomic64_inc(&alg->cipher_err_cnt); + } else { + atomic64_inc(&alg->decrypt_cnt); + atomic64_add(nbytes, &alg->decrypt_tlen); + } + crypto_alg_put(alg); +} +EXPORT_SYMBOL_GPL(crypto_stats_ablkcipher_decrypt); + +void crypto_stats_aead_encrypt(unsigned int cryptlen, struct crypto_alg *alg, + int ret) +{ + if (ret && ret != -EINPROGRESS && ret != -EBUSY) { + atomic64_inc(&alg->aead_err_cnt); + } else { + atomic64_inc(&alg->encrypt_cnt); + atomic64_add(cryptlen, &alg->encrypt_tlen); + } + crypto_alg_put(alg); +} +EXPORT_SYMBOL_GPL(crypto_stats_aead_encrypt); + +void crypto_stats_aead_decrypt(unsigned int cryptlen, struct crypto_alg *alg, + int ret) +{ + if (ret && ret != -EINPROGRESS && ret != -EBUSY) { + atomic64_inc(&alg->aead_err_cnt); + } else { + atomic64_inc(&alg->decrypt_cnt); + atomic64_add(cryptlen, &alg->decrypt_tlen); + } + crypto_alg_put(alg); +} +EXPORT_SYMBOL_GPL(crypto_stats_aead_decrypt); + +void crypto_stats_akcipher_encrypt(unsigned int src_len, int ret, + struct crypto_alg *alg) +{ + if (ret && ret != -EINPROGRESS && ret != -EBUSY) { + atomic64_inc(&alg->akcipher_err_cnt); + } else { + atomic64_inc(&alg->encrypt_cnt); + atomic64_add(src_len, &alg->encrypt_tlen); + } + crypto_alg_put(alg); +} +EXPORT_SYMBOL_GPL(crypto_stats_akcipher_encrypt); + +void crypto_stats_akcipher_decrypt(unsigned int src_len, int ret, + struct crypto_alg *alg) +{ + if (ret && ret != -EINPROGRESS && ret != -EBUSY) { + atomic64_inc(&alg->akcipher_err_cnt); + } else { + atomic64_inc(&alg->decrypt_cnt); + atomic64_add(src_len, &alg->decrypt_tlen); + } + crypto_alg_put(alg); +} +EXPORT_SYMBOL_GPL(crypto_stats_akcipher_decrypt); + +void crypto_stats_akcipher_sign(int ret, struct crypto_alg *alg) +{ + if (ret && ret != -EINPROGRESS && ret != -EBUSY) + atomic64_inc(&alg->akcipher_err_cnt); + else + atomic64_inc(&alg->sign_cnt); + crypto_alg_put(alg); +} +EXPORT_SYMBOL_GPL(crypto_stats_akcipher_sign); + +void crypto_stats_akcipher_verify(int ret, struct crypto_alg *alg) +{ + if (ret && ret != -EINPROGRESS && ret != -EBUSY) + atomic64_inc(&alg->akcipher_err_cnt); + else + atomic64_inc(&alg->verify_cnt); + crypto_alg_put(alg); +} +EXPORT_SYMBOL_GPL(crypto_stats_akcipher_verify); + +void crypto_stats_compress(unsigned int slen, int ret, struct crypto_alg *alg) +{ + if (ret && ret != -EINPROGRESS && ret != -EBUSY) { + atomic64_inc(&alg->compress_err_cnt); + } else { + atomic64_inc(&alg->compress_cnt); + atomic64_add(slen, &alg->compress_tlen); + } + crypto_alg_put(alg); +} +EXPORT_SYMBOL_GPL(crypto_stats_compress); + +void crypto_stats_decompress(unsigned int slen, int ret, struct crypto_alg *alg) +{ + if (ret && ret != -EINPROGRESS && ret != -EBUSY) { + atomic64_inc(&alg->compress_err_cnt); + } else { + atomic64_inc(&alg->decompress_cnt); + atomic64_add(slen, &alg->decompress_tlen); + } + crypto_alg_put(alg); +} +EXPORT_SYMBOL_GPL(crypto_stats_decompress); + +void crypto_stats_ahash_update(unsigned int nbytes, int ret, + struct crypto_alg *alg) +{ + if (ret && ret != -EINPROGRESS && ret != -EBUSY) + atomic64_inc(&alg->hash_err_cnt); + else + atomic64_add(nbytes, &alg->hash_tlen); + crypto_alg_put(alg); +} +EXPORT_SYMBOL_GPL(crypto_stats_ahash_update); + +void crypto_stats_ahash_final(unsigned int nbytes, int ret, + struct crypto_alg *alg) +{ + if (ret && ret != -EINPROGRESS && ret != -EBUSY) { + atomic64_inc(&alg->hash_err_cnt); + } else { + atomic64_inc(&alg->hash_cnt); + atomic64_add(nbytes, &alg->hash_tlen); + } + crypto_alg_put(alg); +} +EXPORT_SYMBOL_GPL(crypto_stats_ahash_final); + +void crypto_stats_kpp_set_secret(struct crypto_alg *alg, int ret) +{ + if (ret) + atomic64_inc(&alg->kpp_err_cnt); + else + atomic64_inc(&alg->setsecret_cnt); + crypto_alg_put(alg); +} +EXPORT_SYMBOL_GPL(crypto_stats_kpp_set_secret); + +void crypto_stats_kpp_generate_public_key(struct crypto_alg *alg, int ret) +{ + if (ret) + atomic64_inc(&alg->kpp_err_cnt); + else + atomic64_inc(&alg->generate_public_key_cnt); + crypto_alg_put(alg); +} +EXPORT_SYMBOL_GPL(crypto_stats_kpp_generate_public_key); + +void crypto_stats_kpp_compute_shared_secret(struct crypto_alg *alg, int ret) +{ + if (ret) + atomic64_inc(&alg->kpp_err_cnt); + else + atomic64_inc(&alg->compute_shared_secret_cnt); + crypto_alg_put(alg); +} +EXPORT_SYMBOL_GPL(crypto_stats_kpp_compute_shared_secret); + +void crypto_stats_rng_seed(struct crypto_alg *alg, int ret) +{ + if (ret && ret != -EINPROGRESS && ret != -EBUSY) + atomic64_inc(&alg->rng_err_cnt); + else + atomic64_inc(&alg->seed_cnt); + crypto_alg_put(alg); +} +EXPORT_SYMBOL_GPL(crypto_stats_rng_seed); + +void crypto_stats_rng_generate(struct crypto_alg *alg, unsigned int dlen, + int ret) +{ + if (ret && ret != -EINPROGRESS && ret != -EBUSY) { + atomic64_inc(&alg->rng_err_cnt); + } else { + atomic64_inc(&alg->generate_cnt); + atomic64_add(dlen, &alg->generate_tlen); + } + crypto_alg_put(alg); +} +EXPORT_SYMBOL_GPL(crypto_stats_rng_generate); + +void crypto_stats_skcipher_encrypt(unsigned int cryptlen, int ret, + struct crypto_alg *alg) +{ + if (ret && ret != -EINPROGRESS && ret != -EBUSY) { + atomic64_inc(&alg->cipher_err_cnt); + } else { + atomic64_inc(&alg->encrypt_cnt); + atomic64_add(cryptlen, &alg->encrypt_tlen); + } + crypto_alg_put(alg); +} +EXPORT_SYMBOL_GPL(crypto_stats_skcipher_encrypt); + +void crypto_stats_skcipher_decrypt(unsigned int cryptlen, int ret, + struct crypto_alg *alg) +{ + if (ret && ret != -EINPROGRESS && ret != -EBUSY) { + atomic64_inc(&alg->cipher_err_cnt); + } else { + atomic64_inc(&alg->decrypt_cnt); + atomic64_add(cryptlen, &alg->decrypt_tlen); + } + crypto_alg_put(alg); +} +EXPORT_SYMBOL_GPL(crypto_stats_skcipher_decrypt); +#endif + static int __init crypto_algapi_init(void) { crypto_init_proc(); diff --git a/crypto/rng.c b/crypto/rng.c index 2406501b90b7..33c38a72bff5 100644 --- a/crypto/rng.c +++ b/crypto/rng.c @@ -35,9 +35,11 @@ static int crypto_default_rng_refcnt; int crypto_rng_reset(struct crypto_rng *tfm, const u8 *seed, unsigned int slen) { + struct crypto_alg *alg = tfm->base.__crt_alg; u8 *buf = NULL; int err; + crypto_stats_get(alg); if (!seed && slen) { buf = kmalloc(slen, GFP_KERNEL); if (!buf) @@ -50,7 +52,7 @@ int crypto_rng_reset(struct crypto_rng *tfm, const u8 *seed, unsigned int slen) } err = crypto_rng_alg(tfm)->seed(tfm, seed, slen); - crypto_stat_rng_seed(tfm, err); + crypto_stats_rng_seed(alg, err); out: kzfree(buf); return err; diff --git a/include/crypto/acompress.h b/include/crypto/acompress.h index f79918196811..a3e766dff917 100644 --- a/include/crypto/acompress.h +++ b/include/crypto/acompress.h @@ -234,34 +234,6 @@ static inline void acomp_request_set_params(struct acomp_req *req, req->flags |= CRYPTO_ACOMP_ALLOC_OUTPUT; } -static inline void crypto_stat_compress(struct acomp_req *req, int ret) -{ -#ifdef CONFIG_CRYPTO_STATS - struct crypto_acomp *tfm = crypto_acomp_reqtfm(req); - - if (ret && ret != -EINPROGRESS && ret != -EBUSY) { - atomic64_inc(&tfm->base.__crt_alg->compress_err_cnt); - } else { - atomic64_inc(&tfm->base.__crt_alg->compress_cnt); - atomic64_add(req->slen, &tfm->base.__crt_alg->compress_tlen); - } -#endif -} - -static inline void crypto_stat_decompress(struct acomp_req *req, int ret) -{ -#ifdef CONFIG_CRYPTO_STATS - struct crypto_acomp *tfm = crypto_acomp_reqtfm(req); - - if (ret && ret != -EINPROGRESS && ret != -EBUSY) { - atomic64_inc(&tfm->base.__crt_alg->compress_err_cnt); - } else { - atomic64_inc(&tfm->base.__crt_alg->decompress_cnt); - atomic64_add(req->slen, &tfm->base.__crt_alg->decompress_tlen); - } -#endif -} - /** * crypto_acomp_compress() -- Invoke asynchronous compress operation * @@ -274,10 +246,13 @@ static inline void crypto_stat_decompress(struct acomp_req *req, int ret) static inline int crypto_acomp_compress(struct acomp_req *req) { struct crypto_acomp *tfm = crypto_acomp_reqtfm(req); + struct crypto_alg *alg = tfm->base.__crt_alg; + unsigned int slen = req->slen; int ret; + crypto_stats_get(alg); ret = tfm->compress(req); - crypto_stat_compress(req, ret); + crypto_stats_compress(slen, ret, alg); return ret; } @@ -293,10 +268,13 @@ static inline int crypto_acomp_compress(struct acomp_req *req) static inline int crypto_acomp_decompress(struct acomp_req *req) { struct crypto_acomp *tfm = crypto_acomp_reqtfm(req); + struct crypto_alg *alg = tfm->base.__crt_alg; + unsigned int slen = req->slen; int ret; + crypto_stats_get(alg); ret = tfm->decompress(req); - crypto_stat_decompress(req, ret); + crypto_stats_decompress(slen, ret, alg); return ret; } diff --git a/include/crypto/aead.h b/include/crypto/aead.h index 99afd78c665d..b7b8d24cf765 100644 --- a/include/crypto/aead.h +++ b/include/crypto/aead.h @@ -306,34 +306,6 @@ static inline struct crypto_aead *crypto_aead_reqtfm(struct aead_request *req) return __crypto_aead_cast(req->base.tfm); } -static inline void crypto_stat_aead_encrypt(struct aead_request *req, int ret) -{ -#ifdef CONFIG_CRYPTO_STATS - struct crypto_aead *tfm = crypto_aead_reqtfm(req); - - if (ret && ret != -EINPROGRESS && ret != -EBUSY) { - atomic64_inc(&tfm->base.__crt_alg->aead_err_cnt); - } else { - atomic64_inc(&tfm->base.__crt_alg->encrypt_cnt); - atomic64_add(req->cryptlen, &tfm->base.__crt_alg->encrypt_tlen); - } -#endif -} - -static inline void crypto_stat_aead_decrypt(struct aead_request *req, int ret) -{ -#ifdef CONFIG_CRYPTO_STATS - struct crypto_aead *tfm = crypto_aead_reqtfm(req); - - if (ret && ret != -EINPROGRESS && ret != -EBUSY) { - atomic64_inc(&tfm->base.__crt_alg->aead_err_cnt); - } else { - atomic64_inc(&tfm->base.__crt_alg->decrypt_cnt); - atomic64_add(req->cryptlen, &tfm->base.__crt_alg->decrypt_tlen); - } -#endif -} - /** * crypto_aead_encrypt() - encrypt plaintext * @req: reference to the aead_request handle that holds all information @@ -356,13 +328,16 @@ static inline void crypto_stat_aead_decrypt(struct aead_request *req, int ret) static inline int crypto_aead_encrypt(struct aead_request *req) { struct crypto_aead *aead = crypto_aead_reqtfm(req); + struct crypto_alg *alg = aead->base.__crt_alg; + unsigned int cryptlen = req->cryptlen; int ret; + crypto_stats_get(alg); if (crypto_aead_get_flags(aead) & CRYPTO_TFM_NEED_KEY) ret = -ENOKEY; else ret = crypto_aead_alg(aead)->encrypt(req); - crypto_stat_aead_encrypt(req, ret); + crypto_stats_aead_encrypt(cryptlen, alg, ret); return ret; } @@ -391,15 +366,18 @@ static inline int crypto_aead_encrypt(struct aead_request *req) static inline int crypto_aead_decrypt(struct aead_request *req) { struct crypto_aead *aead = crypto_aead_reqtfm(req); + struct crypto_alg *alg = aead->base.__crt_alg; + unsigned int cryptlen = req->cryptlen; int ret; + crypto_stats_get(alg); if (crypto_aead_get_flags(aead) & CRYPTO_TFM_NEED_KEY) ret = -ENOKEY; else if (req->cryptlen < crypto_aead_authsize(aead)) ret = -EINVAL; else ret = crypto_aead_alg(aead)->decrypt(req); - crypto_stat_aead_decrypt(req, ret); + crypto_stats_aead_decrypt(cryptlen, alg, ret); return ret; } diff --git a/include/crypto/akcipher.h b/include/crypto/akcipher.h index 3dc05cf7e0a9..2d690494568c 100644 --- a/include/crypto/akcipher.h +++ b/include/crypto/akcipher.h @@ -271,62 +271,6 @@ static inline unsigned int crypto_akcipher_maxsize(struct crypto_akcipher *tfm) return alg->max_size(tfm); } -static inline void crypto_stat_akcipher_encrypt(struct akcipher_request *req, - int ret) -{ -#ifdef CONFIG_CRYPTO_STATS - struct crypto_akcipher *tfm = crypto_akcipher_reqtfm(req); - - if (ret && ret != -EINPROGRESS && ret != -EBUSY) { - atomic64_inc(&tfm->base.__crt_alg->akcipher_err_cnt); - } else { - atomic64_inc(&tfm->base.__crt_alg->encrypt_cnt); - atomic64_add(req->src_len, &tfm->base.__crt_alg->encrypt_tlen); - } -#endif -} - -static inline void crypto_stat_akcipher_decrypt(struct akcipher_request *req, - int ret) -{ -#ifdef CONFIG_CRYPTO_STATS - struct crypto_akcipher *tfm = crypto_akcipher_reqtfm(req); - - if (ret && ret != -EINPROGRESS && ret != -EBUSY) { - atomic64_inc(&tfm->base.__crt_alg->akcipher_err_cnt); - } else { - atomic64_inc(&tfm->base.__crt_alg->decrypt_cnt); - atomic64_add(req->src_len, &tfm->base.__crt_alg->decrypt_tlen); - } -#endif -} - -static inline void crypto_stat_akcipher_sign(struct akcipher_request *req, - int ret) -{ -#ifdef CONFIG_CRYPTO_STATS - struct crypto_akcipher *tfm = crypto_akcipher_reqtfm(req); - - if (ret && ret != -EINPROGRESS && ret != -EBUSY) - atomic64_inc(&tfm->base.__crt_alg->akcipher_err_cnt); - else - atomic64_inc(&tfm->base.__crt_alg->sign_cnt); -#endif -} - -static inline void crypto_stat_akcipher_verify(struct akcipher_request *req, - int ret) -{ -#ifdef CONFIG_CRYPTO_STATS - struct crypto_akcipher *tfm = crypto_akcipher_reqtfm(req); - - if (ret && ret != -EINPROGRESS && ret != -EBUSY) - atomic64_inc(&tfm->base.__crt_alg->akcipher_err_cnt); - else - atomic64_inc(&tfm->base.__crt_alg->verify_cnt); -#endif -} - /** * crypto_akcipher_encrypt() - Invoke public key encrypt operation * @@ -341,10 +285,13 @@ static inline int crypto_akcipher_encrypt(struct akcipher_request *req) { struct crypto_akcipher *tfm = crypto_akcipher_reqtfm(req); struct akcipher_alg *alg = crypto_akcipher_alg(tfm); + struct crypto_alg *calg = tfm->base.__crt_alg; + unsigned int src_len = req->src_len; int ret; + crypto_stats_get(calg); ret = alg->encrypt(req); - crypto_stat_akcipher_encrypt(req, ret); + crypto_stats_akcipher_encrypt(src_len, ret, calg); return ret; } @@ -362,10 +309,13 @@ static inline int crypto_akcipher_decrypt(struct akcipher_request *req) { struct crypto_akcipher *tfm = crypto_akcipher_reqtfm(req); struct akcipher_alg *alg = crypto_akcipher_alg(tfm); + struct crypto_alg *calg = tfm->base.__crt_alg; + unsigned int src_len = req->src_len; int ret; + crypto_stats_get(calg); ret = alg->decrypt(req); - crypto_stat_akcipher_decrypt(req, ret); + crypto_stats_akcipher_decrypt(src_len, ret, calg); return ret; } @@ -383,10 +333,12 @@ static inline int crypto_akcipher_sign(struct akcipher_request *req) { struct crypto_akcipher *tfm = crypto_akcipher_reqtfm(req); struct akcipher_alg *alg = crypto_akcipher_alg(tfm); + struct crypto_alg *calg = tfm->base.__crt_alg; int ret; + crypto_stats_get(calg); ret = alg->sign(req); - crypto_stat_akcipher_sign(req, ret); + crypto_stats_akcipher_sign(ret, calg); return ret; } @@ -404,10 +356,12 @@ static inline int crypto_akcipher_verify(struct akcipher_request *req) { struct crypto_akcipher *tfm = crypto_akcipher_reqtfm(req); struct akcipher_alg *alg = crypto_akcipher_alg(tfm); + struct crypto_alg *calg = tfm->base.__crt_alg; int ret; + crypto_stats_get(calg); ret = alg->verify(req); - crypto_stat_akcipher_verify(req, ret); + crypto_stats_akcipher_verify(ret, calg); return ret; } diff --git a/include/crypto/hash.h b/include/crypto/hash.h index 52920bed05ba..3b31c1b349ae 100644 --- a/include/crypto/hash.h +++ b/include/crypto/hash.h @@ -412,32 +412,6 @@ static inline void *ahash_request_ctx(struct ahash_request *req) int crypto_ahash_setkey(struct crypto_ahash *tfm, const u8 *key, unsigned int keylen); -static inline void crypto_stat_ahash_update(struct ahash_request *req, int ret) -{ -#ifdef CONFIG_CRYPTO_STATS - struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); - - if (ret && ret != -EINPROGRESS && ret != -EBUSY) - atomic64_inc(&tfm->base.__crt_alg->hash_err_cnt); - else - atomic64_add(req->nbytes, &tfm->base.__crt_alg->hash_tlen); -#endif -} - -static inline void crypto_stat_ahash_final(struct ahash_request *req, int ret) -{ -#ifdef CONFIG_CRYPTO_STATS - struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); - - if (ret && ret != -EINPROGRESS && ret != -EBUSY) { - atomic64_inc(&tfm->base.__crt_alg->hash_err_cnt); - } else { - atomic64_inc(&tfm->base.__crt_alg->hash_cnt); - atomic64_add(req->nbytes, &tfm->base.__crt_alg->hash_tlen); - } -#endif -} - /** * crypto_ahash_finup() - update and finalize message digest * @req: reference to the ahash_request handle that holds all information @@ -552,10 +526,14 @@ static inline int crypto_ahash_init(struct ahash_request *req) */ static inline int crypto_ahash_update(struct ahash_request *req) { + struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); + struct crypto_alg *alg = tfm->base.__crt_alg; + unsigned int nbytes = req->nbytes; int ret; + crypto_stats_get(alg); ret = crypto_ahash_reqtfm(req)->update(req); - crypto_stat_ahash_update(req, ret); + crypto_stats_ahash_update(nbytes, ret, alg); return ret; } diff --git a/include/crypto/kpp.h b/include/crypto/kpp.h index bd5103a80919..1a97e1601422 100644 --- a/include/crypto/kpp.h +++ b/include/crypto/kpp.h @@ -268,42 +268,6 @@ struct kpp_secret { unsigned short len; }; -static inline void crypto_stat_kpp_set_secret(struct crypto_kpp *tfm, int ret) -{ -#ifdef CONFIG_CRYPTO_STATS - if (ret) - atomic64_inc(&tfm->base.__crt_alg->kpp_err_cnt); - else - atomic64_inc(&tfm->base.__crt_alg->setsecret_cnt); -#endif -} - -static inline void crypto_stat_kpp_generate_public_key(struct kpp_request *req, - int ret) -{ -#ifdef CONFIG_CRYPTO_STATS - struct crypto_kpp *tfm = crypto_kpp_reqtfm(req); - - if (ret) - atomic64_inc(&tfm->base.__crt_alg->kpp_err_cnt); - else - atomic64_inc(&tfm->base.__crt_alg->generate_public_key_cnt); -#endif -} - -static inline void crypto_stat_kpp_compute_shared_secret(struct kpp_request *req, - int ret) -{ -#ifdef CONFIG_CRYPTO_STATS - struct crypto_kpp *tfm = crypto_kpp_reqtfm(req); - - if (ret) - atomic64_inc(&tfm->base.__crt_alg->kpp_err_cnt); - else - atomic64_inc(&tfm->base.__crt_alg->compute_shared_secret_cnt); -#endif -} - /** * crypto_kpp_set_secret() - Invoke kpp operation * @@ -323,10 +287,12 @@ static inline int crypto_kpp_set_secret(struct crypto_kpp *tfm, const void *buffer, unsigned int len) { struct kpp_alg *alg = crypto_kpp_alg(tfm); + struct crypto_alg *calg = tfm->base.__crt_alg; int ret; + crypto_stats_get(calg); ret = alg->set_secret(tfm, buffer, len); - crypto_stat_kpp_set_secret(tfm, ret); + crypto_stats_kpp_set_secret(calg, ret); return ret; } @@ -347,10 +313,12 @@ static inline int crypto_kpp_generate_public_key(struct kpp_request *req) { struct crypto_kpp *tfm = crypto_kpp_reqtfm(req); struct kpp_alg *alg = crypto_kpp_alg(tfm); + struct crypto_alg *calg = tfm->base.__crt_alg; int ret; + crypto_stats_get(calg); ret = alg->generate_public_key(req); - crypto_stat_kpp_generate_public_key(req, ret); + crypto_stats_kpp_generate_public_key(calg, ret); return ret; } @@ -368,10 +336,12 @@ static inline int crypto_kpp_compute_shared_secret(struct kpp_request *req) { struct crypto_kpp *tfm = crypto_kpp_reqtfm(req); struct kpp_alg *alg = crypto_kpp_alg(tfm); + struct crypto_alg *calg = tfm->base.__crt_alg; int ret; + crypto_stats_get(calg); ret = alg->compute_shared_secret(req); - crypto_stat_kpp_compute_shared_secret(req, ret); + crypto_stats_kpp_compute_shared_secret(calg, ret); return ret; } diff --git a/include/crypto/rng.h b/include/crypto/rng.h index 966615bba45e..022a1b896b47 100644 --- a/include/crypto/rng.h +++ b/include/crypto/rng.h @@ -122,29 +122,6 @@ static inline void crypto_free_rng(struct crypto_rng *tfm) crypto_destroy_tfm(tfm, crypto_rng_tfm(tfm)); } -static inline void crypto_stat_rng_seed(struct crypto_rng *tfm, int ret) -{ -#ifdef CONFIG_CRYPTO_STATS - if (ret && ret != -EINPROGRESS && ret != -EBUSY) - atomic64_inc(&tfm->base.__crt_alg->rng_err_cnt); - else - atomic64_inc(&tfm->base.__crt_alg->seed_cnt); -#endif -} - -static inline void crypto_stat_rng_generate(struct crypto_rng *tfm, - unsigned int dlen, int ret) -{ -#ifdef CONFIG_CRYPTO_STATS - if (ret && ret != -EINPROGRESS && ret != -EBUSY) { - atomic64_inc(&tfm->base.__crt_alg->rng_err_cnt); - } else { - atomic64_inc(&tfm->base.__crt_alg->generate_cnt); - atomic64_add(dlen, &tfm->base.__crt_alg->generate_tlen); - } -#endif -} - /** * crypto_rng_generate() - get random number * @tfm: cipher handle @@ -163,10 +140,12 @@ static inline int crypto_rng_generate(struct crypto_rng *tfm, const u8 *src, unsigned int slen, u8 *dst, unsigned int dlen) { + struct crypto_alg *alg = tfm->base.__crt_alg; int ret; + crypto_stats_get(alg); ret = crypto_rng_alg(tfm)->generate(tfm, src, slen, dst, dlen); - crypto_stat_rng_generate(tfm, dlen, ret); + crypto_stats_rng_generate(alg, dlen, ret); return ret; } diff --git a/include/crypto/skcipher.h b/include/crypto/skcipher.h index dff54731ddf4..480f8301a47d 100644 --- a/include/crypto/skcipher.h +++ b/include/crypto/skcipher.h @@ -486,32 +486,6 @@ static inline struct crypto_sync_skcipher *crypto_sync_skcipher_reqtfm( return container_of(tfm, struct crypto_sync_skcipher, base); } -static inline void crypto_stat_skcipher_encrypt(struct skcipher_request *req, - int ret, struct crypto_alg *alg) -{ -#ifdef CONFIG_CRYPTO_STATS - if (ret && ret != -EINPROGRESS && ret != -EBUSY) { - atomic64_inc(&alg->cipher_err_cnt); - } else { - atomic64_inc(&alg->encrypt_cnt); - atomic64_add(req->cryptlen, &alg->encrypt_tlen); - } -#endif -} - -static inline void crypto_stat_skcipher_decrypt(struct skcipher_request *req, - int ret, struct crypto_alg *alg) -{ -#ifdef CONFIG_CRYPTO_STATS - if (ret && ret != -EINPROGRESS && ret != -EBUSY) { - atomic64_inc(&alg->cipher_err_cnt); - } else { - atomic64_inc(&alg->decrypt_cnt); - atomic64_add(req->cryptlen, &alg->decrypt_tlen); - } -#endif -} - /** * crypto_skcipher_encrypt() - encrypt plaintext * @req: reference to the skcipher_request handle that holds all information @@ -526,13 +500,16 @@ static inline void crypto_stat_skcipher_decrypt(struct skcipher_request *req, static inline int crypto_skcipher_encrypt(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct crypto_alg *alg = tfm->base.__crt_alg; + unsigned int cryptlen = req->cryptlen; int ret; + crypto_stats_get(alg); if (crypto_skcipher_get_flags(tfm) & CRYPTO_TFM_NEED_KEY) ret = -ENOKEY; else ret = tfm->encrypt(req); - crypto_stat_skcipher_encrypt(req, ret, tfm->base.__crt_alg); + crypto_stats_skcipher_encrypt(cryptlen, ret, alg); return ret; } @@ -550,13 +527,16 @@ static inline int crypto_skcipher_encrypt(struct skcipher_request *req) static inline int crypto_skcipher_decrypt(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct crypto_alg *alg = tfm->base.__crt_alg; + unsigned int cryptlen = req->cryptlen; int ret; + crypto_stats_get(alg); if (crypto_skcipher_get_flags(tfm) & CRYPTO_TFM_NEED_KEY) ret = -ENOKEY; else ret = tfm->decrypt(req); - crypto_stat_skcipher_decrypt(req, ret, tfm->base.__crt_alg); + crypto_stats_skcipher_decrypt(cryptlen, ret, alg); return ret; } diff --git a/include/linux/crypto.h b/include/linux/crypto.h index b109b50906e7..e2fd24714e00 100644 --- a/include/linux/crypto.h +++ b/include/linux/crypto.h @@ -557,6 +557,69 @@ struct crypto_alg { } CRYPTO_MINALIGN_ATTR; +#ifdef CONFIG_CRYPTO_STATS +void crypto_stats_get(struct crypto_alg *alg); +void crypto_stats_ablkcipher_encrypt(unsigned int nbytes, int ret, struct crypto_alg *alg); +void crypto_stats_ablkcipher_decrypt(unsigned int nbytes, int ret, struct crypto_alg *alg); +void crypto_stats_aead_encrypt(unsigned int cryptlen, struct crypto_alg *alg, int ret); +void crypto_stats_aead_decrypt(unsigned int cryptlen, struct crypto_alg *alg, int ret); +void crypto_stats_ahash_update(unsigned int nbytes, int ret, struct crypto_alg *alg); +void crypto_stats_ahash_final(unsigned int nbytes, int ret, struct crypto_alg *alg); +void crypto_stats_akcipher_encrypt(unsigned int src_len, int ret, struct crypto_alg *alg); +void crypto_stats_akcipher_decrypt(unsigned int src_len, int ret, struct crypto_alg *alg); +void crypto_stats_akcipher_sign(int ret, struct crypto_alg *alg); +void crypto_stats_akcipher_verify(int ret, struct crypto_alg *alg); +void crypto_stats_compress(unsigned int slen, int ret, struct crypto_alg *alg); +void crypto_stats_decompress(unsigned int slen, int ret, struct crypto_alg *alg); +void crypto_stats_kpp_set_secret(struct crypto_alg *alg, int ret); +void crypto_stats_kpp_generate_public_key(struct crypto_alg *alg, int ret); +void crypto_stats_kpp_compute_shared_secret(struct crypto_alg *alg, int ret); +void crypto_stats_rng_seed(struct crypto_alg *alg, int ret); +void crypto_stats_rng_generate(struct crypto_alg *alg, unsigned int dlen, int ret); +void crypto_stats_skcipher_encrypt(unsigned int cryptlen, int ret, struct crypto_alg *alg); +void crypto_stats_skcipher_decrypt(unsigned int cryptlen, int ret, struct crypto_alg *alg); +#else +static inline void crypto_stats_get(struct crypto_alg *alg) +{} +static inline void crypto_stats_ablkcipher_encrypt(unsigned int nbytes, int ret, struct crypto_alg *alg) +{} +static inline void crypto_stats_ablkcipher_decrypt(unsigned int nbytes, int ret, struct crypto_alg *alg) +{} +static inline void crypto_stats_aead_encrypt(unsigned int cryptlen, struct crypto_alg *alg, int ret) +{} +static inline void crypto_stats_aead_decrypt(unsigned int cryptlen, struct crypto_alg *alg, int ret) +{} +static inline void crypto_stats_ahash_update(unsigned int nbytes, int ret, struct crypto_alg *alg) +{} +static inline void crypto_stats_ahash_final(unsigned int nbytes, int ret, struct crypto_alg *alg) +{} +static inline void crypto_stats_akcipher_encrypt(unsigned int src_len, int ret, struct crypto_alg *alg) +{} +static inline void crypto_stats_akcipher_decrypt(unsigned int src_len, int ret, struct crypto_alg *alg) +{} +static inline void crypto_stats_akcipher_sign(int ret, struct crypto_alg *alg) +{} +static inline void crypto_stats_akcipher_verify(int ret, struct crypto_alg *alg) +{} +static inline void crypto_stats_compress(unsigned int slen, int ret, struct crypto_alg *alg) +{} +static inline void crypto_stats_decompress(unsigned int slen, int ret, struct crypto_alg *alg) +{} +static inline void crypto_stats_kpp_set_secret(struct crypto_alg *alg, int ret) +{} +static inline void crypto_stats_kpp_generate_public_key(struct crypto_alg *alg, int ret) +{} +static inline void crypto_stats_kpp_compute_shared_secret(struct crypto_alg *alg, int ret) +{} +static inline void crypto_stats_rng_seed(struct crypto_alg *alg, int ret) +{} +static inline void crypto_stats_rng_generate(struct crypto_alg *alg, unsigned int dlen, int ret) +{} +static inline void crypto_stats_skcipher_encrypt(unsigned int cryptlen, int ret, struct crypto_alg *alg) +{} +static inline void crypto_stats_skcipher_decrypt(unsigned int cryptlen, int ret, struct crypto_alg *alg) +{} +#endif /* * A helper struct for waiting for completion of async crypto ops */ @@ -975,38 +1038,6 @@ static inline struct crypto_ablkcipher *crypto_ablkcipher_reqtfm( return __crypto_ablkcipher_cast(req->base.tfm); } -static inline void crypto_stat_ablkcipher_encrypt(struct ablkcipher_request *req, - int ret) -{ -#ifdef CONFIG_CRYPTO_STATS - struct ablkcipher_tfm *crt = - crypto_ablkcipher_crt(crypto_ablkcipher_reqtfm(req)); - - if (ret && ret != -EINPROGRESS && ret != -EBUSY) { - atomic64_inc(&crt->base->base.__crt_alg->cipher_err_cnt); - } else { - atomic64_inc(&crt->base->base.__crt_alg->encrypt_cnt); - atomic64_add(req->nbytes, &crt->base->base.__crt_alg->encrypt_tlen); - } -#endif -} - -static inline void crypto_stat_ablkcipher_decrypt(struct ablkcipher_request *req, - int ret) -{ -#ifdef CONFIG_CRYPTO_STATS - struct ablkcipher_tfm *crt = - crypto_ablkcipher_crt(crypto_ablkcipher_reqtfm(req)); - - if (ret && ret != -EINPROGRESS && ret != -EBUSY) { - atomic64_inc(&crt->base->base.__crt_alg->cipher_err_cnt); - } else { - atomic64_inc(&crt->base->base.__crt_alg->decrypt_cnt); - atomic64_add(req->nbytes, &crt->base->base.__crt_alg->decrypt_tlen); - } -#endif -} - /** * crypto_ablkcipher_encrypt() - encrypt plaintext * @req: reference to the ablkcipher_request handle that holds all information @@ -1022,10 +1053,13 @@ static inline int crypto_ablkcipher_encrypt(struct ablkcipher_request *req) { struct ablkcipher_tfm *crt = crypto_ablkcipher_crt(crypto_ablkcipher_reqtfm(req)); + struct crypto_alg *alg = crt->base->base.__crt_alg; + unsigned int nbytes = req->nbytes; int ret; + crypto_stats_get(alg); ret = crt->encrypt(req); - crypto_stat_ablkcipher_encrypt(req, ret); + crypto_stats_ablkcipher_encrypt(nbytes, ret, alg); return ret; } @@ -1044,10 +1078,13 @@ static inline int crypto_ablkcipher_decrypt(struct ablkcipher_request *req) { struct ablkcipher_tfm *crt = crypto_ablkcipher_crt(crypto_ablkcipher_reqtfm(req)); + struct crypto_alg *alg = crt->base->base.__crt_alg; + unsigned int nbytes = req->nbytes; int ret; + crypto_stats_get(alg); ret = crt->decrypt(req); - crypto_stat_ablkcipher_decrypt(req, ret); + crypto_stats_ablkcipher_decrypt(nbytes, ret, alg); return ret; } -- cgit v1.2.3-59-g8ed1b From b0af91c14109d6c9c0d73428dc0512b780f41d94 Mon Sep 17 00:00:00 2001 From: Corentin Labbe Date: Thu, 29 Nov 2018 14:42:22 +0000 Subject: crypto: user - Fix invalid stat reporting Some error count use the wrong name for getting this data. But this had not caused any reporting problem, since all error count are shared in the same union. Signed-off-by: Corentin Labbe Signed-off-by: Herbert Xu --- crypto/crypto_user_stat.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/crypto/crypto_user_stat.c b/crypto/crypto_user_stat.c index 3c14be2f7a1b..838123758423 100644 --- a/crypto/crypto_user_stat.c +++ b/crypto/crypto_user_stat.c @@ -93,7 +93,7 @@ static int crypto_report_comp(struct sk_buff *skb, struct crypto_alg *alg) rcomp.stat_decompress_cnt = v64; v64 = atomic64_read(&alg->decompress_tlen); rcomp.stat_decompress_tlen = v64; - v64 = atomic64_read(&alg->cipher_err_cnt); + v64 = atomic64_read(&alg->compress_err_cnt); rcomp.stat_compress_err_cnt = v64; return nla_put(skb, CRYPTOCFGA_STAT_COMPRESS, sizeof(rcomp), &rcomp); @@ -115,7 +115,7 @@ static int crypto_report_acomp(struct sk_buff *skb, struct crypto_alg *alg) racomp.stat_decompress_cnt = v64; v64 = atomic64_read(&alg->decompress_tlen); racomp.stat_decompress_tlen = v64; - v64 = atomic64_read(&alg->cipher_err_cnt); + v64 = atomic64_read(&alg->compress_err_cnt); racomp.stat_compress_err_cnt = v64; return nla_put(skb, CRYPTOCFGA_STAT_ACOMP, sizeof(racomp), &racomp); @@ -222,7 +222,7 @@ static int crypto_report_rng(struct sk_buff *skb, struct crypto_alg *alg) rrng.stat_generate_tlen = v64; v64 = atomic64_read(&alg->seed_cnt); rrng.stat_seed_cnt = v64; - v64 = atomic64_read(&alg->hash_err_cnt); + v64 = atomic64_read(&alg->rng_err_cnt); rrng.stat_rng_err_cnt = v64; return nla_put(skb, CRYPTOCFGA_STAT_RNG, sizeof(rrng), &rrng); -- cgit v1.2.3-59-g8ed1b From 5fff81729f09f3d7d9be0ace50be112bd34f0bb9 Mon Sep 17 00:00:00 2001 From: Corentin Labbe Date: Thu, 29 Nov 2018 14:42:23 +0000 Subject: crypto: user - remove intermediate variable The use of the v64 intermediate variable is useless, and removing it bring to much readable code. Signed-off-by: Corentin Labbe Signed-off-by: Herbert Xu --- crypto/crypto_user_stat.c | 132 ++++++++++++++-------------------------------- 1 file changed, 41 insertions(+), 91 deletions(-) diff --git a/crypto/crypto_user_stat.c b/crypto/crypto_user_stat.c index 838123758423..7b668c659122 100644 --- a/crypto/crypto_user_stat.c +++ b/crypto/crypto_user_stat.c @@ -34,22 +34,16 @@ struct crypto_dump_info { static int crypto_report_aead(struct sk_buff *skb, struct crypto_alg *alg) { struct crypto_stat_aead raead; - u64 v64; memset(&raead, 0, sizeof(raead)); strscpy(raead.type, "aead", sizeof(raead.type)); - v64 = atomic64_read(&alg->encrypt_cnt); - raead.stat_encrypt_cnt = v64; - v64 = atomic64_read(&alg->encrypt_tlen); - raead.stat_encrypt_tlen = v64; - v64 = atomic64_read(&alg->decrypt_cnt); - raead.stat_decrypt_cnt = v64; - v64 = atomic64_read(&alg->decrypt_tlen); - raead.stat_decrypt_tlen = v64; - v64 = atomic64_read(&alg->aead_err_cnt); - raead.stat_aead_err_cnt = v64; + raead.stat_encrypt_cnt = atomic64_read(&alg->encrypt_cnt); + raead.stat_encrypt_tlen = atomic64_read(&alg->encrypt_tlen); + raead.stat_decrypt_cnt = atomic64_read(&alg->decrypt_cnt); + raead.stat_decrypt_tlen = atomic64_read(&alg->decrypt_tlen); + raead.stat_aead_err_cnt = atomic64_read(&alg->aead_err_cnt); return nla_put(skb, CRYPTOCFGA_STAT_AEAD, sizeof(raead), &raead); } @@ -57,22 +51,16 @@ static int crypto_report_aead(struct sk_buff *skb, struct crypto_alg *alg) static int crypto_report_cipher(struct sk_buff *skb, struct crypto_alg *alg) { struct crypto_stat_cipher rcipher; - u64 v64; memset(&rcipher, 0, sizeof(rcipher)); strscpy(rcipher.type, "cipher", sizeof(rcipher.type)); - v64 = atomic64_read(&alg->encrypt_cnt); - rcipher.stat_encrypt_cnt = v64; - v64 = atomic64_read(&alg->encrypt_tlen); - rcipher.stat_encrypt_tlen = v64; - v64 = atomic64_read(&alg->decrypt_cnt); - rcipher.stat_decrypt_cnt = v64; - v64 = atomic64_read(&alg->decrypt_tlen); - rcipher.stat_decrypt_tlen = v64; - v64 = atomic64_read(&alg->cipher_err_cnt); - rcipher.stat_cipher_err_cnt = v64; + rcipher.stat_encrypt_cnt = atomic64_read(&alg->encrypt_cnt); + rcipher.stat_encrypt_tlen = atomic64_read(&alg->encrypt_tlen); + rcipher.stat_decrypt_cnt = atomic64_read(&alg->decrypt_cnt); + rcipher.stat_decrypt_tlen = atomic64_read(&alg->decrypt_tlen); + rcipher.stat_cipher_err_cnt = atomic64_read(&alg->cipher_err_cnt); return nla_put(skb, CRYPTOCFGA_STAT_CIPHER, sizeof(rcipher), &rcipher); } @@ -80,21 +68,15 @@ static int crypto_report_cipher(struct sk_buff *skb, struct crypto_alg *alg) static int crypto_report_comp(struct sk_buff *skb, struct crypto_alg *alg) { struct crypto_stat_compress rcomp; - u64 v64; memset(&rcomp, 0, sizeof(rcomp)); strscpy(rcomp.type, "compression", sizeof(rcomp.type)); - v64 = atomic64_read(&alg->compress_cnt); - rcomp.stat_compress_cnt = v64; - v64 = atomic64_read(&alg->compress_tlen); - rcomp.stat_compress_tlen = v64; - v64 = atomic64_read(&alg->decompress_cnt); - rcomp.stat_decompress_cnt = v64; - v64 = atomic64_read(&alg->decompress_tlen); - rcomp.stat_decompress_tlen = v64; - v64 = atomic64_read(&alg->compress_err_cnt); - rcomp.stat_compress_err_cnt = v64; + rcomp.stat_compress_cnt = atomic64_read(&alg->compress_cnt); + rcomp.stat_compress_tlen = atomic64_read(&alg->compress_tlen); + rcomp.stat_decompress_cnt = atomic64_read(&alg->decompress_cnt); + rcomp.stat_decompress_tlen = atomic64_read(&alg->decompress_tlen); + rcomp.stat_compress_err_cnt = atomic64_read(&alg->compress_err_cnt); return nla_put(skb, CRYPTOCFGA_STAT_COMPRESS, sizeof(rcomp), &rcomp); } @@ -102,21 +84,15 @@ static int crypto_report_comp(struct sk_buff *skb, struct crypto_alg *alg) static int crypto_report_acomp(struct sk_buff *skb, struct crypto_alg *alg) { struct crypto_stat_compress racomp; - u64 v64; memset(&racomp, 0, sizeof(racomp)); strscpy(racomp.type, "acomp", sizeof(racomp.type)); - v64 = atomic64_read(&alg->compress_cnt); - racomp.stat_compress_cnt = v64; - v64 = atomic64_read(&alg->compress_tlen); - racomp.stat_compress_tlen = v64; - v64 = atomic64_read(&alg->decompress_cnt); - racomp.stat_decompress_cnt = v64; - v64 = atomic64_read(&alg->decompress_tlen); - racomp.stat_decompress_tlen = v64; - v64 = atomic64_read(&alg->compress_err_cnt); - racomp.stat_compress_err_cnt = v64; + racomp.stat_compress_cnt = atomic64_read(&alg->compress_cnt); + racomp.stat_compress_tlen = atomic64_read(&alg->compress_tlen); + racomp.stat_decompress_cnt = atomic64_read(&alg->decompress_cnt); + racomp.stat_decompress_tlen = atomic64_read(&alg->decompress_tlen); + racomp.stat_compress_err_cnt = atomic64_read(&alg->compress_err_cnt); return nla_put(skb, CRYPTOCFGA_STAT_ACOMP, sizeof(racomp), &racomp); } @@ -124,25 +100,17 @@ static int crypto_report_acomp(struct sk_buff *skb, struct crypto_alg *alg) static int crypto_report_akcipher(struct sk_buff *skb, struct crypto_alg *alg) { struct crypto_stat_akcipher rakcipher; - u64 v64; memset(&rakcipher, 0, sizeof(rakcipher)); strscpy(rakcipher.type, "akcipher", sizeof(rakcipher.type)); - v64 = atomic64_read(&alg->encrypt_cnt); - rakcipher.stat_encrypt_cnt = v64; - v64 = atomic64_read(&alg->encrypt_tlen); - rakcipher.stat_encrypt_tlen = v64; - v64 = atomic64_read(&alg->decrypt_cnt); - rakcipher.stat_decrypt_cnt = v64; - v64 = atomic64_read(&alg->decrypt_tlen); - rakcipher.stat_decrypt_tlen = v64; - v64 = atomic64_read(&alg->sign_cnt); - rakcipher.stat_sign_cnt = v64; - v64 = atomic64_read(&alg->verify_cnt); - rakcipher.stat_verify_cnt = v64; - v64 = atomic64_read(&alg->akcipher_err_cnt); - rakcipher.stat_akcipher_err_cnt = v64; + rakcipher.stat_encrypt_cnt = atomic64_read(&alg->encrypt_cnt); + rakcipher.stat_encrypt_tlen = atomic64_read(&alg->encrypt_tlen); + rakcipher.stat_decrypt_cnt = atomic64_read(&alg->decrypt_cnt); + rakcipher.stat_decrypt_tlen = atomic64_read(&alg->decrypt_tlen); + rakcipher.stat_sign_cnt = atomic64_read(&alg->sign_cnt); + rakcipher.stat_verify_cnt = atomic64_read(&alg->verify_cnt); + rakcipher.stat_akcipher_err_cnt = atomic64_read(&alg->akcipher_err_cnt); return nla_put(skb, CRYPTOCFGA_STAT_AKCIPHER, sizeof(rakcipher), &rakcipher); @@ -151,20 +119,15 @@ static int crypto_report_akcipher(struct sk_buff *skb, struct crypto_alg *alg) static int crypto_report_kpp(struct sk_buff *skb, struct crypto_alg *alg) { struct crypto_stat_kpp rkpp; - u64 v; memset(&rkpp, 0, sizeof(rkpp)); strscpy(rkpp.type, "kpp", sizeof(rkpp.type)); - v = atomic64_read(&alg->setsecret_cnt); - rkpp.stat_setsecret_cnt = v; - v = atomic64_read(&alg->generate_public_key_cnt); - rkpp.stat_generate_public_key_cnt = v; - v = atomic64_read(&alg->compute_shared_secret_cnt); - rkpp.stat_compute_shared_secret_cnt = v; - v = atomic64_read(&alg->kpp_err_cnt); - rkpp.stat_kpp_err_cnt = v; + rkpp.stat_setsecret_cnt = atomic64_read(&alg->setsecret_cnt); + rkpp.stat_generate_public_key_cnt = atomic64_read(&alg->generate_public_key_cnt); + rkpp.stat_compute_shared_secret_cnt = atomic64_read(&alg->compute_shared_secret_cnt); + rkpp.stat_kpp_err_cnt = atomic64_read(&alg->kpp_err_cnt); return nla_put(skb, CRYPTOCFGA_STAT_KPP, sizeof(rkpp), &rkpp); } @@ -172,18 +135,14 @@ static int crypto_report_kpp(struct sk_buff *skb, struct crypto_alg *alg) static int crypto_report_ahash(struct sk_buff *skb, struct crypto_alg *alg) { struct crypto_stat_hash rhash; - u64 v64; memset(&rhash, 0, sizeof(rhash)); strscpy(rhash.type, "ahash", sizeof(rhash.type)); - v64 = atomic64_read(&alg->hash_cnt); - rhash.stat_hash_cnt = v64; - v64 = atomic64_read(&alg->hash_tlen); - rhash.stat_hash_tlen = v64; - v64 = atomic64_read(&alg->hash_err_cnt); - rhash.stat_hash_err_cnt = v64; + rhash.stat_hash_cnt = atomic64_read(&alg->hash_cnt); + rhash.stat_hash_tlen = atomic64_read(&alg->hash_tlen); + rhash.stat_hash_err_cnt = atomic64_read(&alg->hash_err_cnt); return nla_put(skb, CRYPTOCFGA_STAT_HASH, sizeof(rhash), &rhash); } @@ -191,18 +150,14 @@ static int crypto_report_ahash(struct sk_buff *skb, struct crypto_alg *alg) static int crypto_report_shash(struct sk_buff *skb, struct crypto_alg *alg) { struct crypto_stat_hash rhash; - u64 v64; memset(&rhash, 0, sizeof(rhash)); strscpy(rhash.type, "shash", sizeof(rhash.type)); - v64 = atomic64_read(&alg->hash_cnt); - rhash.stat_hash_cnt = v64; - v64 = atomic64_read(&alg->hash_tlen); - rhash.stat_hash_tlen = v64; - v64 = atomic64_read(&alg->hash_err_cnt); - rhash.stat_hash_err_cnt = v64; + rhash.stat_hash_cnt = atomic64_read(&alg->hash_cnt); + rhash.stat_hash_tlen = atomic64_read(&alg->hash_tlen); + rhash.stat_hash_err_cnt = atomic64_read(&alg->hash_err_cnt); return nla_put(skb, CRYPTOCFGA_STAT_HASH, sizeof(rhash), &rhash); } @@ -210,20 +165,15 @@ static int crypto_report_shash(struct sk_buff *skb, struct crypto_alg *alg) static int crypto_report_rng(struct sk_buff *skb, struct crypto_alg *alg) { struct crypto_stat_rng rrng; - u64 v64; memset(&rrng, 0, sizeof(rrng)); strscpy(rrng.type, "rng", sizeof(rrng.type)); - v64 = atomic64_read(&alg->generate_cnt); - rrng.stat_generate_cnt = v64; - v64 = atomic64_read(&alg->generate_tlen); - rrng.stat_generate_tlen = v64; - v64 = atomic64_read(&alg->seed_cnt); - rrng.stat_seed_cnt = v64; - v64 = atomic64_read(&alg->rng_err_cnt); - rrng.stat_rng_err_cnt = v64; + rrng.stat_generate_cnt = atomic64_read(&alg->generate_cnt); + rrng.stat_generate_tlen = atomic64_read(&alg->generate_tlen); + rrng.stat_seed_cnt = atomic64_read(&alg->seed_cnt); + rrng.stat_rng_err_cnt = atomic64_read(&alg->rng_err_cnt); return nla_put(skb, CRYPTOCFGA_STAT_RNG, sizeof(rrng), &rrng); } -- cgit v1.2.3-59-g8ed1b From 17c18f9e33282a170458cb5ea20759bfcb0da7d8 Mon Sep 17 00:00:00 2001 From: Corentin Labbe Date: Thu, 29 Nov 2018 14:42:24 +0000 Subject: crypto: user - Split stats in multiple structures Like for userspace, this patch splits stats into multiple structures, one for each algorithm class. Signed-off-by: Corentin Labbe Signed-off-by: Herbert Xu --- crypto/algapi.c | 108 +++++++++++++--------------- crypto/crypto_user_stat.c | 82 ++++++++++----------- include/linux/crypto.h | 180 ++++++++++++++++++++++++++++++---------------- 3 files changed, 210 insertions(+), 160 deletions(-) diff --git a/crypto/algapi.c b/crypto/algapi.c index 4c1e6079d271..a8cb5aed0069 100644 --- a/crypto/algapi.c +++ b/crypto/algapi.c @@ -259,13 +259,7 @@ static struct crypto_larval *__crypto_register_alg(struct crypto_alg *alg) list_add(&larval->alg.cra_list, &crypto_alg_list); #ifdef CONFIG_CRYPTO_STATS - atomic64_set(&alg->encrypt_cnt, 0); - atomic64_set(&alg->decrypt_cnt, 0); - atomic64_set(&alg->encrypt_tlen, 0); - atomic64_set(&alg->decrypt_tlen, 0); - atomic64_set(&alg->verify_cnt, 0); - atomic64_set(&alg->cipher_err_cnt, 0); - atomic64_set(&alg->sign_cnt, 0); + memset(&alg->stats, 0, sizeof(alg->stats)); #endif out: @@ -1089,10 +1083,10 @@ void crypto_stats_ablkcipher_encrypt(unsigned int nbytes, int ret, struct crypto_alg *alg) { if (ret && ret != -EINPROGRESS && ret != -EBUSY) { - atomic64_inc(&alg->cipher_err_cnt); + atomic64_inc(&alg->stats.cipher.cipher_err_cnt); } else { - atomic64_inc(&alg->encrypt_cnt); - atomic64_add(nbytes, &alg->encrypt_tlen); + atomic64_inc(&alg->stats.cipher.encrypt_cnt); + atomic64_add(nbytes, &alg->stats.cipher.encrypt_tlen); } crypto_alg_put(alg); } @@ -1102,10 +1096,10 @@ void crypto_stats_ablkcipher_decrypt(unsigned int nbytes, int ret, struct crypto_alg *alg) { if (ret && ret != -EINPROGRESS && ret != -EBUSY) { - atomic64_inc(&alg->cipher_err_cnt); + atomic64_inc(&alg->stats.cipher.cipher_err_cnt); } else { - atomic64_inc(&alg->decrypt_cnt); - atomic64_add(nbytes, &alg->decrypt_tlen); + atomic64_inc(&alg->stats.cipher.decrypt_cnt); + atomic64_add(nbytes, &alg->stats.cipher.decrypt_tlen); } crypto_alg_put(alg); } @@ -1115,10 +1109,10 @@ void crypto_stats_aead_encrypt(unsigned int cryptlen, struct crypto_alg *alg, int ret) { if (ret && ret != -EINPROGRESS && ret != -EBUSY) { - atomic64_inc(&alg->aead_err_cnt); + atomic64_inc(&alg->stats.aead.aead_err_cnt); } else { - atomic64_inc(&alg->encrypt_cnt); - atomic64_add(cryptlen, &alg->encrypt_tlen); + atomic64_inc(&alg->stats.aead.encrypt_cnt); + atomic64_add(cryptlen, &alg->stats.aead.encrypt_tlen); } crypto_alg_put(alg); } @@ -1128,10 +1122,10 @@ void crypto_stats_aead_decrypt(unsigned int cryptlen, struct crypto_alg *alg, int ret) { if (ret && ret != -EINPROGRESS && ret != -EBUSY) { - atomic64_inc(&alg->aead_err_cnt); + atomic64_inc(&alg->stats.aead.aead_err_cnt); } else { - atomic64_inc(&alg->decrypt_cnt); - atomic64_add(cryptlen, &alg->decrypt_tlen); + atomic64_inc(&alg->stats.aead.decrypt_cnt); + atomic64_add(cryptlen, &alg->stats.aead.decrypt_tlen); } crypto_alg_put(alg); } @@ -1141,10 +1135,10 @@ void crypto_stats_akcipher_encrypt(unsigned int src_len, int ret, struct crypto_alg *alg) { if (ret && ret != -EINPROGRESS && ret != -EBUSY) { - atomic64_inc(&alg->akcipher_err_cnt); + atomic64_inc(&alg->stats.akcipher.akcipher_err_cnt); } else { - atomic64_inc(&alg->encrypt_cnt); - atomic64_add(src_len, &alg->encrypt_tlen); + atomic64_inc(&alg->stats.akcipher.encrypt_cnt); + atomic64_add(src_len, &alg->stats.akcipher.encrypt_tlen); } crypto_alg_put(alg); } @@ -1154,10 +1148,10 @@ void crypto_stats_akcipher_decrypt(unsigned int src_len, int ret, struct crypto_alg *alg) { if (ret && ret != -EINPROGRESS && ret != -EBUSY) { - atomic64_inc(&alg->akcipher_err_cnt); + atomic64_inc(&alg->stats.akcipher.akcipher_err_cnt); } else { - atomic64_inc(&alg->decrypt_cnt); - atomic64_add(src_len, &alg->decrypt_tlen); + atomic64_inc(&alg->stats.akcipher.decrypt_cnt); + atomic64_add(src_len, &alg->stats.akcipher.decrypt_tlen); } crypto_alg_put(alg); } @@ -1166,9 +1160,9 @@ EXPORT_SYMBOL_GPL(crypto_stats_akcipher_decrypt); void crypto_stats_akcipher_sign(int ret, struct crypto_alg *alg) { if (ret && ret != -EINPROGRESS && ret != -EBUSY) - atomic64_inc(&alg->akcipher_err_cnt); + atomic64_inc(&alg->stats.akcipher.akcipher_err_cnt); else - atomic64_inc(&alg->sign_cnt); + atomic64_inc(&alg->stats.akcipher.sign_cnt); crypto_alg_put(alg); } EXPORT_SYMBOL_GPL(crypto_stats_akcipher_sign); @@ -1176,9 +1170,9 @@ EXPORT_SYMBOL_GPL(crypto_stats_akcipher_sign); void crypto_stats_akcipher_verify(int ret, struct crypto_alg *alg) { if (ret && ret != -EINPROGRESS && ret != -EBUSY) - atomic64_inc(&alg->akcipher_err_cnt); + atomic64_inc(&alg->stats.akcipher.akcipher_err_cnt); else - atomic64_inc(&alg->verify_cnt); + atomic64_inc(&alg->stats.akcipher.verify_cnt); crypto_alg_put(alg); } EXPORT_SYMBOL_GPL(crypto_stats_akcipher_verify); @@ -1186,10 +1180,10 @@ EXPORT_SYMBOL_GPL(crypto_stats_akcipher_verify); void crypto_stats_compress(unsigned int slen, int ret, struct crypto_alg *alg) { if (ret && ret != -EINPROGRESS && ret != -EBUSY) { - atomic64_inc(&alg->compress_err_cnt); + atomic64_inc(&alg->stats.compress.compress_err_cnt); } else { - atomic64_inc(&alg->compress_cnt); - atomic64_add(slen, &alg->compress_tlen); + atomic64_inc(&alg->stats.compress.compress_cnt); + atomic64_add(slen, &alg->stats.compress.compress_tlen); } crypto_alg_put(alg); } @@ -1198,10 +1192,10 @@ EXPORT_SYMBOL_GPL(crypto_stats_compress); void crypto_stats_decompress(unsigned int slen, int ret, struct crypto_alg *alg) { if (ret && ret != -EINPROGRESS && ret != -EBUSY) { - atomic64_inc(&alg->compress_err_cnt); + atomic64_inc(&alg->stats.compress.compress_err_cnt); } else { - atomic64_inc(&alg->decompress_cnt); - atomic64_add(slen, &alg->decompress_tlen); + atomic64_inc(&alg->stats.compress.decompress_cnt); + atomic64_add(slen, &alg->stats.compress.decompress_tlen); } crypto_alg_put(alg); } @@ -1211,9 +1205,9 @@ void crypto_stats_ahash_update(unsigned int nbytes, int ret, struct crypto_alg *alg) { if (ret && ret != -EINPROGRESS && ret != -EBUSY) - atomic64_inc(&alg->hash_err_cnt); + atomic64_inc(&alg->stats.hash.hash_err_cnt); else - atomic64_add(nbytes, &alg->hash_tlen); + atomic64_add(nbytes, &alg->stats.hash.hash_tlen); crypto_alg_put(alg); } EXPORT_SYMBOL_GPL(crypto_stats_ahash_update); @@ -1222,10 +1216,10 @@ void crypto_stats_ahash_final(unsigned int nbytes, int ret, struct crypto_alg *alg) { if (ret && ret != -EINPROGRESS && ret != -EBUSY) { - atomic64_inc(&alg->hash_err_cnt); + atomic64_inc(&alg->stats.hash.hash_err_cnt); } else { - atomic64_inc(&alg->hash_cnt); - atomic64_add(nbytes, &alg->hash_tlen); + atomic64_inc(&alg->stats.hash.hash_cnt); + atomic64_add(nbytes, &alg->stats.hash.hash_tlen); } crypto_alg_put(alg); } @@ -1234,9 +1228,9 @@ EXPORT_SYMBOL_GPL(crypto_stats_ahash_final); void crypto_stats_kpp_set_secret(struct crypto_alg *alg, int ret) { if (ret) - atomic64_inc(&alg->kpp_err_cnt); + atomic64_inc(&alg->stats.kpp.kpp_err_cnt); else - atomic64_inc(&alg->setsecret_cnt); + atomic64_inc(&alg->stats.kpp.setsecret_cnt); crypto_alg_put(alg); } EXPORT_SYMBOL_GPL(crypto_stats_kpp_set_secret); @@ -1244,9 +1238,9 @@ EXPORT_SYMBOL_GPL(crypto_stats_kpp_set_secret); void crypto_stats_kpp_generate_public_key(struct crypto_alg *alg, int ret) { if (ret) - atomic64_inc(&alg->kpp_err_cnt); + atomic64_inc(&alg->stats.kpp.kpp_err_cnt); else - atomic64_inc(&alg->generate_public_key_cnt); + atomic64_inc(&alg->stats.kpp.generate_public_key_cnt); crypto_alg_put(alg); } EXPORT_SYMBOL_GPL(crypto_stats_kpp_generate_public_key); @@ -1254,9 +1248,9 @@ EXPORT_SYMBOL_GPL(crypto_stats_kpp_generate_public_key); void crypto_stats_kpp_compute_shared_secret(struct crypto_alg *alg, int ret) { if (ret) - atomic64_inc(&alg->kpp_err_cnt); + atomic64_inc(&alg->stats.kpp.kpp_err_cnt); else - atomic64_inc(&alg->compute_shared_secret_cnt); + atomic64_inc(&alg->stats.kpp.compute_shared_secret_cnt); crypto_alg_put(alg); } EXPORT_SYMBOL_GPL(crypto_stats_kpp_compute_shared_secret); @@ -1264,9 +1258,9 @@ EXPORT_SYMBOL_GPL(crypto_stats_kpp_compute_shared_secret); void crypto_stats_rng_seed(struct crypto_alg *alg, int ret) { if (ret && ret != -EINPROGRESS && ret != -EBUSY) - atomic64_inc(&alg->rng_err_cnt); + atomic64_inc(&alg->stats.rng.rng_err_cnt); else - atomic64_inc(&alg->seed_cnt); + atomic64_inc(&alg->stats.rng.seed_cnt); crypto_alg_put(alg); } EXPORT_SYMBOL_GPL(crypto_stats_rng_seed); @@ -1275,10 +1269,10 @@ void crypto_stats_rng_generate(struct crypto_alg *alg, unsigned int dlen, int ret) { if (ret && ret != -EINPROGRESS && ret != -EBUSY) { - atomic64_inc(&alg->rng_err_cnt); + atomic64_inc(&alg->stats.rng.rng_err_cnt); } else { - atomic64_inc(&alg->generate_cnt); - atomic64_add(dlen, &alg->generate_tlen); + atomic64_inc(&alg->stats.rng.generate_cnt); + atomic64_add(dlen, &alg->stats.rng.generate_tlen); } crypto_alg_put(alg); } @@ -1288,10 +1282,10 @@ void crypto_stats_skcipher_encrypt(unsigned int cryptlen, int ret, struct crypto_alg *alg) { if (ret && ret != -EINPROGRESS && ret != -EBUSY) { - atomic64_inc(&alg->cipher_err_cnt); + atomic64_inc(&alg->stats.cipher.cipher_err_cnt); } else { - atomic64_inc(&alg->encrypt_cnt); - atomic64_add(cryptlen, &alg->encrypt_tlen); + atomic64_inc(&alg->stats.cipher.encrypt_cnt); + atomic64_add(cryptlen, &alg->stats.cipher.encrypt_tlen); } crypto_alg_put(alg); } @@ -1301,10 +1295,10 @@ void crypto_stats_skcipher_decrypt(unsigned int cryptlen, int ret, struct crypto_alg *alg) { if (ret && ret != -EINPROGRESS && ret != -EBUSY) { - atomic64_inc(&alg->cipher_err_cnt); + atomic64_inc(&alg->stats.cipher.cipher_err_cnt); } else { - atomic64_inc(&alg->decrypt_cnt); - atomic64_add(cryptlen, &alg->decrypt_tlen); + atomic64_inc(&alg->stats.cipher.decrypt_cnt); + atomic64_add(cryptlen, &alg->stats.cipher.decrypt_tlen); } crypto_alg_put(alg); } diff --git a/crypto/crypto_user_stat.c b/crypto/crypto_user_stat.c index 7b668c659122..113bf1691560 100644 --- a/crypto/crypto_user_stat.c +++ b/crypto/crypto_user_stat.c @@ -39,11 +39,11 @@ static int crypto_report_aead(struct sk_buff *skb, struct crypto_alg *alg) strscpy(raead.type, "aead", sizeof(raead.type)); - raead.stat_encrypt_cnt = atomic64_read(&alg->encrypt_cnt); - raead.stat_encrypt_tlen = atomic64_read(&alg->encrypt_tlen); - raead.stat_decrypt_cnt = atomic64_read(&alg->decrypt_cnt); - raead.stat_decrypt_tlen = atomic64_read(&alg->decrypt_tlen); - raead.stat_aead_err_cnt = atomic64_read(&alg->aead_err_cnt); + raead.stat_encrypt_cnt = atomic64_read(&alg->stats.aead.encrypt_cnt); + raead.stat_encrypt_tlen = atomic64_read(&alg->stats.aead.encrypt_tlen); + raead.stat_decrypt_cnt = atomic64_read(&alg->stats.aead.decrypt_cnt); + raead.stat_decrypt_tlen = atomic64_read(&alg->stats.aead.decrypt_tlen); + raead.stat_aead_err_cnt = atomic64_read(&alg->stats.aead.aead_err_cnt); return nla_put(skb, CRYPTOCFGA_STAT_AEAD, sizeof(raead), &raead); } @@ -56,11 +56,11 @@ static int crypto_report_cipher(struct sk_buff *skb, struct crypto_alg *alg) strscpy(rcipher.type, "cipher", sizeof(rcipher.type)); - rcipher.stat_encrypt_cnt = atomic64_read(&alg->encrypt_cnt); - rcipher.stat_encrypt_tlen = atomic64_read(&alg->encrypt_tlen); - rcipher.stat_decrypt_cnt = atomic64_read(&alg->decrypt_cnt); - rcipher.stat_decrypt_tlen = atomic64_read(&alg->decrypt_tlen); - rcipher.stat_cipher_err_cnt = atomic64_read(&alg->cipher_err_cnt); + rcipher.stat_encrypt_cnt = atomic64_read(&alg->stats.cipher.encrypt_cnt); + rcipher.stat_encrypt_tlen = atomic64_read(&alg->stats.cipher.encrypt_tlen); + rcipher.stat_decrypt_cnt = atomic64_read(&alg->stats.cipher.decrypt_cnt); + rcipher.stat_decrypt_tlen = atomic64_read(&alg->stats.cipher.decrypt_tlen); + rcipher.stat_cipher_err_cnt = atomic64_read(&alg->stats.cipher.cipher_err_cnt); return nla_put(skb, CRYPTOCFGA_STAT_CIPHER, sizeof(rcipher), &rcipher); } @@ -72,11 +72,11 @@ static int crypto_report_comp(struct sk_buff *skb, struct crypto_alg *alg) memset(&rcomp, 0, sizeof(rcomp)); strscpy(rcomp.type, "compression", sizeof(rcomp.type)); - rcomp.stat_compress_cnt = atomic64_read(&alg->compress_cnt); - rcomp.stat_compress_tlen = atomic64_read(&alg->compress_tlen); - rcomp.stat_decompress_cnt = atomic64_read(&alg->decompress_cnt); - rcomp.stat_decompress_tlen = atomic64_read(&alg->decompress_tlen); - rcomp.stat_compress_err_cnt = atomic64_read(&alg->compress_err_cnt); + rcomp.stat_compress_cnt = atomic64_read(&alg->stats.compress.compress_cnt); + rcomp.stat_compress_tlen = atomic64_read(&alg->stats.compress.compress_tlen); + rcomp.stat_decompress_cnt = atomic64_read(&alg->stats.compress.decompress_cnt); + rcomp.stat_decompress_tlen = atomic64_read(&alg->stats.compress.decompress_tlen); + rcomp.stat_compress_err_cnt = atomic64_read(&alg->stats.compress.compress_err_cnt); return nla_put(skb, CRYPTOCFGA_STAT_COMPRESS, sizeof(rcomp), &rcomp); } @@ -88,11 +88,11 @@ static int crypto_report_acomp(struct sk_buff *skb, struct crypto_alg *alg) memset(&racomp, 0, sizeof(racomp)); strscpy(racomp.type, "acomp", sizeof(racomp.type)); - racomp.stat_compress_cnt = atomic64_read(&alg->compress_cnt); - racomp.stat_compress_tlen = atomic64_read(&alg->compress_tlen); - racomp.stat_decompress_cnt = atomic64_read(&alg->decompress_cnt); - racomp.stat_decompress_tlen = atomic64_read(&alg->decompress_tlen); - racomp.stat_compress_err_cnt = atomic64_read(&alg->compress_err_cnt); + racomp.stat_compress_cnt = atomic64_read(&alg->stats.compress.compress_cnt); + racomp.stat_compress_tlen = atomic64_read(&alg->stats.compress.compress_tlen); + racomp.stat_decompress_cnt = atomic64_read(&alg->stats.compress.decompress_cnt); + racomp.stat_decompress_tlen = atomic64_read(&alg->stats.compress.decompress_tlen); + racomp.stat_compress_err_cnt = atomic64_read(&alg->stats.compress.compress_err_cnt); return nla_put(skb, CRYPTOCFGA_STAT_ACOMP, sizeof(racomp), &racomp); } @@ -104,13 +104,13 @@ static int crypto_report_akcipher(struct sk_buff *skb, struct crypto_alg *alg) memset(&rakcipher, 0, sizeof(rakcipher)); strscpy(rakcipher.type, "akcipher", sizeof(rakcipher.type)); - rakcipher.stat_encrypt_cnt = atomic64_read(&alg->encrypt_cnt); - rakcipher.stat_encrypt_tlen = atomic64_read(&alg->encrypt_tlen); - rakcipher.stat_decrypt_cnt = atomic64_read(&alg->decrypt_cnt); - rakcipher.stat_decrypt_tlen = atomic64_read(&alg->decrypt_tlen); - rakcipher.stat_sign_cnt = atomic64_read(&alg->sign_cnt); - rakcipher.stat_verify_cnt = atomic64_read(&alg->verify_cnt); - rakcipher.stat_akcipher_err_cnt = atomic64_read(&alg->akcipher_err_cnt); + rakcipher.stat_encrypt_cnt = atomic64_read(&alg->stats.akcipher.encrypt_cnt); + rakcipher.stat_encrypt_tlen = atomic64_read(&alg->stats.akcipher.encrypt_tlen); + rakcipher.stat_decrypt_cnt = atomic64_read(&alg->stats.akcipher.decrypt_cnt); + rakcipher.stat_decrypt_tlen = atomic64_read(&alg->stats.akcipher.decrypt_tlen); + rakcipher.stat_sign_cnt = atomic64_read(&alg->stats.akcipher.sign_cnt); + rakcipher.stat_verify_cnt = atomic64_read(&alg->stats.akcipher.verify_cnt); + rakcipher.stat_akcipher_err_cnt = atomic64_read(&alg->stats.akcipher.akcipher_err_cnt); return nla_put(skb, CRYPTOCFGA_STAT_AKCIPHER, sizeof(rakcipher), &rakcipher); @@ -124,10 +124,10 @@ static int crypto_report_kpp(struct sk_buff *skb, struct crypto_alg *alg) strscpy(rkpp.type, "kpp", sizeof(rkpp.type)); - rkpp.stat_setsecret_cnt = atomic64_read(&alg->setsecret_cnt); - rkpp.stat_generate_public_key_cnt = atomic64_read(&alg->generate_public_key_cnt); - rkpp.stat_compute_shared_secret_cnt = atomic64_read(&alg->compute_shared_secret_cnt); - rkpp.stat_kpp_err_cnt = atomic64_read(&alg->kpp_err_cnt); + rkpp.stat_setsecret_cnt = atomic64_read(&alg->stats.kpp.setsecret_cnt); + rkpp.stat_generate_public_key_cnt = atomic64_read(&alg->stats.kpp.generate_public_key_cnt); + rkpp.stat_compute_shared_secret_cnt = atomic64_read(&alg->stats.kpp.compute_shared_secret_cnt); + rkpp.stat_kpp_err_cnt = atomic64_read(&alg->stats.kpp.kpp_err_cnt); return nla_put(skb, CRYPTOCFGA_STAT_KPP, sizeof(rkpp), &rkpp); } @@ -140,9 +140,9 @@ static int crypto_report_ahash(struct sk_buff *skb, struct crypto_alg *alg) strscpy(rhash.type, "ahash", sizeof(rhash.type)); - rhash.stat_hash_cnt = atomic64_read(&alg->hash_cnt); - rhash.stat_hash_tlen = atomic64_read(&alg->hash_tlen); - rhash.stat_hash_err_cnt = atomic64_read(&alg->hash_err_cnt); + rhash.stat_hash_cnt = atomic64_read(&alg->stats.hash.hash_cnt); + rhash.stat_hash_tlen = atomic64_read(&alg->stats.hash.hash_tlen); + rhash.stat_hash_err_cnt = atomic64_read(&alg->stats.hash.hash_err_cnt); return nla_put(skb, CRYPTOCFGA_STAT_HASH, sizeof(rhash), &rhash); } @@ -155,9 +155,9 @@ static int crypto_report_shash(struct sk_buff *skb, struct crypto_alg *alg) strscpy(rhash.type, "shash", sizeof(rhash.type)); - rhash.stat_hash_cnt = atomic64_read(&alg->hash_cnt); - rhash.stat_hash_tlen = atomic64_read(&alg->hash_tlen); - rhash.stat_hash_err_cnt = atomic64_read(&alg->hash_err_cnt); + rhash.stat_hash_cnt = atomic64_read(&alg->stats.hash.hash_cnt); + rhash.stat_hash_tlen = atomic64_read(&alg->stats.hash.hash_tlen); + rhash.stat_hash_err_cnt = atomic64_read(&alg->stats.hash.hash_err_cnt); return nla_put(skb, CRYPTOCFGA_STAT_HASH, sizeof(rhash), &rhash); } @@ -170,10 +170,10 @@ static int crypto_report_rng(struct sk_buff *skb, struct crypto_alg *alg) strscpy(rrng.type, "rng", sizeof(rrng.type)); - rrng.stat_generate_cnt = atomic64_read(&alg->generate_cnt); - rrng.stat_generate_tlen = atomic64_read(&alg->generate_tlen); - rrng.stat_seed_cnt = atomic64_read(&alg->seed_cnt); - rrng.stat_rng_err_cnt = atomic64_read(&alg->rng_err_cnt); + rrng.stat_generate_cnt = atomic64_read(&alg->stats.rng.generate_cnt); + rrng.stat_generate_tlen = atomic64_read(&alg->stats.rng.generate_tlen); + rrng.stat_seed_cnt = atomic64_read(&alg->stats.rng.seed_cnt); + rrng.stat_rng_err_cnt = atomic64_read(&alg->stats.rng.rng_err_cnt); return nla_put(skb, CRYPTOCFGA_STAT_RNG, sizeof(rrng), &rrng); } diff --git a/include/linux/crypto.h b/include/linux/crypto.h index e2fd24714e00..8a46ab35479e 100644 --- a/include/linux/crypto.h +++ b/include/linux/crypto.h @@ -369,6 +369,115 @@ struct compress_alg { unsigned int slen, u8 *dst, unsigned int *dlen); }; +#ifdef CONFIG_CRYPTO_STATS +/* + * struct crypto_istat_aead - statistics for AEAD algorithm + * @encrypt_cnt: number of encrypt requests + * @encrypt_tlen: total data size handled by encrypt requests + * @decrypt_cnt: number of decrypt requests + * @decrypt_tlen: total data size handled by decrypt requests + * @aead_err_cnt: number of error for AEAD requests + */ +struct crypto_istat_aead { + atomic64_t encrypt_cnt; + atomic64_t encrypt_tlen; + atomic64_t decrypt_cnt; + atomic64_t decrypt_tlen; + atomic64_t aead_err_cnt; +}; + +/* + * struct crypto_istat_akcipher - statistics for akcipher algorithm + * @encrypt_cnt: number of encrypt requests + * @encrypt_tlen: total data size handled by encrypt requests + * @decrypt_cnt: number of decrypt requests + * @decrypt_tlen: total data size handled by decrypt requests + * @verify_cnt: number of verify operation + * @sign_cnt: number of sign requests + * @akcipher_err_cnt: number of error for akcipher requests + */ +struct crypto_istat_akcipher { + atomic64_t encrypt_cnt; + atomic64_t encrypt_tlen; + atomic64_t decrypt_cnt; + atomic64_t decrypt_tlen; + atomic64_t verify_cnt; + atomic64_t sign_cnt; + atomic64_t akcipher_err_cnt; +}; + +/* + * struct crypto_istat_cipher - statistics for cipher algorithm + * @encrypt_cnt: number of encrypt requests + * @encrypt_tlen: total data size handled by encrypt requests + * @decrypt_cnt: number of decrypt requests + * @decrypt_tlen: total data size handled by decrypt requests + * @cipher_err_cnt: number of error for cipher requests + */ +struct crypto_istat_cipher { + atomic64_t encrypt_cnt; + atomic64_t encrypt_tlen; + atomic64_t decrypt_cnt; + atomic64_t decrypt_tlen; + atomic64_t cipher_err_cnt; +}; + +/* + * struct crypto_istat_compress - statistics for compress algorithm + * @compress_cnt: number of compress requests + * @compress_tlen: total data size handled by compress requests + * @decompress_cnt: number of decompress requests + * @decompress_tlen: total data size handled by decompress requests + * @compress_err_cnt: number of error for compress requests + */ +struct crypto_istat_compress { + atomic64_t compress_cnt; + atomic64_t compress_tlen; + atomic64_t decompress_cnt; + atomic64_t decompress_tlen; + atomic64_t compress_err_cnt; +}; + +/* + * struct crypto_istat_hash - statistics for has algorithm + * @hash_cnt: number of hash requests + * @hash_tlen: total data size hashed + * @hash_err_cnt: number of error for hash requests + */ +struct crypto_istat_hash { + atomic64_t hash_cnt; + atomic64_t hash_tlen; + atomic64_t hash_err_cnt; +}; + +/* + * struct crypto_istat_kpp - statistics for KPP algorithm + * @setsecret_cnt: number of setsecrey operation + * @generate_public_key_cnt: number of generate_public_key operation + * @compute_shared_secret_cnt: number of compute_shared_secret operation + * @kpp_err_cnt: number of error for KPP requests + */ +struct crypto_istat_kpp { + atomic64_t setsecret_cnt; + atomic64_t generate_public_key_cnt; + atomic64_t compute_shared_secret_cnt; + atomic64_t kpp_err_cnt; +}; + +/* + * struct crypto_istat_rng: statistics for RNG algorithm + * @generate_cnt: number of RNG generate requests + * @generate_tlen: total data size of generated data by the RNG + * @seed_cnt: number of times the RNG was seeded + * @rng_err_cnt: number of error for RNG requests + */ +struct crypto_istat_rng { + atomic64_t generate_cnt; + atomic64_t generate_tlen; + atomic64_t seed_cnt; + atomic64_t rng_err_cnt; +}; +#endif /* CONFIG_CRYPTO_STATS */ #define cra_ablkcipher cra_u.ablkcipher #define cra_blkcipher cra_u.blkcipher @@ -454,32 +563,7 @@ struct compress_alg { * @cra_refcnt: internally used * @cra_destroy: internally used * - * All following statistics are for this crypto_alg - * @encrypt_cnt: number of encrypt requests - * @decrypt_cnt: number of decrypt requests - * @compress_cnt: number of compress requests - * @decompress_cnt: number of decompress requests - * @generate_cnt: number of RNG generate requests - * @seed_cnt: number of times the rng was seeded - * @hash_cnt: number of hash requests - * @sign_cnt: number of sign requests - * @setsecret_cnt: number of setsecrey operation - * @generate_public_key_cnt: number of generate_public_key operation - * @verify_cnt: number of verify operation - * @compute_shared_secret_cnt: number of compute_shared_secret operation - * @encrypt_tlen: total data size handled by encrypt requests - * @decrypt_tlen: total data size handled by decrypt requests - * @compress_tlen: total data size handled by compress requests - * @decompress_tlen: total data size handled by decompress requests - * @generate_tlen: total data size of generated data by the RNG - * @hash_tlen: total data size hashed - * @akcipher_err_cnt: number of error for akcipher requests - * @cipher_err_cnt: number of error for akcipher requests - * @compress_err_cnt: number of error for akcipher requests - * @aead_err_cnt: number of error for akcipher requests - * @hash_err_cnt: number of error for akcipher requests - * @rng_err_cnt: number of error for akcipher requests - * @kpp_err_cnt: number of error for akcipher requests + * @stats: union of all possible crypto_istat_xxx structures * * The struct crypto_alg describes a generic Crypto API algorithm and is common * for all of the transformations. Any variable not documented here shall not @@ -517,42 +601,14 @@ struct crypto_alg { #ifdef CONFIG_CRYPTO_STATS union { - atomic64_t encrypt_cnt; - atomic64_t compress_cnt; - atomic64_t generate_cnt; - atomic64_t hash_cnt; - atomic64_t setsecret_cnt; - }; - union { - atomic64_t encrypt_tlen; - atomic64_t compress_tlen; - atomic64_t generate_tlen; - atomic64_t hash_tlen; - }; - union { - atomic64_t akcipher_err_cnt; - atomic64_t cipher_err_cnt; - atomic64_t compress_err_cnt; - atomic64_t aead_err_cnt; - atomic64_t hash_err_cnt; - atomic64_t rng_err_cnt; - atomic64_t kpp_err_cnt; - }; - union { - atomic64_t decrypt_cnt; - atomic64_t decompress_cnt; - atomic64_t seed_cnt; - atomic64_t generate_public_key_cnt; - }; - union { - atomic64_t decrypt_tlen; - atomic64_t decompress_tlen; - }; - union { - atomic64_t verify_cnt; - atomic64_t compute_shared_secret_cnt; - }; - atomic64_t sign_cnt; + struct crypto_istat_aead aead; + struct crypto_istat_akcipher akcipher; + struct crypto_istat_cipher cipher; + struct crypto_istat_compress compress; + struct crypto_istat_hash hash; + struct crypto_istat_rng rng; + struct crypto_istat_kpp kpp; + } stats; #endif /* CONFIG_CRYPTO_STATS */ } CRYPTO_MINALIGN_ATTR; -- cgit v1.2.3-59-g8ed1b From 44f13133cb03ec32fc88a533673248ef5c0617e3 Mon Sep 17 00:00:00 2001 From: Corentin Labbe Date: Thu, 29 Nov 2018 14:42:25 +0000 Subject: crypto: user - rename err_cnt parameter Since now all crypto stats are on their own structures, it is now useless to have the algorithm name in the err_cnt member. Signed-off-by: Corentin Labbe Signed-off-by: Herbert Xu --- crypto/algapi.c | 38 +++++++++++++++++++------------------- crypto/crypto_user_stat.c | 18 +++++++++--------- include/linux/crypto.h | 28 ++++++++++++++-------------- include/uapi/linux/cryptouser.h | 14 +++++++------- tools/crypto/getstat.c | 18 +++++++++--------- 5 files changed, 58 insertions(+), 58 deletions(-) diff --git a/crypto/algapi.c b/crypto/algapi.c index a8cb5aed0069..c0d4f9ef6b0f 100644 --- a/crypto/algapi.c +++ b/crypto/algapi.c @@ -1083,7 +1083,7 @@ void crypto_stats_ablkcipher_encrypt(unsigned int nbytes, int ret, struct crypto_alg *alg) { if (ret && ret != -EINPROGRESS && ret != -EBUSY) { - atomic64_inc(&alg->stats.cipher.cipher_err_cnt); + atomic64_inc(&alg->stats.cipher.err_cnt); } else { atomic64_inc(&alg->stats.cipher.encrypt_cnt); atomic64_add(nbytes, &alg->stats.cipher.encrypt_tlen); @@ -1096,7 +1096,7 @@ void crypto_stats_ablkcipher_decrypt(unsigned int nbytes, int ret, struct crypto_alg *alg) { if (ret && ret != -EINPROGRESS && ret != -EBUSY) { - atomic64_inc(&alg->stats.cipher.cipher_err_cnt); + atomic64_inc(&alg->stats.cipher.err_cnt); } else { atomic64_inc(&alg->stats.cipher.decrypt_cnt); atomic64_add(nbytes, &alg->stats.cipher.decrypt_tlen); @@ -1109,7 +1109,7 @@ void crypto_stats_aead_encrypt(unsigned int cryptlen, struct crypto_alg *alg, int ret) { if (ret && ret != -EINPROGRESS && ret != -EBUSY) { - atomic64_inc(&alg->stats.aead.aead_err_cnt); + atomic64_inc(&alg->stats.aead.err_cnt); } else { atomic64_inc(&alg->stats.aead.encrypt_cnt); atomic64_add(cryptlen, &alg->stats.aead.encrypt_tlen); @@ -1122,7 +1122,7 @@ void crypto_stats_aead_decrypt(unsigned int cryptlen, struct crypto_alg *alg, int ret) { if (ret && ret != -EINPROGRESS && ret != -EBUSY) { - atomic64_inc(&alg->stats.aead.aead_err_cnt); + atomic64_inc(&alg->stats.aead.err_cnt); } else { atomic64_inc(&alg->stats.aead.decrypt_cnt); atomic64_add(cryptlen, &alg->stats.aead.decrypt_tlen); @@ -1135,7 +1135,7 @@ void crypto_stats_akcipher_encrypt(unsigned int src_len, int ret, struct crypto_alg *alg) { if (ret && ret != -EINPROGRESS && ret != -EBUSY) { - atomic64_inc(&alg->stats.akcipher.akcipher_err_cnt); + atomic64_inc(&alg->stats.akcipher.err_cnt); } else { atomic64_inc(&alg->stats.akcipher.encrypt_cnt); atomic64_add(src_len, &alg->stats.akcipher.encrypt_tlen); @@ -1148,7 +1148,7 @@ void crypto_stats_akcipher_decrypt(unsigned int src_len, int ret, struct crypto_alg *alg) { if (ret && ret != -EINPROGRESS && ret != -EBUSY) { - atomic64_inc(&alg->stats.akcipher.akcipher_err_cnt); + atomic64_inc(&alg->stats.akcipher.err_cnt); } else { atomic64_inc(&alg->stats.akcipher.decrypt_cnt); atomic64_add(src_len, &alg->stats.akcipher.decrypt_tlen); @@ -1160,7 +1160,7 @@ EXPORT_SYMBOL_GPL(crypto_stats_akcipher_decrypt); void crypto_stats_akcipher_sign(int ret, struct crypto_alg *alg) { if (ret && ret != -EINPROGRESS && ret != -EBUSY) - atomic64_inc(&alg->stats.akcipher.akcipher_err_cnt); + atomic64_inc(&alg->stats.akcipher.err_cnt); else atomic64_inc(&alg->stats.akcipher.sign_cnt); crypto_alg_put(alg); @@ -1170,7 +1170,7 @@ EXPORT_SYMBOL_GPL(crypto_stats_akcipher_sign); void crypto_stats_akcipher_verify(int ret, struct crypto_alg *alg) { if (ret && ret != -EINPROGRESS && ret != -EBUSY) - atomic64_inc(&alg->stats.akcipher.akcipher_err_cnt); + atomic64_inc(&alg->stats.akcipher.err_cnt); else atomic64_inc(&alg->stats.akcipher.verify_cnt); crypto_alg_put(alg); @@ -1180,7 +1180,7 @@ EXPORT_SYMBOL_GPL(crypto_stats_akcipher_verify); void crypto_stats_compress(unsigned int slen, int ret, struct crypto_alg *alg) { if (ret && ret != -EINPROGRESS && ret != -EBUSY) { - atomic64_inc(&alg->stats.compress.compress_err_cnt); + atomic64_inc(&alg->stats.compress.err_cnt); } else { atomic64_inc(&alg->stats.compress.compress_cnt); atomic64_add(slen, &alg->stats.compress.compress_tlen); @@ -1192,7 +1192,7 @@ EXPORT_SYMBOL_GPL(crypto_stats_compress); void crypto_stats_decompress(unsigned int slen, int ret, struct crypto_alg *alg) { if (ret && ret != -EINPROGRESS && ret != -EBUSY) { - atomic64_inc(&alg->stats.compress.compress_err_cnt); + atomic64_inc(&alg->stats.compress.err_cnt); } else { atomic64_inc(&alg->stats.compress.decompress_cnt); atomic64_add(slen, &alg->stats.compress.decompress_tlen); @@ -1205,7 +1205,7 @@ void crypto_stats_ahash_update(unsigned int nbytes, int ret, struct crypto_alg *alg) { if (ret && ret != -EINPROGRESS && ret != -EBUSY) - atomic64_inc(&alg->stats.hash.hash_err_cnt); + atomic64_inc(&alg->stats.hash.err_cnt); else atomic64_add(nbytes, &alg->stats.hash.hash_tlen); crypto_alg_put(alg); @@ -1216,7 +1216,7 @@ void crypto_stats_ahash_final(unsigned int nbytes, int ret, struct crypto_alg *alg) { if (ret && ret != -EINPROGRESS && ret != -EBUSY) { - atomic64_inc(&alg->stats.hash.hash_err_cnt); + atomic64_inc(&alg->stats.hash.err_cnt); } else { atomic64_inc(&alg->stats.hash.hash_cnt); atomic64_add(nbytes, &alg->stats.hash.hash_tlen); @@ -1228,7 +1228,7 @@ EXPORT_SYMBOL_GPL(crypto_stats_ahash_final); void crypto_stats_kpp_set_secret(struct crypto_alg *alg, int ret) { if (ret) - atomic64_inc(&alg->stats.kpp.kpp_err_cnt); + atomic64_inc(&alg->stats.kpp.err_cnt); else atomic64_inc(&alg->stats.kpp.setsecret_cnt); crypto_alg_put(alg); @@ -1238,7 +1238,7 @@ EXPORT_SYMBOL_GPL(crypto_stats_kpp_set_secret); void crypto_stats_kpp_generate_public_key(struct crypto_alg *alg, int ret) { if (ret) - atomic64_inc(&alg->stats.kpp.kpp_err_cnt); + atomic64_inc(&alg->stats.kpp.err_cnt); else atomic64_inc(&alg->stats.kpp.generate_public_key_cnt); crypto_alg_put(alg); @@ -1248,7 +1248,7 @@ EXPORT_SYMBOL_GPL(crypto_stats_kpp_generate_public_key); void crypto_stats_kpp_compute_shared_secret(struct crypto_alg *alg, int ret) { if (ret) - atomic64_inc(&alg->stats.kpp.kpp_err_cnt); + atomic64_inc(&alg->stats.kpp.err_cnt); else atomic64_inc(&alg->stats.kpp.compute_shared_secret_cnt); crypto_alg_put(alg); @@ -1258,7 +1258,7 @@ EXPORT_SYMBOL_GPL(crypto_stats_kpp_compute_shared_secret); void crypto_stats_rng_seed(struct crypto_alg *alg, int ret) { if (ret && ret != -EINPROGRESS && ret != -EBUSY) - atomic64_inc(&alg->stats.rng.rng_err_cnt); + atomic64_inc(&alg->stats.rng.err_cnt); else atomic64_inc(&alg->stats.rng.seed_cnt); crypto_alg_put(alg); @@ -1269,7 +1269,7 @@ void crypto_stats_rng_generate(struct crypto_alg *alg, unsigned int dlen, int ret) { if (ret && ret != -EINPROGRESS && ret != -EBUSY) { - atomic64_inc(&alg->stats.rng.rng_err_cnt); + atomic64_inc(&alg->stats.rng.err_cnt); } else { atomic64_inc(&alg->stats.rng.generate_cnt); atomic64_add(dlen, &alg->stats.rng.generate_tlen); @@ -1282,7 +1282,7 @@ void crypto_stats_skcipher_encrypt(unsigned int cryptlen, int ret, struct crypto_alg *alg) { if (ret && ret != -EINPROGRESS && ret != -EBUSY) { - atomic64_inc(&alg->stats.cipher.cipher_err_cnt); + atomic64_inc(&alg->stats.cipher.err_cnt); } else { atomic64_inc(&alg->stats.cipher.encrypt_cnt); atomic64_add(cryptlen, &alg->stats.cipher.encrypt_tlen); @@ -1295,7 +1295,7 @@ void crypto_stats_skcipher_decrypt(unsigned int cryptlen, int ret, struct crypto_alg *alg) { if (ret && ret != -EINPROGRESS && ret != -EBUSY) { - atomic64_inc(&alg->stats.cipher.cipher_err_cnt); + atomic64_inc(&alg->stats.cipher.err_cnt); } else { atomic64_inc(&alg->stats.cipher.decrypt_cnt); atomic64_add(cryptlen, &alg->stats.cipher.decrypt_tlen); diff --git a/crypto/crypto_user_stat.c b/crypto/crypto_user_stat.c index 113bf1691560..0ba00aaeb810 100644 --- a/crypto/crypto_user_stat.c +++ b/crypto/crypto_user_stat.c @@ -43,7 +43,7 @@ static int crypto_report_aead(struct sk_buff *skb, struct crypto_alg *alg) raead.stat_encrypt_tlen = atomic64_read(&alg->stats.aead.encrypt_tlen); raead.stat_decrypt_cnt = atomic64_read(&alg->stats.aead.decrypt_cnt); raead.stat_decrypt_tlen = atomic64_read(&alg->stats.aead.decrypt_tlen); - raead.stat_aead_err_cnt = atomic64_read(&alg->stats.aead.aead_err_cnt); + raead.stat_err_cnt = atomic64_read(&alg->stats.aead.err_cnt); return nla_put(skb, CRYPTOCFGA_STAT_AEAD, sizeof(raead), &raead); } @@ -60,7 +60,7 @@ static int crypto_report_cipher(struct sk_buff *skb, struct crypto_alg *alg) rcipher.stat_encrypt_tlen = atomic64_read(&alg->stats.cipher.encrypt_tlen); rcipher.stat_decrypt_cnt = atomic64_read(&alg->stats.cipher.decrypt_cnt); rcipher.stat_decrypt_tlen = atomic64_read(&alg->stats.cipher.decrypt_tlen); - rcipher.stat_cipher_err_cnt = atomic64_read(&alg->stats.cipher.cipher_err_cnt); + rcipher.stat_err_cnt = atomic64_read(&alg->stats.cipher.err_cnt); return nla_put(skb, CRYPTOCFGA_STAT_CIPHER, sizeof(rcipher), &rcipher); } @@ -76,7 +76,7 @@ static int crypto_report_comp(struct sk_buff *skb, struct crypto_alg *alg) rcomp.stat_compress_tlen = atomic64_read(&alg->stats.compress.compress_tlen); rcomp.stat_decompress_cnt = atomic64_read(&alg->stats.compress.decompress_cnt); rcomp.stat_decompress_tlen = atomic64_read(&alg->stats.compress.decompress_tlen); - rcomp.stat_compress_err_cnt = atomic64_read(&alg->stats.compress.compress_err_cnt); + rcomp.stat_err_cnt = atomic64_read(&alg->stats.compress.err_cnt); return nla_put(skb, CRYPTOCFGA_STAT_COMPRESS, sizeof(rcomp), &rcomp); } @@ -92,7 +92,7 @@ static int crypto_report_acomp(struct sk_buff *skb, struct crypto_alg *alg) racomp.stat_compress_tlen = atomic64_read(&alg->stats.compress.compress_tlen); racomp.stat_decompress_cnt = atomic64_read(&alg->stats.compress.decompress_cnt); racomp.stat_decompress_tlen = atomic64_read(&alg->stats.compress.decompress_tlen); - racomp.stat_compress_err_cnt = atomic64_read(&alg->stats.compress.compress_err_cnt); + racomp.stat_err_cnt = atomic64_read(&alg->stats.compress.err_cnt); return nla_put(skb, CRYPTOCFGA_STAT_ACOMP, sizeof(racomp), &racomp); } @@ -110,7 +110,7 @@ static int crypto_report_akcipher(struct sk_buff *skb, struct crypto_alg *alg) rakcipher.stat_decrypt_tlen = atomic64_read(&alg->stats.akcipher.decrypt_tlen); rakcipher.stat_sign_cnt = atomic64_read(&alg->stats.akcipher.sign_cnt); rakcipher.stat_verify_cnt = atomic64_read(&alg->stats.akcipher.verify_cnt); - rakcipher.stat_akcipher_err_cnt = atomic64_read(&alg->stats.akcipher.akcipher_err_cnt); + rakcipher.stat_err_cnt = atomic64_read(&alg->stats.akcipher.err_cnt); return nla_put(skb, CRYPTOCFGA_STAT_AKCIPHER, sizeof(rakcipher), &rakcipher); @@ -127,7 +127,7 @@ static int crypto_report_kpp(struct sk_buff *skb, struct crypto_alg *alg) rkpp.stat_setsecret_cnt = atomic64_read(&alg->stats.kpp.setsecret_cnt); rkpp.stat_generate_public_key_cnt = atomic64_read(&alg->stats.kpp.generate_public_key_cnt); rkpp.stat_compute_shared_secret_cnt = atomic64_read(&alg->stats.kpp.compute_shared_secret_cnt); - rkpp.stat_kpp_err_cnt = atomic64_read(&alg->stats.kpp.kpp_err_cnt); + rkpp.stat_err_cnt = atomic64_read(&alg->stats.kpp.err_cnt); return nla_put(skb, CRYPTOCFGA_STAT_KPP, sizeof(rkpp), &rkpp); } @@ -142,7 +142,7 @@ static int crypto_report_ahash(struct sk_buff *skb, struct crypto_alg *alg) rhash.stat_hash_cnt = atomic64_read(&alg->stats.hash.hash_cnt); rhash.stat_hash_tlen = atomic64_read(&alg->stats.hash.hash_tlen); - rhash.stat_hash_err_cnt = atomic64_read(&alg->stats.hash.hash_err_cnt); + rhash.stat_err_cnt = atomic64_read(&alg->stats.hash.err_cnt); return nla_put(skb, CRYPTOCFGA_STAT_HASH, sizeof(rhash), &rhash); } @@ -157,7 +157,7 @@ static int crypto_report_shash(struct sk_buff *skb, struct crypto_alg *alg) rhash.stat_hash_cnt = atomic64_read(&alg->stats.hash.hash_cnt); rhash.stat_hash_tlen = atomic64_read(&alg->stats.hash.hash_tlen); - rhash.stat_hash_err_cnt = atomic64_read(&alg->stats.hash.hash_err_cnt); + rhash.stat_err_cnt = atomic64_read(&alg->stats.hash.err_cnt); return nla_put(skb, CRYPTOCFGA_STAT_HASH, sizeof(rhash), &rhash); } @@ -173,7 +173,7 @@ static int crypto_report_rng(struct sk_buff *skb, struct crypto_alg *alg) rrng.stat_generate_cnt = atomic64_read(&alg->stats.rng.generate_cnt); rrng.stat_generate_tlen = atomic64_read(&alg->stats.rng.generate_tlen); rrng.stat_seed_cnt = atomic64_read(&alg->stats.rng.seed_cnt); - rrng.stat_rng_err_cnt = atomic64_read(&alg->stats.rng.rng_err_cnt); + rrng.stat_err_cnt = atomic64_read(&alg->stats.rng.err_cnt); return nla_put(skb, CRYPTOCFGA_STAT_RNG, sizeof(rrng), &rrng); } diff --git a/include/linux/crypto.h b/include/linux/crypto.h index 8a46ab35479e..a2967c1a08b1 100644 --- a/include/linux/crypto.h +++ b/include/linux/crypto.h @@ -376,14 +376,14 @@ struct compress_alg { * @encrypt_tlen: total data size handled by encrypt requests * @decrypt_cnt: number of decrypt requests * @decrypt_tlen: total data size handled by decrypt requests - * @aead_err_cnt: number of error for AEAD requests + * @err_cnt: number of error for AEAD requests */ struct crypto_istat_aead { atomic64_t encrypt_cnt; atomic64_t encrypt_tlen; atomic64_t decrypt_cnt; atomic64_t decrypt_tlen; - atomic64_t aead_err_cnt; + atomic64_t err_cnt; }; /* @@ -394,7 +394,7 @@ struct crypto_istat_aead { * @decrypt_tlen: total data size handled by decrypt requests * @verify_cnt: number of verify operation * @sign_cnt: number of sign requests - * @akcipher_err_cnt: number of error for akcipher requests + * @err_cnt: number of error for akcipher requests */ struct crypto_istat_akcipher { atomic64_t encrypt_cnt; @@ -403,7 +403,7 @@ struct crypto_istat_akcipher { atomic64_t decrypt_tlen; atomic64_t verify_cnt; atomic64_t sign_cnt; - atomic64_t akcipher_err_cnt; + atomic64_t err_cnt; }; /* @@ -412,14 +412,14 @@ struct crypto_istat_akcipher { * @encrypt_tlen: total data size handled by encrypt requests * @decrypt_cnt: number of decrypt requests * @decrypt_tlen: total data size handled by decrypt requests - * @cipher_err_cnt: number of error for cipher requests + * @err_cnt: number of error for cipher requests */ struct crypto_istat_cipher { atomic64_t encrypt_cnt; atomic64_t encrypt_tlen; atomic64_t decrypt_cnt; atomic64_t decrypt_tlen; - atomic64_t cipher_err_cnt; + atomic64_t err_cnt; }; /* @@ -428,26 +428,26 @@ struct crypto_istat_cipher { * @compress_tlen: total data size handled by compress requests * @decompress_cnt: number of decompress requests * @decompress_tlen: total data size handled by decompress requests - * @compress_err_cnt: number of error for compress requests + * @err_cnt: number of error for compress requests */ struct crypto_istat_compress { atomic64_t compress_cnt; atomic64_t compress_tlen; atomic64_t decompress_cnt; atomic64_t decompress_tlen; - atomic64_t compress_err_cnt; + atomic64_t err_cnt; }; /* * struct crypto_istat_hash - statistics for has algorithm * @hash_cnt: number of hash requests * @hash_tlen: total data size hashed - * @hash_err_cnt: number of error for hash requests + * @err_cnt: number of error for hash requests */ struct crypto_istat_hash { atomic64_t hash_cnt; atomic64_t hash_tlen; - atomic64_t hash_err_cnt; + atomic64_t err_cnt; }; /* @@ -455,13 +455,13 @@ struct crypto_istat_hash { * @setsecret_cnt: number of setsecrey operation * @generate_public_key_cnt: number of generate_public_key operation * @compute_shared_secret_cnt: number of compute_shared_secret operation - * @kpp_err_cnt: number of error for KPP requests + * @err_cnt: number of error for KPP requests */ struct crypto_istat_kpp { atomic64_t setsecret_cnt; atomic64_t generate_public_key_cnt; atomic64_t compute_shared_secret_cnt; - atomic64_t kpp_err_cnt; + atomic64_t err_cnt; }; /* @@ -469,13 +469,13 @@ struct crypto_istat_kpp { * @generate_cnt: number of RNG generate requests * @generate_tlen: total data size of generated data by the RNG * @seed_cnt: number of times the RNG was seeded - * @rng_err_cnt: number of error for RNG requests + * @err_cnt: number of error for RNG requests */ struct crypto_istat_rng { atomic64_t generate_cnt; atomic64_t generate_tlen; atomic64_t seed_cnt; - atomic64_t rng_err_cnt; + atomic64_t err_cnt; }; #endif /* CONFIG_CRYPTO_STATS */ diff --git a/include/uapi/linux/cryptouser.h b/include/uapi/linux/cryptouser.h index 3a70f025e27d..4dc1603919ce 100644 --- a/include/uapi/linux/cryptouser.h +++ b/include/uapi/linux/cryptouser.h @@ -82,7 +82,7 @@ struct crypto_stat_aead { __u64 stat_encrypt_tlen; __u64 stat_decrypt_cnt; __u64 stat_decrypt_tlen; - __u64 stat_aead_err_cnt; + __u64 stat_err_cnt; }; struct crypto_stat_akcipher { @@ -93,7 +93,7 @@ struct crypto_stat_akcipher { __u64 stat_decrypt_tlen; __u64 stat_verify_cnt; __u64 stat_sign_cnt; - __u64 stat_akcipher_err_cnt; + __u64 stat_err_cnt; }; struct crypto_stat_cipher { @@ -102,7 +102,7 @@ struct crypto_stat_cipher { __u64 stat_encrypt_tlen; __u64 stat_decrypt_cnt; __u64 stat_decrypt_tlen; - __u64 stat_cipher_err_cnt; + __u64 stat_err_cnt; }; struct crypto_stat_compress { @@ -111,14 +111,14 @@ struct crypto_stat_compress { __u64 stat_compress_tlen; __u64 stat_decompress_cnt; __u64 stat_decompress_tlen; - __u64 stat_compress_err_cnt; + __u64 stat_err_cnt; }; struct crypto_stat_hash { char type[CRYPTO_MAX_NAME]; __u64 stat_hash_cnt; __u64 stat_hash_tlen; - __u64 stat_hash_err_cnt; + __u64 stat_err_cnt; }; struct crypto_stat_kpp { @@ -126,7 +126,7 @@ struct crypto_stat_kpp { __u64 stat_setsecret_cnt; __u64 stat_generate_public_key_cnt; __u64 stat_compute_shared_secret_cnt; - __u64 stat_kpp_err_cnt; + __u64 stat_err_cnt; }; struct crypto_stat_rng { @@ -134,7 +134,7 @@ struct crypto_stat_rng { __u64 stat_generate_cnt; __u64 stat_generate_tlen; __u64 stat_seed_cnt; - __u64 stat_rng_err_cnt; + __u64 stat_err_cnt; }; struct crypto_stat_larval { diff --git a/tools/crypto/getstat.c b/tools/crypto/getstat.c index 57fbb94608d4..9e8ff76420fa 100644 --- a/tools/crypto/getstat.c +++ b/tools/crypto/getstat.c @@ -157,7 +157,7 @@ static int get_stat(const char *drivername) printf("%s\tHash\n\tHash: %llu bytes: %llu\n\tErrors: %llu\n", drivername, rhash->stat_hash_cnt, rhash->stat_hash_tlen, - rhash->stat_hash_err_cnt); + rhash->stat_err_cnt); } else if (tb[CRYPTOCFGA_STAT_COMPRESS]) { struct rtattr *rta = tb[CRYPTOCFGA_STAT_COMPRESS]; struct crypto_stat_compress *rblk = @@ -166,7 +166,7 @@ static int get_stat(const char *drivername) drivername, rblk->stat_compress_cnt, rblk->stat_compress_tlen, rblk->stat_decompress_cnt, rblk->stat_decompress_tlen, - rblk->stat_compress_err_cnt); + rblk->stat_err_cnt); } else if (tb[CRYPTOCFGA_STAT_ACOMP]) { struct rtattr *rta = tb[CRYPTOCFGA_STAT_ACOMP]; struct crypto_stat_compress *rcomp = @@ -175,7 +175,7 @@ static int get_stat(const char *drivername) drivername, rcomp->stat_compress_cnt, rcomp->stat_compress_tlen, rcomp->stat_decompress_cnt, rcomp->stat_decompress_tlen, - rcomp->stat_compress_err_cnt); + rcomp->stat_err_cnt); } else if (tb[CRYPTOCFGA_STAT_AEAD]) { struct rtattr *rta = tb[CRYPTOCFGA_STAT_AEAD]; struct crypto_stat_aead *raead = @@ -184,7 +184,7 @@ static int get_stat(const char *drivername) drivername, raead->stat_encrypt_cnt, raead->stat_encrypt_tlen, raead->stat_decrypt_cnt, raead->stat_decrypt_tlen, - raead->stat_aead_err_cnt); + raead->stat_err_cnt); } else if (tb[CRYPTOCFGA_STAT_BLKCIPHER]) { struct rtattr *rta = tb[CRYPTOCFGA_STAT_BLKCIPHER]; struct crypto_stat_cipher *rblk = @@ -193,7 +193,7 @@ static int get_stat(const char *drivername) drivername, rblk->stat_encrypt_cnt, rblk->stat_encrypt_tlen, rblk->stat_decrypt_cnt, rblk->stat_decrypt_tlen, - rblk->stat_cipher_err_cnt); + rblk->stat_err_cnt); } else if (tb[CRYPTOCFGA_STAT_AKCIPHER]) { struct rtattr *rta = tb[CRYPTOCFGA_STAT_AKCIPHER]; struct crypto_stat_akcipher *rblk = @@ -203,7 +203,7 @@ static int get_stat(const char *drivername) rblk->stat_encrypt_cnt, rblk->stat_encrypt_tlen, rblk->stat_decrypt_cnt, rblk->stat_decrypt_tlen, rblk->stat_sign_cnt, rblk->stat_verify_cnt, - rblk->stat_akcipher_err_cnt); + rblk->stat_err_cnt); } else if (tb[CRYPTOCFGA_STAT_CIPHER]) { struct rtattr *rta = tb[CRYPTOCFGA_STAT_CIPHER]; struct crypto_stat_cipher *rblk = @@ -212,7 +212,7 @@ static int get_stat(const char *drivername) drivername, rblk->stat_encrypt_cnt, rblk->stat_encrypt_tlen, rblk->stat_decrypt_cnt, rblk->stat_decrypt_tlen, - rblk->stat_cipher_err_cnt); + rblk->stat_err_cnt); } else if (tb[CRYPTOCFGA_STAT_RNG]) { struct rtattr *rta = tb[CRYPTOCFGA_STAT_RNG]; struct crypto_stat_rng *rrng = @@ -221,7 +221,7 @@ static int get_stat(const char *drivername) drivername, rrng->stat_seed_cnt, rrng->stat_generate_cnt, rrng->stat_generate_tlen, - rrng->stat_rng_err_cnt); + rrng->stat_err_cnt); } else if (tb[CRYPTOCFGA_STAT_KPP]) { struct rtattr *rta = tb[CRYPTOCFGA_STAT_KPP]; struct crypto_stat_kpp *rkpp = @@ -231,7 +231,7 @@ static int get_stat(const char *drivername) rkpp->stat_setsecret_cnt, rkpp->stat_generate_public_key_cnt, rkpp->stat_compute_shared_secret_cnt, - rkpp->stat_kpp_err_cnt); + rkpp->stat_err_cnt); } else { fprintf(stderr, "%s is of an unknown algorithm\n", drivername); } -- cgit v1.2.3-59-g8ed1b From 1f6669b9716c6c98391b0f756e060892b32b8ca7 Mon Sep 17 00:00:00 2001 From: Corentin Labbe Date: Thu, 29 Nov 2018 14:42:26 +0000 Subject: crypto: user - Add crypto_stats_init This patch add the crypto_stats_init() function. This will permit to remove some ifdef from __crypto_register_alg(). Signed-off-by: Corentin Labbe Signed-off-by: Herbert Xu --- crypto/algapi.c | 10 +++++++--- include/linux/crypto.h | 3 +++ 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/crypto/algapi.c b/crypto/algapi.c index c0d4f9ef6b0f..8b65ada33e5d 100644 --- a/crypto/algapi.c +++ b/crypto/algapi.c @@ -258,9 +258,7 @@ static struct crypto_larval *__crypto_register_alg(struct crypto_alg *alg) list_add(&alg->cra_list, &crypto_alg_list); list_add(&larval->alg.cra_list, &crypto_alg_list); -#ifdef CONFIG_CRYPTO_STATS - memset(&alg->stats, 0, sizeof(alg->stats)); -#endif + crypto_stats_init(alg); out: return larval; @@ -1073,6 +1071,12 @@ int crypto_type_has_alg(const char *name, const struct crypto_type *frontend, EXPORT_SYMBOL_GPL(crypto_type_has_alg); #ifdef CONFIG_CRYPTO_STATS +void crypto_stats_init(struct crypto_alg *alg) +{ + memset(&alg->stats, 0, sizeof(alg->stats)); +} +EXPORT_SYMBOL_GPL(crypto_stats_init); + void crypto_stats_get(struct crypto_alg *alg) { crypto_alg_get(alg); diff --git a/include/linux/crypto.h b/include/linux/crypto.h index a2967c1a08b1..9850b41e38ae 100644 --- a/include/linux/crypto.h +++ b/include/linux/crypto.h @@ -614,6 +614,7 @@ struct crypto_alg { } CRYPTO_MINALIGN_ATTR; #ifdef CONFIG_CRYPTO_STATS +void crypto_stats_init(struct crypto_alg *alg); void crypto_stats_get(struct crypto_alg *alg); void crypto_stats_ablkcipher_encrypt(unsigned int nbytes, int ret, struct crypto_alg *alg); void crypto_stats_ablkcipher_decrypt(unsigned int nbytes, int ret, struct crypto_alg *alg); @@ -635,6 +636,8 @@ void crypto_stats_rng_generate(struct crypto_alg *alg, unsigned int dlen, int re void crypto_stats_skcipher_encrypt(unsigned int cryptlen, int ret, struct crypto_alg *alg); void crypto_stats_skcipher_decrypt(unsigned int cryptlen, int ret, struct crypto_alg *alg); #else +static inline void crypto_stats_init(struct crypto_alg *alg) +{} static inline void crypto_stats_get(struct crypto_alg *alg) {} static inline void crypto_stats_ablkcipher_encrypt(unsigned int nbytes, int ret, struct crypto_alg *alg) -- cgit v1.2.3-59-g8ed1b From c35828ea906a7c76632a0211e59c392903cd4615 Mon Sep 17 00:00:00 2001 From: Atul Gupta Date: Fri, 30 Nov 2018 14:31:48 +0530 Subject: crypto: chcr - small packet Tx stalls the queue Immediate packets sent to hardware should include the work request length in calculating the flits. WR occupy one flit and if not accounted result in invalid request which stalls the HW queue. Cc: stable@vger.kernel.org Signed-off-by: Atul Gupta Signed-off-by: Herbert Xu --- drivers/crypto/chelsio/chcr_ipsec.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/crypto/chelsio/chcr_ipsec.c b/drivers/crypto/chelsio/chcr_ipsec.c index 461b97e2f1fd..1ff8738631a3 100644 --- a/drivers/crypto/chelsio/chcr_ipsec.c +++ b/drivers/crypto/chelsio/chcr_ipsec.c @@ -303,7 +303,10 @@ static bool chcr_ipsec_offload_ok(struct sk_buff *skb, struct xfrm_state *x) static inline int is_eth_imm(const struct sk_buff *skb, unsigned int kctx_len) { - int hdrlen = sizeof(struct chcr_ipsec_req) + kctx_len; + int hdrlen; + + hdrlen = sizeof(struct fw_ulptx_wr) + + sizeof(struct chcr_ipsec_req) + kctx_len; hdrlen += sizeof(struct cpl_tx_pkt); if (skb->len <= MAX_IMM_TX_PKT_LEN - hdrlen) -- cgit v1.2.3-59-g8ed1b From 8362ea16f69fe59c4d012f0748e586ad09391f41 Mon Sep 17 00:00:00 2001 From: Atul Gupta Date: Fri, 30 Nov 2018 14:32:09 +0530 Subject: crypto: chcr - ESN for Inline IPSec Tx Send SPI, 64b seq nos and 64b IV with aadiv drop for inline crypto. This information is added in outgoing packet after the CPL TX PKT XT and removed by hardware. The aad, auth and cipher offsets are then adjusted for ESN enabled tunnel. Signed-off-by: Atul Gupta Signed-off-by: Herbert Xu --- drivers/crypto/chelsio/chcr_core.h | 9 ++ drivers/crypto/chelsio/chcr_ipsec.c | 175 ++++++++++++++++++++++++++++-------- 2 files changed, 148 insertions(+), 36 deletions(-) diff --git a/drivers/crypto/chelsio/chcr_core.h b/drivers/crypto/chelsio/chcr_core.h index de3a9c085daf..4616663e25d7 100644 --- a/drivers/crypto/chelsio/chcr_core.h +++ b/drivers/crypto/chelsio/chcr_core.h @@ -159,8 +159,17 @@ struct chcr_ipsec_wr { struct chcr_ipsec_req req; }; +#define ESN_IV_INSERT_OFFSET 12 +struct chcr_ipsec_aadiv { + __be32 spi; + u8 seq_no[8]; + u8 iv[8]; +}; + struct ipsec_sa_entry { int hmac_ctrl; + u16 esn; + u16 imm; unsigned int enckey_len; unsigned int kctx_len; unsigned int authsize; diff --git a/drivers/crypto/chelsio/chcr_ipsec.c b/drivers/crypto/chelsio/chcr_ipsec.c index 1ff8738631a3..9321d2b1a5ab 100644 --- a/drivers/crypto/chelsio/chcr_ipsec.c +++ b/drivers/crypto/chelsio/chcr_ipsec.c @@ -76,12 +76,14 @@ static int chcr_xfrm_add_state(struct xfrm_state *x); static void chcr_xfrm_del_state(struct xfrm_state *x); static void chcr_xfrm_free_state(struct xfrm_state *x); static bool chcr_ipsec_offload_ok(struct sk_buff *skb, struct xfrm_state *x); +static void chcr_advance_esn_state(struct xfrm_state *x); static const struct xfrmdev_ops chcr_xfrmdev_ops = { .xdo_dev_state_add = chcr_xfrm_add_state, .xdo_dev_state_delete = chcr_xfrm_del_state, .xdo_dev_state_free = chcr_xfrm_free_state, .xdo_dev_offload_ok = chcr_ipsec_offload_ok, + .xdo_dev_state_advance_esn = chcr_advance_esn_state, }; /* Add offload xfrms to Chelsio Interface */ @@ -210,10 +212,6 @@ static int chcr_xfrm_add_state(struct xfrm_state *x) pr_debug("CHCR: Cannot offload compressed xfrm states\n"); return -EINVAL; } - if (x->props.flags & XFRM_STATE_ESN) { - pr_debug("CHCR: Cannot offload ESN xfrm states\n"); - return -EINVAL; - } if (x->props.family != AF_INET && x->props.family != AF_INET6) { pr_debug("CHCR: Only IPv4/6 xfrm state offloaded\n"); @@ -266,6 +264,8 @@ static int chcr_xfrm_add_state(struct xfrm_state *x) } sa_entry->hmac_ctrl = chcr_ipsec_setauthsize(x, sa_entry); + if (x->props.flags & XFRM_STATE_ESN) + sa_entry->esn = 1; chcr_ipsec_setkey(x, sa_entry); x->xso.offload_handle = (unsigned long)sa_entry; try_module_get(THIS_MODULE); @@ -294,31 +294,57 @@ static void chcr_xfrm_free_state(struct xfrm_state *x) static bool chcr_ipsec_offload_ok(struct sk_buff *skb, struct xfrm_state *x) { - /* Offload with IP options is not supported yet */ - if (ip_hdr(skb)->ihl > 5) - return false; - + if (x->props.family == AF_INET) { + /* Offload with IP options is not supported yet */ + if (ip_hdr(skb)->ihl > 5) + return false; + } else { + /* Offload with IPv6 extension headers is not support yet */ + if (ipv6_ext_hdr(ipv6_hdr(skb)->nexthdr)) + return false; + } return true; } -static inline int is_eth_imm(const struct sk_buff *skb, unsigned int kctx_len) +static void chcr_advance_esn_state(struct xfrm_state *x) +{ + /* do nothing */ + if (!x->xso.offload_handle) + return; +} + +static inline int is_eth_imm(const struct sk_buff *skb, + struct ipsec_sa_entry *sa_entry) { + unsigned int kctx_len; int hdrlen; + kctx_len = sa_entry->kctx_len; hdrlen = sizeof(struct fw_ulptx_wr) + sizeof(struct chcr_ipsec_req) + kctx_len; hdrlen += sizeof(struct cpl_tx_pkt); + if (sa_entry->esn) + hdrlen += (DIV_ROUND_UP(sizeof(struct chcr_ipsec_aadiv), 16) + << 4); if (skb->len <= MAX_IMM_TX_PKT_LEN - hdrlen) return hdrlen; return 0; } static inline unsigned int calc_tx_sec_flits(const struct sk_buff *skb, - unsigned int kctx_len) + struct ipsec_sa_entry *sa_entry) { + unsigned int kctx_len; unsigned int flits; - int hdrlen = is_eth_imm(skb, kctx_len); + int aadivlen; + int hdrlen; + + kctx_len = sa_entry->kctx_len; + hdrlen = is_eth_imm(skb, sa_entry); + aadivlen = sa_entry->esn ? DIV_ROUND_UP(sizeof(struct chcr_ipsec_aadiv), + 16) : 0; + aadivlen <<= 4; /* If the skb is small enough, we can pump it out as a work request * with only immediate data. In that case we just have to have the @@ -341,13 +367,69 @@ static inline unsigned int calc_tx_sec_flits(const struct sk_buff *skb, flits += (sizeof(struct fw_ulptx_wr) + sizeof(struct chcr_ipsec_req) + kctx_len + - sizeof(struct cpl_tx_pkt_core)) / sizeof(__be64); + sizeof(struct cpl_tx_pkt_core) + + aadivlen) / sizeof(__be64); return flits; } +inline void *copy_esn_pktxt(struct sk_buff *skb, + struct net_device *dev, + void *pos, + struct ipsec_sa_entry *sa_entry) +{ + struct chcr_ipsec_aadiv *aadiv; + struct ulptx_idata *sc_imm; + struct ip_esp_hdr *esphdr; + struct xfrm_offload *xo; + struct sge_eth_txq *q; + struct adapter *adap; + struct port_info *pi; + __be64 seqno; + u32 qidx; + u32 seqlo; + u8 *iv; + int eoq; + int len; + + pi = netdev_priv(dev); + adap = pi->adapter; + qidx = skb->queue_mapping; + q = &adap->sge.ethtxq[qidx + pi->first_qset]; + + /* end of queue, reset pos to start of queue */ + eoq = (void *)q->q.stat - pos; + if (!eoq) + pos = q->q.desc; + + len = DIV_ROUND_UP(sizeof(struct chcr_ipsec_aadiv), 16) << 4; + memset(pos, 0, len); + aadiv = (struct chcr_ipsec_aadiv *)pos; + esphdr = (struct ip_esp_hdr *)skb_transport_header(skb); + iv = skb_transport_header(skb) + sizeof(struct ip_esp_hdr); + xo = xfrm_offload(skb); + + aadiv->spi = (esphdr->spi); + seqlo = htonl(esphdr->seq_no); + seqno = cpu_to_be64(seqlo + ((u64)xo->seq.hi << 32)); + memcpy(aadiv->seq_no, &seqno, 8); + iv = skb_transport_header(skb) + sizeof(struct ip_esp_hdr); + memcpy(aadiv->iv, iv, 8); + + if (sa_entry->imm) { + sc_imm = (struct ulptx_idata *)(pos + + (DIV_ROUND_UP(sizeof(struct chcr_ipsec_aadiv), + sizeof(__be64)) << 3)); + sc_imm->cmd_more = FILL_CMD_MORE(!sa_entry->imm); + sc_imm->len = cpu_to_be32(sa_entry->imm); + } + pos += len; + return pos; +} + inline void *copy_cpltx_pktxt(struct sk_buff *skb, - struct net_device *dev, - void *pos) + struct net_device *dev, + void *pos, + struct ipsec_sa_entry *sa_entry) { struct cpl_tx_pkt_core *cpl; struct sge_eth_txq *q; @@ -382,6 +464,9 @@ inline void *copy_cpltx_pktxt(struct sk_buff *skb, cpl->ctrl1 = cpu_to_be64(cntrl); pos += sizeof(struct cpl_tx_pkt_core); + /* Copy ESN info for HW */ + if (sa_entry->esn) + pos = copy_esn_pktxt(skb, dev, pos, sa_entry); return pos; } @@ -428,7 +513,7 @@ inline void *copy_key_cpltx_pktxt(struct sk_buff *skb, pos = (u8 *)q->q.desc + (key_len - left); } /* Copy CPL TX PKT XT */ - pos = copy_cpltx_pktxt(skb, dev, pos); + pos = copy_cpltx_pktxt(skb, dev, pos, sa_entry); return pos; } @@ -441,10 +526,16 @@ inline void *chcr_crypto_wreq(struct sk_buff *skb, { struct port_info *pi = netdev_priv(dev); struct adapter *adap = pi->adapter; - unsigned int immdatalen = 0; unsigned int ivsize = GCM_ESP_IV_SIZE; struct chcr_ipsec_wr *wr; + u16 immdatalen = 0; unsigned int flits; + u32 ivinoffset; + u32 aadstart; + u32 aadstop; + u32 ciphstart; + u32 ivdrop = 0; + u32 esnlen = 0; u32 wr_mid; int qidx = skb_get_queue_mapping(skb); struct sge_eth_txq *q = &adap->sge.ethtxq[qidx + pi->first_qset]; @@ -453,10 +544,17 @@ inline void *chcr_crypto_wreq(struct sk_buff *skb, atomic_inc(&adap->chcr_stats.ipsec_cnt); - flits = calc_tx_sec_flits(skb, kctx_len); + flits = calc_tx_sec_flits(skb, sa_entry); + if (sa_entry->esn) + ivdrop = 1; - if (is_eth_imm(skb, kctx_len)) + if (is_eth_imm(skb, sa_entry)) { immdatalen = skb->len; + sa_entry->imm = immdatalen; + } + + if (sa_entry->esn) + esnlen = sizeof(struct chcr_ipsec_aadiv); /* WR Header */ wr = (struct chcr_ipsec_wr *)pos; @@ -481,33 +579,38 @@ inline void *chcr_crypto_wreq(struct sk_buff *skb, sizeof(wr->req.key_ctx) + kctx_len + sizeof(struct cpl_tx_pkt_core) + - immdatalen); + esnlen + + (esnlen ? 0 : immdatalen)); /* CPL_SEC_PDU */ + ivinoffset = sa_entry->esn ? (ESN_IV_INSERT_OFFSET + 1) : + (skb_transport_offset(skb) + + sizeof(struct ip_esp_hdr) + 1); wr->req.sec_cpl.op_ivinsrtofst = htonl( CPL_TX_SEC_PDU_OPCODE_V(CPL_TX_SEC_PDU) | CPL_TX_SEC_PDU_CPLLEN_V(2) | CPL_TX_SEC_PDU_PLACEHOLDER_V(1) | CPL_TX_SEC_PDU_IVINSRTOFST_V( - (skb_transport_offset(skb) + - sizeof(struct ip_esp_hdr) + 1))); + ivinoffset)); - wr->req.sec_cpl.pldlen = htonl(skb->len); + wr->req.sec_cpl.pldlen = htonl(skb->len + esnlen); + aadstart = sa_entry->esn ? 1 : (skb_transport_offset(skb) + 1); + aadstop = sa_entry->esn ? ESN_IV_INSERT_OFFSET : + (skb_transport_offset(skb) + + sizeof(struct ip_esp_hdr)); + ciphstart = skb_transport_offset(skb) + sizeof(struct ip_esp_hdr) + + GCM_ESP_IV_SIZE + 1; + ciphstart += sa_entry->esn ? esnlen : 0; wr->req.sec_cpl.aadstart_cipherstop_hi = FILL_SEC_CPL_CIPHERSTOP_HI( - (skb_transport_offset(skb) + 1), - (skb_transport_offset(skb) + - sizeof(struct ip_esp_hdr)), - (skb_transport_offset(skb) + - sizeof(struct ip_esp_hdr) + - GCM_ESP_IV_SIZE + 1), 0); + aadstart, + aadstop, + ciphstart, 0); wr->req.sec_cpl.cipherstop_lo_authinsert = - FILL_SEC_CPL_AUTHINSERT(0, skb_transport_offset(skb) + - sizeof(struct ip_esp_hdr) + - GCM_ESP_IV_SIZE + 1, - sa_entry->authsize, - sa_entry->authsize); + FILL_SEC_CPL_AUTHINSERT(0, ciphstart, + sa_entry->authsize, + sa_entry->authsize); wr->req.sec_cpl.seqno_numivs = FILL_SEC_CPL_SCMD0_SEQNO(CHCR_ENCRYPT_OP, 1, CHCR_SCMD_CIPHER_MODE_AES_GCM, @@ -515,7 +618,7 @@ inline void *chcr_crypto_wreq(struct sk_buff *skb, sa_entry->hmac_ctrl, ivsize >> 1); wr->req.sec_cpl.ivgen_hdrlen = FILL_SEC_CPL_IVGEN_HDRLEN(0, 0, 1, - 0, 0, 0); + 0, ivdrop, 0); pos += sizeof(struct fw_ulptx_wr) + sizeof(struct ulp_txpkt) + @@ -593,7 +696,7 @@ out_free: dev_kfree_skb_any(skb); cxgb4_reclaim_completed_tx(adap, &q->q, true); - flits = calc_tx_sec_flits(skb, sa_entry->kctx_len); + flits = calc_tx_sec_flits(skb, sa_entry); ndesc = flits_to_desc(flits); credits = txq_avail(&q->q) - ndesc; @@ -606,7 +709,7 @@ out_free: dev_kfree_skb_any(skb); return NETDEV_TX_BUSY; } - if (is_eth_imm(skb, kctx_len)) + if (is_eth_imm(skb, sa_entry)) immediate = true; if (!immediate && -- cgit v1.2.3-59-g8ed1b From 88d905e20b11f7ad841e3afddaf1d59b6693c4a1 Mon Sep 17 00:00:00 2001 From: Yangtao Li Date: Sat, 1 Dec 2018 04:56:30 -0500 Subject: crypto: cavium/nitrox - convert to DEFINE_SHOW_ATTRIBUTE Use DEFINE_SHOW_ATTRIBUTE macro to simplify the code. Signed-off-by: Yangtao Li Signed-off-by: Herbert Xu --- drivers/crypto/cavium/nitrox/nitrox_debugfs.c | 48 +++++---------------------- 1 file changed, 9 insertions(+), 39 deletions(-) diff --git a/drivers/crypto/cavium/nitrox/nitrox_debugfs.c b/drivers/crypto/cavium/nitrox/nitrox_debugfs.c index 5f3cd5fafe04..0196b992280f 100644 --- a/drivers/crypto/cavium/nitrox/nitrox_debugfs.c +++ b/drivers/crypto/cavium/nitrox/nitrox_debugfs.c @@ -13,18 +13,7 @@ static int firmware_show(struct seq_file *s, void *v) return 0; } -static int firmware_open(struct inode *inode, struct file *file) -{ - return single_open(file, firmware_show, inode->i_private); -} - -static const struct file_operations firmware_fops = { - .owner = THIS_MODULE, - .open = firmware_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; +DEFINE_SHOW_ATTRIBUTE(firmware); static int device_show(struct seq_file *s, void *v) { @@ -41,18 +30,7 @@ static int device_show(struct seq_file *s, void *v) return 0; } -static int nitrox_open(struct inode *inode, struct file *file) -{ - return single_open(file, device_show, inode->i_private); -} - -static const struct file_operations nitrox_fops = { - .owner = THIS_MODULE, - .open = nitrox_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; +DEFINE_SHOW_ATTRIBUTE(device); static int stats_show(struct seq_file *s, void *v) { @@ -69,18 +47,7 @@ static int stats_show(struct seq_file *s, void *v) return 0; } -static int nitrox_stats_open(struct inode *inode, struct file *file) -{ - return single_open(file, stats_show, inode->i_private); -} - -static const struct file_operations nitrox_stats_fops = { - .owner = THIS_MODULE, - .open = nitrox_stats_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; +DEFINE_SHOW_ATTRIBUTE(stats); void nitrox_debugfs_exit(struct nitrox_device *ndev) { @@ -97,13 +64,16 @@ int nitrox_debugfs_init(struct nitrox_device *ndev) return -ENOMEM; ndev->debugfs_dir = dir; - f = debugfs_create_file("firmware", 0400, dir, ndev, &firmware_fops); + f = debugfs_create_file("firmware", 0400, dir, ndev, + &firmware_fops); if (!f) goto err; - f = debugfs_create_file("device", 0400, dir, ndev, &nitrox_fops); + f = debugfs_create_file("device", 0400, dir, ndev, + &device_fops); if (!f) goto err; - f = debugfs_create_file("stats", 0400, dir, ndev, &nitrox_stats_fops); + f = debugfs_create_file("stats", 0400, dir, ndev, + &stats_fops); if (!f) goto err; -- cgit v1.2.3-59-g8ed1b From a00fa0c88774bea9a102fc616598d9ee52765451 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 3 Dec 2018 19:52:49 -0800 Subject: crypto: arm64/nhpoly1305 - add NEON-accelerated NHPoly1305 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add an ARM64 NEON implementation of NHPoly1305, an ε-almost-∆-universal hash function used in the Adiantum encryption mode. For now, only the NH portion is actually NEON-accelerated; the Poly1305 part is less performance-critical so is just implemented in C. Reviewed-by: Ard Biesheuvel Tested-by: Ard Biesheuvel # big-endian Signed-off-by: Eric Biggers Signed-off-by: Herbert Xu --- arch/arm64/crypto/Kconfig | 5 ++ arch/arm64/crypto/Makefile | 3 + arch/arm64/crypto/nh-neon-core.S | 103 +++++++++++++++++++++++++++++++ arch/arm64/crypto/nhpoly1305-neon-glue.c | 77 +++++++++++++++++++++++ 4 files changed, 188 insertions(+) create mode 100644 arch/arm64/crypto/nh-neon-core.S create mode 100644 arch/arm64/crypto/nhpoly1305-neon-glue.c diff --git a/arch/arm64/crypto/Kconfig b/arch/arm64/crypto/Kconfig index a5606823ed4d..3f5aeb786192 100644 --- a/arch/arm64/crypto/Kconfig +++ b/arch/arm64/crypto/Kconfig @@ -106,6 +106,11 @@ config CRYPTO_CHACHA20_NEON select CRYPTO_BLKCIPHER select CRYPTO_CHACHA20 +config CRYPTO_NHPOLY1305_NEON + tristate "NHPoly1305 hash function using NEON instructions (for Adiantum)" + depends on KERNEL_MODE_NEON + select CRYPTO_NHPOLY1305 + config CRYPTO_AES_ARM64_BS tristate "AES in ECB/CBC/CTR/XTS modes using bit-sliced NEON algorithm" depends on KERNEL_MODE_NEON diff --git a/arch/arm64/crypto/Makefile b/arch/arm64/crypto/Makefile index f476fede09ba..125dbb10a93e 100644 --- a/arch/arm64/crypto/Makefile +++ b/arch/arm64/crypto/Makefile @@ -53,6 +53,9 @@ sha512-arm64-y := sha512-glue.o sha512-core.o obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha20-neon.o chacha20-neon-y := chacha20-neon-core.o chacha20-neon-glue.o +obj-$(CONFIG_CRYPTO_NHPOLY1305_NEON) += nhpoly1305-neon.o +nhpoly1305-neon-y := nh-neon-core.o nhpoly1305-neon-glue.o + obj-$(CONFIG_CRYPTO_AES_ARM64) += aes-arm64.o aes-arm64-y := aes-cipher-core.o aes-cipher-glue.o diff --git a/arch/arm64/crypto/nh-neon-core.S b/arch/arm64/crypto/nh-neon-core.S new file mode 100644 index 000000000000..e05570c38de7 --- /dev/null +++ b/arch/arm64/crypto/nh-neon-core.S @@ -0,0 +1,103 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * NH - ε-almost-universal hash function, ARM64 NEON accelerated version + * + * Copyright 2018 Google LLC + * + * Author: Eric Biggers + */ + +#include + + KEY .req x0 + MESSAGE .req x1 + MESSAGE_LEN .req x2 + HASH .req x3 + + PASS0_SUMS .req v0 + PASS1_SUMS .req v1 + PASS2_SUMS .req v2 + PASS3_SUMS .req v3 + K0 .req v4 + K1 .req v5 + K2 .req v6 + K3 .req v7 + T0 .req v8 + T1 .req v9 + T2 .req v10 + T3 .req v11 + T4 .req v12 + T5 .req v13 + T6 .req v14 + T7 .req v15 + +.macro _nh_stride k0, k1, k2, k3 + + // Load next message stride + ld1 {T3.16b}, [MESSAGE], #16 + + // Load next key stride + ld1 {\k3\().4s}, [KEY], #16 + + // Add message words to key words + add T0.4s, T3.4s, \k0\().4s + add T1.4s, T3.4s, \k1\().4s + add T2.4s, T3.4s, \k2\().4s + add T3.4s, T3.4s, \k3\().4s + + // Multiply 32x32 => 64 and accumulate + mov T4.d[0], T0.d[1] + mov T5.d[0], T1.d[1] + mov T6.d[0], T2.d[1] + mov T7.d[0], T3.d[1] + umlal PASS0_SUMS.2d, T0.2s, T4.2s + umlal PASS1_SUMS.2d, T1.2s, T5.2s + umlal PASS2_SUMS.2d, T2.2s, T6.2s + umlal PASS3_SUMS.2d, T3.2s, T7.2s +.endm + +/* + * void nh_neon(const u32 *key, const u8 *message, size_t message_len, + * u8 hash[NH_HASH_BYTES]) + * + * It's guaranteed that message_len % 16 == 0. + */ +ENTRY(nh_neon) + + ld1 {K0.4s,K1.4s}, [KEY], #32 + movi PASS0_SUMS.2d, #0 + movi PASS1_SUMS.2d, #0 + ld1 {K2.4s}, [KEY], #16 + movi PASS2_SUMS.2d, #0 + movi PASS3_SUMS.2d, #0 + + subs MESSAGE_LEN, MESSAGE_LEN, #64 + blt .Lloop4_done +.Lloop4: + _nh_stride K0, K1, K2, K3 + _nh_stride K1, K2, K3, K0 + _nh_stride K2, K3, K0, K1 + _nh_stride K3, K0, K1, K2 + subs MESSAGE_LEN, MESSAGE_LEN, #64 + bge .Lloop4 + +.Lloop4_done: + ands MESSAGE_LEN, MESSAGE_LEN, #63 + beq .Ldone + _nh_stride K0, K1, K2, K3 + + subs MESSAGE_LEN, MESSAGE_LEN, #16 + beq .Ldone + _nh_stride K1, K2, K3, K0 + + subs MESSAGE_LEN, MESSAGE_LEN, #16 + beq .Ldone + _nh_stride K2, K3, K0, K1 + +.Ldone: + // Sum the accumulators for each pass, then store the sums to 'hash' + addp T0.2d, PASS0_SUMS.2d, PASS1_SUMS.2d + addp T1.2d, PASS2_SUMS.2d, PASS3_SUMS.2d + st1 {T0.16b,T1.16b}, [HASH] + ret +ENDPROC(nh_neon) diff --git a/arch/arm64/crypto/nhpoly1305-neon-glue.c b/arch/arm64/crypto/nhpoly1305-neon-glue.c new file mode 100644 index 000000000000..22cc32ac9448 --- /dev/null +++ b/arch/arm64/crypto/nhpoly1305-neon-glue.c @@ -0,0 +1,77 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * NHPoly1305 - ε-almost-∆-universal hash function for Adiantum + * (ARM64 NEON accelerated version) + * + * Copyright 2018 Google LLC + */ + +#include +#include +#include +#include +#include + +asmlinkage void nh_neon(const u32 *key, const u8 *message, size_t message_len, + u8 hash[NH_HASH_BYTES]); + +/* wrapper to avoid indirect call to assembly, which doesn't work with CFI */ +static void _nh_neon(const u32 *key, const u8 *message, size_t message_len, + __le64 hash[NH_NUM_PASSES]) +{ + nh_neon(key, message, message_len, (u8 *)hash); +} + +static int nhpoly1305_neon_update(struct shash_desc *desc, + const u8 *src, unsigned int srclen) +{ + if (srclen < 64 || !may_use_simd()) + return crypto_nhpoly1305_update(desc, src, srclen); + + do { + unsigned int n = min_t(unsigned int, srclen, PAGE_SIZE); + + kernel_neon_begin(); + crypto_nhpoly1305_update_helper(desc, src, n, _nh_neon); + kernel_neon_end(); + src += n; + srclen -= n; + } while (srclen); + return 0; +} + +static struct shash_alg nhpoly1305_alg = { + .base.cra_name = "nhpoly1305", + .base.cra_driver_name = "nhpoly1305-neon", + .base.cra_priority = 200, + .base.cra_ctxsize = sizeof(struct nhpoly1305_key), + .base.cra_module = THIS_MODULE, + .digestsize = POLY1305_DIGEST_SIZE, + .init = crypto_nhpoly1305_init, + .update = nhpoly1305_neon_update, + .final = crypto_nhpoly1305_final, + .setkey = crypto_nhpoly1305_setkey, + .descsize = sizeof(struct nhpoly1305_state), +}; + +static int __init nhpoly1305_mod_init(void) +{ + if (!(elf_hwcap & HWCAP_ASIMD)) + return -ENODEV; + + return crypto_register_shash(&nhpoly1305_alg); +} + +static void __exit nhpoly1305_mod_exit(void) +{ + crypto_unregister_shash(&nhpoly1305_alg); +} + +module_init(nhpoly1305_mod_init); +module_exit(nhpoly1305_mod_exit); + +MODULE_DESCRIPTION("NHPoly1305 ε-almost-∆-universal hash function (NEON-accelerated)"); +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR("Eric Biggers "); +MODULE_ALIAS_CRYPTO("nhpoly1305"); +MODULE_ALIAS_CRYPTO("nhpoly1305-neon"); -- cgit v1.2.3-59-g8ed1b From cc7cf991e9eb54cac7733dc7d8f3a8591ba6e1c3 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 3 Dec 2018 19:52:50 -0800 Subject: crypto: arm64/chacha20 - add XChaCha20 support Add an XChaCha20 implementation that is hooked up to the ARM64 NEON implementation of ChaCha20. This can be used by Adiantum. A NEON implementation of single-block HChaCha20 is also added so that XChaCha20 can use it rather than the generic implementation. This required refactoring the ChaCha20 permutation into its own function. Signed-off-by: Eric Biggers Reviewed-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- arch/arm64/crypto/Kconfig | 2 +- arch/arm64/crypto/chacha20-neon-core.S | 65 +++++++++++++++------ arch/arm64/crypto/chacha20-neon-glue.c | 101 +++++++++++++++++++++++++-------- 3 files changed, 125 insertions(+), 43 deletions(-) diff --git a/arch/arm64/crypto/Kconfig b/arch/arm64/crypto/Kconfig index 3f5aeb786192..d54ddb8468ef 100644 --- a/arch/arm64/crypto/Kconfig +++ b/arch/arm64/crypto/Kconfig @@ -101,7 +101,7 @@ config CRYPTO_AES_ARM64_NEON_BLK select CRYPTO_SIMD config CRYPTO_CHACHA20_NEON - tristate "NEON accelerated ChaCha20 symmetric cipher" + tristate "ChaCha20 and XChaCha20 stream ciphers using NEON instructions" depends on KERNEL_MODE_NEON select CRYPTO_BLKCIPHER select CRYPTO_CHACHA20 diff --git a/arch/arm64/crypto/chacha20-neon-core.S b/arch/arm64/crypto/chacha20-neon-core.S index 13c85e272c2a..0571e45a1a0a 100644 --- a/arch/arm64/crypto/chacha20-neon-core.S +++ b/arch/arm64/crypto/chacha20-neon-core.S @@ -23,25 +23,20 @@ .text .align 6 -ENTRY(chacha20_block_xor_neon) - // x0: Input state matrix, s - // x1: 1 data block output, o - // x2: 1 data block input, i - - // - // This function encrypts one ChaCha20 block by loading the state matrix - // in four NEON registers. It performs matrix operation on four words in - // parallel, but requires shuffling to rearrange the words after each - // round. - // - - // x0..3 = s0..3 - adr x3, ROT8 - ld1 {v0.4s-v3.4s}, [x0] - ld1 {v8.4s-v11.4s}, [x0] - ld1 {v12.4s}, [x3] +/* + * chacha20_permute - permute one block + * + * Permute one 64-byte block where the state matrix is stored in the four NEON + * registers v0-v3. It performs matrix operations on four words in parallel, + * but requires shuffling to rearrange the words after each round. + * + * Clobbers: x3, x10, v4, v12 + */ +chacha20_permute: mov x3, #10 + adr x10, ROT8 + ld1 {v12.4s}, [x10] .Ldoubleround: // x0 += x1, x3 = rotl32(x3 ^ x0, 16) @@ -105,6 +100,23 @@ ENTRY(chacha20_block_xor_neon) subs x3, x3, #1 b.ne .Ldoubleround + ret +ENDPROC(chacha20_permute) + +ENTRY(chacha20_block_xor_neon) + // x0: Input state matrix, s + // x1: 1 data block output, o + // x2: 1 data block input, i + + stp x29, x30, [sp, #-16]! + mov x29, sp + + // x0..3 = s0..3 + ld1 {v0.4s-v3.4s}, [x0] + ld1 {v8.4s-v11.4s}, [x0] + + bl chacha20_permute + ld1 {v4.16b-v7.16b}, [x2] // o0 = i0 ^ (x0 + s0) @@ -125,9 +137,28 @@ ENTRY(chacha20_block_xor_neon) st1 {v0.16b-v3.16b}, [x1] + ldp x29, x30, [sp], #16 ret ENDPROC(chacha20_block_xor_neon) +ENTRY(hchacha20_block_neon) + // x0: Input state matrix, s + // x1: output (8 32-bit words) + + stp x29, x30, [sp, #-16]! + mov x29, sp + + ld1 {v0.4s-v3.4s}, [x0] + + bl chacha20_permute + + st1 {v0.16b}, [x1], #16 + st1 {v3.16b}, [x1] + + ldp x29, x30, [sp], #16 + ret +ENDPROC(hchacha20_block_neon) + .align 6 ENTRY(chacha20_4block_xor_neon) // x0: Input state matrix, s diff --git a/arch/arm64/crypto/chacha20-neon-glue.c b/arch/arm64/crypto/chacha20-neon-glue.c index 96e0cfb8c3f5..a5b9cbc0c4de 100644 --- a/arch/arm64/crypto/chacha20-neon-glue.c +++ b/arch/arm64/crypto/chacha20-neon-glue.c @@ -30,6 +30,7 @@ asmlinkage void chacha20_block_xor_neon(u32 *state, u8 *dst, const u8 *src); asmlinkage void chacha20_4block_xor_neon(u32 *state, u8 *dst, const u8 *src); +asmlinkage void hchacha20_block_neon(const u32 *state, u32 *out); static void chacha20_doneon(u32 *state, u8 *dst, const u8 *src, unsigned int bytes) @@ -65,20 +66,16 @@ static void chacha20_doneon(u32 *state, u8 *dst, const u8 *src, kernel_neon_end(); } -static int chacha20_neon(struct skcipher_request *req) +static int chacha20_neon_stream_xor(struct skcipher_request *req, + struct chacha_ctx *ctx, u8 *iv) { - struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm); struct skcipher_walk walk; u32 state[16]; int err; - if (!may_use_simd() || req->cryptlen <= CHACHA_BLOCK_SIZE) - return crypto_chacha_crypt(req); - err = skcipher_walk_virt(&walk, req, false); - crypto_chacha_init(state, ctx, walk.iv); + crypto_chacha_init(state, ctx, iv); while (walk.nbytes > 0) { unsigned int nbytes = walk.nbytes; @@ -94,22 +91,73 @@ static int chacha20_neon(struct skcipher_request *req) return err; } -static struct skcipher_alg alg = { - .base.cra_name = "chacha20", - .base.cra_driver_name = "chacha20-neon", - .base.cra_priority = 300, - .base.cra_blocksize = 1, - .base.cra_ctxsize = sizeof(struct chacha_ctx), - .base.cra_module = THIS_MODULE, - - .min_keysize = CHACHA_KEY_SIZE, - .max_keysize = CHACHA_KEY_SIZE, - .ivsize = CHACHA_IV_SIZE, - .chunksize = CHACHA_BLOCK_SIZE, - .walksize = 4 * CHACHA_BLOCK_SIZE, - .setkey = crypto_chacha20_setkey, - .encrypt = chacha20_neon, - .decrypt = chacha20_neon, +static int chacha20_neon(struct skcipher_request *req) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm); + + if (req->cryptlen <= CHACHA_BLOCK_SIZE || !may_use_simd()) + return crypto_chacha_crypt(req); + + return chacha20_neon_stream_xor(req, ctx, req->iv); +} + +static int xchacha20_neon(struct skcipher_request *req) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm); + struct chacha_ctx subctx; + u32 state[16]; + u8 real_iv[16]; + + if (req->cryptlen <= CHACHA_BLOCK_SIZE || !may_use_simd()) + return crypto_xchacha_crypt(req); + + crypto_chacha_init(state, ctx, req->iv); + + kernel_neon_begin(); + hchacha20_block_neon(state, subctx.key); + kernel_neon_end(); + + memcpy(&real_iv[0], req->iv + 24, 8); + memcpy(&real_iv[8], req->iv + 16, 8); + return chacha20_neon_stream_xor(req, &subctx, real_iv); +} + +static struct skcipher_alg algs[] = { + { + .base.cra_name = "chacha20", + .base.cra_driver_name = "chacha20-neon", + .base.cra_priority = 300, + .base.cra_blocksize = 1, + .base.cra_ctxsize = sizeof(struct chacha_ctx), + .base.cra_module = THIS_MODULE, + + .min_keysize = CHACHA_KEY_SIZE, + .max_keysize = CHACHA_KEY_SIZE, + .ivsize = CHACHA_IV_SIZE, + .chunksize = CHACHA_BLOCK_SIZE, + .walksize = 4 * CHACHA_BLOCK_SIZE, + .setkey = crypto_chacha20_setkey, + .encrypt = chacha20_neon, + .decrypt = chacha20_neon, + }, { + .base.cra_name = "xchacha20", + .base.cra_driver_name = "xchacha20-neon", + .base.cra_priority = 300, + .base.cra_blocksize = 1, + .base.cra_ctxsize = sizeof(struct chacha_ctx), + .base.cra_module = THIS_MODULE, + + .min_keysize = CHACHA_KEY_SIZE, + .max_keysize = CHACHA_KEY_SIZE, + .ivsize = XCHACHA_IV_SIZE, + .chunksize = CHACHA_BLOCK_SIZE, + .walksize = 4 * CHACHA_BLOCK_SIZE, + .setkey = crypto_chacha20_setkey, + .encrypt = xchacha20_neon, + .decrypt = xchacha20_neon, + } }; static int __init chacha20_simd_mod_init(void) @@ -117,12 +165,12 @@ static int __init chacha20_simd_mod_init(void) if (!(elf_hwcap & HWCAP_ASIMD)) return -ENODEV; - return crypto_register_skcipher(&alg); + return crypto_register_skciphers(algs, ARRAY_SIZE(algs)); } static void __exit chacha20_simd_mod_fini(void) { - crypto_unregister_skcipher(&alg); + crypto_unregister_skciphers(algs, ARRAY_SIZE(algs)); } module_init(chacha20_simd_mod_init); @@ -131,3 +179,6 @@ module_exit(chacha20_simd_mod_fini); MODULE_AUTHOR("Ard Biesheuvel "); MODULE_LICENSE("GPL v2"); MODULE_ALIAS_CRYPTO("chacha20"); +MODULE_ALIAS_CRYPTO("chacha20-neon"); +MODULE_ALIAS_CRYPTO("xchacha20"); +MODULE_ALIAS_CRYPTO("xchacha20-neon"); -- cgit v1.2.3-59-g8ed1b From 95a34b779e2a45b14e73cee1e7eec11870efb2ea Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 3 Dec 2018 19:52:51 -0800 Subject: crypto: arm64/chacha20 - refactor to allow varying number of rounds In preparation for adding XChaCha12 support, rename/refactor the ARM64 NEON implementation of ChaCha20 to support different numbers of rounds. Reviewed-by: Ard Biesheuvel Signed-off-by: Eric Biggers Signed-off-by: Herbert Xu --- arch/arm64/crypto/Makefile | 4 +- arch/arm64/crypto/chacha-neon-core.S | 484 +++++++++++++++++++++++++++++++++ arch/arm64/crypto/chacha-neon-glue.c | 189 +++++++++++++ arch/arm64/crypto/chacha20-neon-core.S | 481 -------------------------------- arch/arm64/crypto/chacha20-neon-glue.c | 184 ------------- 5 files changed, 675 insertions(+), 667 deletions(-) create mode 100644 arch/arm64/crypto/chacha-neon-core.S create mode 100644 arch/arm64/crypto/chacha-neon-glue.c delete mode 100644 arch/arm64/crypto/chacha20-neon-core.S delete mode 100644 arch/arm64/crypto/chacha20-neon-glue.c diff --git a/arch/arm64/crypto/Makefile b/arch/arm64/crypto/Makefile index 125dbb10a93e..a4ffd9fe3265 100644 --- a/arch/arm64/crypto/Makefile +++ b/arch/arm64/crypto/Makefile @@ -50,8 +50,8 @@ sha256-arm64-y := sha256-glue.o sha256-core.o obj-$(CONFIG_CRYPTO_SHA512_ARM64) += sha512-arm64.o sha512-arm64-y := sha512-glue.o sha512-core.o -obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha20-neon.o -chacha20-neon-y := chacha20-neon-core.o chacha20-neon-glue.o +obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha-neon.o +chacha-neon-y := chacha-neon-core.o chacha-neon-glue.o obj-$(CONFIG_CRYPTO_NHPOLY1305_NEON) += nhpoly1305-neon.o nhpoly1305-neon-y := nh-neon-core.o nhpoly1305-neon-glue.o diff --git a/arch/arm64/crypto/chacha-neon-core.S b/arch/arm64/crypto/chacha-neon-core.S new file mode 100644 index 000000000000..3d3a12db5204 --- /dev/null +++ b/arch/arm64/crypto/chacha-neon-core.S @@ -0,0 +1,484 @@ +/* + * ChaCha/XChaCha NEON helper functions + * + * Copyright (C) 2016 Linaro, Ltd. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Based on: + * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSSE3 functions + * + * Copyright (C) 2015 Martin Willi + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include + + .text + .align 6 + +/* + * chacha_permute - permute one block + * + * Permute one 64-byte block where the state matrix is stored in the four NEON + * registers v0-v3. It performs matrix operations on four words in parallel, + * but requires shuffling to rearrange the words after each round. + * + * The round count is given in w3. + * + * Clobbers: w3, x10, v4, v12 + */ +chacha_permute: + + adr x10, ROT8 + ld1 {v12.4s}, [x10] + +.Ldoubleround: + // x0 += x1, x3 = rotl32(x3 ^ x0, 16) + add v0.4s, v0.4s, v1.4s + eor v3.16b, v3.16b, v0.16b + rev32 v3.8h, v3.8h + + // x2 += x3, x1 = rotl32(x1 ^ x2, 12) + add v2.4s, v2.4s, v3.4s + eor v4.16b, v1.16b, v2.16b + shl v1.4s, v4.4s, #12 + sri v1.4s, v4.4s, #20 + + // x0 += x1, x3 = rotl32(x3 ^ x0, 8) + add v0.4s, v0.4s, v1.4s + eor v3.16b, v3.16b, v0.16b + tbl v3.16b, {v3.16b}, v12.16b + + // x2 += x3, x1 = rotl32(x1 ^ x2, 7) + add v2.4s, v2.4s, v3.4s + eor v4.16b, v1.16b, v2.16b + shl v1.4s, v4.4s, #7 + sri v1.4s, v4.4s, #25 + + // x1 = shuffle32(x1, MASK(0, 3, 2, 1)) + ext v1.16b, v1.16b, v1.16b, #4 + // x2 = shuffle32(x2, MASK(1, 0, 3, 2)) + ext v2.16b, v2.16b, v2.16b, #8 + // x3 = shuffle32(x3, MASK(2, 1, 0, 3)) + ext v3.16b, v3.16b, v3.16b, #12 + + // x0 += x1, x3 = rotl32(x3 ^ x0, 16) + add v0.4s, v0.4s, v1.4s + eor v3.16b, v3.16b, v0.16b + rev32 v3.8h, v3.8h + + // x2 += x3, x1 = rotl32(x1 ^ x2, 12) + add v2.4s, v2.4s, v3.4s + eor v4.16b, v1.16b, v2.16b + shl v1.4s, v4.4s, #12 + sri v1.4s, v4.4s, #20 + + // x0 += x1, x3 = rotl32(x3 ^ x0, 8) + add v0.4s, v0.4s, v1.4s + eor v3.16b, v3.16b, v0.16b + tbl v3.16b, {v3.16b}, v12.16b + + // x2 += x3, x1 = rotl32(x1 ^ x2, 7) + add v2.4s, v2.4s, v3.4s + eor v4.16b, v1.16b, v2.16b + shl v1.4s, v4.4s, #7 + sri v1.4s, v4.4s, #25 + + // x1 = shuffle32(x1, MASK(2, 1, 0, 3)) + ext v1.16b, v1.16b, v1.16b, #12 + // x2 = shuffle32(x2, MASK(1, 0, 3, 2)) + ext v2.16b, v2.16b, v2.16b, #8 + // x3 = shuffle32(x3, MASK(0, 3, 2, 1)) + ext v3.16b, v3.16b, v3.16b, #4 + + subs w3, w3, #2 + b.ne .Ldoubleround + + ret +ENDPROC(chacha_permute) + +ENTRY(chacha_block_xor_neon) + // x0: Input state matrix, s + // x1: 1 data block output, o + // x2: 1 data block input, i + // w3: nrounds + + stp x29, x30, [sp, #-16]! + mov x29, sp + + // x0..3 = s0..3 + ld1 {v0.4s-v3.4s}, [x0] + ld1 {v8.4s-v11.4s}, [x0] + + bl chacha_permute + + ld1 {v4.16b-v7.16b}, [x2] + + // o0 = i0 ^ (x0 + s0) + add v0.4s, v0.4s, v8.4s + eor v0.16b, v0.16b, v4.16b + + // o1 = i1 ^ (x1 + s1) + add v1.4s, v1.4s, v9.4s + eor v1.16b, v1.16b, v5.16b + + // o2 = i2 ^ (x2 + s2) + add v2.4s, v2.4s, v10.4s + eor v2.16b, v2.16b, v6.16b + + // o3 = i3 ^ (x3 + s3) + add v3.4s, v3.4s, v11.4s + eor v3.16b, v3.16b, v7.16b + + st1 {v0.16b-v3.16b}, [x1] + + ldp x29, x30, [sp], #16 + ret +ENDPROC(chacha_block_xor_neon) + +ENTRY(hchacha_block_neon) + // x0: Input state matrix, s + // x1: output (8 32-bit words) + // w2: nrounds + + stp x29, x30, [sp, #-16]! + mov x29, sp + + ld1 {v0.4s-v3.4s}, [x0] + + mov w3, w2 + bl chacha_permute + + st1 {v0.16b}, [x1], #16 + st1 {v3.16b}, [x1] + + ldp x29, x30, [sp], #16 + ret +ENDPROC(hchacha_block_neon) + + .align 6 +ENTRY(chacha_4block_xor_neon) + // x0: Input state matrix, s + // x1: 4 data blocks output, o + // x2: 4 data blocks input, i + // w3: nrounds + + // + // This function encrypts four consecutive ChaCha blocks by loading + // the state matrix in NEON registers four times. The algorithm performs + // each operation on the corresponding word of each state matrix, hence + // requires no word shuffling. For final XORing step we transpose the + // matrix by interleaving 32- and then 64-bit words, which allows us to + // do XOR in NEON registers. + // + adr x9, CTRINC // ... and ROT8 + ld1 {v30.4s-v31.4s}, [x9] + + // x0..15[0-3] = s0..3[0..3] + mov x4, x0 + ld4r { v0.4s- v3.4s}, [x4], #16 + ld4r { v4.4s- v7.4s}, [x4], #16 + ld4r { v8.4s-v11.4s}, [x4], #16 + ld4r {v12.4s-v15.4s}, [x4] + + // x12 += counter values 0-3 + add v12.4s, v12.4s, v30.4s + +.Ldoubleround4: + // x0 += x4, x12 = rotl32(x12 ^ x0, 16) + // x1 += x5, x13 = rotl32(x13 ^ x1, 16) + // x2 += x6, x14 = rotl32(x14 ^ x2, 16) + // x3 += x7, x15 = rotl32(x15 ^ x3, 16) + add v0.4s, v0.4s, v4.4s + add v1.4s, v1.4s, v5.4s + add v2.4s, v2.4s, v6.4s + add v3.4s, v3.4s, v7.4s + + eor v12.16b, v12.16b, v0.16b + eor v13.16b, v13.16b, v1.16b + eor v14.16b, v14.16b, v2.16b + eor v15.16b, v15.16b, v3.16b + + rev32 v12.8h, v12.8h + rev32 v13.8h, v13.8h + rev32 v14.8h, v14.8h + rev32 v15.8h, v15.8h + + // x8 += x12, x4 = rotl32(x4 ^ x8, 12) + // x9 += x13, x5 = rotl32(x5 ^ x9, 12) + // x10 += x14, x6 = rotl32(x6 ^ x10, 12) + // x11 += x15, x7 = rotl32(x7 ^ x11, 12) + add v8.4s, v8.4s, v12.4s + add v9.4s, v9.4s, v13.4s + add v10.4s, v10.4s, v14.4s + add v11.4s, v11.4s, v15.4s + + eor v16.16b, v4.16b, v8.16b + eor v17.16b, v5.16b, v9.16b + eor v18.16b, v6.16b, v10.16b + eor v19.16b, v7.16b, v11.16b + + shl v4.4s, v16.4s, #12 + shl v5.4s, v17.4s, #12 + shl v6.4s, v18.4s, #12 + shl v7.4s, v19.4s, #12 + + sri v4.4s, v16.4s, #20 + sri v5.4s, v17.4s, #20 + sri v6.4s, v18.4s, #20 + sri v7.4s, v19.4s, #20 + + // x0 += x4, x12 = rotl32(x12 ^ x0, 8) + // x1 += x5, x13 = rotl32(x13 ^ x1, 8) + // x2 += x6, x14 = rotl32(x14 ^ x2, 8) + // x3 += x7, x15 = rotl32(x15 ^ x3, 8) + add v0.4s, v0.4s, v4.4s + add v1.4s, v1.4s, v5.4s + add v2.4s, v2.4s, v6.4s + add v3.4s, v3.4s, v7.4s + + eor v12.16b, v12.16b, v0.16b + eor v13.16b, v13.16b, v1.16b + eor v14.16b, v14.16b, v2.16b + eor v15.16b, v15.16b, v3.16b + + tbl v12.16b, {v12.16b}, v31.16b + tbl v13.16b, {v13.16b}, v31.16b + tbl v14.16b, {v14.16b}, v31.16b + tbl v15.16b, {v15.16b}, v31.16b + + // x8 += x12, x4 = rotl32(x4 ^ x8, 7) + // x9 += x13, x5 = rotl32(x5 ^ x9, 7) + // x10 += x14, x6 = rotl32(x6 ^ x10, 7) + // x11 += x15, x7 = rotl32(x7 ^ x11, 7) + add v8.4s, v8.4s, v12.4s + add v9.4s, v9.4s, v13.4s + add v10.4s, v10.4s, v14.4s + add v11.4s, v11.4s, v15.4s + + eor v16.16b, v4.16b, v8.16b + eor v17.16b, v5.16b, v9.16b + eor v18.16b, v6.16b, v10.16b + eor v19.16b, v7.16b, v11.16b + + shl v4.4s, v16.4s, #7 + shl v5.4s, v17.4s, #7 + shl v6.4s, v18.4s, #7 + shl v7.4s, v19.4s, #7 + + sri v4.4s, v16.4s, #25 + sri v5.4s, v17.4s, #25 + sri v6.4s, v18.4s, #25 + sri v7.4s, v19.4s, #25 + + // x0 += x5, x15 = rotl32(x15 ^ x0, 16) + // x1 += x6, x12 = rotl32(x12 ^ x1, 16) + // x2 += x7, x13 = rotl32(x13 ^ x2, 16) + // x3 += x4, x14 = rotl32(x14 ^ x3, 16) + add v0.4s, v0.4s, v5.4s + add v1.4s, v1.4s, v6.4s + add v2.4s, v2.4s, v7.4s + add v3.4s, v3.4s, v4.4s + + eor v15.16b, v15.16b, v0.16b + eor v12.16b, v12.16b, v1.16b + eor v13.16b, v13.16b, v2.16b + eor v14.16b, v14.16b, v3.16b + + rev32 v15.8h, v15.8h + rev32 v12.8h, v12.8h + rev32 v13.8h, v13.8h + rev32 v14.8h, v14.8h + + // x10 += x15, x5 = rotl32(x5 ^ x10, 12) + // x11 += x12, x6 = rotl32(x6 ^ x11, 12) + // x8 += x13, x7 = rotl32(x7 ^ x8, 12) + // x9 += x14, x4 = rotl32(x4 ^ x9, 12) + add v10.4s, v10.4s, v15.4s + add v11.4s, v11.4s, v12.4s + add v8.4s, v8.4s, v13.4s + add v9.4s, v9.4s, v14.4s + + eor v16.16b, v5.16b, v10.16b + eor v17.16b, v6.16b, v11.16b + eor v18.16b, v7.16b, v8.16b + eor v19.16b, v4.16b, v9.16b + + shl v5.4s, v16.4s, #12 + shl v6.4s, v17.4s, #12 + shl v7.4s, v18.4s, #12 + shl v4.4s, v19.4s, #12 + + sri v5.4s, v16.4s, #20 + sri v6.4s, v17.4s, #20 + sri v7.4s, v18.4s, #20 + sri v4.4s, v19.4s, #20 + + // x0 += x5, x15 = rotl32(x15 ^ x0, 8) + // x1 += x6, x12 = rotl32(x12 ^ x1, 8) + // x2 += x7, x13 = rotl32(x13 ^ x2, 8) + // x3 += x4, x14 = rotl32(x14 ^ x3, 8) + add v0.4s, v0.4s, v5.4s + add v1.4s, v1.4s, v6.4s + add v2.4s, v2.4s, v7.4s + add v3.4s, v3.4s, v4.4s + + eor v15.16b, v15.16b, v0.16b + eor v12.16b, v12.16b, v1.16b + eor v13.16b, v13.16b, v2.16b + eor v14.16b, v14.16b, v3.16b + + tbl v15.16b, {v15.16b}, v31.16b + tbl v12.16b, {v12.16b}, v31.16b + tbl v13.16b, {v13.16b}, v31.16b + tbl v14.16b, {v14.16b}, v31.16b + + // x10 += x15, x5 = rotl32(x5 ^ x10, 7) + // x11 += x12, x6 = rotl32(x6 ^ x11, 7) + // x8 += x13, x7 = rotl32(x7 ^ x8, 7) + // x9 += x14, x4 = rotl32(x4 ^ x9, 7) + add v10.4s, v10.4s, v15.4s + add v11.4s, v11.4s, v12.4s + add v8.4s, v8.4s, v13.4s + add v9.4s, v9.4s, v14.4s + + eor v16.16b, v5.16b, v10.16b + eor v17.16b, v6.16b, v11.16b + eor v18.16b, v7.16b, v8.16b + eor v19.16b, v4.16b, v9.16b + + shl v5.4s, v16.4s, #7 + shl v6.4s, v17.4s, #7 + shl v7.4s, v18.4s, #7 + shl v4.4s, v19.4s, #7 + + sri v5.4s, v16.4s, #25 + sri v6.4s, v17.4s, #25 + sri v7.4s, v18.4s, #25 + sri v4.4s, v19.4s, #25 + + subs w3, w3, #2 + b.ne .Ldoubleround4 + + ld4r {v16.4s-v19.4s}, [x0], #16 + ld4r {v20.4s-v23.4s}, [x0], #16 + + // x12 += counter values 0-3 + add v12.4s, v12.4s, v30.4s + + // x0[0-3] += s0[0] + // x1[0-3] += s0[1] + // x2[0-3] += s0[2] + // x3[0-3] += s0[3] + add v0.4s, v0.4s, v16.4s + add v1.4s, v1.4s, v17.4s + add v2.4s, v2.4s, v18.4s + add v3.4s, v3.4s, v19.4s + + ld4r {v24.4s-v27.4s}, [x0], #16 + ld4r {v28.4s-v31.4s}, [x0] + + // x4[0-3] += s1[0] + // x5[0-3] += s1[1] + // x6[0-3] += s1[2] + // x7[0-3] += s1[3] + add v4.4s, v4.4s, v20.4s + add v5.4s, v5.4s, v21.4s + add v6.4s, v6.4s, v22.4s + add v7.4s, v7.4s, v23.4s + + // x8[0-3] += s2[0] + // x9[0-3] += s2[1] + // x10[0-3] += s2[2] + // x11[0-3] += s2[3] + add v8.4s, v8.4s, v24.4s + add v9.4s, v9.4s, v25.4s + add v10.4s, v10.4s, v26.4s + add v11.4s, v11.4s, v27.4s + + // x12[0-3] += s3[0] + // x13[0-3] += s3[1] + // x14[0-3] += s3[2] + // x15[0-3] += s3[3] + add v12.4s, v12.4s, v28.4s + add v13.4s, v13.4s, v29.4s + add v14.4s, v14.4s, v30.4s + add v15.4s, v15.4s, v31.4s + + // interleave 32-bit words in state n, n+1 + zip1 v16.4s, v0.4s, v1.4s + zip2 v17.4s, v0.4s, v1.4s + zip1 v18.4s, v2.4s, v3.4s + zip2 v19.4s, v2.4s, v3.4s + zip1 v20.4s, v4.4s, v5.4s + zip2 v21.4s, v4.4s, v5.4s + zip1 v22.4s, v6.4s, v7.4s + zip2 v23.4s, v6.4s, v7.4s + zip1 v24.4s, v8.4s, v9.4s + zip2 v25.4s, v8.4s, v9.4s + zip1 v26.4s, v10.4s, v11.4s + zip2 v27.4s, v10.4s, v11.4s + zip1 v28.4s, v12.4s, v13.4s + zip2 v29.4s, v12.4s, v13.4s + zip1 v30.4s, v14.4s, v15.4s + zip2 v31.4s, v14.4s, v15.4s + + // interleave 64-bit words in state n, n+2 + zip1 v0.2d, v16.2d, v18.2d + zip2 v4.2d, v16.2d, v18.2d + zip1 v8.2d, v17.2d, v19.2d + zip2 v12.2d, v17.2d, v19.2d + ld1 {v16.16b-v19.16b}, [x2], #64 + + zip1 v1.2d, v20.2d, v22.2d + zip2 v5.2d, v20.2d, v22.2d + zip1 v9.2d, v21.2d, v23.2d + zip2 v13.2d, v21.2d, v23.2d + ld1 {v20.16b-v23.16b}, [x2], #64 + + zip1 v2.2d, v24.2d, v26.2d + zip2 v6.2d, v24.2d, v26.2d + zip1 v10.2d, v25.2d, v27.2d + zip2 v14.2d, v25.2d, v27.2d + ld1 {v24.16b-v27.16b}, [x2], #64 + + zip1 v3.2d, v28.2d, v30.2d + zip2 v7.2d, v28.2d, v30.2d + zip1 v11.2d, v29.2d, v31.2d + zip2 v15.2d, v29.2d, v31.2d + ld1 {v28.16b-v31.16b}, [x2] + + // xor with corresponding input, write to output + eor v16.16b, v16.16b, v0.16b + eor v17.16b, v17.16b, v1.16b + eor v18.16b, v18.16b, v2.16b + eor v19.16b, v19.16b, v3.16b + eor v20.16b, v20.16b, v4.16b + eor v21.16b, v21.16b, v5.16b + st1 {v16.16b-v19.16b}, [x1], #64 + eor v22.16b, v22.16b, v6.16b + eor v23.16b, v23.16b, v7.16b + eor v24.16b, v24.16b, v8.16b + eor v25.16b, v25.16b, v9.16b + st1 {v20.16b-v23.16b}, [x1], #64 + eor v26.16b, v26.16b, v10.16b + eor v27.16b, v27.16b, v11.16b + eor v28.16b, v28.16b, v12.16b + st1 {v24.16b-v27.16b}, [x1], #64 + eor v29.16b, v29.16b, v13.16b + eor v30.16b, v30.16b, v14.16b + eor v31.16b, v31.16b, v15.16b + st1 {v28.16b-v31.16b}, [x1] + + ret +ENDPROC(chacha_4block_xor_neon) + +CTRINC: .word 0, 1, 2, 3 +ROT8: .word 0x02010003, 0x06050407, 0x0a09080b, 0x0e0d0c0f diff --git a/arch/arm64/crypto/chacha-neon-glue.c b/arch/arm64/crypto/chacha-neon-glue.c new file mode 100644 index 000000000000..4d992029b912 --- /dev/null +++ b/arch/arm64/crypto/chacha-neon-glue.c @@ -0,0 +1,189 @@ +/* + * ARM NEON accelerated ChaCha and XChaCha stream ciphers, + * including ChaCha20 (RFC7539) + * + * Copyright (C) 2016 - 2017 Linaro, Ltd. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Based on: + * ChaCha20 256-bit cipher algorithm, RFC7539, SIMD glue code + * + * Copyright (C) 2015 Martin Willi + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include +#include +#include +#include +#include + +#include +#include +#include + +asmlinkage void chacha_block_xor_neon(u32 *state, u8 *dst, const u8 *src, + int nrounds); +asmlinkage void chacha_4block_xor_neon(u32 *state, u8 *dst, const u8 *src, + int nrounds); +asmlinkage void hchacha_block_neon(const u32 *state, u32 *out, int nrounds); + +static void chacha_doneon(u32 *state, u8 *dst, const u8 *src, + unsigned int bytes, int nrounds) +{ + u8 buf[CHACHA_BLOCK_SIZE]; + + while (bytes >= CHACHA_BLOCK_SIZE * 4) { + kernel_neon_begin(); + chacha_4block_xor_neon(state, dst, src, nrounds); + kernel_neon_end(); + bytes -= CHACHA_BLOCK_SIZE * 4; + src += CHACHA_BLOCK_SIZE * 4; + dst += CHACHA_BLOCK_SIZE * 4; + state[12] += 4; + } + + if (!bytes) + return; + + kernel_neon_begin(); + while (bytes >= CHACHA_BLOCK_SIZE) { + chacha_block_xor_neon(state, dst, src, nrounds); + bytes -= CHACHA_BLOCK_SIZE; + src += CHACHA_BLOCK_SIZE; + dst += CHACHA_BLOCK_SIZE; + state[12]++; + } + if (bytes) { + memcpy(buf, src, bytes); + chacha_block_xor_neon(state, buf, buf, nrounds); + memcpy(dst, buf, bytes); + } + kernel_neon_end(); +} + +static int chacha_neon_stream_xor(struct skcipher_request *req, + struct chacha_ctx *ctx, u8 *iv) +{ + struct skcipher_walk walk; + u32 state[16]; + int err; + + err = skcipher_walk_virt(&walk, req, false); + + crypto_chacha_init(state, ctx, iv); + + while (walk.nbytes > 0) { + unsigned int nbytes = walk.nbytes; + + if (nbytes < walk.total) + nbytes = round_down(nbytes, walk.stride); + + chacha_doneon(state, walk.dst.virt.addr, walk.src.virt.addr, + nbytes, ctx->nrounds); + err = skcipher_walk_done(&walk, walk.nbytes - nbytes); + } + + return err; +} + +static int chacha_neon(struct skcipher_request *req) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm); + + if (req->cryptlen <= CHACHA_BLOCK_SIZE || !may_use_simd()) + return crypto_chacha_crypt(req); + + return chacha_neon_stream_xor(req, ctx, req->iv); +} + +static int xchacha_neon(struct skcipher_request *req) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm); + struct chacha_ctx subctx; + u32 state[16]; + u8 real_iv[16]; + + if (req->cryptlen <= CHACHA_BLOCK_SIZE || !may_use_simd()) + return crypto_xchacha_crypt(req); + + crypto_chacha_init(state, ctx, req->iv); + + kernel_neon_begin(); + hchacha_block_neon(state, subctx.key, ctx->nrounds); + kernel_neon_end(); + subctx.nrounds = ctx->nrounds; + + memcpy(&real_iv[0], req->iv + 24, 8); + memcpy(&real_iv[8], req->iv + 16, 8); + return chacha_neon_stream_xor(req, &subctx, real_iv); +} + +static struct skcipher_alg algs[] = { + { + .base.cra_name = "chacha20", + .base.cra_driver_name = "chacha20-neon", + .base.cra_priority = 300, + .base.cra_blocksize = 1, + .base.cra_ctxsize = sizeof(struct chacha_ctx), + .base.cra_module = THIS_MODULE, + + .min_keysize = CHACHA_KEY_SIZE, + .max_keysize = CHACHA_KEY_SIZE, + .ivsize = CHACHA_IV_SIZE, + .chunksize = CHACHA_BLOCK_SIZE, + .walksize = 4 * CHACHA_BLOCK_SIZE, + .setkey = crypto_chacha20_setkey, + .encrypt = chacha_neon, + .decrypt = chacha_neon, + }, { + .base.cra_name = "xchacha20", + .base.cra_driver_name = "xchacha20-neon", + .base.cra_priority = 300, + .base.cra_blocksize = 1, + .base.cra_ctxsize = sizeof(struct chacha_ctx), + .base.cra_module = THIS_MODULE, + + .min_keysize = CHACHA_KEY_SIZE, + .max_keysize = CHACHA_KEY_SIZE, + .ivsize = XCHACHA_IV_SIZE, + .chunksize = CHACHA_BLOCK_SIZE, + .walksize = 4 * CHACHA_BLOCK_SIZE, + .setkey = crypto_chacha20_setkey, + .encrypt = xchacha_neon, + .decrypt = xchacha_neon, + } +}; + +static int __init chacha_simd_mod_init(void) +{ + if (!(elf_hwcap & HWCAP_ASIMD)) + return -ENODEV; + + return crypto_register_skciphers(algs, ARRAY_SIZE(algs)); +} + +static void __exit chacha_simd_mod_fini(void) +{ + crypto_unregister_skciphers(algs, ARRAY_SIZE(algs)); +} + +module_init(chacha_simd_mod_init); +module_exit(chacha_simd_mod_fini); + +MODULE_DESCRIPTION("ChaCha and XChaCha stream ciphers (NEON accelerated)"); +MODULE_AUTHOR("Ard Biesheuvel "); +MODULE_LICENSE("GPL v2"); +MODULE_ALIAS_CRYPTO("chacha20"); +MODULE_ALIAS_CRYPTO("chacha20-neon"); +MODULE_ALIAS_CRYPTO("xchacha20"); +MODULE_ALIAS_CRYPTO("xchacha20-neon"); diff --git a/arch/arm64/crypto/chacha20-neon-core.S b/arch/arm64/crypto/chacha20-neon-core.S deleted file mode 100644 index 0571e45a1a0a..000000000000 --- a/arch/arm64/crypto/chacha20-neon-core.S +++ /dev/null @@ -1,481 +0,0 @@ -/* - * ChaCha20 256-bit cipher algorithm, RFC7539, arm64 NEON functions - * - * Copyright (C) 2016 Linaro, Ltd. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * Based on: - * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSSE3 functions - * - * Copyright (C) 2015 Martin Willi - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - */ - -#include - - .text - .align 6 - -/* - * chacha20_permute - permute one block - * - * Permute one 64-byte block where the state matrix is stored in the four NEON - * registers v0-v3. It performs matrix operations on four words in parallel, - * but requires shuffling to rearrange the words after each round. - * - * Clobbers: x3, x10, v4, v12 - */ -chacha20_permute: - - mov x3, #10 - adr x10, ROT8 - ld1 {v12.4s}, [x10] - -.Ldoubleround: - // x0 += x1, x3 = rotl32(x3 ^ x0, 16) - add v0.4s, v0.4s, v1.4s - eor v3.16b, v3.16b, v0.16b - rev32 v3.8h, v3.8h - - // x2 += x3, x1 = rotl32(x1 ^ x2, 12) - add v2.4s, v2.4s, v3.4s - eor v4.16b, v1.16b, v2.16b - shl v1.4s, v4.4s, #12 - sri v1.4s, v4.4s, #20 - - // x0 += x1, x3 = rotl32(x3 ^ x0, 8) - add v0.4s, v0.4s, v1.4s - eor v3.16b, v3.16b, v0.16b - tbl v3.16b, {v3.16b}, v12.16b - - // x2 += x3, x1 = rotl32(x1 ^ x2, 7) - add v2.4s, v2.4s, v3.4s - eor v4.16b, v1.16b, v2.16b - shl v1.4s, v4.4s, #7 - sri v1.4s, v4.4s, #25 - - // x1 = shuffle32(x1, MASK(0, 3, 2, 1)) - ext v1.16b, v1.16b, v1.16b, #4 - // x2 = shuffle32(x2, MASK(1, 0, 3, 2)) - ext v2.16b, v2.16b, v2.16b, #8 - // x3 = shuffle32(x3, MASK(2, 1, 0, 3)) - ext v3.16b, v3.16b, v3.16b, #12 - - // x0 += x1, x3 = rotl32(x3 ^ x0, 16) - add v0.4s, v0.4s, v1.4s - eor v3.16b, v3.16b, v0.16b - rev32 v3.8h, v3.8h - - // x2 += x3, x1 = rotl32(x1 ^ x2, 12) - add v2.4s, v2.4s, v3.4s - eor v4.16b, v1.16b, v2.16b - shl v1.4s, v4.4s, #12 - sri v1.4s, v4.4s, #20 - - // x0 += x1, x3 = rotl32(x3 ^ x0, 8) - add v0.4s, v0.4s, v1.4s - eor v3.16b, v3.16b, v0.16b - tbl v3.16b, {v3.16b}, v12.16b - - // x2 += x3, x1 = rotl32(x1 ^ x2, 7) - add v2.4s, v2.4s, v3.4s - eor v4.16b, v1.16b, v2.16b - shl v1.4s, v4.4s, #7 - sri v1.4s, v4.4s, #25 - - // x1 = shuffle32(x1, MASK(2, 1, 0, 3)) - ext v1.16b, v1.16b, v1.16b, #12 - // x2 = shuffle32(x2, MASK(1, 0, 3, 2)) - ext v2.16b, v2.16b, v2.16b, #8 - // x3 = shuffle32(x3, MASK(0, 3, 2, 1)) - ext v3.16b, v3.16b, v3.16b, #4 - - subs x3, x3, #1 - b.ne .Ldoubleround - - ret -ENDPROC(chacha20_permute) - -ENTRY(chacha20_block_xor_neon) - // x0: Input state matrix, s - // x1: 1 data block output, o - // x2: 1 data block input, i - - stp x29, x30, [sp, #-16]! - mov x29, sp - - // x0..3 = s0..3 - ld1 {v0.4s-v3.4s}, [x0] - ld1 {v8.4s-v11.4s}, [x0] - - bl chacha20_permute - - ld1 {v4.16b-v7.16b}, [x2] - - // o0 = i0 ^ (x0 + s0) - add v0.4s, v0.4s, v8.4s - eor v0.16b, v0.16b, v4.16b - - // o1 = i1 ^ (x1 + s1) - add v1.4s, v1.4s, v9.4s - eor v1.16b, v1.16b, v5.16b - - // o2 = i2 ^ (x2 + s2) - add v2.4s, v2.4s, v10.4s - eor v2.16b, v2.16b, v6.16b - - // o3 = i3 ^ (x3 + s3) - add v3.4s, v3.4s, v11.4s - eor v3.16b, v3.16b, v7.16b - - st1 {v0.16b-v3.16b}, [x1] - - ldp x29, x30, [sp], #16 - ret -ENDPROC(chacha20_block_xor_neon) - -ENTRY(hchacha20_block_neon) - // x0: Input state matrix, s - // x1: output (8 32-bit words) - - stp x29, x30, [sp, #-16]! - mov x29, sp - - ld1 {v0.4s-v3.4s}, [x0] - - bl chacha20_permute - - st1 {v0.16b}, [x1], #16 - st1 {v3.16b}, [x1] - - ldp x29, x30, [sp], #16 - ret -ENDPROC(hchacha20_block_neon) - - .align 6 -ENTRY(chacha20_4block_xor_neon) - // x0: Input state matrix, s - // x1: 4 data blocks output, o - // x2: 4 data blocks input, i - - // - // This function encrypts four consecutive ChaCha20 blocks by loading - // the state matrix in NEON registers four times. The algorithm performs - // each operation on the corresponding word of each state matrix, hence - // requires no word shuffling. For final XORing step we transpose the - // matrix by interleaving 32- and then 64-bit words, which allows us to - // do XOR in NEON registers. - // - adr x3, CTRINC // ... and ROT8 - ld1 {v30.4s-v31.4s}, [x3] - - // x0..15[0-3] = s0..3[0..3] - mov x4, x0 - ld4r { v0.4s- v3.4s}, [x4], #16 - ld4r { v4.4s- v7.4s}, [x4], #16 - ld4r { v8.4s-v11.4s}, [x4], #16 - ld4r {v12.4s-v15.4s}, [x4] - - // x12 += counter values 0-3 - add v12.4s, v12.4s, v30.4s - - mov x3, #10 - -.Ldoubleround4: - // x0 += x4, x12 = rotl32(x12 ^ x0, 16) - // x1 += x5, x13 = rotl32(x13 ^ x1, 16) - // x2 += x6, x14 = rotl32(x14 ^ x2, 16) - // x3 += x7, x15 = rotl32(x15 ^ x3, 16) - add v0.4s, v0.4s, v4.4s - add v1.4s, v1.4s, v5.4s - add v2.4s, v2.4s, v6.4s - add v3.4s, v3.4s, v7.4s - - eor v12.16b, v12.16b, v0.16b - eor v13.16b, v13.16b, v1.16b - eor v14.16b, v14.16b, v2.16b - eor v15.16b, v15.16b, v3.16b - - rev32 v12.8h, v12.8h - rev32 v13.8h, v13.8h - rev32 v14.8h, v14.8h - rev32 v15.8h, v15.8h - - // x8 += x12, x4 = rotl32(x4 ^ x8, 12) - // x9 += x13, x5 = rotl32(x5 ^ x9, 12) - // x10 += x14, x6 = rotl32(x6 ^ x10, 12) - // x11 += x15, x7 = rotl32(x7 ^ x11, 12) - add v8.4s, v8.4s, v12.4s - add v9.4s, v9.4s, v13.4s - add v10.4s, v10.4s, v14.4s - add v11.4s, v11.4s, v15.4s - - eor v16.16b, v4.16b, v8.16b - eor v17.16b, v5.16b, v9.16b - eor v18.16b, v6.16b, v10.16b - eor v19.16b, v7.16b, v11.16b - - shl v4.4s, v16.4s, #12 - shl v5.4s, v17.4s, #12 - shl v6.4s, v18.4s, #12 - shl v7.4s, v19.4s, #12 - - sri v4.4s, v16.4s, #20 - sri v5.4s, v17.4s, #20 - sri v6.4s, v18.4s, #20 - sri v7.4s, v19.4s, #20 - - // x0 += x4, x12 = rotl32(x12 ^ x0, 8) - // x1 += x5, x13 = rotl32(x13 ^ x1, 8) - // x2 += x6, x14 = rotl32(x14 ^ x2, 8) - // x3 += x7, x15 = rotl32(x15 ^ x3, 8) - add v0.4s, v0.4s, v4.4s - add v1.4s, v1.4s, v5.4s - add v2.4s, v2.4s, v6.4s - add v3.4s, v3.4s, v7.4s - - eor v12.16b, v12.16b, v0.16b - eor v13.16b, v13.16b, v1.16b - eor v14.16b, v14.16b, v2.16b - eor v15.16b, v15.16b, v3.16b - - tbl v12.16b, {v12.16b}, v31.16b - tbl v13.16b, {v13.16b}, v31.16b - tbl v14.16b, {v14.16b}, v31.16b - tbl v15.16b, {v15.16b}, v31.16b - - // x8 += x12, x4 = rotl32(x4 ^ x8, 7) - // x9 += x13, x5 = rotl32(x5 ^ x9, 7) - // x10 += x14, x6 = rotl32(x6 ^ x10, 7) - // x11 += x15, x7 = rotl32(x7 ^ x11, 7) - add v8.4s, v8.4s, v12.4s - add v9.4s, v9.4s, v13.4s - add v10.4s, v10.4s, v14.4s - add v11.4s, v11.4s, v15.4s - - eor v16.16b, v4.16b, v8.16b - eor v17.16b, v5.16b, v9.16b - eor v18.16b, v6.16b, v10.16b - eor v19.16b, v7.16b, v11.16b - - shl v4.4s, v16.4s, #7 - shl v5.4s, v17.4s, #7 - shl v6.4s, v18.4s, #7 - shl v7.4s, v19.4s, #7 - - sri v4.4s, v16.4s, #25 - sri v5.4s, v17.4s, #25 - sri v6.4s, v18.4s, #25 - sri v7.4s, v19.4s, #25 - - // x0 += x5, x15 = rotl32(x15 ^ x0, 16) - // x1 += x6, x12 = rotl32(x12 ^ x1, 16) - // x2 += x7, x13 = rotl32(x13 ^ x2, 16) - // x3 += x4, x14 = rotl32(x14 ^ x3, 16) - add v0.4s, v0.4s, v5.4s - add v1.4s, v1.4s, v6.4s - add v2.4s, v2.4s, v7.4s - add v3.4s, v3.4s, v4.4s - - eor v15.16b, v15.16b, v0.16b - eor v12.16b, v12.16b, v1.16b - eor v13.16b, v13.16b, v2.16b - eor v14.16b, v14.16b, v3.16b - - rev32 v15.8h, v15.8h - rev32 v12.8h, v12.8h - rev32 v13.8h, v13.8h - rev32 v14.8h, v14.8h - - // x10 += x15, x5 = rotl32(x5 ^ x10, 12) - // x11 += x12, x6 = rotl32(x6 ^ x11, 12) - // x8 += x13, x7 = rotl32(x7 ^ x8, 12) - // x9 += x14, x4 = rotl32(x4 ^ x9, 12) - add v10.4s, v10.4s, v15.4s - add v11.4s, v11.4s, v12.4s - add v8.4s, v8.4s, v13.4s - add v9.4s, v9.4s, v14.4s - - eor v16.16b, v5.16b, v10.16b - eor v17.16b, v6.16b, v11.16b - eor v18.16b, v7.16b, v8.16b - eor v19.16b, v4.16b, v9.16b - - shl v5.4s, v16.4s, #12 - shl v6.4s, v17.4s, #12 - shl v7.4s, v18.4s, #12 - shl v4.4s, v19.4s, #12 - - sri v5.4s, v16.4s, #20 - sri v6.4s, v17.4s, #20 - sri v7.4s, v18.4s, #20 - sri v4.4s, v19.4s, #20 - - // x0 += x5, x15 = rotl32(x15 ^ x0, 8) - // x1 += x6, x12 = rotl32(x12 ^ x1, 8) - // x2 += x7, x13 = rotl32(x13 ^ x2, 8) - // x3 += x4, x14 = rotl32(x14 ^ x3, 8) - add v0.4s, v0.4s, v5.4s - add v1.4s, v1.4s, v6.4s - add v2.4s, v2.4s, v7.4s - add v3.4s, v3.4s, v4.4s - - eor v15.16b, v15.16b, v0.16b - eor v12.16b, v12.16b, v1.16b - eor v13.16b, v13.16b, v2.16b - eor v14.16b, v14.16b, v3.16b - - tbl v15.16b, {v15.16b}, v31.16b - tbl v12.16b, {v12.16b}, v31.16b - tbl v13.16b, {v13.16b}, v31.16b - tbl v14.16b, {v14.16b}, v31.16b - - // x10 += x15, x5 = rotl32(x5 ^ x10, 7) - // x11 += x12, x6 = rotl32(x6 ^ x11, 7) - // x8 += x13, x7 = rotl32(x7 ^ x8, 7) - // x9 += x14, x4 = rotl32(x4 ^ x9, 7) - add v10.4s, v10.4s, v15.4s - add v11.4s, v11.4s, v12.4s - add v8.4s, v8.4s, v13.4s - add v9.4s, v9.4s, v14.4s - - eor v16.16b, v5.16b, v10.16b - eor v17.16b, v6.16b, v11.16b - eor v18.16b, v7.16b, v8.16b - eor v19.16b, v4.16b, v9.16b - - shl v5.4s, v16.4s, #7 - shl v6.4s, v17.4s, #7 - shl v7.4s, v18.4s, #7 - shl v4.4s, v19.4s, #7 - - sri v5.4s, v16.4s, #25 - sri v6.4s, v17.4s, #25 - sri v7.4s, v18.4s, #25 - sri v4.4s, v19.4s, #25 - - subs x3, x3, #1 - b.ne .Ldoubleround4 - - ld4r {v16.4s-v19.4s}, [x0], #16 - ld4r {v20.4s-v23.4s}, [x0], #16 - - // x12 += counter values 0-3 - add v12.4s, v12.4s, v30.4s - - // x0[0-3] += s0[0] - // x1[0-3] += s0[1] - // x2[0-3] += s0[2] - // x3[0-3] += s0[3] - add v0.4s, v0.4s, v16.4s - add v1.4s, v1.4s, v17.4s - add v2.4s, v2.4s, v18.4s - add v3.4s, v3.4s, v19.4s - - ld4r {v24.4s-v27.4s}, [x0], #16 - ld4r {v28.4s-v31.4s}, [x0] - - // x4[0-3] += s1[0] - // x5[0-3] += s1[1] - // x6[0-3] += s1[2] - // x7[0-3] += s1[3] - add v4.4s, v4.4s, v20.4s - add v5.4s, v5.4s, v21.4s - add v6.4s, v6.4s, v22.4s - add v7.4s, v7.4s, v23.4s - - // x8[0-3] += s2[0] - // x9[0-3] += s2[1] - // x10[0-3] += s2[2] - // x11[0-3] += s2[3] - add v8.4s, v8.4s, v24.4s - add v9.4s, v9.4s, v25.4s - add v10.4s, v10.4s, v26.4s - add v11.4s, v11.4s, v27.4s - - // x12[0-3] += s3[0] - // x13[0-3] += s3[1] - // x14[0-3] += s3[2] - // x15[0-3] += s3[3] - add v12.4s, v12.4s, v28.4s - add v13.4s, v13.4s, v29.4s - add v14.4s, v14.4s, v30.4s - add v15.4s, v15.4s, v31.4s - - // interleave 32-bit words in state n, n+1 - zip1 v16.4s, v0.4s, v1.4s - zip2 v17.4s, v0.4s, v1.4s - zip1 v18.4s, v2.4s, v3.4s - zip2 v19.4s, v2.4s, v3.4s - zip1 v20.4s, v4.4s, v5.4s - zip2 v21.4s, v4.4s, v5.4s - zip1 v22.4s, v6.4s, v7.4s - zip2 v23.4s, v6.4s, v7.4s - zip1 v24.4s, v8.4s, v9.4s - zip2 v25.4s, v8.4s, v9.4s - zip1 v26.4s, v10.4s, v11.4s - zip2 v27.4s, v10.4s, v11.4s - zip1 v28.4s, v12.4s, v13.4s - zip2 v29.4s, v12.4s, v13.4s - zip1 v30.4s, v14.4s, v15.4s - zip2 v31.4s, v14.4s, v15.4s - - // interleave 64-bit words in state n, n+2 - zip1 v0.2d, v16.2d, v18.2d - zip2 v4.2d, v16.2d, v18.2d - zip1 v8.2d, v17.2d, v19.2d - zip2 v12.2d, v17.2d, v19.2d - ld1 {v16.16b-v19.16b}, [x2], #64 - - zip1 v1.2d, v20.2d, v22.2d - zip2 v5.2d, v20.2d, v22.2d - zip1 v9.2d, v21.2d, v23.2d - zip2 v13.2d, v21.2d, v23.2d - ld1 {v20.16b-v23.16b}, [x2], #64 - - zip1 v2.2d, v24.2d, v26.2d - zip2 v6.2d, v24.2d, v26.2d - zip1 v10.2d, v25.2d, v27.2d - zip2 v14.2d, v25.2d, v27.2d - ld1 {v24.16b-v27.16b}, [x2], #64 - - zip1 v3.2d, v28.2d, v30.2d - zip2 v7.2d, v28.2d, v30.2d - zip1 v11.2d, v29.2d, v31.2d - zip2 v15.2d, v29.2d, v31.2d - ld1 {v28.16b-v31.16b}, [x2] - - // xor with corresponding input, write to output - eor v16.16b, v16.16b, v0.16b - eor v17.16b, v17.16b, v1.16b - eor v18.16b, v18.16b, v2.16b - eor v19.16b, v19.16b, v3.16b - eor v20.16b, v20.16b, v4.16b - eor v21.16b, v21.16b, v5.16b - st1 {v16.16b-v19.16b}, [x1], #64 - eor v22.16b, v22.16b, v6.16b - eor v23.16b, v23.16b, v7.16b - eor v24.16b, v24.16b, v8.16b - eor v25.16b, v25.16b, v9.16b - st1 {v20.16b-v23.16b}, [x1], #64 - eor v26.16b, v26.16b, v10.16b - eor v27.16b, v27.16b, v11.16b - eor v28.16b, v28.16b, v12.16b - st1 {v24.16b-v27.16b}, [x1], #64 - eor v29.16b, v29.16b, v13.16b - eor v30.16b, v30.16b, v14.16b - eor v31.16b, v31.16b, v15.16b - st1 {v28.16b-v31.16b}, [x1] - - ret -ENDPROC(chacha20_4block_xor_neon) - -CTRINC: .word 0, 1, 2, 3 -ROT8: .word 0x02010003, 0x06050407, 0x0a09080b, 0x0e0d0c0f diff --git a/arch/arm64/crypto/chacha20-neon-glue.c b/arch/arm64/crypto/chacha20-neon-glue.c deleted file mode 100644 index a5b9cbc0c4de..000000000000 --- a/arch/arm64/crypto/chacha20-neon-glue.c +++ /dev/null @@ -1,184 +0,0 @@ -/* - * ChaCha20 256-bit cipher algorithm, RFC7539, arm64 NEON functions - * - * Copyright (C) 2016 - 2017 Linaro, Ltd. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * Based on: - * ChaCha20 256-bit cipher algorithm, RFC7539, SIMD glue code - * - * Copyright (C) 2015 Martin Willi - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - */ - -#include -#include -#include -#include -#include - -#include -#include -#include - -asmlinkage void chacha20_block_xor_neon(u32 *state, u8 *dst, const u8 *src); -asmlinkage void chacha20_4block_xor_neon(u32 *state, u8 *dst, const u8 *src); -asmlinkage void hchacha20_block_neon(const u32 *state, u32 *out); - -static void chacha20_doneon(u32 *state, u8 *dst, const u8 *src, - unsigned int bytes) -{ - u8 buf[CHACHA_BLOCK_SIZE]; - - while (bytes >= CHACHA_BLOCK_SIZE * 4) { - kernel_neon_begin(); - chacha20_4block_xor_neon(state, dst, src); - kernel_neon_end(); - bytes -= CHACHA_BLOCK_SIZE * 4; - src += CHACHA_BLOCK_SIZE * 4; - dst += CHACHA_BLOCK_SIZE * 4; - state[12] += 4; - } - - if (!bytes) - return; - - kernel_neon_begin(); - while (bytes >= CHACHA_BLOCK_SIZE) { - chacha20_block_xor_neon(state, dst, src); - bytes -= CHACHA_BLOCK_SIZE; - src += CHACHA_BLOCK_SIZE; - dst += CHACHA_BLOCK_SIZE; - state[12]++; - } - if (bytes) { - memcpy(buf, src, bytes); - chacha20_block_xor_neon(state, buf, buf); - memcpy(dst, buf, bytes); - } - kernel_neon_end(); -} - -static int chacha20_neon_stream_xor(struct skcipher_request *req, - struct chacha_ctx *ctx, u8 *iv) -{ - struct skcipher_walk walk; - u32 state[16]; - int err; - - err = skcipher_walk_virt(&walk, req, false); - - crypto_chacha_init(state, ctx, iv); - - while (walk.nbytes > 0) { - unsigned int nbytes = walk.nbytes; - - if (nbytes < walk.total) - nbytes = round_down(nbytes, walk.stride); - - chacha20_doneon(state, walk.dst.virt.addr, walk.src.virt.addr, - nbytes); - err = skcipher_walk_done(&walk, walk.nbytes - nbytes); - } - - return err; -} - -static int chacha20_neon(struct skcipher_request *req) -{ - struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm); - - if (req->cryptlen <= CHACHA_BLOCK_SIZE || !may_use_simd()) - return crypto_chacha_crypt(req); - - return chacha20_neon_stream_xor(req, ctx, req->iv); -} - -static int xchacha20_neon(struct skcipher_request *req) -{ - struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm); - struct chacha_ctx subctx; - u32 state[16]; - u8 real_iv[16]; - - if (req->cryptlen <= CHACHA_BLOCK_SIZE || !may_use_simd()) - return crypto_xchacha_crypt(req); - - crypto_chacha_init(state, ctx, req->iv); - - kernel_neon_begin(); - hchacha20_block_neon(state, subctx.key); - kernel_neon_end(); - - memcpy(&real_iv[0], req->iv + 24, 8); - memcpy(&real_iv[8], req->iv + 16, 8); - return chacha20_neon_stream_xor(req, &subctx, real_iv); -} - -static struct skcipher_alg algs[] = { - { - .base.cra_name = "chacha20", - .base.cra_driver_name = "chacha20-neon", - .base.cra_priority = 300, - .base.cra_blocksize = 1, - .base.cra_ctxsize = sizeof(struct chacha_ctx), - .base.cra_module = THIS_MODULE, - - .min_keysize = CHACHA_KEY_SIZE, - .max_keysize = CHACHA_KEY_SIZE, - .ivsize = CHACHA_IV_SIZE, - .chunksize = CHACHA_BLOCK_SIZE, - .walksize = 4 * CHACHA_BLOCK_SIZE, - .setkey = crypto_chacha20_setkey, - .encrypt = chacha20_neon, - .decrypt = chacha20_neon, - }, { - .base.cra_name = "xchacha20", - .base.cra_driver_name = "xchacha20-neon", - .base.cra_priority = 300, - .base.cra_blocksize = 1, - .base.cra_ctxsize = sizeof(struct chacha_ctx), - .base.cra_module = THIS_MODULE, - - .min_keysize = CHACHA_KEY_SIZE, - .max_keysize = CHACHA_KEY_SIZE, - .ivsize = XCHACHA_IV_SIZE, - .chunksize = CHACHA_BLOCK_SIZE, - .walksize = 4 * CHACHA_BLOCK_SIZE, - .setkey = crypto_chacha20_setkey, - .encrypt = xchacha20_neon, - .decrypt = xchacha20_neon, - } -}; - -static int __init chacha20_simd_mod_init(void) -{ - if (!(elf_hwcap & HWCAP_ASIMD)) - return -ENODEV; - - return crypto_register_skciphers(algs, ARRAY_SIZE(algs)); -} - -static void __exit chacha20_simd_mod_fini(void) -{ - crypto_unregister_skciphers(algs, ARRAY_SIZE(algs)); -} - -module_init(chacha20_simd_mod_init); -module_exit(chacha20_simd_mod_fini); - -MODULE_AUTHOR("Ard Biesheuvel "); -MODULE_LICENSE("GPL v2"); -MODULE_ALIAS_CRYPTO("chacha20"); -MODULE_ALIAS_CRYPTO("chacha20-neon"); -MODULE_ALIAS_CRYPTO("xchacha20"); -MODULE_ALIAS_CRYPTO("xchacha20-neon"); -- cgit v1.2.3-59-g8ed1b From 19c11c97c39f5c6280b4d523ea170ef9a8f7ed12 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 3 Dec 2018 19:52:52 -0800 Subject: crypto: arm64/chacha - add XChaCha12 support Now that the ARM64 NEON implementation of ChaCha20 and XChaCha20 has been refactored to support varying the number of rounds, add support for XChaCha12. This is identical to XChaCha20 except for the number of rounds, which is 12 instead of 20. This can be used by Adiantum. Reviewed-by: Ard Biesheuvel Signed-off-by: Eric Biggers Signed-off-by: Herbert Xu --- arch/arm64/crypto/Kconfig | 2 +- arch/arm64/crypto/chacha-neon-glue.c | 18 ++++++++++++++++++ 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/arch/arm64/crypto/Kconfig b/arch/arm64/crypto/Kconfig index d54ddb8468ef..d9a523ecdd83 100644 --- a/arch/arm64/crypto/Kconfig +++ b/arch/arm64/crypto/Kconfig @@ -101,7 +101,7 @@ config CRYPTO_AES_ARM64_NEON_BLK select CRYPTO_SIMD config CRYPTO_CHACHA20_NEON - tristate "ChaCha20 and XChaCha20 stream ciphers using NEON instructions" + tristate "ChaCha20, XChaCha20, and XChaCha12 stream ciphers using NEON instructions" depends on KERNEL_MODE_NEON select CRYPTO_BLKCIPHER select CRYPTO_CHACHA20 diff --git a/arch/arm64/crypto/chacha-neon-glue.c b/arch/arm64/crypto/chacha-neon-glue.c index 4d992029b912..346eb85498a1 100644 --- a/arch/arm64/crypto/chacha-neon-glue.c +++ b/arch/arm64/crypto/chacha-neon-glue.c @@ -161,6 +161,22 @@ static struct skcipher_alg algs[] = { .setkey = crypto_chacha20_setkey, .encrypt = xchacha_neon, .decrypt = xchacha_neon, + }, { + .base.cra_name = "xchacha12", + .base.cra_driver_name = "xchacha12-neon", + .base.cra_priority = 300, + .base.cra_blocksize = 1, + .base.cra_ctxsize = sizeof(struct chacha_ctx), + .base.cra_module = THIS_MODULE, + + .min_keysize = CHACHA_KEY_SIZE, + .max_keysize = CHACHA_KEY_SIZE, + .ivsize = XCHACHA_IV_SIZE, + .chunksize = CHACHA_BLOCK_SIZE, + .walksize = 4 * CHACHA_BLOCK_SIZE, + .setkey = crypto_chacha12_setkey, + .encrypt = xchacha_neon, + .decrypt = xchacha_neon, } }; @@ -187,3 +203,5 @@ MODULE_ALIAS_CRYPTO("chacha20"); MODULE_ALIAS_CRYPTO("chacha20-neon"); MODULE_ALIAS_CRYPTO("xchacha20"); MODULE_ALIAS_CRYPTO("xchacha20-neon"); +MODULE_ALIAS_CRYPTO("xchacha12"); +MODULE_ALIAS_CRYPTO("xchacha12-neon"); -- cgit v1.2.3-59-g8ed1b From cf718eaa8f9b2cb8a372dcfd5ef701188e233558 Mon Sep 17 00:00:00 2001 From: "Srikanth, Jampala" Date: Tue, 4 Dec 2018 12:55:54 +0000 Subject: crypto: cavium/nitrox - Enabled Mailbox support Enabled the PF->VF Mailbox support. Mailbox message are interpreted as {type, opcode, data}. Supported message types are REQ, ACK and NACK. Signed-off-by: Srikanth Jampala Signed-off-by: Herbert Xu --- drivers/crypto/cavium/nitrox/Makefile | 3 +- drivers/crypto/cavium/nitrox/nitrox_csr.h | 12 +- drivers/crypto/cavium/nitrox/nitrox_debugfs.h | 22 +++ drivers/crypto/cavium/nitrox/nitrox_dev.h | 61 ++++++-- drivers/crypto/cavium/nitrox/nitrox_hal.c | 114 ++++++++++---- drivers/crypto/cavium/nitrox/nitrox_hal.h | 2 + drivers/crypto/cavium/nitrox/nitrox_isr.c | 8 +- drivers/crypto/cavium/nitrox/nitrox_main.c | 3 +- drivers/crypto/cavium/nitrox/nitrox_mbx.c | 204 ++++++++++++++++++++++++++ drivers/crypto/cavium/nitrox/nitrox_mbx.h | 9 ++ drivers/crypto/cavium/nitrox/nitrox_sriov.c | 57 ++++++- 11 files changed, 441 insertions(+), 54 deletions(-) create mode 100644 drivers/crypto/cavium/nitrox/nitrox_debugfs.h create mode 100644 drivers/crypto/cavium/nitrox/nitrox_mbx.c create mode 100644 drivers/crypto/cavium/nitrox/nitrox_mbx.h diff --git a/drivers/crypto/cavium/nitrox/Makefile b/drivers/crypto/cavium/nitrox/Makefile index e12954791673..ad0546630ad8 100644 --- a/drivers/crypto/cavium/nitrox/Makefile +++ b/drivers/crypto/cavium/nitrox/Makefile @@ -6,7 +6,8 @@ n5pf-objs := nitrox_main.o \ nitrox_lib.o \ nitrox_hal.o \ nitrox_reqmgr.o \ - nitrox_algs.o + nitrox_algs.o \ + nitrox_mbx.o n5pf-$(CONFIG_PCI_IOV) += nitrox_sriov.o n5pf-$(CONFIG_DEBUG_FS) += nitrox_debugfs.o diff --git a/drivers/crypto/cavium/nitrox/nitrox_csr.h b/drivers/crypto/cavium/nitrox/nitrox_csr.h index 1ad27b1a87c5..a2a452642b38 100644 --- a/drivers/crypto/cavium/nitrox/nitrox_csr.h +++ b/drivers/crypto/cavium/nitrox/nitrox_csr.h @@ -54,7 +54,13 @@ #define NPS_STATS_PKT_DMA_WR_CNT 0x1000190 /* NPS packet registers */ -#define NPS_PKT_INT 0x1040018 +#define NPS_PKT_INT 0x1040018 +#define NPS_PKT_MBOX_INT_LO 0x1040020 +#define NPS_PKT_MBOX_INT_LO_ENA_W1C 0x1040030 +#define NPS_PKT_MBOX_INT_LO_ENA_W1S 0x1040038 +#define NPS_PKT_MBOX_INT_HI 0x1040040 +#define NPS_PKT_MBOX_INT_HI_ENA_W1C 0x1040050 +#define NPS_PKT_MBOX_INT_HI_ENA_W1S 0x1040058 #define NPS_PKT_IN_RERR_HI 0x1040108 #define NPS_PKT_IN_RERR_HI_ENA_W1S 0x1040120 #define NPS_PKT_IN_RERR_LO 0x1040128 @@ -74,6 +80,10 @@ #define NPS_PKT_SLC_RERR_LO_ENA_W1S 0x1040240 #define NPS_PKT_SLC_ERR_TYPE 0x1040248 #define NPS_PKT_SLC_ERR_TYPE_ENA_W1S 0x1040260 +/* Mailbox PF->VF PF Accessible Data registers */ +#define NPS_PKT_MBOX_PF_VF_PFDATAX(_i) (0x1040800 + ((_i) * 0x8)) +#define NPS_PKT_MBOX_VF_PF_PFDATAX(_i) (0x1040C00 + ((_i) * 0x8)) + #define NPS_PKT_SLC_CTLX(_i) (0x10000 + ((_i) * 0x40000)) #define NPS_PKT_SLC_CNTSX(_i) (0x10008 + ((_i) * 0x40000)) #define NPS_PKT_SLC_INT_LEVELSX(_i) (0x10010 + ((_i) * 0x40000)) diff --git a/drivers/crypto/cavium/nitrox/nitrox_debugfs.h b/drivers/crypto/cavium/nitrox/nitrox_debugfs.h new file mode 100644 index 000000000000..7b701ea6227a --- /dev/null +++ b/drivers/crypto/cavium/nitrox/nitrox_debugfs.h @@ -0,0 +1,22 @@ +// SPDX-License-Identifier: GPL-2.0 +#ifndef __NITROX_DEBUGFS_H +#define __NITROX_DEBUGFS_H + +#include "nitrox_dev.h" + +#ifdef CONFIG_DEBUG_FS +int nitrox_debugfs_init(struct nitrox_device *ndev); +void nitrox_debugfs_exit(struct nitrox_device *ndev); +#else +static inline int nitrox_debugfs_init(struct nitrox_device *ndev) +{ + return 0; +} + +static inline int nitrox_sriov_debugfs_init(struct nitrox_device *ndev) +{ + return 0; +} +#endif /* !CONFIG_DEBUG_FS */ + +#endif /* __NITROX_DEBUGFS_H */ diff --git a/drivers/crypto/cavium/nitrox/nitrox_dev.h b/drivers/crypto/cavium/nitrox/nitrox_dev.h index 247df32f687c..0338877b828f 100644 --- a/drivers/crypto/cavium/nitrox/nitrox_dev.h +++ b/drivers/crypto/cavium/nitrox/nitrox_dev.h @@ -8,6 +8,8 @@ #include #define VERSION_LEN 32 +/* Maximum queues in PF mode */ +#define MAX_PF_QUEUES 64 /** * struct nitrox_cmdq - NITROX command queue @@ -103,13 +105,58 @@ struct nitrox_q_vector { }; }; +/** + * mbox_msg - Mailbox message data + * @type: message type + * @opcode: message opcode + * @data: message data + */ +union mbox_msg { + u64 value; + struct { + u64 type: 2; + u64 opcode: 6; + u64 data: 58; + }; + struct { + u64 type: 2; + u64 opcode: 6; + u64 chipid: 8; + u64 vfid: 8; + } id; +}; + +/** + * nitrox_vfdev - NITROX VF device instance in PF + * @state: VF device state + * @vfno: VF number + * @nr_queues: number of queues enabled in VF + * @ring: ring to communicate with VF + * @msg: Mailbox message data from VF + * @mbx_resp: Mailbox counters + */ +struct nitrox_vfdev { + atomic_t state; + int vfno; + int nr_queues; + int ring; + union mbox_msg msg; + atomic64_t mbx_resp; +}; + /** * struct nitrox_iov - SR-IOV information * @num_vfs: number of VF(s) enabled - * @msix: MSI-X for PF in SR-IOV case + * @max_vf_queues: Maximum number of queues allowed for VF + * @vfdev: VF(s) devices + * @pf2vf_wq: workqueue for PF2VF communication + * @msix: MSI-X entry for PF in SR-IOV case */ struct nitrox_iov { int num_vfs; + int max_vf_queues; + struct nitrox_vfdev *vfdev; + struct workqueue_struct *pf2vf_wq; struct msix_entry msix; }; @@ -226,17 +273,9 @@ static inline bool nitrox_ready(struct nitrox_device *ndev) return atomic_read(&ndev->state) == __NDEV_READY; } -#ifdef CONFIG_DEBUG_FS -int nitrox_debugfs_init(struct nitrox_device *ndev); -void nitrox_debugfs_exit(struct nitrox_device *ndev); -#else -static inline int nitrox_debugfs_init(struct nitrox_device *ndev) +static inline bool nitrox_vfdev_ready(struct nitrox_vfdev *vfdev) { - return 0; + return atomic_read(&vfdev->state) == __NDEV_READY; } -static inline void nitrox_debugfs_exit(struct nitrox_device *ndev) -{ } -#endif - #endif /* __NITROX_DEV_H */ diff --git a/drivers/crypto/cavium/nitrox/nitrox_hal.c b/drivers/crypto/cavium/nitrox/nitrox_hal.c index a9b82387cf53..c08d9f33a3b1 100644 --- a/drivers/crypto/cavium/nitrox/nitrox_hal.c +++ b/drivers/crypto/cavium/nitrox/nitrox_hal.c @@ -5,10 +5,11 @@ #include "nitrox_csr.h" #define PLL_REF_CLK 50 +#define MAX_CSR_RETRIES 10 /** * emu_enable_cores - Enable EMU cluster cores. - * @ndev: N5 device + * @ndev: NITROX device */ static void emu_enable_cores(struct nitrox_device *ndev) { @@ -33,7 +34,7 @@ static void emu_enable_cores(struct nitrox_device *ndev) /** * nitrox_config_emu_unit - configure EMU unit. - * @ndev: N5 device + * @ndev: NITROX device */ void nitrox_config_emu_unit(struct nitrox_device *ndev) { @@ -63,29 +64,26 @@ void nitrox_config_emu_unit(struct nitrox_device *ndev) static void reset_pkt_input_ring(struct nitrox_device *ndev, int ring) { union nps_pkt_in_instr_ctl pkt_in_ctl; - union nps_pkt_in_instr_baoff_dbell pkt_in_dbell; union nps_pkt_in_done_cnts pkt_in_cnts; + int max_retries = MAX_CSR_RETRIES; u64 offset; + /* step 1: disable the ring, clear enable bit */ offset = NPS_PKT_IN_INSTR_CTLX(ring); - /* disable the ring */ pkt_in_ctl.value = nitrox_read_csr(ndev, offset); pkt_in_ctl.s.enb = 0; nitrox_write_csr(ndev, offset, pkt_in_ctl.value); - usleep_range(100, 150); - /* wait to clear [ENB] */ + /* step 2: wait to clear [ENB] */ + usleep_range(100, 150); do { pkt_in_ctl.value = nitrox_read_csr(ndev, offset); - } while (pkt_in_ctl.s.enb); - - /* clear off door bell counts */ - offset = NPS_PKT_IN_INSTR_BAOFF_DBELLX(ring); - pkt_in_dbell.value = 0; - pkt_in_dbell.s.dbell = 0xffffffff; - nitrox_write_csr(ndev, offset, pkt_in_dbell.value); + if (!pkt_in_ctl.s.enb) + break; + udelay(50); + } while (max_retries--); - /* clear done counts */ + /* step 3: clear done counts */ offset = NPS_PKT_IN_DONE_CNTSX(ring); pkt_in_cnts.value = nitrox_read_csr(ndev, offset); nitrox_write_csr(ndev, offset, pkt_in_cnts.value); @@ -95,6 +93,7 @@ static void reset_pkt_input_ring(struct nitrox_device *ndev, int ring) void enable_pkt_input_ring(struct nitrox_device *ndev, int ring) { union nps_pkt_in_instr_ctl pkt_in_ctl; + int max_retries = MAX_CSR_RETRIES; u64 offset; /* 64-byte instruction size */ @@ -107,12 +106,15 @@ void enable_pkt_input_ring(struct nitrox_device *ndev, int ring) /* wait for set [ENB] */ do { pkt_in_ctl.value = nitrox_read_csr(ndev, offset); - } while (!pkt_in_ctl.s.enb); + if (pkt_in_ctl.s.enb) + break; + udelay(50); + } while (max_retries--); } /** * nitrox_config_pkt_input_rings - configure Packet Input Rings - * @ndev: N5 device + * @ndev: NITROX device */ void nitrox_config_pkt_input_rings(struct nitrox_device *ndev) { @@ -121,11 +123,14 @@ void nitrox_config_pkt_input_rings(struct nitrox_device *ndev) for (i = 0; i < ndev->nr_queues; i++) { struct nitrox_cmdq *cmdq = &ndev->pkt_inq[i]; union nps_pkt_in_instr_rsize pkt_in_rsize; + union nps_pkt_in_instr_baoff_dbell pkt_in_dbell; u64 offset; reset_pkt_input_ring(ndev, i); - /* configure ring base address 16-byte aligned, + /** + * step 4: + * configure ring base address 16-byte aligned, * size and interrupt threshold. */ offset = NPS_PKT_IN_INSTR_BADDRX(i); @@ -141,6 +146,13 @@ void nitrox_config_pkt_input_rings(struct nitrox_device *ndev) offset = NPS_PKT_IN_INT_LEVELSX(i); nitrox_write_csr(ndev, offset, 0xffffffff); + /* step 5: clear off door bell counts */ + offset = NPS_PKT_IN_INSTR_BAOFF_DBELLX(i); + pkt_in_dbell.value = 0; + pkt_in_dbell.s.dbell = 0xffffffff; + nitrox_write_csr(ndev, offset, pkt_in_dbell.value); + + /* enable the ring */ enable_pkt_input_ring(ndev, i); } } @@ -149,21 +161,26 @@ static void reset_pkt_solicit_port(struct nitrox_device *ndev, int port) { union nps_pkt_slc_ctl pkt_slc_ctl; union nps_pkt_slc_cnts pkt_slc_cnts; + int max_retries = MAX_CSR_RETRIES; u64 offset; - /* disable slc port */ + /* step 1: disable slc port */ offset = NPS_PKT_SLC_CTLX(port); pkt_slc_ctl.value = nitrox_read_csr(ndev, offset); pkt_slc_ctl.s.enb = 0; nitrox_write_csr(ndev, offset, pkt_slc_ctl.value); - usleep_range(100, 150); + /* step 2 */ + usleep_range(100, 150); /* wait to clear [ENB] */ do { pkt_slc_ctl.value = nitrox_read_csr(ndev, offset); - } while (pkt_slc_ctl.s.enb); + if (!pkt_slc_ctl.s.enb) + break; + udelay(50); + } while (max_retries--); - /* clear slc counters */ + /* step 3: clear slc counters */ offset = NPS_PKT_SLC_CNTSX(port); pkt_slc_cnts.value = nitrox_read_csr(ndev, offset); nitrox_write_csr(ndev, offset, pkt_slc_cnts.value); @@ -173,12 +190,12 @@ static void reset_pkt_solicit_port(struct nitrox_device *ndev, int port) void enable_pkt_solicit_port(struct nitrox_device *ndev, int port) { union nps_pkt_slc_ctl pkt_slc_ctl; + int max_retries = MAX_CSR_RETRIES; u64 offset; offset = NPS_PKT_SLC_CTLX(port); pkt_slc_ctl.value = 0; pkt_slc_ctl.s.enb = 1; - /* * 8 trailing 0x00 bytes will be added * to the end of the outgoing packet. @@ -191,23 +208,27 @@ void enable_pkt_solicit_port(struct nitrox_device *ndev, int port) /* wait to set [ENB] */ do { pkt_slc_ctl.value = nitrox_read_csr(ndev, offset); - } while (!pkt_slc_ctl.s.enb); + if (pkt_slc_ctl.s.enb) + break; + udelay(50); + } while (max_retries--); } -static void config_single_pkt_solicit_port(struct nitrox_device *ndev, - int port) +static void config_pkt_solicit_port(struct nitrox_device *ndev, int port) { union nps_pkt_slc_int_levels pkt_slc_int; u64 offset; reset_pkt_solicit_port(ndev, port); + /* step 4: configure interrupt levels */ offset = NPS_PKT_SLC_INT_LEVELSX(port); pkt_slc_int.value = 0; /* time interrupt threshold */ pkt_slc_int.s.timet = 0x3fffff; nitrox_write_csr(ndev, offset, pkt_slc_int.value); + /* enable the solicit port */ enable_pkt_solicit_port(ndev, port); } @@ -216,12 +237,12 @@ void nitrox_config_pkt_solicit_ports(struct nitrox_device *ndev) int i; for (i = 0; i < ndev->nr_queues; i++) - config_single_pkt_solicit_port(ndev, i); + config_pkt_solicit_port(ndev, i); } /** * enable_nps_interrupts - enable NPS interrutps - * @ndev: N5 device. + * @ndev: NITROX device. * * This includes NPS core, packet in and slc interrupts. */ @@ -284,8 +305,8 @@ void nitrox_config_pom_unit(struct nitrox_device *ndev) } /** - * nitrox_config_rand_unit - enable N5 random number unit - * @ndev: N5 device + * nitrox_config_rand_unit - enable NITROX random number unit + * @ndev: NITROX device */ void nitrox_config_rand_unit(struct nitrox_device *ndev) { @@ -361,6 +382,7 @@ void invalidate_lbc(struct nitrox_device *ndev) { union lbc_inval_ctl lbc_ctl; union lbc_inval_status lbc_stat; + int max_retries = MAX_CSR_RETRIES; u64 offset; /* invalidate LBC */ @@ -370,10 +392,12 @@ void invalidate_lbc(struct nitrox_device *ndev) nitrox_write_csr(ndev, offset, lbc_ctl.value); offset = LBC_INVAL_STATUS; - do { lbc_stat.value = nitrox_read_csr(ndev, offset); - } while (!lbc_stat.s.done); + if (lbc_stat.s.done) + break; + udelay(50); + } while (max_retries--); } void nitrox_config_lbc_unit(struct nitrox_device *ndev) @@ -467,3 +491,31 @@ void nitrox_get_hwinfo(struct nitrox_device *ndev) /* copy partname */ strncpy(ndev->hw.partname, name, sizeof(ndev->hw.partname)); } + +void enable_pf2vf_mbox_interrupts(struct nitrox_device *ndev) +{ + u64 value = ~0ULL; + u64 reg_addr; + + /* Mailbox interrupt low enable set register */ + reg_addr = NPS_PKT_MBOX_INT_LO_ENA_W1S; + nitrox_write_csr(ndev, reg_addr, value); + + /* Mailbox interrupt high enable set register */ + reg_addr = NPS_PKT_MBOX_INT_HI_ENA_W1S; + nitrox_write_csr(ndev, reg_addr, value); +} + +void disable_pf2vf_mbox_interrupts(struct nitrox_device *ndev) +{ + u64 value = ~0ULL; + u64 reg_addr; + + /* Mailbox interrupt low enable clear register */ + reg_addr = NPS_PKT_MBOX_INT_LO_ENA_W1C; + nitrox_write_csr(ndev, reg_addr, value); + + /* Mailbox interrupt high enable clear register */ + reg_addr = NPS_PKT_MBOX_INT_HI_ENA_W1C; + nitrox_write_csr(ndev, reg_addr, value); +} diff --git a/drivers/crypto/cavium/nitrox/nitrox_hal.h b/drivers/crypto/cavium/nitrox/nitrox_hal.h index 489ee64c119e..d6606418ba38 100644 --- a/drivers/crypto/cavium/nitrox/nitrox_hal.h +++ b/drivers/crypto/cavium/nitrox/nitrox_hal.h @@ -19,5 +19,7 @@ void enable_pkt_input_ring(struct nitrox_device *ndev, int ring); void enable_pkt_solicit_port(struct nitrox_device *ndev, int port); void config_nps_core_vfcfg_mode(struct nitrox_device *ndev, enum vf_mode mode); void nitrox_get_hwinfo(struct nitrox_device *ndev); +void enable_pf2vf_mbox_interrupts(struct nitrox_device *ndev); +void disable_pf2vf_mbox_interrupts(struct nitrox_device *ndev); #endif /* __NITROX_HAL_H */ diff --git a/drivers/crypto/cavium/nitrox/nitrox_isr.c b/drivers/crypto/cavium/nitrox/nitrox_isr.c index c5b797808680..3dec570a190a 100644 --- a/drivers/crypto/cavium/nitrox/nitrox_isr.c +++ b/drivers/crypto/cavium/nitrox/nitrox_isr.c @@ -7,6 +7,7 @@ #include "nitrox_csr.h" #include "nitrox_common.h" #include "nitrox_hal.h" +#include "nitrox_mbx.h" /** * One vector for each type of ring @@ -220,7 +221,8 @@ static void nps_core_int_tasklet(unsigned long data) */ static irqreturn_t nps_core_int_isr(int irq, void *data) { - struct nitrox_device *ndev = data; + struct nitrox_q_vector *qvec = data; + struct nitrox_device *ndev = qvec->ndev; union nps_core_int_active core_int; core_int.value = nitrox_read_csr(ndev, NPS_CORE_INT_ACTIVE); @@ -246,6 +248,10 @@ static irqreturn_t nps_core_int_isr(int irq, void *data) if (core_int.s.bmi) clear_bmi_err_intr(ndev); + /* Mailbox interrupt */ + if (core_int.s.mbox) + nitrox_pf2vf_mbox_handler(ndev); + /* If more work callback the ISR, set resend */ core_int.s.resend = 1; nitrox_write_csr(ndev, NPS_CORE_INT_ACTIVE, core_int.value); diff --git a/drivers/crypto/cavium/nitrox/nitrox_main.c b/drivers/crypto/cavium/nitrox/nitrox_main.c index 6595c95af9f1..014e9863c20e 100644 --- a/drivers/crypto/cavium/nitrox/nitrox_main.c +++ b/drivers/crypto/cavium/nitrox/nitrox_main.c @@ -1,6 +1,5 @@ #include #include -#include #include #include #include @@ -13,9 +12,9 @@ #include "nitrox_csr.h" #include "nitrox_hal.h" #include "nitrox_isr.h" +#include "nitrox_debugfs.h" #define CNN55XX_DEV_ID 0x12 -#define MAX_PF_QUEUES 64 #define UCODE_HLEN 48 #define SE_GROUP 0 diff --git a/drivers/crypto/cavium/nitrox/nitrox_mbx.c b/drivers/crypto/cavium/nitrox/nitrox_mbx.c new file mode 100644 index 000000000000..02ee95064841 --- /dev/null +++ b/drivers/crypto/cavium/nitrox/nitrox_mbx.c @@ -0,0 +1,204 @@ +// SPDX-License-Identifier: GPL-2.0 +#include + +#include "nitrox_csr.h" +#include "nitrox_hal.h" +#include "nitrox_dev.h" + +#define RING_TO_VFNO(_x, _y) ((_x) / (_y)) + +/** + * mbx_msg_type - Mailbox message types + */ +enum mbx_msg_type { + MBX_MSG_TYPE_NOP, + MBX_MSG_TYPE_REQ, + MBX_MSG_TYPE_ACK, + MBX_MSG_TYPE_NACK, +}; + +/** + * mbx_msg_opcode - Mailbox message opcodes + */ +enum mbx_msg_opcode { + MSG_OP_VF_MODE = 1, + MSG_OP_VF_UP, + MSG_OP_VF_DOWN, + MSG_OP_CHIPID_VFID, +}; + +struct pf2vf_work { + struct nitrox_vfdev *vfdev; + struct nitrox_device *ndev; + struct work_struct pf2vf_resp; +}; + +static inline u64 pf2vf_read_mbox(struct nitrox_device *ndev, int ring) +{ + u64 reg_addr; + + reg_addr = NPS_PKT_MBOX_VF_PF_PFDATAX(ring); + return nitrox_read_csr(ndev, reg_addr); +} + +static inline void pf2vf_write_mbox(struct nitrox_device *ndev, u64 value, + int ring) +{ + u64 reg_addr; + + reg_addr = NPS_PKT_MBOX_PF_VF_PFDATAX(ring); + nitrox_write_csr(ndev, reg_addr, value); +} + +static void pf2vf_send_response(struct nitrox_device *ndev, + struct nitrox_vfdev *vfdev) +{ + union mbox_msg msg; + + msg.value = vfdev->msg.value; + + switch (vfdev->msg.opcode) { + case MSG_OP_VF_MODE: + msg.data = ndev->mode; + break; + case MSG_OP_VF_UP: + vfdev->nr_queues = vfdev->msg.data; + atomic_set(&vfdev->state, __NDEV_READY); + break; + case MSG_OP_CHIPID_VFID: + msg.id.chipid = ndev->idx; + msg.id.vfid = vfdev->vfno; + break; + case MSG_OP_VF_DOWN: + vfdev->nr_queues = 0; + atomic_set(&vfdev->state, __NDEV_NOT_READY); + break; + default: + msg.type = MBX_MSG_TYPE_NOP; + break; + } + + if (msg.type == MBX_MSG_TYPE_NOP) + return; + + /* send ACK to VF */ + msg.type = MBX_MSG_TYPE_ACK; + pf2vf_write_mbox(ndev, msg.value, vfdev->ring); + + vfdev->msg.value = 0; + atomic64_inc(&vfdev->mbx_resp); +} + +static void pf2vf_resp_handler(struct work_struct *work) +{ + struct pf2vf_work *pf2vf_resp = container_of(work, struct pf2vf_work, + pf2vf_resp); + struct nitrox_vfdev *vfdev = pf2vf_resp->vfdev; + struct nitrox_device *ndev = pf2vf_resp->ndev; + + switch (vfdev->msg.type) { + case MBX_MSG_TYPE_REQ: + /* process the request from VF */ + pf2vf_send_response(ndev, vfdev); + break; + case MBX_MSG_TYPE_ACK: + case MBX_MSG_TYPE_NACK: + break; + }; + + kfree(pf2vf_resp); +} + +void nitrox_pf2vf_mbox_handler(struct nitrox_device *ndev) +{ + struct nitrox_vfdev *vfdev; + struct pf2vf_work *pfwork; + u64 value, reg_addr; + u32 i; + int vfno; + + /* loop for VF(0..63) */ + reg_addr = NPS_PKT_MBOX_INT_LO; + value = nitrox_read_csr(ndev, reg_addr); + for_each_set_bit(i, (const unsigned long *)&value, BITS_PER_LONG) { + /* get the vfno from ring */ + vfno = RING_TO_VFNO(i, ndev->iov.max_vf_queues); + vfdev = ndev->iov.vfdev + vfno; + vfdev->ring = i; + /* fill the vf mailbox data */ + vfdev->msg.value = pf2vf_read_mbox(ndev, vfdev->ring); + pfwork = kzalloc(sizeof(*pfwork), GFP_ATOMIC); + if (!pfwork) + continue; + + pfwork->vfdev = vfdev; + pfwork->ndev = ndev; + INIT_WORK(&pfwork->pf2vf_resp, pf2vf_resp_handler); + queue_work(ndev->iov.pf2vf_wq, &pfwork->pf2vf_resp); + /* clear the corresponding vf bit */ + nitrox_write_csr(ndev, reg_addr, BIT_ULL(i)); + } + + /* loop for VF(64..127) */ + reg_addr = NPS_PKT_MBOX_INT_HI; + value = nitrox_read_csr(ndev, reg_addr); + for_each_set_bit(i, (const unsigned long *)&value, BITS_PER_LONG) { + /* get the vfno from ring */ + vfno = RING_TO_VFNO(i + 64, ndev->iov.max_vf_queues); + vfdev = ndev->iov.vfdev + vfno; + vfdev->ring = (i + 64); + /* fill the vf mailbox data */ + vfdev->msg.value = pf2vf_read_mbox(ndev, vfdev->ring); + + pfwork = kzalloc(sizeof(*pfwork), GFP_ATOMIC); + if (!pfwork) + continue; + + pfwork->vfdev = vfdev; + pfwork->ndev = ndev; + INIT_WORK(&pfwork->pf2vf_resp, pf2vf_resp_handler); + queue_work(ndev->iov.pf2vf_wq, &pfwork->pf2vf_resp); + /* clear the corresponding vf bit */ + nitrox_write_csr(ndev, reg_addr, BIT_ULL(i)); + } +} + +int nitrox_mbox_init(struct nitrox_device *ndev) +{ + struct nitrox_vfdev *vfdev; + int i; + + ndev->iov.vfdev = kcalloc(ndev->iov.num_vfs, + sizeof(struct nitrox_vfdev), GFP_KERNEL); + if (!ndev->iov.vfdev) + return -ENOMEM; + + for (i = 0; i < ndev->iov.num_vfs; i++) { + vfdev = ndev->iov.vfdev + i; + vfdev->vfno = i; + } + + /* allocate pf2vf response workqueue */ + ndev->iov.pf2vf_wq = alloc_workqueue("nitrox_pf2vf", 0, 0); + if (!ndev->iov.pf2vf_wq) { + kfree(ndev->iov.vfdev); + return -ENOMEM; + } + /* enable pf2vf mailbox interrupts */ + enable_pf2vf_mbox_interrupts(ndev); + + return 0; +} + +void nitrox_mbox_cleanup(struct nitrox_device *ndev) +{ + /* disable pf2vf mailbox interrupts */ + disable_pf2vf_mbox_interrupts(ndev); + /* destroy workqueue */ + if (ndev->iov.pf2vf_wq) + destroy_workqueue(ndev->iov.pf2vf_wq); + + kfree(ndev->iov.vfdev); + ndev->iov.pf2vf_wq = NULL; + ndev->iov.vfdev = NULL; +} diff --git a/drivers/crypto/cavium/nitrox/nitrox_mbx.h b/drivers/crypto/cavium/nitrox/nitrox_mbx.h new file mode 100644 index 000000000000..5008399775a9 --- /dev/null +++ b/drivers/crypto/cavium/nitrox/nitrox_mbx.h @@ -0,0 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0 +#ifndef __NITROX_MBX_H +#define __NITROX_MBX_H + +int nitrox_mbox_init(struct nitrox_device *ndev); +void nitrox_mbox_cleanup(struct nitrox_device *ndev); +void nitrox_pf2vf_mbox_handler(struct nitrox_device *ndev); + +#endif /* __NITROX_MBX_H */ diff --git a/drivers/crypto/cavium/nitrox/nitrox_sriov.c b/drivers/crypto/cavium/nitrox/nitrox_sriov.c index 7ba0cc5d6d02..bf439d8256ba 100644 --- a/drivers/crypto/cavium/nitrox/nitrox_sriov.c +++ b/drivers/crypto/cavium/nitrox/nitrox_sriov.c @@ -6,6 +6,7 @@ #include "nitrox_hal.h" #include "nitrox_common.h" #include "nitrox_isr.h" +#include "nitrox_mbx.h" /** * num_vfs_valid - validate VF count @@ -52,6 +53,31 @@ static inline enum vf_mode num_vfs_to_mode(int num_vfs) return mode; } +static inline int vf_mode_to_nr_queues(enum vf_mode mode) +{ + int nr_queues = 0; + + switch (mode) { + case __NDEV_MODE_PF: + nr_queues = MAX_PF_QUEUES; + break; + case __NDEV_MODE_VF16: + nr_queues = 8; + break; + case __NDEV_MODE_VF32: + nr_queues = 4; + break; + case __NDEV_MODE_VF64: + nr_queues = 2; + break; + case __NDEV_MODE_VF128: + nr_queues = 1; + break; + } + + return nr_queues; +} + static void nitrox_pf_cleanup(struct nitrox_device *ndev) { /* PF has no queues in SR-IOV mode */ @@ -94,16 +120,31 @@ static int nitrox_pf_reinit(struct nitrox_device *ndev) return nitrox_crypto_register(); } -static int nitrox_sriov_init(struct nitrox_device *ndev) -{ - /* register interrupts for PF in SR-IOV */ - return nitrox_sriov_register_interupts(ndev); -} - static void nitrox_sriov_cleanup(struct nitrox_device *ndev) { /* unregister interrupts for PF in SR-IOV */ nitrox_sriov_unregister_interrupts(ndev); + nitrox_mbox_cleanup(ndev); +} + +static int nitrox_sriov_init(struct nitrox_device *ndev) +{ + int ret; + + /* register interrupts for PF in SR-IOV */ + ret = nitrox_sriov_register_interupts(ndev); + if (ret) + return ret; + + ret = nitrox_mbox_init(ndev); + if (ret) + goto sriov_init_fail; + + return 0; + +sriov_init_fail: + nitrox_sriov_cleanup(ndev); + return ret; } static int nitrox_sriov_enable(struct pci_dev *pdev, int num_vfs) @@ -126,8 +167,9 @@ static int nitrox_sriov_enable(struct pci_dev *pdev, int num_vfs) } dev_info(DEV(ndev), "Enabled VF(s) %d\n", num_vfs); - ndev->iov.num_vfs = num_vfs; ndev->mode = num_vfs_to_mode(num_vfs); + ndev->iov.num_vfs = num_vfs; + ndev->iov.max_vf_queues = vf_mode_to_nr_queues(ndev->mode); /* set bit in flags */ set_bit(__NDEV_SRIOV_BIT, &ndev->flags); @@ -169,6 +211,7 @@ static int nitrox_sriov_disable(struct pci_dev *pdev) clear_bit(__NDEV_SRIOV_BIT, &ndev->flags); ndev->iov.num_vfs = 0; + ndev->iov.max_vf_queues = 0; ndev->mode = __NDEV_MODE_PF; /* cleanup PF SR-IOV resources */ -- cgit v1.2.3-59-g8ed1b From ee5bbc9fd3a1fb81e9f6103d6c52ab88926a9603 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 4 Dec 2018 14:13:31 +0100 Subject: crypto: tcrypt - add block size of 1472 to skcipher template In order to have better coverage of algorithms operating on block sizes that are in the ballpark of a VPN packet, add 1472 to the block_sizes array. Signed-off-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- crypto/tcrypt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crypto/tcrypt.c b/crypto/tcrypt.c index 0590a9204562..e7fb87e114a5 100644 --- a/crypto/tcrypt.c +++ b/crypto/tcrypt.c @@ -81,7 +81,7 @@ static char *check[] = { NULL }; -static u32 block_sizes[] = { 16, 64, 256, 1024, 8192, 0 }; +static u32 block_sizes[] = { 16, 64, 256, 1024, 1472, 8192, 0 }; static u32 aead_sizes[] = { 16, 64, 256, 512, 1024, 2048, 4096, 8192, 0 }; #define XBUFSIZE 8 -- cgit v1.2.3-59-g8ed1b From f2ca1cbd0fb584b5b5e0dbd9bda819f49cf9cdb6 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 4 Dec 2018 14:13:32 +0100 Subject: crypto: arm64/chacha - optimize for arbitrary length inputs Update the 4-way NEON ChaCha routine so it can handle input of any length >64 bytes in its entirety, rather than having to call into the 1-way routine and/or memcpy()s via temp buffers to handle the tail of a ChaCha invocation that is not a multiple of 256 bytes. On inputs that are a multiple of 256 bytes (and thus in tcrypt benchmarks), performance drops by around 1% on Cortex-A57, while performance for inputs drawn randomly from the range [64, 1024) increases by around 30%. Signed-off-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- arch/arm64/crypto/chacha-neon-core.S | 183 ++++++++++++++++++++++++++++++++--- arch/arm64/crypto/chacha-neon-glue.c | 38 +++----- 2 files changed, 184 insertions(+), 37 deletions(-) diff --git a/arch/arm64/crypto/chacha-neon-core.S b/arch/arm64/crypto/chacha-neon-core.S index 3d3a12db5204..8f9c2e83f6f0 100644 --- a/arch/arm64/crypto/chacha-neon-core.S +++ b/arch/arm64/crypto/chacha-neon-core.S @@ -19,6 +19,8 @@ */ #include +#include +#include .text .align 6 @@ -36,7 +38,7 @@ */ chacha_permute: - adr x10, ROT8 + adr_l x10, ROT8 ld1 {v12.4s}, [x10] .Ldoubleround: @@ -169,6 +171,12 @@ ENTRY(chacha_4block_xor_neon) // x1: 4 data blocks output, o // x2: 4 data blocks input, i // w3: nrounds + // x4: byte count + + adr_l x10, .Lpermute + and x5, x4, #63 + add x10, x10, x5 + add x11, x10, #64 // // This function encrypts four consecutive ChaCha blocks by loading @@ -178,15 +186,15 @@ ENTRY(chacha_4block_xor_neon) // matrix by interleaving 32- and then 64-bit words, which allows us to // do XOR in NEON registers. // - adr x9, CTRINC // ... and ROT8 + adr_l x9, CTRINC // ... and ROT8 ld1 {v30.4s-v31.4s}, [x9] // x0..15[0-3] = s0..3[0..3] - mov x4, x0 - ld4r { v0.4s- v3.4s}, [x4], #16 - ld4r { v4.4s- v7.4s}, [x4], #16 - ld4r { v8.4s-v11.4s}, [x4], #16 - ld4r {v12.4s-v15.4s}, [x4] + add x8, x0, #16 + ld4r { v0.4s- v3.4s}, [x0] + ld4r { v4.4s- v7.4s}, [x8], #16 + ld4r { v8.4s-v11.4s}, [x8], #16 + ld4r {v12.4s-v15.4s}, [x8] // x12 += counter values 0-3 add v12.4s, v12.4s, v30.4s @@ -430,24 +438,47 @@ ENTRY(chacha_4block_xor_neon) zip1 v30.4s, v14.4s, v15.4s zip2 v31.4s, v14.4s, v15.4s + mov x3, #64 + subs x5, x4, #64 + add x6, x5, x2 + csel x3, x3, xzr, ge + csel x2, x2, x6, ge + // interleave 64-bit words in state n, n+2 zip1 v0.2d, v16.2d, v18.2d zip2 v4.2d, v16.2d, v18.2d zip1 v8.2d, v17.2d, v19.2d zip2 v12.2d, v17.2d, v19.2d - ld1 {v16.16b-v19.16b}, [x2], #64 + ld1 {v16.16b-v19.16b}, [x2], x3 + + subs x6, x4, #128 + ccmp x3, xzr, #4, lt + add x7, x6, x2 + csel x3, x3, xzr, eq + csel x2, x2, x7, eq zip1 v1.2d, v20.2d, v22.2d zip2 v5.2d, v20.2d, v22.2d zip1 v9.2d, v21.2d, v23.2d zip2 v13.2d, v21.2d, v23.2d - ld1 {v20.16b-v23.16b}, [x2], #64 + ld1 {v20.16b-v23.16b}, [x2], x3 + + subs x7, x4, #192 + ccmp x3, xzr, #4, lt + add x8, x7, x2 + csel x3, x3, xzr, eq + csel x2, x2, x8, eq zip1 v2.2d, v24.2d, v26.2d zip2 v6.2d, v24.2d, v26.2d zip1 v10.2d, v25.2d, v27.2d zip2 v14.2d, v25.2d, v27.2d - ld1 {v24.16b-v27.16b}, [x2], #64 + ld1 {v24.16b-v27.16b}, [x2], x3 + + subs x8, x4, #256 + ccmp x3, xzr, #4, lt + add x9, x8, x2 + csel x2, x2, x9, eq zip1 v3.2d, v28.2d, v30.2d zip2 v7.2d, v28.2d, v30.2d @@ -456,29 +487,155 @@ ENTRY(chacha_4block_xor_neon) ld1 {v28.16b-v31.16b}, [x2] // xor with corresponding input, write to output + tbnz x5, #63, 0f eor v16.16b, v16.16b, v0.16b eor v17.16b, v17.16b, v1.16b eor v18.16b, v18.16b, v2.16b eor v19.16b, v19.16b, v3.16b + st1 {v16.16b-v19.16b}, [x1], #64 + + tbnz x6, #63, 1f eor v20.16b, v20.16b, v4.16b eor v21.16b, v21.16b, v5.16b - st1 {v16.16b-v19.16b}, [x1], #64 eor v22.16b, v22.16b, v6.16b eor v23.16b, v23.16b, v7.16b + st1 {v20.16b-v23.16b}, [x1], #64 + + tbnz x7, #63, 2f eor v24.16b, v24.16b, v8.16b eor v25.16b, v25.16b, v9.16b - st1 {v20.16b-v23.16b}, [x1], #64 eor v26.16b, v26.16b, v10.16b eor v27.16b, v27.16b, v11.16b - eor v28.16b, v28.16b, v12.16b st1 {v24.16b-v27.16b}, [x1], #64 + + tbnz x8, #63, 3f + eor v28.16b, v28.16b, v12.16b eor v29.16b, v29.16b, v13.16b eor v30.16b, v30.16b, v14.16b eor v31.16b, v31.16b, v15.16b st1 {v28.16b-v31.16b}, [x1] ret + + // fewer than 64 bytes of in/output +0: ld1 {v8.16b}, [x10] + ld1 {v9.16b}, [x11] + movi v10.16b, #16 + sub x2, x1, #64 + add x1, x1, x5 + ld1 {v16.16b-v19.16b}, [x2] + tbl v4.16b, {v0.16b-v3.16b}, v8.16b + tbx v20.16b, {v16.16b-v19.16b}, v9.16b + add v8.16b, v8.16b, v10.16b + add v9.16b, v9.16b, v10.16b + tbl v5.16b, {v0.16b-v3.16b}, v8.16b + tbx v21.16b, {v16.16b-v19.16b}, v9.16b + add v8.16b, v8.16b, v10.16b + add v9.16b, v9.16b, v10.16b + tbl v6.16b, {v0.16b-v3.16b}, v8.16b + tbx v22.16b, {v16.16b-v19.16b}, v9.16b + add v8.16b, v8.16b, v10.16b + add v9.16b, v9.16b, v10.16b + tbl v7.16b, {v0.16b-v3.16b}, v8.16b + tbx v23.16b, {v16.16b-v19.16b}, v9.16b + + eor v20.16b, v20.16b, v4.16b + eor v21.16b, v21.16b, v5.16b + eor v22.16b, v22.16b, v6.16b + eor v23.16b, v23.16b, v7.16b + st1 {v20.16b-v23.16b}, [x1] + ret + + // fewer than 128 bytes of in/output +1: ld1 {v8.16b}, [x10] + ld1 {v9.16b}, [x11] + movi v10.16b, #16 + add x1, x1, x6 + tbl v0.16b, {v4.16b-v7.16b}, v8.16b + tbx v20.16b, {v16.16b-v19.16b}, v9.16b + add v8.16b, v8.16b, v10.16b + add v9.16b, v9.16b, v10.16b + tbl v1.16b, {v4.16b-v7.16b}, v8.16b + tbx v21.16b, {v16.16b-v19.16b}, v9.16b + add v8.16b, v8.16b, v10.16b + add v9.16b, v9.16b, v10.16b + tbl v2.16b, {v4.16b-v7.16b}, v8.16b + tbx v22.16b, {v16.16b-v19.16b}, v9.16b + add v8.16b, v8.16b, v10.16b + add v9.16b, v9.16b, v10.16b + tbl v3.16b, {v4.16b-v7.16b}, v8.16b + tbx v23.16b, {v16.16b-v19.16b}, v9.16b + + eor v20.16b, v20.16b, v0.16b + eor v21.16b, v21.16b, v1.16b + eor v22.16b, v22.16b, v2.16b + eor v23.16b, v23.16b, v3.16b + st1 {v20.16b-v23.16b}, [x1] + ret + + // fewer than 192 bytes of in/output +2: ld1 {v4.16b}, [x10] + ld1 {v5.16b}, [x11] + movi v6.16b, #16 + add x1, x1, x7 + tbl v0.16b, {v8.16b-v11.16b}, v4.16b + tbx v24.16b, {v20.16b-v23.16b}, v5.16b + add v4.16b, v4.16b, v6.16b + add v5.16b, v5.16b, v6.16b + tbl v1.16b, {v8.16b-v11.16b}, v4.16b + tbx v25.16b, {v20.16b-v23.16b}, v5.16b + add v4.16b, v4.16b, v6.16b + add v5.16b, v5.16b, v6.16b + tbl v2.16b, {v8.16b-v11.16b}, v4.16b + tbx v26.16b, {v20.16b-v23.16b}, v5.16b + add v4.16b, v4.16b, v6.16b + add v5.16b, v5.16b, v6.16b + tbl v3.16b, {v8.16b-v11.16b}, v4.16b + tbx v27.16b, {v20.16b-v23.16b}, v5.16b + + eor v24.16b, v24.16b, v0.16b + eor v25.16b, v25.16b, v1.16b + eor v26.16b, v26.16b, v2.16b + eor v27.16b, v27.16b, v3.16b + st1 {v24.16b-v27.16b}, [x1] + ret + + // fewer than 256 bytes of in/output +3: ld1 {v4.16b}, [x10] + ld1 {v5.16b}, [x11] + movi v6.16b, #16 + add x1, x1, x8 + tbl v0.16b, {v12.16b-v15.16b}, v4.16b + tbx v28.16b, {v24.16b-v27.16b}, v5.16b + add v4.16b, v4.16b, v6.16b + add v5.16b, v5.16b, v6.16b + tbl v1.16b, {v12.16b-v15.16b}, v4.16b + tbx v29.16b, {v24.16b-v27.16b}, v5.16b + add v4.16b, v4.16b, v6.16b + add v5.16b, v5.16b, v6.16b + tbl v2.16b, {v12.16b-v15.16b}, v4.16b + tbx v30.16b, {v24.16b-v27.16b}, v5.16b + add v4.16b, v4.16b, v6.16b + add v5.16b, v5.16b, v6.16b + tbl v3.16b, {v12.16b-v15.16b}, v4.16b + tbx v31.16b, {v24.16b-v27.16b}, v5.16b + + eor v28.16b, v28.16b, v0.16b + eor v29.16b, v29.16b, v1.16b + eor v30.16b, v30.16b, v2.16b + eor v31.16b, v31.16b, v3.16b + st1 {v28.16b-v31.16b}, [x1] + ret ENDPROC(chacha_4block_xor_neon) + .section ".rodata", "a", %progbits + .align L1_CACHE_SHIFT +.Lpermute: + .set .Li, 0 + .rept 192 + .byte (.Li - 64) + .set .Li, .Li + 1 + .endr + CTRINC: .word 0, 1, 2, 3 ROT8: .word 0x02010003, 0x06050407, 0x0a09080b, 0x0e0d0c0f diff --git a/arch/arm64/crypto/chacha-neon-glue.c b/arch/arm64/crypto/chacha-neon-glue.c index 346eb85498a1..67f8feb0c717 100644 --- a/arch/arm64/crypto/chacha-neon-glue.c +++ b/arch/arm64/crypto/chacha-neon-glue.c @@ -32,41 +32,29 @@ asmlinkage void chacha_block_xor_neon(u32 *state, u8 *dst, const u8 *src, int nrounds); asmlinkage void chacha_4block_xor_neon(u32 *state, u8 *dst, const u8 *src, - int nrounds); + int nrounds, int bytes); asmlinkage void hchacha_block_neon(const u32 *state, u32 *out, int nrounds); static void chacha_doneon(u32 *state, u8 *dst, const u8 *src, - unsigned int bytes, int nrounds) + int bytes, int nrounds) { u8 buf[CHACHA_BLOCK_SIZE]; - while (bytes >= CHACHA_BLOCK_SIZE * 4) { - kernel_neon_begin(); - chacha_4block_xor_neon(state, dst, src, nrounds); - kernel_neon_end(); + if (bytes < CHACHA_BLOCK_SIZE) { + memcpy(buf, src, bytes); + chacha_block_xor_neon(state, buf, buf, nrounds); + memcpy(dst, buf, bytes); + return; + } + + while (bytes > 0) { + chacha_4block_xor_neon(state, dst, src, nrounds, + min(bytes, CHACHA_BLOCK_SIZE * 4)); bytes -= CHACHA_BLOCK_SIZE * 4; src += CHACHA_BLOCK_SIZE * 4; dst += CHACHA_BLOCK_SIZE * 4; state[12] += 4; } - - if (!bytes) - return; - - kernel_neon_begin(); - while (bytes >= CHACHA_BLOCK_SIZE) { - chacha_block_xor_neon(state, dst, src, nrounds); - bytes -= CHACHA_BLOCK_SIZE; - src += CHACHA_BLOCK_SIZE; - dst += CHACHA_BLOCK_SIZE; - state[12]++; - } - if (bytes) { - memcpy(buf, src, bytes); - chacha_block_xor_neon(state, buf, buf, nrounds); - memcpy(dst, buf, bytes); - } - kernel_neon_end(); } static int chacha_neon_stream_xor(struct skcipher_request *req, @@ -86,8 +74,10 @@ static int chacha_neon_stream_xor(struct skcipher_request *req, if (nbytes < walk.total) nbytes = round_down(nbytes, walk.stride); + kernel_neon_begin(); chacha_doneon(state, walk.dst.virt.addr, walk.src.virt.addr, nbytes, ctx->nrounds); + kernel_neon_end(); err = skcipher_walk_done(&walk, walk.nbytes - nbytes); } -- cgit v1.2.3-59-g8ed1b From 2fe55987b2624a86a5c709a8df65d4de2608dc07 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 4 Dec 2018 14:13:33 +0100 Subject: crypto: arm64/chacha - use combined SIMD/ALU routine for more speed To some degree, most known AArch64 micro-architectures appear to be able to issue ALU instructions in parellel to SIMD instructions without affecting the SIMD throughput. This means we can use the ALU to process a fifth ChaCha block while the SIMD is processing four blocks in parallel. Signed-off-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- arch/arm64/crypto/chacha-neon-core.S | 235 ++++++++++++++++++++++++++++++++--- arch/arm64/crypto/chacha-neon-glue.c | 39 +++--- 2 files changed, 239 insertions(+), 35 deletions(-) diff --git a/arch/arm64/crypto/chacha-neon-core.S b/arch/arm64/crypto/chacha-neon-core.S index 8f9c2e83f6f0..021bb9e9784b 100644 --- a/arch/arm64/crypto/chacha-neon-core.S +++ b/arch/arm64/crypto/chacha-neon-core.S @@ -1,13 +1,13 @@ /* * ChaCha/XChaCha NEON helper functions * - * Copyright (C) 2016 Linaro, Ltd. + * Copyright (C) 2016-2018 Linaro, Ltd. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. * - * Based on: + * Originally based on: * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSSE3 functions * * Copyright (C) 2015 Martin Willi @@ -165,8 +165,27 @@ ENTRY(hchacha_block_neon) ret ENDPROC(hchacha_block_neon) + a0 .req w12 + a1 .req w13 + a2 .req w14 + a3 .req w15 + a4 .req w16 + a5 .req w17 + a6 .req w19 + a7 .req w20 + a8 .req w21 + a9 .req w22 + a10 .req w23 + a11 .req w24 + a12 .req w25 + a13 .req w26 + a14 .req w27 + a15 .req w28 + .align 6 ENTRY(chacha_4block_xor_neon) + frame_push 10 + // x0: Input state matrix, s // x1: 4 data blocks output, o // x2: 4 data blocks input, i @@ -186,6 +205,9 @@ ENTRY(chacha_4block_xor_neon) // matrix by interleaving 32- and then 64-bit words, which allows us to // do XOR in NEON registers. // + // At the same time, a fifth block is encrypted in parallel using + // scalar registers + // adr_l x9, CTRINC // ... and ROT8 ld1 {v30.4s-v31.4s}, [x9] @@ -196,7 +218,24 @@ ENTRY(chacha_4block_xor_neon) ld4r { v8.4s-v11.4s}, [x8], #16 ld4r {v12.4s-v15.4s}, [x8] - // x12 += counter values 0-3 + mov a0, v0.s[0] + mov a1, v1.s[0] + mov a2, v2.s[0] + mov a3, v3.s[0] + mov a4, v4.s[0] + mov a5, v5.s[0] + mov a6, v6.s[0] + mov a7, v7.s[0] + mov a8, v8.s[0] + mov a9, v9.s[0] + mov a10, v10.s[0] + mov a11, v11.s[0] + mov a12, v12.s[0] + mov a13, v13.s[0] + mov a14, v14.s[0] + mov a15, v15.s[0] + + // x12 += counter values 1-4 add v12.4s, v12.4s, v30.4s .Ldoubleround4: @@ -205,33 +244,53 @@ ENTRY(chacha_4block_xor_neon) // x2 += x6, x14 = rotl32(x14 ^ x2, 16) // x3 += x7, x15 = rotl32(x15 ^ x3, 16) add v0.4s, v0.4s, v4.4s + add a0, a0, a4 add v1.4s, v1.4s, v5.4s + add a1, a1, a5 add v2.4s, v2.4s, v6.4s + add a2, a2, a6 add v3.4s, v3.4s, v7.4s + add a3, a3, a7 eor v12.16b, v12.16b, v0.16b + eor a12, a12, a0 eor v13.16b, v13.16b, v1.16b + eor a13, a13, a1 eor v14.16b, v14.16b, v2.16b + eor a14, a14, a2 eor v15.16b, v15.16b, v3.16b + eor a15, a15, a3 rev32 v12.8h, v12.8h + ror a12, a12, #16 rev32 v13.8h, v13.8h + ror a13, a13, #16 rev32 v14.8h, v14.8h + ror a14, a14, #16 rev32 v15.8h, v15.8h + ror a15, a15, #16 // x8 += x12, x4 = rotl32(x4 ^ x8, 12) // x9 += x13, x5 = rotl32(x5 ^ x9, 12) // x10 += x14, x6 = rotl32(x6 ^ x10, 12) // x11 += x15, x7 = rotl32(x7 ^ x11, 12) add v8.4s, v8.4s, v12.4s + add a8, a8, a12 add v9.4s, v9.4s, v13.4s + add a9, a9, a13 add v10.4s, v10.4s, v14.4s + add a10, a10, a14 add v11.4s, v11.4s, v15.4s + add a11, a11, a15 eor v16.16b, v4.16b, v8.16b + eor a4, a4, a8 eor v17.16b, v5.16b, v9.16b + eor a5, a5, a9 eor v18.16b, v6.16b, v10.16b + eor a6, a6, a10 eor v19.16b, v7.16b, v11.16b + eor a7, a7, a11 shl v4.4s, v16.4s, #12 shl v5.4s, v17.4s, #12 @@ -239,42 +298,66 @@ ENTRY(chacha_4block_xor_neon) shl v7.4s, v19.4s, #12 sri v4.4s, v16.4s, #20 + ror a4, a4, #20 sri v5.4s, v17.4s, #20 + ror a5, a5, #20 sri v6.4s, v18.4s, #20 + ror a6, a6, #20 sri v7.4s, v19.4s, #20 + ror a7, a7, #20 // x0 += x4, x12 = rotl32(x12 ^ x0, 8) // x1 += x5, x13 = rotl32(x13 ^ x1, 8) // x2 += x6, x14 = rotl32(x14 ^ x2, 8) // x3 += x7, x15 = rotl32(x15 ^ x3, 8) add v0.4s, v0.4s, v4.4s + add a0, a0, a4 add v1.4s, v1.4s, v5.4s + add a1, a1, a5 add v2.4s, v2.4s, v6.4s + add a2, a2, a6 add v3.4s, v3.4s, v7.4s + add a3, a3, a7 eor v12.16b, v12.16b, v0.16b + eor a12, a12, a0 eor v13.16b, v13.16b, v1.16b + eor a13, a13, a1 eor v14.16b, v14.16b, v2.16b + eor a14, a14, a2 eor v15.16b, v15.16b, v3.16b + eor a15, a15, a3 tbl v12.16b, {v12.16b}, v31.16b + ror a12, a12, #24 tbl v13.16b, {v13.16b}, v31.16b + ror a13, a13, #24 tbl v14.16b, {v14.16b}, v31.16b + ror a14, a14, #24 tbl v15.16b, {v15.16b}, v31.16b + ror a15, a15, #24 // x8 += x12, x4 = rotl32(x4 ^ x8, 7) // x9 += x13, x5 = rotl32(x5 ^ x9, 7) // x10 += x14, x6 = rotl32(x6 ^ x10, 7) // x11 += x15, x7 = rotl32(x7 ^ x11, 7) add v8.4s, v8.4s, v12.4s + add a8, a8, a12 add v9.4s, v9.4s, v13.4s + add a9, a9, a13 add v10.4s, v10.4s, v14.4s + add a10, a10, a14 add v11.4s, v11.4s, v15.4s + add a11, a11, a15 eor v16.16b, v4.16b, v8.16b + eor a4, a4, a8 eor v17.16b, v5.16b, v9.16b + eor a5, a5, a9 eor v18.16b, v6.16b, v10.16b + eor a6, a6, a10 eor v19.16b, v7.16b, v11.16b + eor a7, a7, a11 shl v4.4s, v16.4s, #7 shl v5.4s, v17.4s, #7 @@ -282,42 +365,66 @@ ENTRY(chacha_4block_xor_neon) shl v7.4s, v19.4s, #7 sri v4.4s, v16.4s, #25 + ror a4, a4, #25 sri v5.4s, v17.4s, #25 + ror a5, a5, #25 sri v6.4s, v18.4s, #25 + ror a6, a6, #25 sri v7.4s, v19.4s, #25 + ror a7, a7, #25 // x0 += x5, x15 = rotl32(x15 ^ x0, 16) // x1 += x6, x12 = rotl32(x12 ^ x1, 16) // x2 += x7, x13 = rotl32(x13 ^ x2, 16) // x3 += x4, x14 = rotl32(x14 ^ x3, 16) add v0.4s, v0.4s, v5.4s + add a0, a0, a5 add v1.4s, v1.4s, v6.4s + add a1, a1, a6 add v2.4s, v2.4s, v7.4s + add a2, a2, a7 add v3.4s, v3.4s, v4.4s + add a3, a3, a4 eor v15.16b, v15.16b, v0.16b + eor a15, a15, a0 eor v12.16b, v12.16b, v1.16b + eor a12, a12, a1 eor v13.16b, v13.16b, v2.16b + eor a13, a13, a2 eor v14.16b, v14.16b, v3.16b + eor a14, a14, a3 rev32 v15.8h, v15.8h + ror a15, a15, #16 rev32 v12.8h, v12.8h + ror a12, a12, #16 rev32 v13.8h, v13.8h + ror a13, a13, #16 rev32 v14.8h, v14.8h + ror a14, a14, #16 // x10 += x15, x5 = rotl32(x5 ^ x10, 12) // x11 += x12, x6 = rotl32(x6 ^ x11, 12) // x8 += x13, x7 = rotl32(x7 ^ x8, 12) // x9 += x14, x4 = rotl32(x4 ^ x9, 12) add v10.4s, v10.4s, v15.4s + add a10, a10, a15 add v11.4s, v11.4s, v12.4s + add a11, a11, a12 add v8.4s, v8.4s, v13.4s + add a8, a8, a13 add v9.4s, v9.4s, v14.4s + add a9, a9, a14 eor v16.16b, v5.16b, v10.16b + eor a5, a5, a10 eor v17.16b, v6.16b, v11.16b + eor a6, a6, a11 eor v18.16b, v7.16b, v8.16b + eor a7, a7, a8 eor v19.16b, v4.16b, v9.16b + eor a4, a4, a9 shl v5.4s, v16.4s, #12 shl v6.4s, v17.4s, #12 @@ -325,42 +432,66 @@ ENTRY(chacha_4block_xor_neon) shl v4.4s, v19.4s, #12 sri v5.4s, v16.4s, #20 + ror a5, a5, #20 sri v6.4s, v17.4s, #20 + ror a6, a6, #20 sri v7.4s, v18.4s, #20 + ror a7, a7, #20 sri v4.4s, v19.4s, #20 + ror a4, a4, #20 // x0 += x5, x15 = rotl32(x15 ^ x0, 8) // x1 += x6, x12 = rotl32(x12 ^ x1, 8) // x2 += x7, x13 = rotl32(x13 ^ x2, 8) // x3 += x4, x14 = rotl32(x14 ^ x3, 8) add v0.4s, v0.4s, v5.4s + add a0, a0, a5 add v1.4s, v1.4s, v6.4s + add a1, a1, a6 add v2.4s, v2.4s, v7.4s + add a2, a2, a7 add v3.4s, v3.4s, v4.4s + add a3, a3, a4 eor v15.16b, v15.16b, v0.16b + eor a15, a15, a0 eor v12.16b, v12.16b, v1.16b + eor a12, a12, a1 eor v13.16b, v13.16b, v2.16b + eor a13, a13, a2 eor v14.16b, v14.16b, v3.16b + eor a14, a14, a3 tbl v15.16b, {v15.16b}, v31.16b + ror a15, a15, #24 tbl v12.16b, {v12.16b}, v31.16b + ror a12, a12, #24 tbl v13.16b, {v13.16b}, v31.16b + ror a13, a13, #24 tbl v14.16b, {v14.16b}, v31.16b + ror a14, a14, #24 // x10 += x15, x5 = rotl32(x5 ^ x10, 7) // x11 += x12, x6 = rotl32(x6 ^ x11, 7) // x8 += x13, x7 = rotl32(x7 ^ x8, 7) // x9 += x14, x4 = rotl32(x4 ^ x9, 7) add v10.4s, v10.4s, v15.4s + add a10, a10, a15 add v11.4s, v11.4s, v12.4s + add a11, a11, a12 add v8.4s, v8.4s, v13.4s + add a8, a8, a13 add v9.4s, v9.4s, v14.4s + add a9, a9, a14 eor v16.16b, v5.16b, v10.16b + eor a5, a5, a10 eor v17.16b, v6.16b, v11.16b + eor a6, a6, a11 eor v18.16b, v7.16b, v8.16b + eor a7, a7, a8 eor v19.16b, v4.16b, v9.16b + eor a4, a4, a9 shl v5.4s, v16.4s, #7 shl v6.4s, v17.4s, #7 @@ -368,9 +499,13 @@ ENTRY(chacha_4block_xor_neon) shl v4.4s, v19.4s, #7 sri v5.4s, v16.4s, #25 + ror a5, a5, #25 sri v6.4s, v17.4s, #25 + ror a6, a6, #25 sri v7.4s, v18.4s, #25 + ror a7, a7, #25 sri v4.4s, v19.4s, #25 + ror a4, a4, #25 subs w3, w3, #2 b.ne .Ldoubleround4 @@ -386,9 +521,17 @@ ENTRY(chacha_4block_xor_neon) // x2[0-3] += s0[2] // x3[0-3] += s0[3] add v0.4s, v0.4s, v16.4s + mov w6, v16.s[0] + mov w7, v17.s[0] add v1.4s, v1.4s, v17.4s + mov w8, v18.s[0] + mov w9, v19.s[0] add v2.4s, v2.4s, v18.4s + add a0, a0, w6 + add a1, a1, w7 add v3.4s, v3.4s, v19.4s + add a2, a2, w8 + add a3, a3, w9 ld4r {v24.4s-v27.4s}, [x0], #16 ld4r {v28.4s-v31.4s}, [x0] @@ -398,48 +541,96 @@ ENTRY(chacha_4block_xor_neon) // x6[0-3] += s1[2] // x7[0-3] += s1[3] add v4.4s, v4.4s, v20.4s + mov w6, v20.s[0] + mov w7, v21.s[0] add v5.4s, v5.4s, v21.4s + mov w8, v22.s[0] + mov w9, v23.s[0] add v6.4s, v6.4s, v22.4s + add a4, a4, w6 + add a5, a5, w7 add v7.4s, v7.4s, v23.4s + add a6, a6, w8 + add a7, a7, w9 // x8[0-3] += s2[0] // x9[0-3] += s2[1] // x10[0-3] += s2[2] // x11[0-3] += s2[3] add v8.4s, v8.4s, v24.4s + mov w6, v24.s[0] + mov w7, v25.s[0] add v9.4s, v9.4s, v25.4s + mov w8, v26.s[0] + mov w9, v27.s[0] add v10.4s, v10.4s, v26.4s + add a8, a8, w6 + add a9, a9, w7 add v11.4s, v11.4s, v27.4s + add a10, a10, w8 + add a11, a11, w9 // x12[0-3] += s3[0] // x13[0-3] += s3[1] // x14[0-3] += s3[2] // x15[0-3] += s3[3] add v12.4s, v12.4s, v28.4s + mov w6, v28.s[0] + mov w7, v29.s[0] add v13.4s, v13.4s, v29.4s + mov w8, v30.s[0] + mov w9, v31.s[0] add v14.4s, v14.4s, v30.4s + add a12, a12, w6 + add a13, a13, w7 add v15.4s, v15.4s, v31.4s + add a14, a14, w8 + add a15, a15, w9 // interleave 32-bit words in state n, n+1 + ldp w6, w7, [x2], #64 zip1 v16.4s, v0.4s, v1.4s + ldp w8, w9, [x2, #-56] + eor a0, a0, w6 zip2 v17.4s, v0.4s, v1.4s + eor a1, a1, w7 zip1 v18.4s, v2.4s, v3.4s + eor a2, a2, w8 zip2 v19.4s, v2.4s, v3.4s + eor a3, a3, w9 + ldp w6, w7, [x2, #-48] zip1 v20.4s, v4.4s, v5.4s + ldp w8, w9, [x2, #-40] + eor a4, a4, w6 zip2 v21.4s, v4.4s, v5.4s + eor a5, a5, w7 zip1 v22.4s, v6.4s, v7.4s + eor a6, a6, w8 zip2 v23.4s, v6.4s, v7.4s + eor a7, a7, w9 + ldp w6, w7, [x2, #-32] zip1 v24.4s, v8.4s, v9.4s + ldp w8, w9, [x2, #-24] + eor a8, a8, w6 zip2 v25.4s, v8.4s, v9.4s + eor a9, a9, w7 zip1 v26.4s, v10.4s, v11.4s + eor a10, a10, w8 zip2 v27.4s, v10.4s, v11.4s + eor a11, a11, w9 + ldp w6, w7, [x2, #-16] zip1 v28.4s, v12.4s, v13.4s + ldp w8, w9, [x2, #-8] + eor a12, a12, w6 zip2 v29.4s, v12.4s, v13.4s + eor a13, a13, w7 zip1 v30.4s, v14.4s, v15.4s + eor a14, a14, w8 zip2 v31.4s, v14.4s, v15.4s + eor a15, a15, w9 mov x3, #64 - subs x5, x4, #64 + subs x5, x4, #128 add x6, x5, x2 csel x3, x3, xzr, ge csel x2, x2, x6, ge @@ -447,11 +638,13 @@ ENTRY(chacha_4block_xor_neon) // interleave 64-bit words in state n, n+2 zip1 v0.2d, v16.2d, v18.2d zip2 v4.2d, v16.2d, v18.2d + stp a0, a1, [x1], #64 zip1 v8.2d, v17.2d, v19.2d zip2 v12.2d, v17.2d, v19.2d + stp a2, a3, [x1, #-56] ld1 {v16.16b-v19.16b}, [x2], x3 - subs x6, x4, #128 + subs x6, x4, #192 ccmp x3, xzr, #4, lt add x7, x6, x2 csel x3, x3, xzr, eq @@ -459,11 +652,13 @@ ENTRY(chacha_4block_xor_neon) zip1 v1.2d, v20.2d, v22.2d zip2 v5.2d, v20.2d, v22.2d + stp a4, a5, [x1, #-48] zip1 v9.2d, v21.2d, v23.2d zip2 v13.2d, v21.2d, v23.2d + stp a6, a7, [x1, #-40] ld1 {v20.16b-v23.16b}, [x2], x3 - subs x7, x4, #192 + subs x7, x4, #256 ccmp x3, xzr, #4, lt add x8, x7, x2 csel x3, x3, xzr, eq @@ -471,19 +666,23 @@ ENTRY(chacha_4block_xor_neon) zip1 v2.2d, v24.2d, v26.2d zip2 v6.2d, v24.2d, v26.2d + stp a8, a9, [x1, #-32] zip1 v10.2d, v25.2d, v27.2d zip2 v14.2d, v25.2d, v27.2d + stp a10, a11, [x1, #-24] ld1 {v24.16b-v27.16b}, [x2], x3 - subs x8, x4, #256 + subs x8, x4, #320 ccmp x3, xzr, #4, lt add x9, x8, x2 csel x2, x2, x9, eq zip1 v3.2d, v28.2d, v30.2d zip2 v7.2d, v28.2d, v30.2d + stp a12, a13, [x1, #-16] zip1 v11.2d, v29.2d, v31.2d zip2 v15.2d, v29.2d, v31.2d + stp a14, a15, [x1, #-8] ld1 {v28.16b-v31.16b}, [x2] // xor with corresponding input, write to output @@ -493,6 +692,7 @@ ENTRY(chacha_4block_xor_neon) eor v18.16b, v18.16b, v2.16b eor v19.16b, v19.16b, v3.16b st1 {v16.16b-v19.16b}, [x1], #64 + cbz x5, .Lout tbnz x6, #63, 1f eor v20.16b, v20.16b, v4.16b @@ -500,6 +700,7 @@ ENTRY(chacha_4block_xor_neon) eor v22.16b, v22.16b, v6.16b eor v23.16b, v23.16b, v7.16b st1 {v20.16b-v23.16b}, [x1], #64 + cbz x6, .Lout tbnz x7, #63, 2f eor v24.16b, v24.16b, v8.16b @@ -507,6 +708,7 @@ ENTRY(chacha_4block_xor_neon) eor v26.16b, v26.16b, v10.16b eor v27.16b, v27.16b, v11.16b st1 {v24.16b-v27.16b}, [x1], #64 + cbz x7, .Lout tbnz x8, #63, 3f eor v28.16b, v28.16b, v12.16b @@ -515,9 +717,10 @@ ENTRY(chacha_4block_xor_neon) eor v31.16b, v31.16b, v15.16b st1 {v28.16b-v31.16b}, [x1] +.Lout: frame_pop ret - // fewer than 64 bytes of in/output + // fewer than 128 bytes of in/output 0: ld1 {v8.16b}, [x10] ld1 {v9.16b}, [x11] movi v10.16b, #16 @@ -544,9 +747,9 @@ ENTRY(chacha_4block_xor_neon) eor v22.16b, v22.16b, v6.16b eor v23.16b, v23.16b, v7.16b st1 {v20.16b-v23.16b}, [x1] - ret + b .Lout - // fewer than 128 bytes of in/output + // fewer than 192 bytes of in/output 1: ld1 {v8.16b}, [x10] ld1 {v9.16b}, [x11] movi v10.16b, #16 @@ -571,9 +774,9 @@ ENTRY(chacha_4block_xor_neon) eor v22.16b, v22.16b, v2.16b eor v23.16b, v23.16b, v3.16b st1 {v20.16b-v23.16b}, [x1] - ret + b .Lout - // fewer than 192 bytes of in/output + // fewer than 256 bytes of in/output 2: ld1 {v4.16b}, [x10] ld1 {v5.16b}, [x11] movi v6.16b, #16 @@ -598,9 +801,9 @@ ENTRY(chacha_4block_xor_neon) eor v26.16b, v26.16b, v2.16b eor v27.16b, v27.16b, v3.16b st1 {v24.16b-v27.16b}, [x1] - ret + b .Lout - // fewer than 256 bytes of in/output + // fewer than 320 bytes of in/output 3: ld1 {v4.16b}, [x10] ld1 {v5.16b}, [x11] movi v6.16b, #16 @@ -625,7 +828,7 @@ ENTRY(chacha_4block_xor_neon) eor v30.16b, v30.16b, v2.16b eor v31.16b, v31.16b, v3.16b st1 {v28.16b-v31.16b}, [x1] - ret + b .Lout ENDPROC(chacha_4block_xor_neon) .section ".rodata", "a", %progbits @@ -637,5 +840,5 @@ ENDPROC(chacha_4block_xor_neon) .set .Li, .Li + 1 .endr -CTRINC: .word 0, 1, 2, 3 +CTRINC: .word 1, 2, 3, 4 ROT8: .word 0x02010003, 0x06050407, 0x0a09080b, 0x0e0d0c0f diff --git a/arch/arm64/crypto/chacha-neon-glue.c b/arch/arm64/crypto/chacha-neon-glue.c index 67f8feb0c717..bece1d85bd81 100644 --- a/arch/arm64/crypto/chacha-neon-glue.c +++ b/arch/arm64/crypto/chacha-neon-glue.c @@ -38,22 +38,23 @@ asmlinkage void hchacha_block_neon(const u32 *state, u32 *out, int nrounds); static void chacha_doneon(u32 *state, u8 *dst, const u8 *src, int bytes, int nrounds) { - u8 buf[CHACHA_BLOCK_SIZE]; - - if (bytes < CHACHA_BLOCK_SIZE) { - memcpy(buf, src, bytes); - chacha_block_xor_neon(state, buf, buf, nrounds); - memcpy(dst, buf, bytes); - return; - } - while (bytes > 0) { - chacha_4block_xor_neon(state, dst, src, nrounds, - min(bytes, CHACHA_BLOCK_SIZE * 4)); - bytes -= CHACHA_BLOCK_SIZE * 4; - src += CHACHA_BLOCK_SIZE * 4; - dst += CHACHA_BLOCK_SIZE * 4; - state[12] += 4; + int l = min(bytes, CHACHA_BLOCK_SIZE * 5); + + if (l <= CHACHA_BLOCK_SIZE) { + u8 buf[CHACHA_BLOCK_SIZE]; + + memcpy(buf, src, l); + chacha_block_xor_neon(state, buf, buf, nrounds); + memcpy(dst, buf, l); + state[12] += 1; + break; + } + chacha_4block_xor_neon(state, dst, src, nrounds, l); + bytes -= CHACHA_BLOCK_SIZE * 5; + src += CHACHA_BLOCK_SIZE * 5; + dst += CHACHA_BLOCK_SIZE * 5; + state[12] += 5; } } @@ -72,7 +73,7 @@ static int chacha_neon_stream_xor(struct skcipher_request *req, unsigned int nbytes = walk.nbytes; if (nbytes < walk.total) - nbytes = round_down(nbytes, walk.stride); + nbytes = rounddown(nbytes, walk.stride); kernel_neon_begin(); chacha_doneon(state, walk.dst.virt.addr, walk.src.virt.addr, @@ -131,7 +132,7 @@ static struct skcipher_alg algs[] = { .max_keysize = CHACHA_KEY_SIZE, .ivsize = CHACHA_IV_SIZE, .chunksize = CHACHA_BLOCK_SIZE, - .walksize = 4 * CHACHA_BLOCK_SIZE, + .walksize = 5 * CHACHA_BLOCK_SIZE, .setkey = crypto_chacha20_setkey, .encrypt = chacha_neon, .decrypt = chacha_neon, @@ -147,7 +148,7 @@ static struct skcipher_alg algs[] = { .max_keysize = CHACHA_KEY_SIZE, .ivsize = XCHACHA_IV_SIZE, .chunksize = CHACHA_BLOCK_SIZE, - .walksize = 4 * CHACHA_BLOCK_SIZE, + .walksize = 5 * CHACHA_BLOCK_SIZE, .setkey = crypto_chacha20_setkey, .encrypt = xchacha_neon, .decrypt = xchacha_neon, @@ -163,7 +164,7 @@ static struct skcipher_alg algs[] = { .max_keysize = CHACHA_KEY_SIZE, .ivsize = XCHACHA_IV_SIZE, .chunksize = CHACHA_BLOCK_SIZE, - .walksize = 4 * CHACHA_BLOCK_SIZE, + .walksize = 5 * CHACHA_BLOCK_SIZE, .setkey = crypto_chacha12_setkey, .encrypt = xchacha_neon, .decrypt = xchacha_neon, -- cgit v1.2.3-59-g8ed1b From b299362ee48db8eab34208302ee9730ff9d6091c Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Tue, 4 Dec 2018 16:46:54 -0800 Subject: crypto: adiantum - propagate CRYPTO_ALG_ASYNC flag to instance If the stream cipher implementation is asynchronous, then the Adiantum instance must be flagged as asynchronous as well. Otherwise someone asking for a synchronous algorithm can get an asynchronous algorithm. There are no asynchronous xchacha12 or xchacha20 implementations yet which makes this largely a theoretical issue, but it should be fixed. Fixes: 059c2a4d8e16 ("crypto: adiantum - add Adiantum support") Signed-off-by: Eric Biggers Signed-off-by: Herbert Xu --- crypto/adiantum.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/crypto/adiantum.c b/crypto/adiantum.c index 2dfcf12fd452..ca27e0dc2958 100644 --- a/crypto/adiantum.c +++ b/crypto/adiantum.c @@ -590,6 +590,8 @@ static int adiantum_create(struct crypto_template *tmpl, struct rtattr **tb) hash_alg->base.cra_driver_name) >= CRYPTO_MAX_ALG_NAME) goto out_drop_hash; + inst->alg.base.cra_flags = streamcipher_alg->base.cra_flags & + CRYPTO_ALG_ASYNC; inst->alg.base.cra_blocksize = BLOCKCIPHER_BLOCK_SIZE; inst->alg.base.cra_ctxsize = sizeof(struct adiantum_tfm_ctx); inst->alg.base.cra_alignmask = streamcipher_alg->base.cra_alignmask | -- cgit v1.2.3-59-g8ed1b From 012c82388c032cd4a9821e11bae336cf4a014822 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Tue, 4 Dec 2018 22:20:00 -0800 Subject: crypto: x86/nhpoly1305 - add SSE2 accelerated NHPoly1305 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a 64-bit SSE2 implementation of NHPoly1305, an ε-almost-∆-universal hash function used in the Adiantum encryption mode. For now, only the NH portion is actually SSE2-accelerated; the Poly1305 part is less performance-critical so is just implemented in C. Signed-off-by: Eric Biggers Signed-off-by: Herbert Xu --- arch/x86/crypto/Makefile | 4 ++ arch/x86/crypto/nh-sse2-x86_64.S | 123 +++++++++++++++++++++++++++++++++ arch/x86/crypto/nhpoly1305-sse2-glue.c | 76 ++++++++++++++++++++ crypto/Kconfig | 8 +++ 4 files changed, 211 insertions(+) create mode 100644 arch/x86/crypto/nh-sse2-x86_64.S create mode 100644 arch/x86/crypto/nhpoly1305-sse2-glue.c diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile index ce4e43642984..2a6acb4de373 100644 --- a/arch/x86/crypto/Makefile +++ b/arch/x86/crypto/Makefile @@ -47,6 +47,8 @@ obj-$(CONFIG_CRYPTO_MORUS1280_GLUE) += morus1280_glue.o obj-$(CONFIG_CRYPTO_MORUS640_SSE2) += morus640-sse2.o obj-$(CONFIG_CRYPTO_MORUS1280_SSE2) += morus1280-sse2.o +obj-$(CONFIG_CRYPTO_NHPOLY1305_SSE2) += nhpoly1305-sse2.o + # These modules require assembler to support AVX. ifeq ($(avx_supported),yes) obj-$(CONFIG_CRYPTO_CAMELLIA_AESNI_AVX_X86_64) += \ @@ -85,6 +87,8 @@ aegis256-aesni-y := aegis256-aesni-asm.o aegis256-aesni-glue.o morus640-sse2-y := morus640-sse2-asm.o morus640-sse2-glue.o morus1280-sse2-y := morus1280-sse2-asm.o morus1280-sse2-glue.o +nhpoly1305-sse2-y := nh-sse2-x86_64.o nhpoly1305-sse2-glue.o + ifeq ($(avx_supported),yes) camellia-aesni-avx-x86_64-y := camellia-aesni-avx-asm_64.o \ camellia_aesni_avx_glue.o diff --git a/arch/x86/crypto/nh-sse2-x86_64.S b/arch/x86/crypto/nh-sse2-x86_64.S new file mode 100644 index 000000000000..51f52d4ab4bb --- /dev/null +++ b/arch/x86/crypto/nh-sse2-x86_64.S @@ -0,0 +1,123 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * NH - ε-almost-universal hash function, x86_64 SSE2 accelerated + * + * Copyright 2018 Google LLC + * + * Author: Eric Biggers + */ + +#include + +#define PASS0_SUMS %xmm0 +#define PASS1_SUMS %xmm1 +#define PASS2_SUMS %xmm2 +#define PASS3_SUMS %xmm3 +#define K0 %xmm4 +#define K1 %xmm5 +#define K2 %xmm6 +#define K3 %xmm7 +#define T0 %xmm8 +#define T1 %xmm9 +#define T2 %xmm10 +#define T3 %xmm11 +#define T4 %xmm12 +#define T5 %xmm13 +#define T6 %xmm14 +#define T7 %xmm15 +#define KEY %rdi +#define MESSAGE %rsi +#define MESSAGE_LEN %rdx +#define HASH %rcx + +.macro _nh_stride k0, k1, k2, k3, offset + + // Load next message stride + movdqu \offset(MESSAGE), T1 + + // Load next key stride + movdqu \offset(KEY), \k3 + + // Add message words to key words + movdqa T1, T2 + movdqa T1, T3 + paddd T1, \k0 // reuse k0 to avoid a move + paddd \k1, T1 + paddd \k2, T2 + paddd \k3, T3 + + // Multiply 32x32 => 64 and accumulate + pshufd $0x10, \k0, T4 + pshufd $0x32, \k0, \k0 + pshufd $0x10, T1, T5 + pshufd $0x32, T1, T1 + pshufd $0x10, T2, T6 + pshufd $0x32, T2, T2 + pshufd $0x10, T3, T7 + pshufd $0x32, T3, T3 + pmuludq T4, \k0 + pmuludq T5, T1 + pmuludq T6, T2 + pmuludq T7, T3 + paddq \k0, PASS0_SUMS + paddq T1, PASS1_SUMS + paddq T2, PASS2_SUMS + paddq T3, PASS3_SUMS +.endm + +/* + * void nh_sse2(const u32 *key, const u8 *message, size_t message_len, + * u8 hash[NH_HASH_BYTES]) + * + * It's guaranteed that message_len % 16 == 0. + */ +ENTRY(nh_sse2) + + movdqu 0x00(KEY), K0 + movdqu 0x10(KEY), K1 + movdqu 0x20(KEY), K2 + add $0x30, KEY + pxor PASS0_SUMS, PASS0_SUMS + pxor PASS1_SUMS, PASS1_SUMS + pxor PASS2_SUMS, PASS2_SUMS + pxor PASS3_SUMS, PASS3_SUMS + + sub $0x40, MESSAGE_LEN + jl .Lloop4_done +.Lloop4: + _nh_stride K0, K1, K2, K3, 0x00 + _nh_stride K1, K2, K3, K0, 0x10 + _nh_stride K2, K3, K0, K1, 0x20 + _nh_stride K3, K0, K1, K2, 0x30 + add $0x40, KEY + add $0x40, MESSAGE + sub $0x40, MESSAGE_LEN + jge .Lloop4 + +.Lloop4_done: + and $0x3f, MESSAGE_LEN + jz .Ldone + _nh_stride K0, K1, K2, K3, 0x00 + + sub $0x10, MESSAGE_LEN + jz .Ldone + _nh_stride K1, K2, K3, K0, 0x10 + + sub $0x10, MESSAGE_LEN + jz .Ldone + _nh_stride K2, K3, K0, K1, 0x20 + +.Ldone: + // Sum the accumulators for each pass, then store the sums to 'hash' + movdqa PASS0_SUMS, T0 + movdqa PASS2_SUMS, T1 + punpcklqdq PASS1_SUMS, T0 // => (PASS0_SUM_A PASS1_SUM_A) + punpcklqdq PASS3_SUMS, T1 // => (PASS2_SUM_A PASS3_SUM_A) + punpckhqdq PASS1_SUMS, PASS0_SUMS // => (PASS0_SUM_B PASS1_SUM_B) + punpckhqdq PASS3_SUMS, PASS2_SUMS // => (PASS2_SUM_B PASS3_SUM_B) + paddq PASS0_SUMS, T0 + paddq PASS2_SUMS, T1 + movdqu T0, 0x00(HASH) + movdqu T1, 0x10(HASH) + ret +ENDPROC(nh_sse2) diff --git a/arch/x86/crypto/nhpoly1305-sse2-glue.c b/arch/x86/crypto/nhpoly1305-sse2-glue.c new file mode 100644 index 000000000000..ed68d164ce14 --- /dev/null +++ b/arch/x86/crypto/nhpoly1305-sse2-glue.c @@ -0,0 +1,76 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * NHPoly1305 - ε-almost-∆-universal hash function for Adiantum + * (SSE2 accelerated version) + * + * Copyright 2018 Google LLC + */ + +#include +#include +#include +#include + +asmlinkage void nh_sse2(const u32 *key, const u8 *message, size_t message_len, + u8 hash[NH_HASH_BYTES]); + +/* wrapper to avoid indirect call to assembly, which doesn't work with CFI */ +static void _nh_sse2(const u32 *key, const u8 *message, size_t message_len, + __le64 hash[NH_NUM_PASSES]) +{ + nh_sse2(key, message, message_len, (u8 *)hash); +} + +static int nhpoly1305_sse2_update(struct shash_desc *desc, + const u8 *src, unsigned int srclen) +{ + if (srclen < 64 || !irq_fpu_usable()) + return crypto_nhpoly1305_update(desc, src, srclen); + + do { + unsigned int n = min_t(unsigned int, srclen, PAGE_SIZE); + + kernel_fpu_begin(); + crypto_nhpoly1305_update_helper(desc, src, n, _nh_sse2); + kernel_fpu_end(); + src += n; + srclen -= n; + } while (srclen); + return 0; +} + +static struct shash_alg nhpoly1305_alg = { + .base.cra_name = "nhpoly1305", + .base.cra_driver_name = "nhpoly1305-sse2", + .base.cra_priority = 200, + .base.cra_ctxsize = sizeof(struct nhpoly1305_key), + .base.cra_module = THIS_MODULE, + .digestsize = POLY1305_DIGEST_SIZE, + .init = crypto_nhpoly1305_init, + .update = nhpoly1305_sse2_update, + .final = crypto_nhpoly1305_final, + .setkey = crypto_nhpoly1305_setkey, + .descsize = sizeof(struct nhpoly1305_state), +}; + +static int __init nhpoly1305_mod_init(void) +{ + if (!boot_cpu_has(X86_FEATURE_XMM2)) + return -ENODEV; + + return crypto_register_shash(&nhpoly1305_alg); +} + +static void __exit nhpoly1305_mod_exit(void) +{ + crypto_unregister_shash(&nhpoly1305_alg); +} + +module_init(nhpoly1305_mod_init); +module_exit(nhpoly1305_mod_exit); + +MODULE_DESCRIPTION("NHPoly1305 ε-almost-∆-universal hash function (SSE2-accelerated)"); +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR("Eric Biggers "); +MODULE_ALIAS_CRYPTO("nhpoly1305"); +MODULE_ALIAS_CRYPTO("nhpoly1305-sse2"); diff --git a/crypto/Kconfig b/crypto/Kconfig index 5994d0fa7a78..cd3d9cb3482c 100644 --- a/crypto/Kconfig +++ b/crypto/Kconfig @@ -501,6 +501,14 @@ config CRYPTO_NHPOLY1305 select CRYPTO_HASH select CRYPTO_POLY1305 +config CRYPTO_NHPOLY1305_SSE2 + tristate "NHPoly1305 hash function (x86_64 SSE2 implementation)" + depends on X86 && 64BIT + select CRYPTO_NHPOLY1305 + help + SSE2 optimized implementation of the hash function used by the + Adiantum encryption mode. + config CRYPTO_ADIANTUM tristate "Adiantum support" select CRYPTO_CHACHA20 -- cgit v1.2.3-59-g8ed1b From 0f961f9f670e7c07690bfde2f533b93c653569cc Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Tue, 4 Dec 2018 22:20:01 -0800 Subject: crypto: x86/nhpoly1305 - add AVX2 accelerated NHPoly1305 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a 64-bit AVX2 implementation of NHPoly1305, an ε-almost-∆-universal hash function used in the Adiantum encryption mode. For now, only the NH portion is actually AVX2-accelerated; the Poly1305 part is less performance-critical so is just implemented in C. Signed-off-by: Eric Biggers Signed-off-by: Herbert Xu --- arch/x86/crypto/Makefile | 3 + arch/x86/crypto/nh-avx2-x86_64.S | 157 +++++++++++++++++++++++++++++++++ arch/x86/crypto/nhpoly1305-avx2-glue.c | 77 ++++++++++++++++ crypto/Kconfig | 8 ++ 4 files changed, 245 insertions(+) create mode 100644 arch/x86/crypto/nh-avx2-x86_64.S create mode 100644 arch/x86/crypto/nhpoly1305-avx2-glue.c diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile index 2a6acb4de373..0b31b16f49d8 100644 --- a/arch/x86/crypto/Makefile +++ b/arch/x86/crypto/Makefile @@ -48,6 +48,7 @@ obj-$(CONFIG_CRYPTO_MORUS640_SSE2) += morus640-sse2.o obj-$(CONFIG_CRYPTO_MORUS1280_SSE2) += morus1280-sse2.o obj-$(CONFIG_CRYPTO_NHPOLY1305_SSE2) += nhpoly1305-sse2.o +obj-$(CONFIG_CRYPTO_NHPOLY1305_AVX2) += nhpoly1305-avx2.o # These modules require assembler to support AVX. ifeq ($(avx_supported),yes) @@ -106,6 +107,8 @@ ifeq ($(avx2_supported),yes) serpent-avx2-y := serpent-avx2-asm_64.o serpent_avx2_glue.o morus1280-avx2-y := morus1280-avx2-asm.o morus1280-avx2-glue.o + + nhpoly1305-avx2-y := nh-avx2-x86_64.o nhpoly1305-avx2-glue.o endif ifeq ($(avx512_supported),yes) diff --git a/arch/x86/crypto/nh-avx2-x86_64.S b/arch/x86/crypto/nh-avx2-x86_64.S new file mode 100644 index 000000000000..f7946ea1b704 --- /dev/null +++ b/arch/x86/crypto/nh-avx2-x86_64.S @@ -0,0 +1,157 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * NH - ε-almost-universal hash function, x86_64 AVX2 accelerated + * + * Copyright 2018 Google LLC + * + * Author: Eric Biggers + */ + +#include + +#define PASS0_SUMS %ymm0 +#define PASS1_SUMS %ymm1 +#define PASS2_SUMS %ymm2 +#define PASS3_SUMS %ymm3 +#define K0 %ymm4 +#define K0_XMM %xmm4 +#define K1 %ymm5 +#define K1_XMM %xmm5 +#define K2 %ymm6 +#define K2_XMM %xmm6 +#define K3 %ymm7 +#define K3_XMM %xmm7 +#define T0 %ymm8 +#define T1 %ymm9 +#define T2 %ymm10 +#define T2_XMM %xmm10 +#define T3 %ymm11 +#define T3_XMM %xmm11 +#define T4 %ymm12 +#define T5 %ymm13 +#define T6 %ymm14 +#define T7 %ymm15 +#define KEY %rdi +#define MESSAGE %rsi +#define MESSAGE_LEN %rdx +#define HASH %rcx + +.macro _nh_2xstride k0, k1, k2, k3 + + // Add message words to key words + vpaddd \k0, T3, T0 + vpaddd \k1, T3, T1 + vpaddd \k2, T3, T2 + vpaddd \k3, T3, T3 + + // Multiply 32x32 => 64 and accumulate + vpshufd $0x10, T0, T4 + vpshufd $0x32, T0, T0 + vpshufd $0x10, T1, T5 + vpshufd $0x32, T1, T1 + vpshufd $0x10, T2, T6 + vpshufd $0x32, T2, T2 + vpshufd $0x10, T3, T7 + vpshufd $0x32, T3, T3 + vpmuludq T4, T0, T0 + vpmuludq T5, T1, T1 + vpmuludq T6, T2, T2 + vpmuludq T7, T3, T3 + vpaddq T0, PASS0_SUMS, PASS0_SUMS + vpaddq T1, PASS1_SUMS, PASS1_SUMS + vpaddq T2, PASS2_SUMS, PASS2_SUMS + vpaddq T3, PASS3_SUMS, PASS3_SUMS +.endm + +/* + * void nh_avx2(const u32 *key, const u8 *message, size_t message_len, + * u8 hash[NH_HASH_BYTES]) + * + * It's guaranteed that message_len % 16 == 0. + */ +ENTRY(nh_avx2) + + vmovdqu 0x00(KEY), K0 + vmovdqu 0x10(KEY), K1 + add $0x20, KEY + vpxor PASS0_SUMS, PASS0_SUMS, PASS0_SUMS + vpxor PASS1_SUMS, PASS1_SUMS, PASS1_SUMS + vpxor PASS2_SUMS, PASS2_SUMS, PASS2_SUMS + vpxor PASS3_SUMS, PASS3_SUMS, PASS3_SUMS + + sub $0x40, MESSAGE_LEN + jl .Lloop4_done +.Lloop4: + vmovdqu (MESSAGE), T3 + vmovdqu 0x00(KEY), K2 + vmovdqu 0x10(KEY), K3 + _nh_2xstride K0, K1, K2, K3 + + vmovdqu 0x20(MESSAGE), T3 + vmovdqu 0x20(KEY), K0 + vmovdqu 0x30(KEY), K1 + _nh_2xstride K2, K3, K0, K1 + + add $0x40, MESSAGE + add $0x40, KEY + sub $0x40, MESSAGE_LEN + jge .Lloop4 + +.Lloop4_done: + and $0x3f, MESSAGE_LEN + jz .Ldone + + cmp $0x20, MESSAGE_LEN + jl .Llast + + // 2 or 3 strides remain; do 2 more. + vmovdqu (MESSAGE), T3 + vmovdqu 0x00(KEY), K2 + vmovdqu 0x10(KEY), K3 + _nh_2xstride K0, K1, K2, K3 + add $0x20, MESSAGE + add $0x20, KEY + sub $0x20, MESSAGE_LEN + jz .Ldone + vmovdqa K2, K0 + vmovdqa K3, K1 +.Llast: + // Last stride. Zero the high 128 bits of the message and keys so they + // don't affect the result when processing them like 2 strides. + vmovdqu (MESSAGE), T3_XMM + vmovdqa K0_XMM, K0_XMM + vmovdqa K1_XMM, K1_XMM + vmovdqu 0x00(KEY), K2_XMM + vmovdqu 0x10(KEY), K3_XMM + _nh_2xstride K0, K1, K2, K3 + +.Ldone: + // Sum the accumulators for each pass, then store the sums to 'hash' + + // PASS0_SUMS is (0A 0B 0C 0D) + // PASS1_SUMS is (1A 1B 1C 1D) + // PASS2_SUMS is (2A 2B 2C 2D) + // PASS3_SUMS is (3A 3B 3C 3D) + // We need the horizontal sums: + // (0A + 0B + 0C + 0D, + // 1A + 1B + 1C + 1D, + // 2A + 2B + 2C + 2D, + // 3A + 3B + 3C + 3D) + // + + vpunpcklqdq PASS1_SUMS, PASS0_SUMS, T0 // T0 = (0A 1A 0C 1C) + vpunpckhqdq PASS1_SUMS, PASS0_SUMS, T1 // T1 = (0B 1B 0D 1D) + vpunpcklqdq PASS3_SUMS, PASS2_SUMS, T2 // T2 = (2A 3A 2C 3C) + vpunpckhqdq PASS3_SUMS, PASS2_SUMS, T3 // T3 = (2B 3B 2D 3D) + + vinserti128 $0x1, T2_XMM, T0, T4 // T4 = (0A 1A 2A 3A) + vinserti128 $0x1, T3_XMM, T1, T5 // T5 = (0B 1B 2B 3B) + vperm2i128 $0x31, T2, T0, T0 // T0 = (0C 1C 2C 3C) + vperm2i128 $0x31, T3, T1, T1 // T1 = (0D 1D 2D 3D) + + vpaddq T5, T4, T4 + vpaddq T1, T0, T0 + vpaddq T4, T0, T0 + vmovdqu T0, (HASH) + ret +ENDPROC(nh_avx2) diff --git a/arch/x86/crypto/nhpoly1305-avx2-glue.c b/arch/x86/crypto/nhpoly1305-avx2-glue.c new file mode 100644 index 000000000000..20d815ea4b6a --- /dev/null +++ b/arch/x86/crypto/nhpoly1305-avx2-glue.c @@ -0,0 +1,77 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * NHPoly1305 - ε-almost-∆-universal hash function for Adiantum + * (AVX2 accelerated version) + * + * Copyright 2018 Google LLC + */ + +#include +#include +#include +#include + +asmlinkage void nh_avx2(const u32 *key, const u8 *message, size_t message_len, + u8 hash[NH_HASH_BYTES]); + +/* wrapper to avoid indirect call to assembly, which doesn't work with CFI */ +static void _nh_avx2(const u32 *key, const u8 *message, size_t message_len, + __le64 hash[NH_NUM_PASSES]) +{ + nh_avx2(key, message, message_len, (u8 *)hash); +} + +static int nhpoly1305_avx2_update(struct shash_desc *desc, + const u8 *src, unsigned int srclen) +{ + if (srclen < 64 || !irq_fpu_usable()) + return crypto_nhpoly1305_update(desc, src, srclen); + + do { + unsigned int n = min_t(unsigned int, srclen, PAGE_SIZE); + + kernel_fpu_begin(); + crypto_nhpoly1305_update_helper(desc, src, n, _nh_avx2); + kernel_fpu_end(); + src += n; + srclen -= n; + } while (srclen); + return 0; +} + +static struct shash_alg nhpoly1305_alg = { + .base.cra_name = "nhpoly1305", + .base.cra_driver_name = "nhpoly1305-avx2", + .base.cra_priority = 300, + .base.cra_ctxsize = sizeof(struct nhpoly1305_key), + .base.cra_module = THIS_MODULE, + .digestsize = POLY1305_DIGEST_SIZE, + .init = crypto_nhpoly1305_init, + .update = nhpoly1305_avx2_update, + .final = crypto_nhpoly1305_final, + .setkey = crypto_nhpoly1305_setkey, + .descsize = sizeof(struct nhpoly1305_state), +}; + +static int __init nhpoly1305_mod_init(void) +{ + if (!boot_cpu_has(X86_FEATURE_AVX2) || + !boot_cpu_has(X86_FEATURE_OSXSAVE)) + return -ENODEV; + + return crypto_register_shash(&nhpoly1305_alg); +} + +static void __exit nhpoly1305_mod_exit(void) +{ + crypto_unregister_shash(&nhpoly1305_alg); +} + +module_init(nhpoly1305_mod_init); +module_exit(nhpoly1305_mod_exit); + +MODULE_DESCRIPTION("NHPoly1305 ε-almost-∆-universal hash function (AVX2-accelerated)"); +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR("Eric Biggers "); +MODULE_ALIAS_CRYPTO("nhpoly1305"); +MODULE_ALIAS_CRYPTO("nhpoly1305-avx2"); diff --git a/crypto/Kconfig b/crypto/Kconfig index cd3d9cb3482c..d0bff6ea6b10 100644 --- a/crypto/Kconfig +++ b/crypto/Kconfig @@ -509,6 +509,14 @@ config CRYPTO_NHPOLY1305_SSE2 SSE2 optimized implementation of the hash function used by the Adiantum encryption mode. +config CRYPTO_NHPOLY1305_AVX2 + tristate "NHPoly1305 hash function (x86_64 AVX2 implementation)" + depends on X86 && 64BIT + select CRYPTO_NHPOLY1305 + help + AVX2 optimized implementation of the hash function used by the + Adiantum encryption mode. + config CRYPTO_ADIANTUM tristate "Adiantum support" select CRYPTO_CHACHA20 -- cgit v1.2.3-59-g8ed1b From 4af78261870a7d36dd222af8dad9688b705e365e Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Tue, 4 Dec 2018 22:20:02 -0800 Subject: crypto: x86/chacha20 - add XChaCha20 support Add an XChaCha20 implementation that is hooked up to the x86_64 SIMD implementations of ChaCha20. This can be used by Adiantum. An SSSE3 implementation of single-block HChaCha20 is also added so that XChaCha20 can use it rather than the generic implementation. This required refactoring the ChaCha permutation into its own function. Signed-off-by: Eric Biggers Signed-off-by: Herbert Xu --- arch/x86/crypto/chacha20-ssse3-x86_64.S | 81 ++++++++++++++++-------- arch/x86/crypto/chacha20_glue.c | 108 ++++++++++++++++++++++++-------- crypto/Kconfig | 12 +--- 3 files changed, 141 insertions(+), 60 deletions(-) diff --git a/arch/x86/crypto/chacha20-ssse3-x86_64.S b/arch/x86/crypto/chacha20-ssse3-x86_64.S index d8ac75bb448f..f6792789f875 100644 --- a/arch/x86/crypto/chacha20-ssse3-x86_64.S +++ b/arch/x86/crypto/chacha20-ssse3-x86_64.S @@ -10,6 +10,7 @@ */ #include +#include .section .rodata.cst16.ROT8, "aM", @progbits, 16 .align 16 @@ -23,37 +24,24 @@ CTRINC: .octa 0x00000003000000020000000100000000 .text -ENTRY(chacha20_block_xor_ssse3) - # %rdi: Input state matrix, s - # %rsi: up to 1 data block output, o - # %rdx: up to 1 data block input, i - # %rcx: input/output length in bytes - - # This function encrypts one ChaCha20 block by loading the state matrix - # in four SSE registers. It performs matrix operation on four words in - # parallel, but requires shuffling to rearrange the words after each - # round. 8/16-bit word rotation is done with the slightly better - # performing SSSE3 byte shuffling, 7/12-bit word rotation uses - # traditional shift+OR. - - # x0..3 = s0..3 - movdqa 0x00(%rdi),%xmm0 - movdqa 0x10(%rdi),%xmm1 - movdqa 0x20(%rdi),%xmm2 - movdqa 0x30(%rdi),%xmm3 - movdqa %xmm0,%xmm8 - movdqa %xmm1,%xmm9 - movdqa %xmm2,%xmm10 - movdqa %xmm3,%xmm11 +/* + * chacha20_permute - permute one block + * + * Permute one 64-byte block where the state matrix is in %xmm0-%xmm3. This + * function performs matrix operations on four words in parallel, but requires + * shuffling to rearrange the words after each round. 8/16-bit word rotation is + * done with the slightly better performing SSSE3 byte shuffling, 7/12-bit word + * rotation uses traditional shift+OR. + * + * Clobbers: %ecx, %xmm4-%xmm7 + */ +chacha20_permute: movdqa ROT8(%rip),%xmm4 movdqa ROT16(%rip),%xmm5 - - mov %rcx,%rax mov $10,%ecx .Ldoubleround: - # x0 += x1, x3 = rotl32(x3 ^ x0, 16) paddd %xmm1,%xmm0 pxor %xmm0,%xmm3 @@ -123,6 +111,29 @@ ENTRY(chacha20_block_xor_ssse3) dec %ecx jnz .Ldoubleround + ret +ENDPROC(chacha20_permute) + +ENTRY(chacha20_block_xor_ssse3) + # %rdi: Input state matrix, s + # %rsi: up to 1 data block output, o + # %rdx: up to 1 data block input, i + # %rcx: input/output length in bytes + FRAME_BEGIN + + # x0..3 = s0..3 + movdqa 0x00(%rdi),%xmm0 + movdqa 0x10(%rdi),%xmm1 + movdqa 0x20(%rdi),%xmm2 + movdqa 0x30(%rdi),%xmm3 + movdqa %xmm0,%xmm8 + movdqa %xmm1,%xmm9 + movdqa %xmm2,%xmm10 + movdqa %xmm3,%xmm11 + + mov %rcx,%rax + call chacha20_permute + # o0 = i0 ^ (x0 + s0) paddd %xmm8,%xmm0 cmp $0x10,%rax @@ -156,6 +167,7 @@ ENTRY(chacha20_block_xor_ssse3) movdqu %xmm0,0x30(%rsi) .Ldone: + FRAME_END ret .Lxorpart: @@ -189,6 +201,25 @@ ENTRY(chacha20_block_xor_ssse3) ENDPROC(chacha20_block_xor_ssse3) +ENTRY(hchacha20_block_ssse3) + # %rdi: Input state matrix, s + # %rsi: output (8 32-bit words) + FRAME_BEGIN + + movdqa 0x00(%rdi),%xmm0 + movdqa 0x10(%rdi),%xmm1 + movdqa 0x20(%rdi),%xmm2 + movdqa 0x30(%rdi),%xmm3 + + call chacha20_permute + + movdqu %xmm0,0x00(%rsi) + movdqu %xmm3,0x10(%rsi) + + FRAME_END + ret +ENDPROC(hchacha20_block_ssse3) + ENTRY(chacha20_4block_xor_ssse3) # %rdi: Input state matrix, s # %rsi: up to 4 data blocks output, o diff --git a/arch/x86/crypto/chacha20_glue.c b/arch/x86/crypto/chacha20_glue.c index 773d075a1483..70d388e4a3a2 100644 --- a/arch/x86/crypto/chacha20_glue.c +++ b/arch/x86/crypto/chacha20_glue.c @@ -23,6 +23,7 @@ asmlinkage void chacha20_block_xor_ssse3(u32 *state, u8 *dst, const u8 *src, unsigned int len); asmlinkage void chacha20_4block_xor_ssse3(u32 *state, u8 *dst, const u8 *src, unsigned int len); +asmlinkage void hchacha20_block_ssse3(const u32 *state, u32 *out); #ifdef CONFIG_AS_AVX2 asmlinkage void chacha20_2block_xor_avx2(u32 *state, u8 *dst, const u8 *src, unsigned int len); @@ -121,10 +122,9 @@ static void chacha20_dosimd(u32 *state, u8 *dst, const u8 *src, } } -static int chacha20_simd(struct skcipher_request *req) +static int chacha20_simd_stream_xor(struct skcipher_request *req, + struct chacha_ctx *ctx, u8 *iv) { - struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm); u32 *state, state_buf[16 + 2] __aligned(8); struct skcipher_walk walk; int err; @@ -132,14 +132,9 @@ static int chacha20_simd(struct skcipher_request *req) BUILD_BUG_ON(CHACHA20_STATE_ALIGN != 16); state = PTR_ALIGN(state_buf + 0, CHACHA20_STATE_ALIGN); - if (req->cryptlen <= CHACHA_BLOCK_SIZE || !may_use_simd()) - return crypto_chacha_crypt(req); - err = skcipher_walk_virt(&walk, req, true); - crypto_chacha_init(state, ctx, walk.iv); - - kernel_fpu_begin(); + crypto_chacha_init(state, ctx, iv); while (walk.nbytes > 0) { unsigned int nbytes = walk.nbytes; @@ -153,26 +148,85 @@ static int chacha20_simd(struct skcipher_request *req) err = skcipher_walk_done(&walk, walk.nbytes - nbytes); } + return err; +} + +static int chacha20_simd(struct skcipher_request *req) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm); + int err; + + if (req->cryptlen <= CHACHA_BLOCK_SIZE || !irq_fpu_usable()) + return crypto_chacha_crypt(req); + + kernel_fpu_begin(); + err = chacha20_simd_stream_xor(req, ctx, req->iv); + kernel_fpu_end(); + return err; +} + +static int xchacha20_simd(struct skcipher_request *req) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm); + struct chacha_ctx subctx; + u32 *state, state_buf[16 + 2] __aligned(8); + u8 real_iv[16]; + int err; + + if (req->cryptlen <= CHACHA_BLOCK_SIZE || !irq_fpu_usable()) + return crypto_xchacha_crypt(req); + + BUILD_BUG_ON(CHACHA20_STATE_ALIGN != 16); + state = PTR_ALIGN(state_buf + 0, CHACHA20_STATE_ALIGN); + crypto_chacha_init(state, ctx, req->iv); + + kernel_fpu_begin(); + + hchacha20_block_ssse3(state, subctx.key); + + memcpy(&real_iv[0], req->iv + 24, 8); + memcpy(&real_iv[8], req->iv + 16, 8); + err = chacha20_simd_stream_xor(req, &subctx, real_iv); + kernel_fpu_end(); return err; } -static struct skcipher_alg alg = { - .base.cra_name = "chacha20", - .base.cra_driver_name = "chacha20-simd", - .base.cra_priority = 300, - .base.cra_blocksize = 1, - .base.cra_ctxsize = sizeof(struct chacha_ctx), - .base.cra_module = THIS_MODULE, - - .min_keysize = CHACHA_KEY_SIZE, - .max_keysize = CHACHA_KEY_SIZE, - .ivsize = CHACHA_IV_SIZE, - .chunksize = CHACHA_BLOCK_SIZE, - .setkey = crypto_chacha20_setkey, - .encrypt = chacha20_simd, - .decrypt = chacha20_simd, +static struct skcipher_alg algs[] = { + { + .base.cra_name = "chacha20", + .base.cra_driver_name = "chacha20-simd", + .base.cra_priority = 300, + .base.cra_blocksize = 1, + .base.cra_ctxsize = sizeof(struct chacha_ctx), + .base.cra_module = THIS_MODULE, + + .min_keysize = CHACHA_KEY_SIZE, + .max_keysize = CHACHA_KEY_SIZE, + .ivsize = CHACHA_IV_SIZE, + .chunksize = CHACHA_BLOCK_SIZE, + .setkey = crypto_chacha20_setkey, + .encrypt = chacha20_simd, + .decrypt = chacha20_simd, + }, { + .base.cra_name = "xchacha20", + .base.cra_driver_name = "xchacha20-simd", + .base.cra_priority = 300, + .base.cra_blocksize = 1, + .base.cra_ctxsize = sizeof(struct chacha_ctx), + .base.cra_module = THIS_MODULE, + + .min_keysize = CHACHA_KEY_SIZE, + .max_keysize = CHACHA_KEY_SIZE, + .ivsize = XCHACHA_IV_SIZE, + .chunksize = CHACHA_BLOCK_SIZE, + .setkey = crypto_chacha20_setkey, + .encrypt = xchacha20_simd, + .decrypt = xchacha20_simd, + }, }; static int __init chacha20_simd_mod_init(void) @@ -190,12 +244,12 @@ static int __init chacha20_simd_mod_init(void) boot_cpu_has(X86_FEATURE_AVX512BW); /* kmovq */ #endif #endif - return crypto_register_skcipher(&alg); + return crypto_register_skciphers(algs, ARRAY_SIZE(algs)); } static void __exit chacha20_simd_mod_fini(void) { - crypto_unregister_skcipher(&alg); + crypto_unregister_skciphers(algs, ARRAY_SIZE(algs)); } module_init(chacha20_simd_mod_init); @@ -206,3 +260,5 @@ MODULE_AUTHOR("Martin Willi "); MODULE_DESCRIPTION("chacha20 cipher algorithm, SIMD accelerated"); MODULE_ALIAS_CRYPTO("chacha20"); MODULE_ALIAS_CRYPTO("chacha20-simd"); +MODULE_ALIAS_CRYPTO("xchacha20"); +MODULE_ALIAS_CRYPTO("xchacha20-simd"); diff --git a/crypto/Kconfig b/crypto/Kconfig index d0bff6ea6b10..dc3a0e3ea2ac 100644 --- a/crypto/Kconfig +++ b/crypto/Kconfig @@ -1468,19 +1468,13 @@ config CRYPTO_CHACHA20 in some performance-sensitive scenarios. config CRYPTO_CHACHA20_X86_64 - tristate "ChaCha20 cipher algorithm (x86_64/SSSE3/AVX2)" + tristate "ChaCha stream cipher algorithms (x86_64/SSSE3/AVX2/AVX-512VL)" depends on X86 && 64BIT select CRYPTO_BLKCIPHER select CRYPTO_CHACHA20 help - ChaCha20 cipher algorithm, RFC7539. - - ChaCha20 is a 256-bit high-speed stream cipher designed by Daniel J. - Bernstein and further specified in RFC7539 for use in IETF protocols. - This is the x86_64 assembler implementation using SIMD instructions. - - See also: - + SSSE3, AVX2, and AVX-512VL optimized implementations of the ChaCha20 + and XChaCha20 stream ciphers. config CRYPTO_SEED tristate "SEED cipher algorithm" -- cgit v1.2.3-59-g8ed1b From 8b65f34c5821e7361488dc668d21195ea4c9f14d Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Tue, 4 Dec 2018 22:20:03 -0800 Subject: crypto: x86/chacha20 - refactor to allow varying number of rounds In preparation for adding XChaCha12 support, rename/refactor the x86_64 SIMD implementations of ChaCha20 to support different numbers of rounds. Reviewed-by: Martin Willi Signed-off-by: Eric Biggers Signed-off-by: Herbert Xu --- arch/x86/crypto/Makefile | 8 +- arch/x86/crypto/chacha-avx2-x86_64.S | 1025 +++++++++++++++++++++++++++ arch/x86/crypto/chacha-avx512vl-x86_64.S | 836 +++++++++++++++++++++++ arch/x86/crypto/chacha-ssse3-x86_64.S | 795 +++++++++++++++++++++ arch/x86/crypto/chacha20-avx2-x86_64.S | 1026 ---------------------------- arch/x86/crypto/chacha20-avx512vl-x86_64.S | 839 ----------------------- arch/x86/crypto/chacha20-ssse3-x86_64.S | 792 --------------------- arch/x86/crypto/chacha20_glue.c | 264 ------- arch/x86/crypto/chacha_glue.c | 270 ++++++++ 9 files changed, 2930 insertions(+), 2925 deletions(-) create mode 100644 arch/x86/crypto/chacha-avx2-x86_64.S create mode 100644 arch/x86/crypto/chacha-avx512vl-x86_64.S create mode 100644 arch/x86/crypto/chacha-ssse3-x86_64.S delete mode 100644 arch/x86/crypto/chacha20-avx2-x86_64.S delete mode 100644 arch/x86/crypto/chacha20-avx512vl-x86_64.S delete mode 100644 arch/x86/crypto/chacha20-ssse3-x86_64.S delete mode 100644 arch/x86/crypto/chacha20_glue.c create mode 100644 arch/x86/crypto/chacha_glue.c diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile index 0b31b16f49d8..45734e1cf967 100644 --- a/arch/x86/crypto/Makefile +++ b/arch/x86/crypto/Makefile @@ -24,7 +24,7 @@ obj-$(CONFIG_CRYPTO_CAMELLIA_X86_64) += camellia-x86_64.o obj-$(CONFIG_CRYPTO_BLOWFISH_X86_64) += blowfish-x86_64.o obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o obj-$(CONFIG_CRYPTO_TWOFISH_X86_64_3WAY) += twofish-x86_64-3way.o -obj-$(CONFIG_CRYPTO_CHACHA20_X86_64) += chacha20-x86_64.o +obj-$(CONFIG_CRYPTO_CHACHA20_X86_64) += chacha-x86_64.o obj-$(CONFIG_CRYPTO_SERPENT_SSE2_X86_64) += serpent-sse2-x86_64.o obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o @@ -78,7 +78,7 @@ camellia-x86_64-y := camellia-x86_64-asm_64.o camellia_glue.o blowfish-x86_64-y := blowfish-x86_64-asm_64.o blowfish_glue.o twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_glue.o twofish-x86_64-3way-y := twofish-x86_64-asm_64-3way.o twofish_glue_3way.o -chacha20-x86_64-y := chacha20-ssse3-x86_64.o chacha20_glue.o +chacha-x86_64-y := chacha-ssse3-x86_64.o chacha_glue.o serpent-sse2-x86_64-y := serpent-sse2-x86_64-asm_64.o serpent_sse2_glue.o aegis128-aesni-y := aegis128-aesni-asm.o aegis128-aesni-glue.o @@ -103,7 +103,7 @@ endif ifeq ($(avx2_supported),yes) camellia-aesni-avx2-y := camellia-aesni-avx2-asm_64.o camellia_aesni_avx2_glue.o - chacha20-x86_64-y += chacha20-avx2-x86_64.o + chacha-x86_64-y += chacha-avx2-x86_64.o serpent-avx2-y := serpent-avx2-asm_64.o serpent_avx2_glue.o morus1280-avx2-y := morus1280-avx2-asm.o morus1280-avx2-glue.o @@ -112,7 +112,7 @@ ifeq ($(avx2_supported),yes) endif ifeq ($(avx512_supported),yes) - chacha20-x86_64-y += chacha20-avx512vl-x86_64.o + chacha-x86_64-y += chacha-avx512vl-x86_64.o endif aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o diff --git a/arch/x86/crypto/chacha-avx2-x86_64.S b/arch/x86/crypto/chacha-avx2-x86_64.S new file mode 100644 index 000000000000..32903fd450af --- /dev/null +++ b/arch/x86/crypto/chacha-avx2-x86_64.S @@ -0,0 +1,1025 @@ +/* + * ChaCha 256-bit cipher algorithm, x64 AVX2 functions + * + * Copyright (C) 2015 Martin Willi + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include + +.section .rodata.cst32.ROT8, "aM", @progbits, 32 +.align 32 +ROT8: .octa 0x0e0d0c0f0a09080b0605040702010003 + .octa 0x0e0d0c0f0a09080b0605040702010003 + +.section .rodata.cst32.ROT16, "aM", @progbits, 32 +.align 32 +ROT16: .octa 0x0d0c0f0e09080b0a0504070601000302 + .octa 0x0d0c0f0e09080b0a0504070601000302 + +.section .rodata.cst32.CTRINC, "aM", @progbits, 32 +.align 32 +CTRINC: .octa 0x00000003000000020000000100000000 + .octa 0x00000007000000060000000500000004 + +.section .rodata.cst32.CTR2BL, "aM", @progbits, 32 +.align 32 +CTR2BL: .octa 0x00000000000000000000000000000000 + .octa 0x00000000000000000000000000000001 + +.section .rodata.cst32.CTR4BL, "aM", @progbits, 32 +.align 32 +CTR4BL: .octa 0x00000000000000000000000000000002 + .octa 0x00000000000000000000000000000003 + +.text + +ENTRY(chacha_2block_xor_avx2) + # %rdi: Input state matrix, s + # %rsi: up to 2 data blocks output, o + # %rdx: up to 2 data blocks input, i + # %rcx: input/output length in bytes + # %r8d: nrounds + + # This function encrypts two ChaCha blocks by loading the state + # matrix twice across four AVX registers. It performs matrix operations + # on four words in each matrix in parallel, but requires shuffling to + # rearrange the words after each round. + + vzeroupper + + # x0..3[0-2] = s0..3 + vbroadcasti128 0x00(%rdi),%ymm0 + vbroadcasti128 0x10(%rdi),%ymm1 + vbroadcasti128 0x20(%rdi),%ymm2 + vbroadcasti128 0x30(%rdi),%ymm3 + + vpaddd CTR2BL(%rip),%ymm3,%ymm3 + + vmovdqa %ymm0,%ymm8 + vmovdqa %ymm1,%ymm9 + vmovdqa %ymm2,%ymm10 + vmovdqa %ymm3,%ymm11 + + vmovdqa ROT8(%rip),%ymm4 + vmovdqa ROT16(%rip),%ymm5 + + mov %rcx,%rax + +.Ldoubleround: + + # x0 += x1, x3 = rotl32(x3 ^ x0, 16) + vpaddd %ymm1,%ymm0,%ymm0 + vpxor %ymm0,%ymm3,%ymm3 + vpshufb %ymm5,%ymm3,%ymm3 + + # x2 += x3, x1 = rotl32(x1 ^ x2, 12) + vpaddd %ymm3,%ymm2,%ymm2 + vpxor %ymm2,%ymm1,%ymm1 + vmovdqa %ymm1,%ymm6 + vpslld $12,%ymm6,%ymm6 + vpsrld $20,%ymm1,%ymm1 + vpor %ymm6,%ymm1,%ymm1 + + # x0 += x1, x3 = rotl32(x3 ^ x0, 8) + vpaddd %ymm1,%ymm0,%ymm0 + vpxor %ymm0,%ymm3,%ymm3 + vpshufb %ymm4,%ymm3,%ymm3 + + # x2 += x3, x1 = rotl32(x1 ^ x2, 7) + vpaddd %ymm3,%ymm2,%ymm2 + vpxor %ymm2,%ymm1,%ymm1 + vmovdqa %ymm1,%ymm7 + vpslld $7,%ymm7,%ymm7 + vpsrld $25,%ymm1,%ymm1 + vpor %ymm7,%ymm1,%ymm1 + + # x1 = shuffle32(x1, MASK(0, 3, 2, 1)) + vpshufd $0x39,%ymm1,%ymm1 + # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) + vpshufd $0x4e,%ymm2,%ymm2 + # x3 = shuffle32(x3, MASK(2, 1, 0, 3)) + vpshufd $0x93,%ymm3,%ymm3 + + # x0 += x1, x3 = rotl32(x3 ^ x0, 16) + vpaddd %ymm1,%ymm0,%ymm0 + vpxor %ymm0,%ymm3,%ymm3 + vpshufb %ymm5,%ymm3,%ymm3 + + # x2 += x3, x1 = rotl32(x1 ^ x2, 12) + vpaddd %ymm3,%ymm2,%ymm2 + vpxor %ymm2,%ymm1,%ymm1 + vmovdqa %ymm1,%ymm6 + vpslld $12,%ymm6,%ymm6 + vpsrld $20,%ymm1,%ymm1 + vpor %ymm6,%ymm1,%ymm1 + + # x0 += x1, x3 = rotl32(x3 ^ x0, 8) + vpaddd %ymm1,%ymm0,%ymm0 + vpxor %ymm0,%ymm3,%ymm3 + vpshufb %ymm4,%ymm3,%ymm3 + + # x2 += x3, x1 = rotl32(x1 ^ x2, 7) + vpaddd %ymm3,%ymm2,%ymm2 + vpxor %ymm2,%ymm1,%ymm1 + vmovdqa %ymm1,%ymm7 + vpslld $7,%ymm7,%ymm7 + vpsrld $25,%ymm1,%ymm1 + vpor %ymm7,%ymm1,%ymm1 + + # x1 = shuffle32(x1, MASK(2, 1, 0, 3)) + vpshufd $0x93,%ymm1,%ymm1 + # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) + vpshufd $0x4e,%ymm2,%ymm2 + # x3 = shuffle32(x3, MASK(0, 3, 2, 1)) + vpshufd $0x39,%ymm3,%ymm3 + + sub $2,%r8d + jnz .Ldoubleround + + # o0 = i0 ^ (x0 + s0) + vpaddd %ymm8,%ymm0,%ymm7 + cmp $0x10,%rax + jl .Lxorpart2 + vpxor 0x00(%rdx),%xmm7,%xmm6 + vmovdqu %xmm6,0x00(%rsi) + vextracti128 $1,%ymm7,%xmm0 + # o1 = i1 ^ (x1 + s1) + vpaddd %ymm9,%ymm1,%ymm7 + cmp $0x20,%rax + jl .Lxorpart2 + vpxor 0x10(%rdx),%xmm7,%xmm6 + vmovdqu %xmm6,0x10(%rsi) + vextracti128 $1,%ymm7,%xmm1 + # o2 = i2 ^ (x2 + s2) + vpaddd %ymm10,%ymm2,%ymm7 + cmp $0x30,%rax + jl .Lxorpart2 + vpxor 0x20(%rdx),%xmm7,%xmm6 + vmovdqu %xmm6,0x20(%rsi) + vextracti128 $1,%ymm7,%xmm2 + # o3 = i3 ^ (x3 + s3) + vpaddd %ymm11,%ymm3,%ymm7 + cmp $0x40,%rax + jl .Lxorpart2 + vpxor 0x30(%rdx),%xmm7,%xmm6 + vmovdqu %xmm6,0x30(%rsi) + vextracti128 $1,%ymm7,%xmm3 + + # xor and write second block + vmovdqa %xmm0,%xmm7 + cmp $0x50,%rax + jl .Lxorpart2 + vpxor 0x40(%rdx),%xmm7,%xmm6 + vmovdqu %xmm6,0x40(%rsi) + + vmovdqa %xmm1,%xmm7 + cmp $0x60,%rax + jl .Lxorpart2 + vpxor 0x50(%rdx),%xmm7,%xmm6 + vmovdqu %xmm6,0x50(%rsi) + + vmovdqa %xmm2,%xmm7 + cmp $0x70,%rax + jl .Lxorpart2 + vpxor 0x60(%rdx),%xmm7,%xmm6 + vmovdqu %xmm6,0x60(%rsi) + + vmovdqa %xmm3,%xmm7 + cmp $0x80,%rax + jl .Lxorpart2 + vpxor 0x70(%rdx),%xmm7,%xmm6 + vmovdqu %xmm6,0x70(%rsi) + +.Ldone2: + vzeroupper + ret + +.Lxorpart2: + # xor remaining bytes from partial register into output + mov %rax,%r9 + and $0x0f,%r9 + jz .Ldone2 + and $~0x0f,%rax + + mov %rsi,%r11 + + lea 8(%rsp),%r10 + sub $0x10,%rsp + and $~31,%rsp + + lea (%rdx,%rax),%rsi + mov %rsp,%rdi + mov %r9,%rcx + rep movsb + + vpxor 0x00(%rsp),%xmm7,%xmm7 + vmovdqa %xmm7,0x00(%rsp) + + mov %rsp,%rsi + lea (%r11,%rax),%rdi + mov %r9,%rcx + rep movsb + + lea -8(%r10),%rsp + jmp .Ldone2 + +ENDPROC(chacha_2block_xor_avx2) + +ENTRY(chacha_4block_xor_avx2) + # %rdi: Input state matrix, s + # %rsi: up to 4 data blocks output, o + # %rdx: up to 4 data blocks input, i + # %rcx: input/output length in bytes + # %r8d: nrounds + + # This function encrypts four ChaCha blocks by loading the state + # matrix four times across eight AVX registers. It performs matrix + # operations on four words in two matrices in parallel, sequentially + # to the operations on the four words of the other two matrices. The + # required word shuffling has a rather high latency, we can do the + # arithmetic on two matrix-pairs without much slowdown. + + vzeroupper + + # x0..3[0-4] = s0..3 + vbroadcasti128 0x00(%rdi),%ymm0 + vbroadcasti128 0x10(%rdi),%ymm1 + vbroadcasti128 0x20(%rdi),%ymm2 + vbroadcasti128 0x30(%rdi),%ymm3 + + vmovdqa %ymm0,%ymm4 + vmovdqa %ymm1,%ymm5 + vmovdqa %ymm2,%ymm6 + vmovdqa %ymm3,%ymm7 + + vpaddd CTR2BL(%rip),%ymm3,%ymm3 + vpaddd CTR4BL(%rip),%ymm7,%ymm7 + + vmovdqa %ymm0,%ymm11 + vmovdqa %ymm1,%ymm12 + vmovdqa %ymm2,%ymm13 + vmovdqa %ymm3,%ymm14 + vmovdqa %ymm7,%ymm15 + + vmovdqa ROT8(%rip),%ymm8 + vmovdqa ROT16(%rip),%ymm9 + + mov %rcx,%rax + +.Ldoubleround4: + + # x0 += x1, x3 = rotl32(x3 ^ x0, 16) + vpaddd %ymm1,%ymm0,%ymm0 + vpxor %ymm0,%ymm3,%ymm3 + vpshufb %ymm9,%ymm3,%ymm3 + + vpaddd %ymm5,%ymm4,%ymm4 + vpxor %ymm4,%ymm7,%ymm7 + vpshufb %ymm9,%ymm7,%ymm7 + + # x2 += x3, x1 = rotl32(x1 ^ x2, 12) + vpaddd %ymm3,%ymm2,%ymm2 + vpxor %ymm2,%ymm1,%ymm1 + vmovdqa %ymm1,%ymm10 + vpslld $12,%ymm10,%ymm10 + vpsrld $20,%ymm1,%ymm1 + vpor %ymm10,%ymm1,%ymm1 + + vpaddd %ymm7,%ymm6,%ymm6 + vpxor %ymm6,%ymm5,%ymm5 + vmovdqa %ymm5,%ymm10 + vpslld $12,%ymm10,%ymm10 + vpsrld $20,%ymm5,%ymm5 + vpor %ymm10,%ymm5,%ymm5 + + # x0 += x1, x3 = rotl32(x3 ^ x0, 8) + vpaddd %ymm1,%ymm0,%ymm0 + vpxor %ymm0,%ymm3,%ymm3 + vpshufb %ymm8,%ymm3,%ymm3 + + vpaddd %ymm5,%ymm4,%ymm4 + vpxor %ymm4,%ymm7,%ymm7 + vpshufb %ymm8,%ymm7,%ymm7 + + # x2 += x3, x1 = rotl32(x1 ^ x2, 7) + vpaddd %ymm3,%ymm2,%ymm2 + vpxor %ymm2,%ymm1,%ymm1 + vmovdqa %ymm1,%ymm10 + vpslld $7,%ymm10,%ymm10 + vpsrld $25,%ymm1,%ymm1 + vpor %ymm10,%ymm1,%ymm1 + + vpaddd %ymm7,%ymm6,%ymm6 + vpxor %ymm6,%ymm5,%ymm5 + vmovdqa %ymm5,%ymm10 + vpslld $7,%ymm10,%ymm10 + vpsrld $25,%ymm5,%ymm5 + vpor %ymm10,%ymm5,%ymm5 + + # x1 = shuffle32(x1, MASK(0, 3, 2, 1)) + vpshufd $0x39,%ymm1,%ymm1 + vpshufd $0x39,%ymm5,%ymm5 + # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) + vpshufd $0x4e,%ymm2,%ymm2 + vpshufd $0x4e,%ymm6,%ymm6 + # x3 = shuffle32(x3, MASK(2, 1, 0, 3)) + vpshufd $0x93,%ymm3,%ymm3 + vpshufd $0x93,%ymm7,%ymm7 + + # x0 += x1, x3 = rotl32(x3 ^ x0, 16) + vpaddd %ymm1,%ymm0,%ymm0 + vpxor %ymm0,%ymm3,%ymm3 + vpshufb %ymm9,%ymm3,%ymm3 + + vpaddd %ymm5,%ymm4,%ymm4 + vpxor %ymm4,%ymm7,%ymm7 + vpshufb %ymm9,%ymm7,%ymm7 + + # x2 += x3, x1 = rotl32(x1 ^ x2, 12) + vpaddd %ymm3,%ymm2,%ymm2 + vpxor %ymm2,%ymm1,%ymm1 + vmovdqa %ymm1,%ymm10 + vpslld $12,%ymm10,%ymm10 + vpsrld $20,%ymm1,%ymm1 + vpor %ymm10,%ymm1,%ymm1 + + vpaddd %ymm7,%ymm6,%ymm6 + vpxor %ymm6,%ymm5,%ymm5 + vmovdqa %ymm5,%ymm10 + vpslld $12,%ymm10,%ymm10 + vpsrld $20,%ymm5,%ymm5 + vpor %ymm10,%ymm5,%ymm5 + + # x0 += x1, x3 = rotl32(x3 ^ x0, 8) + vpaddd %ymm1,%ymm0,%ymm0 + vpxor %ymm0,%ymm3,%ymm3 + vpshufb %ymm8,%ymm3,%ymm3 + + vpaddd %ymm5,%ymm4,%ymm4 + vpxor %ymm4,%ymm7,%ymm7 + vpshufb %ymm8,%ymm7,%ymm7 + + # x2 += x3, x1 = rotl32(x1 ^ x2, 7) + vpaddd %ymm3,%ymm2,%ymm2 + vpxor %ymm2,%ymm1,%ymm1 + vmovdqa %ymm1,%ymm10 + vpslld $7,%ymm10,%ymm10 + vpsrld $25,%ymm1,%ymm1 + vpor %ymm10,%ymm1,%ymm1 + + vpaddd %ymm7,%ymm6,%ymm6 + vpxor %ymm6,%ymm5,%ymm5 + vmovdqa %ymm5,%ymm10 + vpslld $7,%ymm10,%ymm10 + vpsrld $25,%ymm5,%ymm5 + vpor %ymm10,%ymm5,%ymm5 + + # x1 = shuffle32(x1, MASK(2, 1, 0, 3)) + vpshufd $0x93,%ymm1,%ymm1 + vpshufd $0x93,%ymm5,%ymm5 + # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) + vpshufd $0x4e,%ymm2,%ymm2 + vpshufd $0x4e,%ymm6,%ymm6 + # x3 = shuffle32(x3, MASK(0, 3, 2, 1)) + vpshufd $0x39,%ymm3,%ymm3 + vpshufd $0x39,%ymm7,%ymm7 + + sub $2,%r8d + jnz .Ldoubleround4 + + # o0 = i0 ^ (x0 + s0), first block + vpaddd %ymm11,%ymm0,%ymm10 + cmp $0x10,%rax + jl .Lxorpart4 + vpxor 0x00(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0x00(%rsi) + vextracti128 $1,%ymm10,%xmm0 + # o1 = i1 ^ (x1 + s1), first block + vpaddd %ymm12,%ymm1,%ymm10 + cmp $0x20,%rax + jl .Lxorpart4 + vpxor 0x10(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0x10(%rsi) + vextracti128 $1,%ymm10,%xmm1 + # o2 = i2 ^ (x2 + s2), first block + vpaddd %ymm13,%ymm2,%ymm10 + cmp $0x30,%rax + jl .Lxorpart4 + vpxor 0x20(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0x20(%rsi) + vextracti128 $1,%ymm10,%xmm2 + # o3 = i3 ^ (x3 + s3), first block + vpaddd %ymm14,%ymm3,%ymm10 + cmp $0x40,%rax + jl .Lxorpart4 + vpxor 0x30(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0x30(%rsi) + vextracti128 $1,%ymm10,%xmm3 + + # xor and write second block + vmovdqa %xmm0,%xmm10 + cmp $0x50,%rax + jl .Lxorpart4 + vpxor 0x40(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0x40(%rsi) + + vmovdqa %xmm1,%xmm10 + cmp $0x60,%rax + jl .Lxorpart4 + vpxor 0x50(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0x50(%rsi) + + vmovdqa %xmm2,%xmm10 + cmp $0x70,%rax + jl .Lxorpart4 + vpxor 0x60(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0x60(%rsi) + + vmovdqa %xmm3,%xmm10 + cmp $0x80,%rax + jl .Lxorpart4 + vpxor 0x70(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0x70(%rsi) + + # o0 = i0 ^ (x0 + s0), third block + vpaddd %ymm11,%ymm4,%ymm10 + cmp $0x90,%rax + jl .Lxorpart4 + vpxor 0x80(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0x80(%rsi) + vextracti128 $1,%ymm10,%xmm4 + # o1 = i1 ^ (x1 + s1), third block + vpaddd %ymm12,%ymm5,%ymm10 + cmp $0xa0,%rax + jl .Lxorpart4 + vpxor 0x90(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0x90(%rsi) + vextracti128 $1,%ymm10,%xmm5 + # o2 = i2 ^ (x2 + s2), third block + vpaddd %ymm13,%ymm6,%ymm10 + cmp $0xb0,%rax + jl .Lxorpart4 + vpxor 0xa0(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0xa0(%rsi) + vextracti128 $1,%ymm10,%xmm6 + # o3 = i3 ^ (x3 + s3), third block + vpaddd %ymm15,%ymm7,%ymm10 + cmp $0xc0,%rax + jl .Lxorpart4 + vpxor 0xb0(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0xb0(%rsi) + vextracti128 $1,%ymm10,%xmm7 + + # xor and write fourth block + vmovdqa %xmm4,%xmm10 + cmp $0xd0,%rax + jl .Lxorpart4 + vpxor 0xc0(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0xc0(%rsi) + + vmovdqa %xmm5,%xmm10 + cmp $0xe0,%rax + jl .Lxorpart4 + vpxor 0xd0(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0xd0(%rsi) + + vmovdqa %xmm6,%xmm10 + cmp $0xf0,%rax + jl .Lxorpart4 + vpxor 0xe0(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0xe0(%rsi) + + vmovdqa %xmm7,%xmm10 + cmp $0x100,%rax + jl .Lxorpart4 + vpxor 0xf0(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0xf0(%rsi) + +.Ldone4: + vzeroupper + ret + +.Lxorpart4: + # xor remaining bytes from partial register into output + mov %rax,%r9 + and $0x0f,%r9 + jz .Ldone4 + and $~0x0f,%rax + + mov %rsi,%r11 + + lea 8(%rsp),%r10 + sub $0x10,%rsp + and $~31,%rsp + + lea (%rdx,%rax),%rsi + mov %rsp,%rdi + mov %r9,%rcx + rep movsb + + vpxor 0x00(%rsp),%xmm10,%xmm10 + vmovdqa %xmm10,0x00(%rsp) + + mov %rsp,%rsi + lea (%r11,%rax),%rdi + mov %r9,%rcx + rep movsb + + lea -8(%r10),%rsp + jmp .Ldone4 + +ENDPROC(chacha_4block_xor_avx2) + +ENTRY(chacha_8block_xor_avx2) + # %rdi: Input state matrix, s + # %rsi: up to 8 data blocks output, o + # %rdx: up to 8 data blocks input, i + # %rcx: input/output length in bytes + # %r8d: nrounds + + # This function encrypts eight consecutive ChaCha blocks by loading + # the state matrix in AVX registers eight times. As we need some + # scratch registers, we save the first four registers on the stack. The + # algorithm performs each operation on the corresponding word of each + # state matrix, hence requires no word shuffling. For final XORing step + # we transpose the matrix by interleaving 32-, 64- and then 128-bit + # words, which allows us to do XOR in AVX registers. 8/16-bit word + # rotation is done with the slightly better performing byte shuffling, + # 7/12-bit word rotation uses traditional shift+OR. + + vzeroupper + # 4 * 32 byte stack, 32-byte aligned + lea 8(%rsp),%r10 + and $~31, %rsp + sub $0x80, %rsp + mov %rcx,%rax + + # x0..15[0-7] = s[0..15] + vpbroadcastd 0x00(%rdi),%ymm0 + vpbroadcastd 0x04(%rdi),%ymm1 + vpbroadcastd 0x08(%rdi),%ymm2 + vpbroadcastd 0x0c(%rdi),%ymm3 + vpbroadcastd 0x10(%rdi),%ymm4 + vpbroadcastd 0x14(%rdi),%ymm5 + vpbroadcastd 0x18(%rdi),%ymm6 + vpbroadcastd 0x1c(%rdi),%ymm7 + vpbroadcastd 0x20(%rdi),%ymm8 + vpbroadcastd 0x24(%rdi),%ymm9 + vpbroadcastd 0x28(%rdi),%ymm10 + vpbroadcastd 0x2c(%rdi),%ymm11 + vpbroadcastd 0x30(%rdi),%ymm12 + vpbroadcastd 0x34(%rdi),%ymm13 + vpbroadcastd 0x38(%rdi),%ymm14 + vpbroadcastd 0x3c(%rdi),%ymm15 + # x0..3 on stack + vmovdqa %ymm0,0x00(%rsp) + vmovdqa %ymm1,0x20(%rsp) + vmovdqa %ymm2,0x40(%rsp) + vmovdqa %ymm3,0x60(%rsp) + + vmovdqa CTRINC(%rip),%ymm1 + vmovdqa ROT8(%rip),%ymm2 + vmovdqa ROT16(%rip),%ymm3 + + # x12 += counter values 0-3 + vpaddd %ymm1,%ymm12,%ymm12 + +.Ldoubleround8: + # x0 += x4, x12 = rotl32(x12 ^ x0, 16) + vpaddd 0x00(%rsp),%ymm4,%ymm0 + vmovdqa %ymm0,0x00(%rsp) + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm3,%ymm12,%ymm12 + # x1 += x5, x13 = rotl32(x13 ^ x1, 16) + vpaddd 0x20(%rsp),%ymm5,%ymm0 + vmovdqa %ymm0,0x20(%rsp) + vpxor %ymm0,%ymm13,%ymm13 + vpshufb %ymm3,%ymm13,%ymm13 + # x2 += x6, x14 = rotl32(x14 ^ x2, 16) + vpaddd 0x40(%rsp),%ymm6,%ymm0 + vmovdqa %ymm0,0x40(%rsp) + vpxor %ymm0,%ymm14,%ymm14 + vpshufb %ymm3,%ymm14,%ymm14 + # x3 += x7, x15 = rotl32(x15 ^ x3, 16) + vpaddd 0x60(%rsp),%ymm7,%ymm0 + vmovdqa %ymm0,0x60(%rsp) + vpxor %ymm0,%ymm15,%ymm15 + vpshufb %ymm3,%ymm15,%ymm15 + + # x8 += x12, x4 = rotl32(x4 ^ x8, 12) + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $12,%ymm4,%ymm0 + vpsrld $20,%ymm4,%ymm4 + vpor %ymm0,%ymm4,%ymm4 + # x9 += x13, x5 = rotl32(x5 ^ x9, 12) + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpslld $12,%ymm5,%ymm0 + vpsrld $20,%ymm5,%ymm5 + vpor %ymm0,%ymm5,%ymm5 + # x10 += x14, x6 = rotl32(x6 ^ x10, 12) + vpaddd %ymm14,%ymm10,%ymm10 + vpxor %ymm10,%ymm6,%ymm6 + vpslld $12,%ymm6,%ymm0 + vpsrld $20,%ymm6,%ymm6 + vpor %ymm0,%ymm6,%ymm6 + # x11 += x15, x7 = rotl32(x7 ^ x11, 12) + vpaddd %ymm15,%ymm11,%ymm11 + vpxor %ymm11,%ymm7,%ymm7 + vpslld $12,%ymm7,%ymm0 + vpsrld $20,%ymm7,%ymm7 + vpor %ymm0,%ymm7,%ymm7 + + # x0 += x4, x12 = rotl32(x12 ^ x0, 8) + vpaddd 0x00(%rsp),%ymm4,%ymm0 + vmovdqa %ymm0,0x00(%rsp) + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm2,%ymm12,%ymm12 + # x1 += x5, x13 = rotl32(x13 ^ x1, 8) + vpaddd 0x20(%rsp),%ymm5,%ymm0 + vmovdqa %ymm0,0x20(%rsp) + vpxor %ymm0,%ymm13,%ymm13 + vpshufb %ymm2,%ymm13,%ymm13 + # x2 += x6, x14 = rotl32(x14 ^ x2, 8) + vpaddd 0x40(%rsp),%ymm6,%ymm0 + vmovdqa %ymm0,0x40(%rsp) + vpxor %ymm0,%ymm14,%ymm14 + vpshufb %ymm2,%ymm14,%ymm14 + # x3 += x7, x15 = rotl32(x15 ^ x3, 8) + vpaddd 0x60(%rsp),%ymm7,%ymm0 + vmovdqa %ymm0,0x60(%rsp) + vpxor %ymm0,%ymm15,%ymm15 + vpshufb %ymm2,%ymm15,%ymm15 + + # x8 += x12, x4 = rotl32(x4 ^ x8, 7) + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm0 + vpsrld $25,%ymm4,%ymm4 + vpor %ymm0,%ymm4,%ymm4 + # x9 += x13, x5 = rotl32(x5 ^ x9, 7) + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpslld $7,%ymm5,%ymm0 + vpsrld $25,%ymm5,%ymm5 + vpor %ymm0,%ymm5,%ymm5 + # x10 += x14, x6 = rotl32(x6 ^ x10, 7) + vpaddd %ymm14,%ymm10,%ymm10 + vpxor %ymm10,%ymm6,%ymm6 + vpslld $7,%ymm6,%ymm0 + vpsrld $25,%ymm6,%ymm6 + vpor %ymm0,%ymm6,%ymm6 + # x11 += x15, x7 = rotl32(x7 ^ x11, 7) + vpaddd %ymm15,%ymm11,%ymm11 + vpxor %ymm11,%ymm7,%ymm7 + vpslld $7,%ymm7,%ymm0 + vpsrld $25,%ymm7,%ymm7 + vpor %ymm0,%ymm7,%ymm7 + + # x0 += x5, x15 = rotl32(x15 ^ x0, 16) + vpaddd 0x00(%rsp),%ymm5,%ymm0 + vmovdqa %ymm0,0x00(%rsp) + vpxor %ymm0,%ymm15,%ymm15 + vpshufb %ymm3,%ymm15,%ymm15 + # x1 += x6, x12 = rotl32(x12 ^ x1, 16)%ymm0 + vpaddd 0x20(%rsp),%ymm6,%ymm0 + vmovdqa %ymm0,0x20(%rsp) + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm3,%ymm12,%ymm12 + # x2 += x7, x13 = rotl32(x13 ^ x2, 16) + vpaddd 0x40(%rsp),%ymm7,%ymm0 + vmovdqa %ymm0,0x40(%rsp) + vpxor %ymm0,%ymm13,%ymm13 + vpshufb %ymm3,%ymm13,%ymm13 + # x3 += x4, x14 = rotl32(x14 ^ x3, 16) + vpaddd 0x60(%rsp),%ymm4,%ymm0 + vmovdqa %ymm0,0x60(%rsp) + vpxor %ymm0,%ymm14,%ymm14 + vpshufb %ymm3,%ymm14,%ymm14 + + # x10 += x15, x5 = rotl32(x5 ^ x10, 12) + vpaddd %ymm15,%ymm10,%ymm10 + vpxor %ymm10,%ymm5,%ymm5 + vpslld $12,%ymm5,%ymm0 + vpsrld $20,%ymm5,%ymm5 + vpor %ymm0,%ymm5,%ymm5 + # x11 += x12, x6 = rotl32(x6 ^ x11, 12) + vpaddd %ymm12,%ymm11,%ymm11 + vpxor %ymm11,%ymm6,%ymm6 + vpslld $12,%ymm6,%ymm0 + vpsrld $20,%ymm6,%ymm6 + vpor %ymm0,%ymm6,%ymm6 + # x8 += x13, x7 = rotl32(x7 ^ x8, 12) + vpaddd %ymm13,%ymm8,%ymm8 + vpxor %ymm8,%ymm7,%ymm7 + vpslld $12,%ymm7,%ymm0 + vpsrld $20,%ymm7,%ymm7 + vpor %ymm0,%ymm7,%ymm7 + # x9 += x14, x4 = rotl32(x4 ^ x9, 12) + vpaddd %ymm14,%ymm9,%ymm9 + vpxor %ymm9,%ymm4,%ymm4 + vpslld $12,%ymm4,%ymm0 + vpsrld $20,%ymm4,%ymm4 + vpor %ymm0,%ymm4,%ymm4 + + # x0 += x5, x15 = rotl32(x15 ^ x0, 8) + vpaddd 0x00(%rsp),%ymm5,%ymm0 + vmovdqa %ymm0,0x00(%rsp) + vpxor %ymm0,%ymm15,%ymm15 + vpshufb %ymm2,%ymm15,%ymm15 + # x1 += x6, x12 = rotl32(x12 ^ x1, 8) + vpaddd 0x20(%rsp),%ymm6,%ymm0 + vmovdqa %ymm0,0x20(%rsp) + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm2,%ymm12,%ymm12 + # x2 += x7, x13 = rotl32(x13 ^ x2, 8) + vpaddd 0x40(%rsp),%ymm7,%ymm0 + vmovdqa %ymm0,0x40(%rsp) + vpxor %ymm0,%ymm13,%ymm13 + vpshufb %ymm2,%ymm13,%ymm13 + # x3 += x4, x14 = rotl32(x14 ^ x3, 8) + vpaddd 0x60(%rsp),%ymm4,%ymm0 + vmovdqa %ymm0,0x60(%rsp) + vpxor %ymm0,%ymm14,%ymm14 + vpshufb %ymm2,%ymm14,%ymm14 + + # x10 += x15, x5 = rotl32(x5 ^ x10, 7) + vpaddd %ymm15,%ymm10,%ymm10 + vpxor %ymm10,%ymm5,%ymm5 + vpslld $7,%ymm5,%ymm0 + vpsrld $25,%ymm5,%ymm5 + vpor %ymm0,%ymm5,%ymm5 + # x11 += x12, x6 = rotl32(x6 ^ x11, 7) + vpaddd %ymm12,%ymm11,%ymm11 + vpxor %ymm11,%ymm6,%ymm6 + vpslld $7,%ymm6,%ymm0 + vpsrld $25,%ymm6,%ymm6 + vpor %ymm0,%ymm6,%ymm6 + # x8 += x13, x7 = rotl32(x7 ^ x8, 7) + vpaddd %ymm13,%ymm8,%ymm8 + vpxor %ymm8,%ymm7,%ymm7 + vpslld $7,%ymm7,%ymm0 + vpsrld $25,%ymm7,%ymm7 + vpor %ymm0,%ymm7,%ymm7 + # x9 += x14, x4 = rotl32(x4 ^ x9, 7) + vpaddd %ymm14,%ymm9,%ymm9 + vpxor %ymm9,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm0 + vpsrld $25,%ymm4,%ymm4 + vpor %ymm0,%ymm4,%ymm4 + + sub $2,%r8d + jnz .Ldoubleround8 + + # x0..15[0-3] += s[0..15] + vpbroadcastd 0x00(%rdi),%ymm0 + vpaddd 0x00(%rsp),%ymm0,%ymm0 + vmovdqa %ymm0,0x00(%rsp) + vpbroadcastd 0x04(%rdi),%ymm0 + vpaddd 0x20(%rsp),%ymm0,%ymm0 + vmovdqa %ymm0,0x20(%rsp) + vpbroadcastd 0x08(%rdi),%ymm0 + vpaddd 0x40(%rsp),%ymm0,%ymm0 + vmovdqa %ymm0,0x40(%rsp) + vpbroadcastd 0x0c(%rdi),%ymm0 + vpaddd 0x60(%rsp),%ymm0,%ymm0 + vmovdqa %ymm0,0x60(%rsp) + vpbroadcastd 0x10(%rdi),%ymm0 + vpaddd %ymm0,%ymm4,%ymm4 + vpbroadcastd 0x14(%rdi),%ymm0 + vpaddd %ymm0,%ymm5,%ymm5 + vpbroadcastd 0x18(%rdi),%ymm0 + vpaddd %ymm0,%ymm6,%ymm6 + vpbroadcastd 0x1c(%rdi),%ymm0 + vpaddd %ymm0,%ymm7,%ymm7 + vpbroadcastd 0x20(%rdi),%ymm0 + vpaddd %ymm0,%ymm8,%ymm8 + vpbroadcastd 0x24(%rdi),%ymm0 + vpaddd %ymm0,%ymm9,%ymm9 + vpbroadcastd 0x28(%rdi),%ymm0 + vpaddd %ymm0,%ymm10,%ymm10 + vpbroadcastd 0x2c(%rdi),%ymm0 + vpaddd %ymm0,%ymm11,%ymm11 + vpbroadcastd 0x30(%rdi),%ymm0 + vpaddd %ymm0,%ymm12,%ymm12 + vpbroadcastd 0x34(%rdi),%ymm0 + vpaddd %ymm0,%ymm13,%ymm13 + vpbroadcastd 0x38(%rdi),%ymm0 + vpaddd %ymm0,%ymm14,%ymm14 + vpbroadcastd 0x3c(%rdi),%ymm0 + vpaddd %ymm0,%ymm15,%ymm15 + + # x12 += counter values 0-3 + vpaddd %ymm1,%ymm12,%ymm12 + + # interleave 32-bit words in state n, n+1 + vmovdqa 0x00(%rsp),%ymm0 + vmovdqa 0x20(%rsp),%ymm1 + vpunpckldq %ymm1,%ymm0,%ymm2 + vpunpckhdq %ymm1,%ymm0,%ymm1 + vmovdqa %ymm2,0x00(%rsp) + vmovdqa %ymm1,0x20(%rsp) + vmovdqa 0x40(%rsp),%ymm0 + vmovdqa 0x60(%rsp),%ymm1 + vpunpckldq %ymm1,%ymm0,%ymm2 + vpunpckhdq %ymm1,%ymm0,%ymm1 + vmovdqa %ymm2,0x40(%rsp) + vmovdqa %ymm1,0x60(%rsp) + vmovdqa %ymm4,%ymm0 + vpunpckldq %ymm5,%ymm0,%ymm4 + vpunpckhdq %ymm5,%ymm0,%ymm5 + vmovdqa %ymm6,%ymm0 + vpunpckldq %ymm7,%ymm0,%ymm6 + vpunpckhdq %ymm7,%ymm0,%ymm7 + vmovdqa %ymm8,%ymm0 + vpunpckldq %ymm9,%ymm0,%ymm8 + vpunpckhdq %ymm9,%ymm0,%ymm9 + vmovdqa %ymm10,%ymm0 + vpunpckldq %ymm11,%ymm0,%ymm10 + vpunpckhdq %ymm11,%ymm0,%ymm11 + vmovdqa %ymm12,%ymm0 + vpunpckldq %ymm13,%ymm0,%ymm12 + vpunpckhdq %ymm13,%ymm0,%ymm13 + vmovdqa %ymm14,%ymm0 + vpunpckldq %ymm15,%ymm0,%ymm14 + vpunpckhdq %ymm15,%ymm0,%ymm15 + + # interleave 64-bit words in state n, n+2 + vmovdqa 0x00(%rsp),%ymm0 + vmovdqa 0x40(%rsp),%ymm2 + vpunpcklqdq %ymm2,%ymm0,%ymm1 + vpunpckhqdq %ymm2,%ymm0,%ymm2 + vmovdqa %ymm1,0x00(%rsp) + vmovdqa %ymm2,0x40(%rsp) + vmovdqa 0x20(%rsp),%ymm0 + vmovdqa 0x60(%rsp),%ymm2 + vpunpcklqdq %ymm2,%ymm0,%ymm1 + vpunpckhqdq %ymm2,%ymm0,%ymm2 + vmovdqa %ymm1,0x20(%rsp) + vmovdqa %ymm2,0x60(%rsp) + vmovdqa %ymm4,%ymm0 + vpunpcklqdq %ymm6,%ymm0,%ymm4 + vpunpckhqdq %ymm6,%ymm0,%ymm6 + vmovdqa %ymm5,%ymm0 + vpunpcklqdq %ymm7,%ymm0,%ymm5 + vpunpckhqdq %ymm7,%ymm0,%ymm7 + vmovdqa %ymm8,%ymm0 + vpunpcklqdq %ymm10,%ymm0,%ymm8 + vpunpckhqdq %ymm10,%ymm0,%ymm10 + vmovdqa %ymm9,%ymm0 + vpunpcklqdq %ymm11,%ymm0,%ymm9 + vpunpckhqdq %ymm11,%ymm0,%ymm11 + vmovdqa %ymm12,%ymm0 + vpunpcklqdq %ymm14,%ymm0,%ymm12 + vpunpckhqdq %ymm14,%ymm0,%ymm14 + vmovdqa %ymm13,%ymm0 + vpunpcklqdq %ymm15,%ymm0,%ymm13 + vpunpckhqdq %ymm15,%ymm0,%ymm15 + + # interleave 128-bit words in state n, n+4 + # xor/write first four blocks + vmovdqa 0x00(%rsp),%ymm1 + vperm2i128 $0x20,%ymm4,%ymm1,%ymm0 + cmp $0x0020,%rax + jl .Lxorpart8 + vpxor 0x0000(%rdx),%ymm0,%ymm0 + vmovdqu %ymm0,0x0000(%rsi) + vperm2i128 $0x31,%ymm4,%ymm1,%ymm4 + + vperm2i128 $0x20,%ymm12,%ymm8,%ymm0 + cmp $0x0040,%rax + jl .Lxorpart8 + vpxor 0x0020(%rdx),%ymm0,%ymm0 + vmovdqu %ymm0,0x0020(%rsi) + vperm2i128 $0x31,%ymm12,%ymm8,%ymm12 + + vmovdqa 0x40(%rsp),%ymm1 + vperm2i128 $0x20,%ymm6,%ymm1,%ymm0 + cmp $0x0060,%rax + jl .Lxorpart8 + vpxor 0x0040(%rdx),%ymm0,%ymm0 + vmovdqu %ymm0,0x0040(%rsi) + vperm2i128 $0x31,%ymm6,%ymm1,%ymm6 + + vperm2i128 $0x20,%ymm14,%ymm10,%ymm0 + cmp $0x0080,%rax + jl .Lxorpart8 + vpxor 0x0060(%rdx),%ymm0,%ymm0 + vmovdqu %ymm0,0x0060(%rsi) + vperm2i128 $0x31,%ymm14,%ymm10,%ymm14 + + vmovdqa 0x20(%rsp),%ymm1 + vperm2i128 $0x20,%ymm5,%ymm1,%ymm0 + cmp $0x00a0,%rax + jl .Lxorpart8 + vpxor 0x0080(%rdx),%ymm0,%ymm0 + vmovdqu %ymm0,0x0080(%rsi) + vperm2i128 $0x31,%ymm5,%ymm1,%ymm5 + + vperm2i128 $0x20,%ymm13,%ymm9,%ymm0 + cmp $0x00c0,%rax + jl .Lxorpart8 + vpxor 0x00a0(%rdx),%ymm0,%ymm0 + vmovdqu %ymm0,0x00a0(%rsi) + vperm2i128 $0x31,%ymm13,%ymm9,%ymm13 + + vmovdqa 0x60(%rsp),%ymm1 + vperm2i128 $0x20,%ymm7,%ymm1,%ymm0 + cmp $0x00e0,%rax + jl .Lxorpart8 + vpxor 0x00c0(%rdx),%ymm0,%ymm0 + vmovdqu %ymm0,0x00c0(%rsi) + vperm2i128 $0x31,%ymm7,%ymm1,%ymm7 + + vperm2i128 $0x20,%ymm15,%ymm11,%ymm0 + cmp $0x0100,%rax + jl .Lxorpart8 + vpxor 0x00e0(%rdx),%ymm0,%ymm0 + vmovdqu %ymm0,0x00e0(%rsi) + vperm2i128 $0x31,%ymm15,%ymm11,%ymm15 + + # xor remaining blocks, write to output + vmovdqa %ymm4,%ymm0 + cmp $0x0120,%rax + jl .Lxorpart8 + vpxor 0x0100(%rdx),%ymm0,%ymm0 + vmovdqu %ymm0,0x0100(%rsi) + + vmovdqa %ymm12,%ymm0 + cmp $0x0140,%rax + jl .Lxorpart8 + vpxor 0x0120(%rdx),%ymm0,%ymm0 + vmovdqu %ymm0,0x0120(%rsi) + + vmovdqa %ymm6,%ymm0 + cmp $0x0160,%rax + jl .Lxorpart8 + vpxor 0x0140(%rdx),%ymm0,%ymm0 + vmovdqu %ymm0,0x0140(%rsi) + + vmovdqa %ymm14,%ymm0 + cmp $0x0180,%rax + jl .Lxorpart8 + vpxor 0x0160(%rdx),%ymm0,%ymm0 + vmovdqu %ymm0,0x0160(%rsi) + + vmovdqa %ymm5,%ymm0 + cmp $0x01a0,%rax + jl .Lxorpart8 + vpxor 0x0180(%rdx),%ymm0,%ymm0 + vmovdqu %ymm0,0x0180(%rsi) + + vmovdqa %ymm13,%ymm0 + cmp $0x01c0,%rax + jl .Lxorpart8 + vpxor 0x01a0(%rdx),%ymm0,%ymm0 + vmovdqu %ymm0,0x01a0(%rsi) + + vmovdqa %ymm7,%ymm0 + cmp $0x01e0,%rax + jl .Lxorpart8 + vpxor 0x01c0(%rdx),%ymm0,%ymm0 + vmovdqu %ymm0,0x01c0(%rsi) + + vmovdqa %ymm15,%ymm0 + cmp $0x0200,%rax + jl .Lxorpart8 + vpxor 0x01e0(%rdx),%ymm0,%ymm0 + vmovdqu %ymm0,0x01e0(%rsi) + +.Ldone8: + vzeroupper + lea -8(%r10),%rsp + ret + +.Lxorpart8: + # xor remaining bytes from partial register into output + mov %rax,%r9 + and $0x1f,%r9 + jz .Ldone8 + and $~0x1f,%rax + + mov %rsi,%r11 + + lea (%rdx,%rax),%rsi + mov %rsp,%rdi + mov %r9,%rcx + rep movsb + + vpxor 0x00(%rsp),%ymm0,%ymm0 + vmovdqa %ymm0,0x00(%rsp) + + mov %rsp,%rsi + lea (%r11,%rax),%rdi + mov %r9,%rcx + rep movsb + + jmp .Ldone8 + +ENDPROC(chacha_8block_xor_avx2) diff --git a/arch/x86/crypto/chacha-avx512vl-x86_64.S b/arch/x86/crypto/chacha-avx512vl-x86_64.S new file mode 100644 index 000000000000..848f9c75fd4f --- /dev/null +++ b/arch/x86/crypto/chacha-avx512vl-x86_64.S @@ -0,0 +1,836 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ +/* + * ChaCha 256-bit cipher algorithm, x64 AVX-512VL functions + * + * Copyright (C) 2018 Martin Willi + */ + +#include + +.section .rodata.cst32.CTR2BL, "aM", @progbits, 32 +.align 32 +CTR2BL: .octa 0x00000000000000000000000000000000 + .octa 0x00000000000000000000000000000001 + +.section .rodata.cst32.CTR4BL, "aM", @progbits, 32 +.align 32 +CTR4BL: .octa 0x00000000000000000000000000000002 + .octa 0x00000000000000000000000000000003 + +.section .rodata.cst32.CTR8BL, "aM", @progbits, 32 +.align 32 +CTR8BL: .octa 0x00000003000000020000000100000000 + .octa 0x00000007000000060000000500000004 + +.text + +ENTRY(chacha_2block_xor_avx512vl) + # %rdi: Input state matrix, s + # %rsi: up to 2 data blocks output, o + # %rdx: up to 2 data blocks input, i + # %rcx: input/output length in bytes + # %r8d: nrounds + + # This function encrypts two ChaCha blocks by loading the state + # matrix twice across four AVX registers. It performs matrix operations + # on four words in each matrix in parallel, but requires shuffling to + # rearrange the words after each round. + + vzeroupper + + # x0..3[0-2] = s0..3 + vbroadcasti128 0x00(%rdi),%ymm0 + vbroadcasti128 0x10(%rdi),%ymm1 + vbroadcasti128 0x20(%rdi),%ymm2 + vbroadcasti128 0x30(%rdi),%ymm3 + + vpaddd CTR2BL(%rip),%ymm3,%ymm3 + + vmovdqa %ymm0,%ymm8 + vmovdqa %ymm1,%ymm9 + vmovdqa %ymm2,%ymm10 + vmovdqa %ymm3,%ymm11 + +.Ldoubleround: + + # x0 += x1, x3 = rotl32(x3 ^ x0, 16) + vpaddd %ymm1,%ymm0,%ymm0 + vpxord %ymm0,%ymm3,%ymm3 + vprold $16,%ymm3,%ymm3 + + # x2 += x3, x1 = rotl32(x1 ^ x2, 12) + vpaddd %ymm3,%ymm2,%ymm2 + vpxord %ymm2,%ymm1,%ymm1 + vprold $12,%ymm1,%ymm1 + + # x0 += x1, x3 = rotl32(x3 ^ x0, 8) + vpaddd %ymm1,%ymm0,%ymm0 + vpxord %ymm0,%ymm3,%ymm3 + vprold $8,%ymm3,%ymm3 + + # x2 += x3, x1 = rotl32(x1 ^ x2, 7) + vpaddd %ymm3,%ymm2,%ymm2 + vpxord %ymm2,%ymm1,%ymm1 + vprold $7,%ymm1,%ymm1 + + # x1 = shuffle32(x1, MASK(0, 3, 2, 1)) + vpshufd $0x39,%ymm1,%ymm1 + # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) + vpshufd $0x4e,%ymm2,%ymm2 + # x3 = shuffle32(x3, MASK(2, 1, 0, 3)) + vpshufd $0x93,%ymm3,%ymm3 + + # x0 += x1, x3 = rotl32(x3 ^ x0, 16) + vpaddd %ymm1,%ymm0,%ymm0 + vpxord %ymm0,%ymm3,%ymm3 + vprold $16,%ymm3,%ymm3 + + # x2 += x3, x1 = rotl32(x1 ^ x2, 12) + vpaddd %ymm3,%ymm2,%ymm2 + vpxord %ymm2,%ymm1,%ymm1 + vprold $12,%ymm1,%ymm1 + + # x0 += x1, x3 = rotl32(x3 ^ x0, 8) + vpaddd %ymm1,%ymm0,%ymm0 + vpxord %ymm0,%ymm3,%ymm3 + vprold $8,%ymm3,%ymm3 + + # x2 += x3, x1 = rotl32(x1 ^ x2, 7) + vpaddd %ymm3,%ymm2,%ymm2 + vpxord %ymm2,%ymm1,%ymm1 + vprold $7,%ymm1,%ymm1 + + # x1 = shuffle32(x1, MASK(2, 1, 0, 3)) + vpshufd $0x93,%ymm1,%ymm1 + # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) + vpshufd $0x4e,%ymm2,%ymm2 + # x3 = shuffle32(x3, MASK(0, 3, 2, 1)) + vpshufd $0x39,%ymm3,%ymm3 + + sub $2,%r8d + jnz .Ldoubleround + + # o0 = i0 ^ (x0 + s0) + vpaddd %ymm8,%ymm0,%ymm7 + cmp $0x10,%rcx + jl .Lxorpart2 + vpxord 0x00(%rdx),%xmm7,%xmm6 + vmovdqu %xmm6,0x00(%rsi) + vextracti128 $1,%ymm7,%xmm0 + # o1 = i1 ^ (x1 + s1) + vpaddd %ymm9,%ymm1,%ymm7 + cmp $0x20,%rcx + jl .Lxorpart2 + vpxord 0x10(%rdx),%xmm7,%xmm6 + vmovdqu %xmm6,0x10(%rsi) + vextracti128 $1,%ymm7,%xmm1 + # o2 = i2 ^ (x2 + s2) + vpaddd %ymm10,%ymm2,%ymm7 + cmp $0x30,%rcx + jl .Lxorpart2 + vpxord 0x20(%rdx),%xmm7,%xmm6 + vmovdqu %xmm6,0x20(%rsi) + vextracti128 $1,%ymm7,%xmm2 + # o3 = i3 ^ (x3 + s3) + vpaddd %ymm11,%ymm3,%ymm7 + cmp $0x40,%rcx + jl .Lxorpart2 + vpxord 0x30(%rdx),%xmm7,%xmm6 + vmovdqu %xmm6,0x30(%rsi) + vextracti128 $1,%ymm7,%xmm3 + + # xor and write second block + vmovdqa %xmm0,%xmm7 + cmp $0x50,%rcx + jl .Lxorpart2 + vpxord 0x40(%rdx),%xmm7,%xmm6 + vmovdqu %xmm6,0x40(%rsi) + + vmovdqa %xmm1,%xmm7 + cmp $0x60,%rcx + jl .Lxorpart2 + vpxord 0x50(%rdx),%xmm7,%xmm6 + vmovdqu %xmm6,0x50(%rsi) + + vmovdqa %xmm2,%xmm7 + cmp $0x70,%rcx + jl .Lxorpart2 + vpxord 0x60(%rdx),%xmm7,%xmm6 + vmovdqu %xmm6,0x60(%rsi) + + vmovdqa %xmm3,%xmm7 + cmp $0x80,%rcx + jl .Lxorpart2 + vpxord 0x70(%rdx),%xmm7,%xmm6 + vmovdqu %xmm6,0x70(%rsi) + +.Ldone2: + vzeroupper + ret + +.Lxorpart2: + # xor remaining bytes from partial register into output + mov %rcx,%rax + and $0xf,%rcx + jz .Ldone8 + mov %rax,%r9 + and $~0xf,%r9 + + mov $1,%rax + shld %cl,%rax,%rax + sub $1,%rax + kmovq %rax,%k1 + + vmovdqu8 (%rdx,%r9),%xmm1{%k1}{z} + vpxord %xmm7,%xmm1,%xmm1 + vmovdqu8 %xmm1,(%rsi,%r9){%k1} + + jmp .Ldone2 + +ENDPROC(chacha_2block_xor_avx512vl) + +ENTRY(chacha_4block_xor_avx512vl) + # %rdi: Input state matrix, s + # %rsi: up to 4 data blocks output, o + # %rdx: up to 4 data blocks input, i + # %rcx: input/output length in bytes + # %r8d: nrounds + + # This function encrypts four ChaCha blocks by loading the state + # matrix four times across eight AVX registers. It performs matrix + # operations on four words in two matrices in parallel, sequentially + # to the operations on the four words of the other two matrices. The + # required word shuffling has a rather high latency, we can do the + # arithmetic on two matrix-pairs without much slowdown. + + vzeroupper + + # x0..3[0-4] = s0..3 + vbroadcasti128 0x00(%rdi),%ymm0 + vbroadcasti128 0x10(%rdi),%ymm1 + vbroadcasti128 0x20(%rdi),%ymm2 + vbroadcasti128 0x30(%rdi),%ymm3 + + vmovdqa %ymm0,%ymm4 + vmovdqa %ymm1,%ymm5 + vmovdqa %ymm2,%ymm6 + vmovdqa %ymm3,%ymm7 + + vpaddd CTR2BL(%rip),%ymm3,%ymm3 + vpaddd CTR4BL(%rip),%ymm7,%ymm7 + + vmovdqa %ymm0,%ymm11 + vmovdqa %ymm1,%ymm12 + vmovdqa %ymm2,%ymm13 + vmovdqa %ymm3,%ymm14 + vmovdqa %ymm7,%ymm15 + +.Ldoubleround4: + + # x0 += x1, x3 = rotl32(x3 ^ x0, 16) + vpaddd %ymm1,%ymm0,%ymm0 + vpxord %ymm0,%ymm3,%ymm3 + vprold $16,%ymm3,%ymm3 + + vpaddd %ymm5,%ymm4,%ymm4 + vpxord %ymm4,%ymm7,%ymm7 + vprold $16,%ymm7,%ymm7 + + # x2 += x3, x1 = rotl32(x1 ^ x2, 12) + vpaddd %ymm3,%ymm2,%ymm2 + vpxord %ymm2,%ymm1,%ymm1 + vprold $12,%ymm1,%ymm1 + + vpaddd %ymm7,%ymm6,%ymm6 + vpxord %ymm6,%ymm5,%ymm5 + vprold $12,%ymm5,%ymm5 + + # x0 += x1, x3 = rotl32(x3 ^ x0, 8) + vpaddd %ymm1,%ymm0,%ymm0 + vpxord %ymm0,%ymm3,%ymm3 + vprold $8,%ymm3,%ymm3 + + vpaddd %ymm5,%ymm4,%ymm4 + vpxord %ymm4,%ymm7,%ymm7 + vprold $8,%ymm7,%ymm7 + + # x2 += x3, x1 = rotl32(x1 ^ x2, 7) + vpaddd %ymm3,%ymm2,%ymm2 + vpxord %ymm2,%ymm1,%ymm1 + vprold $7,%ymm1,%ymm1 + + vpaddd %ymm7,%ymm6,%ymm6 + vpxord %ymm6,%ymm5,%ymm5 + vprold $7,%ymm5,%ymm5 + + # x1 = shuffle32(x1, MASK(0, 3, 2, 1)) + vpshufd $0x39,%ymm1,%ymm1 + vpshufd $0x39,%ymm5,%ymm5 + # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) + vpshufd $0x4e,%ymm2,%ymm2 + vpshufd $0x4e,%ymm6,%ymm6 + # x3 = shuffle32(x3, MASK(2, 1, 0, 3)) + vpshufd $0x93,%ymm3,%ymm3 + vpshufd $0x93,%ymm7,%ymm7 + + # x0 += x1, x3 = rotl32(x3 ^ x0, 16) + vpaddd %ymm1,%ymm0,%ymm0 + vpxord %ymm0,%ymm3,%ymm3 + vprold $16,%ymm3,%ymm3 + + vpaddd %ymm5,%ymm4,%ymm4 + vpxord %ymm4,%ymm7,%ymm7 + vprold $16,%ymm7,%ymm7 + + # x2 += x3, x1 = rotl32(x1 ^ x2, 12) + vpaddd %ymm3,%ymm2,%ymm2 + vpxord %ymm2,%ymm1,%ymm1 + vprold $12,%ymm1,%ymm1 + + vpaddd %ymm7,%ymm6,%ymm6 + vpxord %ymm6,%ymm5,%ymm5 + vprold $12,%ymm5,%ymm5 + + # x0 += x1, x3 = rotl32(x3 ^ x0, 8) + vpaddd %ymm1,%ymm0,%ymm0 + vpxord %ymm0,%ymm3,%ymm3 + vprold $8,%ymm3,%ymm3 + + vpaddd %ymm5,%ymm4,%ymm4 + vpxord %ymm4,%ymm7,%ymm7 + vprold $8,%ymm7,%ymm7 + + # x2 += x3, x1 = rotl32(x1 ^ x2, 7) + vpaddd %ymm3,%ymm2,%ymm2 + vpxord %ymm2,%ymm1,%ymm1 + vprold $7,%ymm1,%ymm1 + + vpaddd %ymm7,%ymm6,%ymm6 + vpxord %ymm6,%ymm5,%ymm5 + vprold $7,%ymm5,%ymm5 + + # x1 = shuffle32(x1, MASK(2, 1, 0, 3)) + vpshufd $0x93,%ymm1,%ymm1 + vpshufd $0x93,%ymm5,%ymm5 + # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) + vpshufd $0x4e,%ymm2,%ymm2 + vpshufd $0x4e,%ymm6,%ymm6 + # x3 = shuffle32(x3, MASK(0, 3, 2, 1)) + vpshufd $0x39,%ymm3,%ymm3 + vpshufd $0x39,%ymm7,%ymm7 + + sub $2,%r8d + jnz .Ldoubleround4 + + # o0 = i0 ^ (x0 + s0), first block + vpaddd %ymm11,%ymm0,%ymm10 + cmp $0x10,%rcx + jl .Lxorpart4 + vpxord 0x00(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0x00(%rsi) + vextracti128 $1,%ymm10,%xmm0 + # o1 = i1 ^ (x1 + s1), first block + vpaddd %ymm12,%ymm1,%ymm10 + cmp $0x20,%rcx + jl .Lxorpart4 + vpxord 0x10(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0x10(%rsi) + vextracti128 $1,%ymm10,%xmm1 + # o2 = i2 ^ (x2 + s2), first block + vpaddd %ymm13,%ymm2,%ymm10 + cmp $0x30,%rcx + jl .Lxorpart4 + vpxord 0x20(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0x20(%rsi) + vextracti128 $1,%ymm10,%xmm2 + # o3 = i3 ^ (x3 + s3), first block + vpaddd %ymm14,%ymm3,%ymm10 + cmp $0x40,%rcx + jl .Lxorpart4 + vpxord 0x30(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0x30(%rsi) + vextracti128 $1,%ymm10,%xmm3 + + # xor and write second block + vmovdqa %xmm0,%xmm10 + cmp $0x50,%rcx + jl .Lxorpart4 + vpxord 0x40(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0x40(%rsi) + + vmovdqa %xmm1,%xmm10 + cmp $0x60,%rcx + jl .Lxorpart4 + vpxord 0x50(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0x50(%rsi) + + vmovdqa %xmm2,%xmm10 + cmp $0x70,%rcx + jl .Lxorpart4 + vpxord 0x60(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0x60(%rsi) + + vmovdqa %xmm3,%xmm10 + cmp $0x80,%rcx + jl .Lxorpart4 + vpxord 0x70(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0x70(%rsi) + + # o0 = i0 ^ (x0 + s0), third block + vpaddd %ymm11,%ymm4,%ymm10 + cmp $0x90,%rcx + jl .Lxorpart4 + vpxord 0x80(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0x80(%rsi) + vextracti128 $1,%ymm10,%xmm4 + # o1 = i1 ^ (x1 + s1), third block + vpaddd %ymm12,%ymm5,%ymm10 + cmp $0xa0,%rcx + jl .Lxorpart4 + vpxord 0x90(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0x90(%rsi) + vextracti128 $1,%ymm10,%xmm5 + # o2 = i2 ^ (x2 + s2), third block + vpaddd %ymm13,%ymm6,%ymm10 + cmp $0xb0,%rcx + jl .Lxorpart4 + vpxord 0xa0(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0xa0(%rsi) + vextracti128 $1,%ymm10,%xmm6 + # o3 = i3 ^ (x3 + s3), third block + vpaddd %ymm15,%ymm7,%ymm10 + cmp $0xc0,%rcx + jl .Lxorpart4 + vpxord 0xb0(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0xb0(%rsi) + vextracti128 $1,%ymm10,%xmm7 + + # xor and write fourth block + vmovdqa %xmm4,%xmm10 + cmp $0xd0,%rcx + jl .Lxorpart4 + vpxord 0xc0(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0xc0(%rsi) + + vmovdqa %xmm5,%xmm10 + cmp $0xe0,%rcx + jl .Lxorpart4 + vpxord 0xd0(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0xd0(%rsi) + + vmovdqa %xmm6,%xmm10 + cmp $0xf0,%rcx + jl .Lxorpart4 + vpxord 0xe0(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0xe0(%rsi) + + vmovdqa %xmm7,%xmm10 + cmp $0x100,%rcx + jl .Lxorpart4 + vpxord 0xf0(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0xf0(%rsi) + +.Ldone4: + vzeroupper + ret + +.Lxorpart4: + # xor remaining bytes from partial register into output + mov %rcx,%rax + and $0xf,%rcx + jz .Ldone8 + mov %rax,%r9 + and $~0xf,%r9 + + mov $1,%rax + shld %cl,%rax,%rax + sub $1,%rax + kmovq %rax,%k1 + + vmovdqu8 (%rdx,%r9),%xmm1{%k1}{z} + vpxord %xmm10,%xmm1,%xmm1 + vmovdqu8 %xmm1,(%rsi,%r9){%k1} + + jmp .Ldone4 + +ENDPROC(chacha_4block_xor_avx512vl) + +ENTRY(chacha_8block_xor_avx512vl) + # %rdi: Input state matrix, s + # %rsi: up to 8 data blocks output, o + # %rdx: up to 8 data blocks input, i + # %rcx: input/output length in bytes + # %r8d: nrounds + + # This function encrypts eight consecutive ChaCha blocks by loading + # the state matrix in AVX registers eight times. Compared to AVX2, this + # mostly benefits from the new rotate instructions in VL and the + # additional registers. + + vzeroupper + + # x0..15[0-7] = s[0..15] + vpbroadcastd 0x00(%rdi),%ymm0 + vpbroadcastd 0x04(%rdi),%ymm1 + vpbroadcastd 0x08(%rdi),%ymm2 + vpbroadcastd 0x0c(%rdi),%ymm3 + vpbroadcastd 0x10(%rdi),%ymm4 + vpbroadcastd 0x14(%rdi),%ymm5 + vpbroadcastd 0x18(%rdi),%ymm6 + vpbroadcastd 0x1c(%rdi),%ymm7 + vpbroadcastd 0x20(%rdi),%ymm8 + vpbroadcastd 0x24(%rdi),%ymm9 + vpbroadcastd 0x28(%rdi),%ymm10 + vpbroadcastd 0x2c(%rdi),%ymm11 + vpbroadcastd 0x30(%rdi),%ymm12 + vpbroadcastd 0x34(%rdi),%ymm13 + vpbroadcastd 0x38(%rdi),%ymm14 + vpbroadcastd 0x3c(%rdi),%ymm15 + + # x12 += counter values 0-3 + vpaddd CTR8BL(%rip),%ymm12,%ymm12 + + vmovdqa64 %ymm0,%ymm16 + vmovdqa64 %ymm1,%ymm17 + vmovdqa64 %ymm2,%ymm18 + vmovdqa64 %ymm3,%ymm19 + vmovdqa64 %ymm4,%ymm20 + vmovdqa64 %ymm5,%ymm21 + vmovdqa64 %ymm6,%ymm22 + vmovdqa64 %ymm7,%ymm23 + vmovdqa64 %ymm8,%ymm24 + vmovdqa64 %ymm9,%ymm25 + vmovdqa64 %ymm10,%ymm26 + vmovdqa64 %ymm11,%ymm27 + vmovdqa64 %ymm12,%ymm28 + vmovdqa64 %ymm13,%ymm29 + vmovdqa64 %ymm14,%ymm30 + vmovdqa64 %ymm15,%ymm31 + +.Ldoubleround8: + # x0 += x4, x12 = rotl32(x12 ^ x0, 16) + vpaddd %ymm0,%ymm4,%ymm0 + vpxord %ymm0,%ymm12,%ymm12 + vprold $16,%ymm12,%ymm12 + # x1 += x5, x13 = rotl32(x13 ^ x1, 16) + vpaddd %ymm1,%ymm5,%ymm1 + vpxord %ymm1,%ymm13,%ymm13 + vprold $16,%ymm13,%ymm13 + # x2 += x6, x14 = rotl32(x14 ^ x2, 16) + vpaddd %ymm2,%ymm6,%ymm2 + vpxord %ymm2,%ymm14,%ymm14 + vprold $16,%ymm14,%ymm14 + # x3 += x7, x15 = rotl32(x15 ^ x3, 16) + vpaddd %ymm3,%ymm7,%ymm3 + vpxord %ymm3,%ymm15,%ymm15 + vprold $16,%ymm15,%ymm15 + + # x8 += x12, x4 = rotl32(x4 ^ x8, 12) + vpaddd %ymm12,%ymm8,%ymm8 + vpxord %ymm8,%ymm4,%ymm4 + vprold $12,%ymm4,%ymm4 + # x9 += x13, x5 = rotl32(x5 ^ x9, 12) + vpaddd %ymm13,%ymm9,%ymm9 + vpxord %ymm9,%ymm5,%ymm5 + vprold $12,%ymm5,%ymm5 + # x10 += x14, x6 = rotl32(x6 ^ x10, 12) + vpaddd %ymm14,%ymm10,%ymm10 + vpxord %ymm10,%ymm6,%ymm6 + vprold $12,%ymm6,%ymm6 + # x11 += x15, x7 = rotl32(x7 ^ x11, 12) + vpaddd %ymm15,%ymm11,%ymm11 + vpxord %ymm11,%ymm7,%ymm7 + vprold $12,%ymm7,%ymm7 + + # x0 += x4, x12 = rotl32(x12 ^ x0, 8) + vpaddd %ymm0,%ymm4,%ymm0 + vpxord %ymm0,%ymm12,%ymm12 + vprold $8,%ymm12,%ymm12 + # x1 += x5, x13 = rotl32(x13 ^ x1, 8) + vpaddd %ymm1,%ymm5,%ymm1 + vpxord %ymm1,%ymm13,%ymm13 + vprold $8,%ymm13,%ymm13 + # x2 += x6, x14 = rotl32(x14 ^ x2, 8) + vpaddd %ymm2,%ymm6,%ymm2 + vpxord %ymm2,%ymm14,%ymm14 + vprold $8,%ymm14,%ymm14 + # x3 += x7, x15 = rotl32(x15 ^ x3, 8) + vpaddd %ymm3,%ymm7,%ymm3 + vpxord %ymm3,%ymm15,%ymm15 + vprold $8,%ymm15,%ymm15 + + # x8 += x12, x4 = rotl32(x4 ^ x8, 7) + vpaddd %ymm12,%ymm8,%ymm8 + vpxord %ymm8,%ymm4,%ymm4 + vprold $7,%ymm4,%ymm4 + # x9 += x13, x5 = rotl32(x5 ^ x9, 7) + vpaddd %ymm13,%ymm9,%ymm9 + vpxord %ymm9,%ymm5,%ymm5 + vprold $7,%ymm5,%ymm5 + # x10 += x14, x6 = rotl32(x6 ^ x10, 7) + vpaddd %ymm14,%ymm10,%ymm10 + vpxord %ymm10,%ymm6,%ymm6 + vprold $7,%ymm6,%ymm6 + # x11 += x15, x7 = rotl32(x7 ^ x11, 7) + vpaddd %ymm15,%ymm11,%ymm11 + vpxord %ymm11,%ymm7,%ymm7 + vprold $7,%ymm7,%ymm7 + + # x0 += x5, x15 = rotl32(x15 ^ x0, 16) + vpaddd %ymm0,%ymm5,%ymm0 + vpxord %ymm0,%ymm15,%ymm15 + vprold $16,%ymm15,%ymm15 + # x1 += x6, x12 = rotl32(x12 ^ x1, 16) + vpaddd %ymm1,%ymm6,%ymm1 + vpxord %ymm1,%ymm12,%ymm12 + vprold $16,%ymm12,%ymm12 + # x2 += x7, x13 = rotl32(x13 ^ x2, 16) + vpaddd %ymm2,%ymm7,%ymm2 + vpxord %ymm2,%ymm13,%ymm13 + vprold $16,%ymm13,%ymm13 + # x3 += x4, x14 = rotl32(x14 ^ x3, 16) + vpaddd %ymm3,%ymm4,%ymm3 + vpxord %ymm3,%ymm14,%ymm14 + vprold $16,%ymm14,%ymm14 + + # x10 += x15, x5 = rotl32(x5 ^ x10, 12) + vpaddd %ymm15,%ymm10,%ymm10 + vpxord %ymm10,%ymm5,%ymm5 + vprold $12,%ymm5,%ymm5 + # x11 += x12, x6 = rotl32(x6 ^ x11, 12) + vpaddd %ymm12,%ymm11,%ymm11 + vpxord %ymm11,%ymm6,%ymm6 + vprold $12,%ymm6,%ymm6 + # x8 += x13, x7 = rotl32(x7 ^ x8, 12) + vpaddd %ymm13,%ymm8,%ymm8 + vpxord %ymm8,%ymm7,%ymm7 + vprold $12,%ymm7,%ymm7 + # x9 += x14, x4 = rotl32(x4 ^ x9, 12) + vpaddd %ymm14,%ymm9,%ymm9 + vpxord %ymm9,%ymm4,%ymm4 + vprold $12,%ymm4,%ymm4 + + # x0 += x5, x15 = rotl32(x15 ^ x0, 8) + vpaddd %ymm0,%ymm5,%ymm0 + vpxord %ymm0,%ymm15,%ymm15 + vprold $8,%ymm15,%ymm15 + # x1 += x6, x12 = rotl32(x12 ^ x1, 8) + vpaddd %ymm1,%ymm6,%ymm1 + vpxord %ymm1,%ymm12,%ymm12 + vprold $8,%ymm12,%ymm12 + # x2 += x7, x13 = rotl32(x13 ^ x2, 8) + vpaddd %ymm2,%ymm7,%ymm2 + vpxord %ymm2,%ymm13,%ymm13 + vprold $8,%ymm13,%ymm13 + # x3 += x4, x14 = rotl32(x14 ^ x3, 8) + vpaddd %ymm3,%ymm4,%ymm3 + vpxord %ymm3,%ymm14,%ymm14 + vprold $8,%ymm14,%ymm14 + + # x10 += x15, x5 = rotl32(x5 ^ x10, 7) + vpaddd %ymm15,%ymm10,%ymm10 + vpxord %ymm10,%ymm5,%ymm5 + vprold $7,%ymm5,%ymm5 + # x11 += x12, x6 = rotl32(x6 ^ x11, 7) + vpaddd %ymm12,%ymm11,%ymm11 + vpxord %ymm11,%ymm6,%ymm6 + vprold $7,%ymm6,%ymm6 + # x8 += x13, x7 = rotl32(x7 ^ x8, 7) + vpaddd %ymm13,%ymm8,%ymm8 + vpxord %ymm8,%ymm7,%ymm7 + vprold $7,%ymm7,%ymm7 + # x9 += x14, x4 = rotl32(x4 ^ x9, 7) + vpaddd %ymm14,%ymm9,%ymm9 + vpxord %ymm9,%ymm4,%ymm4 + vprold $7,%ymm4,%ymm4 + + sub $2,%r8d + jnz .Ldoubleround8 + + # x0..15[0-3] += s[0..15] + vpaddd %ymm16,%ymm0,%ymm0 + vpaddd %ymm17,%ymm1,%ymm1 + vpaddd %ymm18,%ymm2,%ymm2 + vpaddd %ymm19,%ymm3,%ymm3 + vpaddd %ymm20,%ymm4,%ymm4 + vpaddd %ymm21,%ymm5,%ymm5 + vpaddd %ymm22,%ymm6,%ymm6 + vpaddd %ymm23,%ymm7,%ymm7 + vpaddd %ymm24,%ymm8,%ymm8 + vpaddd %ymm25,%ymm9,%ymm9 + vpaddd %ymm26,%ymm10,%ymm10 + vpaddd %ymm27,%ymm11,%ymm11 + vpaddd %ymm28,%ymm12,%ymm12 + vpaddd %ymm29,%ymm13,%ymm13 + vpaddd %ymm30,%ymm14,%ymm14 + vpaddd %ymm31,%ymm15,%ymm15 + + # interleave 32-bit words in state n, n+1 + vpunpckldq %ymm1,%ymm0,%ymm16 + vpunpckhdq %ymm1,%ymm0,%ymm17 + vpunpckldq %ymm3,%ymm2,%ymm18 + vpunpckhdq %ymm3,%ymm2,%ymm19 + vpunpckldq %ymm5,%ymm4,%ymm20 + vpunpckhdq %ymm5,%ymm4,%ymm21 + vpunpckldq %ymm7,%ymm6,%ymm22 + vpunpckhdq %ymm7,%ymm6,%ymm23 + vpunpckldq %ymm9,%ymm8,%ymm24 + vpunpckhdq %ymm9,%ymm8,%ymm25 + vpunpckldq %ymm11,%ymm10,%ymm26 + vpunpckhdq %ymm11,%ymm10,%ymm27 + vpunpckldq %ymm13,%ymm12,%ymm28 + vpunpckhdq %ymm13,%ymm12,%ymm29 + vpunpckldq %ymm15,%ymm14,%ymm30 + vpunpckhdq %ymm15,%ymm14,%ymm31 + + # interleave 64-bit words in state n, n+2 + vpunpcklqdq %ymm18,%ymm16,%ymm0 + vpunpcklqdq %ymm19,%ymm17,%ymm1 + vpunpckhqdq %ymm18,%ymm16,%ymm2 + vpunpckhqdq %ymm19,%ymm17,%ymm3 + vpunpcklqdq %ymm22,%ymm20,%ymm4 + vpunpcklqdq %ymm23,%ymm21,%ymm5 + vpunpckhqdq %ymm22,%ymm20,%ymm6 + vpunpckhqdq %ymm23,%ymm21,%ymm7 + vpunpcklqdq %ymm26,%ymm24,%ymm8 + vpunpcklqdq %ymm27,%ymm25,%ymm9 + vpunpckhqdq %ymm26,%ymm24,%ymm10 + vpunpckhqdq %ymm27,%ymm25,%ymm11 + vpunpcklqdq %ymm30,%ymm28,%ymm12 + vpunpcklqdq %ymm31,%ymm29,%ymm13 + vpunpckhqdq %ymm30,%ymm28,%ymm14 + vpunpckhqdq %ymm31,%ymm29,%ymm15 + + # interleave 128-bit words in state n, n+4 + # xor/write first four blocks + vmovdqa64 %ymm0,%ymm16 + vperm2i128 $0x20,%ymm4,%ymm0,%ymm0 + cmp $0x0020,%rcx + jl .Lxorpart8 + vpxord 0x0000(%rdx),%ymm0,%ymm0 + vmovdqu64 %ymm0,0x0000(%rsi) + vmovdqa64 %ymm16,%ymm0 + vperm2i128 $0x31,%ymm4,%ymm0,%ymm4 + + vperm2i128 $0x20,%ymm12,%ymm8,%ymm0 + cmp $0x0040,%rcx + jl .Lxorpart8 + vpxord 0x0020(%rdx),%ymm0,%ymm0 + vmovdqu64 %ymm0,0x0020(%rsi) + vperm2i128 $0x31,%ymm12,%ymm8,%ymm12 + + vperm2i128 $0x20,%ymm6,%ymm2,%ymm0 + cmp $0x0060,%rcx + jl .Lxorpart8 + vpxord 0x0040(%rdx),%ymm0,%ymm0 + vmovdqu64 %ymm0,0x0040(%rsi) + vperm2i128 $0x31,%ymm6,%ymm2,%ymm6 + + vperm2i128 $0x20,%ymm14,%ymm10,%ymm0 + cmp $0x0080,%rcx + jl .Lxorpart8 + vpxord 0x0060(%rdx),%ymm0,%ymm0 + vmovdqu64 %ymm0,0x0060(%rsi) + vperm2i128 $0x31,%ymm14,%ymm10,%ymm14 + + vperm2i128 $0x20,%ymm5,%ymm1,%ymm0 + cmp $0x00a0,%rcx + jl .Lxorpart8 + vpxord 0x0080(%rdx),%ymm0,%ymm0 + vmovdqu64 %ymm0,0x0080(%rsi) + vperm2i128 $0x31,%ymm5,%ymm1,%ymm5 + + vperm2i128 $0x20,%ymm13,%ymm9,%ymm0 + cmp $0x00c0,%rcx + jl .Lxorpart8 + vpxord 0x00a0(%rdx),%ymm0,%ymm0 + vmovdqu64 %ymm0,0x00a0(%rsi) + vperm2i128 $0x31,%ymm13,%ymm9,%ymm13 + + vperm2i128 $0x20,%ymm7,%ymm3,%ymm0 + cmp $0x00e0,%rcx + jl .Lxorpart8 + vpxord 0x00c0(%rdx),%ymm0,%ymm0 + vmovdqu64 %ymm0,0x00c0(%rsi) + vperm2i128 $0x31,%ymm7,%ymm3,%ymm7 + + vperm2i128 $0x20,%ymm15,%ymm11,%ymm0 + cmp $0x0100,%rcx + jl .Lxorpart8 + vpxord 0x00e0(%rdx),%ymm0,%ymm0 + vmovdqu64 %ymm0,0x00e0(%rsi) + vperm2i128 $0x31,%ymm15,%ymm11,%ymm15 + + # xor remaining blocks, write to output + vmovdqa64 %ymm4,%ymm0 + cmp $0x0120,%rcx + jl .Lxorpart8 + vpxord 0x0100(%rdx),%ymm0,%ymm0 + vmovdqu64 %ymm0,0x0100(%rsi) + + vmovdqa64 %ymm12,%ymm0 + cmp $0x0140,%rcx + jl .Lxorpart8 + vpxord 0x0120(%rdx),%ymm0,%ymm0 + vmovdqu64 %ymm0,0x0120(%rsi) + + vmovdqa64 %ymm6,%ymm0 + cmp $0x0160,%rcx + jl .Lxorpart8 + vpxord 0x0140(%rdx),%ymm0,%ymm0 + vmovdqu64 %ymm0,0x0140(%rsi) + + vmovdqa64 %ymm14,%ymm0 + cmp $0x0180,%rcx + jl .Lxorpart8 + vpxord 0x0160(%rdx),%ymm0,%ymm0 + vmovdqu64 %ymm0,0x0160(%rsi) + + vmovdqa64 %ymm5,%ymm0 + cmp $0x01a0,%rcx + jl .Lxorpart8 + vpxord 0x0180(%rdx),%ymm0,%ymm0 + vmovdqu64 %ymm0,0x0180(%rsi) + + vmovdqa64 %ymm13,%ymm0 + cmp $0x01c0,%rcx + jl .Lxorpart8 + vpxord 0x01a0(%rdx),%ymm0,%ymm0 + vmovdqu64 %ymm0,0x01a0(%rsi) + + vmovdqa64 %ymm7,%ymm0 + cmp $0x01e0,%rcx + jl .Lxorpart8 + vpxord 0x01c0(%rdx),%ymm0,%ymm0 + vmovdqu64 %ymm0,0x01c0(%rsi) + + vmovdqa64 %ymm15,%ymm0 + cmp $0x0200,%rcx + jl .Lxorpart8 + vpxord 0x01e0(%rdx),%ymm0,%ymm0 + vmovdqu64 %ymm0,0x01e0(%rsi) + +.Ldone8: + vzeroupper + ret + +.Lxorpart8: + # xor remaining bytes from partial register into output + mov %rcx,%rax + and $0x1f,%rcx + jz .Ldone8 + mov %rax,%r9 + and $~0x1f,%r9 + + mov $1,%rax + shld %cl,%rax,%rax + sub $1,%rax + kmovq %rax,%k1 + + vmovdqu8 (%rdx,%r9),%ymm1{%k1}{z} + vpxord %ymm0,%ymm1,%ymm1 + vmovdqu8 %ymm1,(%rsi,%r9){%k1} + + jmp .Ldone8 + +ENDPROC(chacha_8block_xor_avx512vl) diff --git a/arch/x86/crypto/chacha-ssse3-x86_64.S b/arch/x86/crypto/chacha-ssse3-x86_64.S new file mode 100644 index 000000000000..c05a7a963dc3 --- /dev/null +++ b/arch/x86/crypto/chacha-ssse3-x86_64.S @@ -0,0 +1,795 @@ +/* + * ChaCha 256-bit cipher algorithm, x64 SSSE3 functions + * + * Copyright (C) 2015 Martin Willi + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include +#include + +.section .rodata.cst16.ROT8, "aM", @progbits, 16 +.align 16 +ROT8: .octa 0x0e0d0c0f0a09080b0605040702010003 +.section .rodata.cst16.ROT16, "aM", @progbits, 16 +.align 16 +ROT16: .octa 0x0d0c0f0e09080b0a0504070601000302 +.section .rodata.cst16.CTRINC, "aM", @progbits, 16 +.align 16 +CTRINC: .octa 0x00000003000000020000000100000000 + +.text + +/* + * chacha_permute - permute one block + * + * Permute one 64-byte block where the state matrix is in %xmm0-%xmm3. This + * function performs matrix operations on four words in parallel, but requires + * shuffling to rearrange the words after each round. 8/16-bit word rotation is + * done with the slightly better performing SSSE3 byte shuffling, 7/12-bit word + * rotation uses traditional shift+OR. + * + * The round count is given in %r8d. + * + * Clobbers: %r8d, %xmm4-%xmm7 + */ +chacha_permute: + + movdqa ROT8(%rip),%xmm4 + movdqa ROT16(%rip),%xmm5 + +.Ldoubleround: + # x0 += x1, x3 = rotl32(x3 ^ x0, 16) + paddd %xmm1,%xmm0 + pxor %xmm0,%xmm3 + pshufb %xmm5,%xmm3 + + # x2 += x3, x1 = rotl32(x1 ^ x2, 12) + paddd %xmm3,%xmm2 + pxor %xmm2,%xmm1 + movdqa %xmm1,%xmm6 + pslld $12,%xmm6 + psrld $20,%xmm1 + por %xmm6,%xmm1 + + # x0 += x1, x3 = rotl32(x3 ^ x0, 8) + paddd %xmm1,%xmm0 + pxor %xmm0,%xmm3 + pshufb %xmm4,%xmm3 + + # x2 += x3, x1 = rotl32(x1 ^ x2, 7) + paddd %xmm3,%xmm2 + pxor %xmm2,%xmm1 + movdqa %xmm1,%xmm7 + pslld $7,%xmm7 + psrld $25,%xmm1 + por %xmm7,%xmm1 + + # x1 = shuffle32(x1, MASK(0, 3, 2, 1)) + pshufd $0x39,%xmm1,%xmm1 + # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) + pshufd $0x4e,%xmm2,%xmm2 + # x3 = shuffle32(x3, MASK(2, 1, 0, 3)) + pshufd $0x93,%xmm3,%xmm3 + + # x0 += x1, x3 = rotl32(x3 ^ x0, 16) + paddd %xmm1,%xmm0 + pxor %xmm0,%xmm3 + pshufb %xmm5,%xmm3 + + # x2 += x3, x1 = rotl32(x1 ^ x2, 12) + paddd %xmm3,%xmm2 + pxor %xmm2,%xmm1 + movdqa %xmm1,%xmm6 + pslld $12,%xmm6 + psrld $20,%xmm1 + por %xmm6,%xmm1 + + # x0 += x1, x3 = rotl32(x3 ^ x0, 8) + paddd %xmm1,%xmm0 + pxor %xmm0,%xmm3 + pshufb %xmm4,%xmm3 + + # x2 += x3, x1 = rotl32(x1 ^ x2, 7) + paddd %xmm3,%xmm2 + pxor %xmm2,%xmm1 + movdqa %xmm1,%xmm7 + pslld $7,%xmm7 + psrld $25,%xmm1 + por %xmm7,%xmm1 + + # x1 = shuffle32(x1, MASK(2, 1, 0, 3)) + pshufd $0x93,%xmm1,%xmm1 + # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) + pshufd $0x4e,%xmm2,%xmm2 + # x3 = shuffle32(x3, MASK(0, 3, 2, 1)) + pshufd $0x39,%xmm3,%xmm3 + + sub $2,%r8d + jnz .Ldoubleround + + ret +ENDPROC(chacha_permute) + +ENTRY(chacha_block_xor_ssse3) + # %rdi: Input state matrix, s + # %rsi: up to 1 data block output, o + # %rdx: up to 1 data block input, i + # %rcx: input/output length in bytes + # %r8d: nrounds + FRAME_BEGIN + + # x0..3 = s0..3 + movdqa 0x00(%rdi),%xmm0 + movdqa 0x10(%rdi),%xmm1 + movdqa 0x20(%rdi),%xmm2 + movdqa 0x30(%rdi),%xmm3 + movdqa %xmm0,%xmm8 + movdqa %xmm1,%xmm9 + movdqa %xmm2,%xmm10 + movdqa %xmm3,%xmm11 + + mov %rcx,%rax + call chacha_permute + + # o0 = i0 ^ (x0 + s0) + paddd %xmm8,%xmm0 + cmp $0x10,%rax + jl .Lxorpart + movdqu 0x00(%rdx),%xmm4 + pxor %xmm4,%xmm0 + movdqu %xmm0,0x00(%rsi) + # o1 = i1 ^ (x1 + s1) + paddd %xmm9,%xmm1 + movdqa %xmm1,%xmm0 + cmp $0x20,%rax + jl .Lxorpart + movdqu 0x10(%rdx),%xmm0 + pxor %xmm1,%xmm0 + movdqu %xmm0,0x10(%rsi) + # o2 = i2 ^ (x2 + s2) + paddd %xmm10,%xmm2 + movdqa %xmm2,%xmm0 + cmp $0x30,%rax + jl .Lxorpart + movdqu 0x20(%rdx),%xmm0 + pxor %xmm2,%xmm0 + movdqu %xmm0,0x20(%rsi) + # o3 = i3 ^ (x3 + s3) + paddd %xmm11,%xmm3 + movdqa %xmm3,%xmm0 + cmp $0x40,%rax + jl .Lxorpart + movdqu 0x30(%rdx),%xmm0 + pxor %xmm3,%xmm0 + movdqu %xmm0,0x30(%rsi) + +.Ldone: + FRAME_END + ret + +.Lxorpart: + # xor remaining bytes from partial register into output + mov %rax,%r9 + and $0x0f,%r9 + jz .Ldone + and $~0x0f,%rax + + mov %rsi,%r11 + + lea 8(%rsp),%r10 + sub $0x10,%rsp + and $~31,%rsp + + lea (%rdx,%rax),%rsi + mov %rsp,%rdi + mov %r9,%rcx + rep movsb + + pxor 0x00(%rsp),%xmm0 + movdqa %xmm0,0x00(%rsp) + + mov %rsp,%rsi + lea (%r11,%rax),%rdi + mov %r9,%rcx + rep movsb + + lea -8(%r10),%rsp + jmp .Ldone + +ENDPROC(chacha_block_xor_ssse3) + +ENTRY(hchacha_block_ssse3) + # %rdi: Input state matrix, s + # %rsi: output (8 32-bit words) + # %edx: nrounds + FRAME_BEGIN + + movdqa 0x00(%rdi),%xmm0 + movdqa 0x10(%rdi),%xmm1 + movdqa 0x20(%rdi),%xmm2 + movdqa 0x30(%rdi),%xmm3 + + mov %edx,%r8d + call chacha_permute + + movdqu %xmm0,0x00(%rsi) + movdqu %xmm3,0x10(%rsi) + + FRAME_END + ret +ENDPROC(hchacha_block_ssse3) + +ENTRY(chacha_4block_xor_ssse3) + # %rdi: Input state matrix, s + # %rsi: up to 4 data blocks output, o + # %rdx: up to 4 data blocks input, i + # %rcx: input/output length in bytes + # %r8d: nrounds + + # This function encrypts four consecutive ChaCha blocks by loading the + # the state matrix in SSE registers four times. As we need some scratch + # registers, we save the first four registers on the stack. The + # algorithm performs each operation on the corresponding word of each + # state matrix, hence requires no word shuffling. For final XORing step + # we transpose the matrix by interleaving 32- and then 64-bit words, + # which allows us to do XOR in SSE registers. 8/16-bit word rotation is + # done with the slightly better performing SSSE3 byte shuffling, + # 7/12-bit word rotation uses traditional shift+OR. + + lea 8(%rsp),%r10 + sub $0x80,%rsp + and $~63,%rsp + mov %rcx,%rax + + # x0..15[0-3] = s0..3[0..3] + movq 0x00(%rdi),%xmm1 + pshufd $0x00,%xmm1,%xmm0 + pshufd $0x55,%xmm1,%xmm1 + movq 0x08(%rdi),%xmm3 + pshufd $0x00,%xmm3,%xmm2 + pshufd $0x55,%xmm3,%xmm3 + movq 0x10(%rdi),%xmm5 + pshufd $0x00,%xmm5,%xmm4 + pshufd $0x55,%xmm5,%xmm5 + movq 0x18(%rdi),%xmm7 + pshufd $0x00,%xmm7,%xmm6 + pshufd $0x55,%xmm7,%xmm7 + movq 0x20(%rdi),%xmm9 + pshufd $0x00,%xmm9,%xmm8 + pshufd $0x55,%xmm9,%xmm9 + movq 0x28(%rdi),%xmm11 + pshufd $0x00,%xmm11,%xmm10 + pshufd $0x55,%xmm11,%xmm11 + movq 0x30(%rdi),%xmm13 + pshufd $0x00,%xmm13,%xmm12 + pshufd $0x55,%xmm13,%xmm13 + movq 0x38(%rdi),%xmm15 + pshufd $0x00,%xmm15,%xmm14 + pshufd $0x55,%xmm15,%xmm15 + # x0..3 on stack + movdqa %xmm0,0x00(%rsp) + movdqa %xmm1,0x10(%rsp) + movdqa %xmm2,0x20(%rsp) + movdqa %xmm3,0x30(%rsp) + + movdqa CTRINC(%rip),%xmm1 + movdqa ROT8(%rip),%xmm2 + movdqa ROT16(%rip),%xmm3 + + # x12 += counter values 0-3 + paddd %xmm1,%xmm12 + +.Ldoubleround4: + # x0 += x4, x12 = rotl32(x12 ^ x0, 16) + movdqa 0x00(%rsp),%xmm0 + paddd %xmm4,%xmm0 + movdqa %xmm0,0x00(%rsp) + pxor %xmm0,%xmm12 + pshufb %xmm3,%xmm12 + # x1 += x5, x13 = rotl32(x13 ^ x1, 16) + movdqa 0x10(%rsp),%xmm0 + paddd %xmm5,%xmm0 + movdqa %xmm0,0x10(%rsp) + pxor %xmm0,%xmm13 + pshufb %xmm3,%xmm13 + # x2 += x6, x14 = rotl32(x14 ^ x2, 16) + movdqa 0x20(%rsp),%xmm0 + paddd %xmm6,%xmm0 + movdqa %xmm0,0x20(%rsp) + pxor %xmm0,%xmm14 + pshufb %xmm3,%xmm14 + # x3 += x7, x15 = rotl32(x15 ^ x3, 16) + movdqa 0x30(%rsp),%xmm0 + paddd %xmm7,%xmm0 + movdqa %xmm0,0x30(%rsp) + pxor %xmm0,%xmm15 + pshufb %xmm3,%xmm15 + + # x8 += x12, x4 = rotl32(x4 ^ x8, 12) + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm0 + pslld $12,%xmm0 + psrld $20,%xmm4 + por %xmm0,%xmm4 + # x9 += x13, x5 = rotl32(x5 ^ x9, 12) + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm0 + pslld $12,%xmm0 + psrld $20,%xmm5 + por %xmm0,%xmm5 + # x10 += x14, x6 = rotl32(x6 ^ x10, 12) + paddd %xmm14,%xmm10 + pxor %xmm10,%xmm6 + movdqa %xmm6,%xmm0 + pslld $12,%xmm0 + psrld $20,%xmm6 + por %xmm0,%xmm6 + # x11 += x15, x7 = rotl32(x7 ^ x11, 12) + paddd %xmm15,%xmm11 + pxor %xmm11,%xmm7 + movdqa %xmm7,%xmm0 + pslld $12,%xmm0 + psrld $20,%xmm7 + por %xmm0,%xmm7 + + # x0 += x4, x12 = rotl32(x12 ^ x0, 8) + movdqa 0x00(%rsp),%xmm0 + paddd %xmm4,%xmm0 + movdqa %xmm0,0x00(%rsp) + pxor %xmm0,%xmm12 + pshufb %xmm2,%xmm12 + # x1 += x5, x13 = rotl32(x13 ^ x1, 8) + movdqa 0x10(%rsp),%xmm0 + paddd %xmm5,%xmm0 + movdqa %xmm0,0x10(%rsp) + pxor %xmm0,%xmm13 + pshufb %xmm2,%xmm13 + # x2 += x6, x14 = rotl32(x14 ^ x2, 8) + movdqa 0x20(%rsp),%xmm0 + paddd %xmm6,%xmm0 + movdqa %xmm0,0x20(%rsp) + pxor %xmm0,%xmm14 + pshufb %xmm2,%xmm14 + # x3 += x7, x15 = rotl32(x15 ^ x3, 8) + movdqa 0x30(%rsp),%xmm0 + paddd %xmm7,%xmm0 + movdqa %xmm0,0x30(%rsp) + pxor %xmm0,%xmm15 + pshufb %xmm2,%xmm15 + + # x8 += x12, x4 = rotl32(x4 ^ x8, 7) + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm0 + pslld $7,%xmm0 + psrld $25,%xmm4 + por %xmm0,%xmm4 + # x9 += x13, x5 = rotl32(x5 ^ x9, 7) + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm0 + pslld $7,%xmm0 + psrld $25,%xmm5 + por %xmm0,%xmm5 + # x10 += x14, x6 = rotl32(x6 ^ x10, 7) + paddd %xmm14,%xmm10 + pxor %xmm10,%xmm6 + movdqa %xmm6,%xmm0 + pslld $7,%xmm0 + psrld $25,%xmm6 + por %xmm0,%xmm6 + # x11 += x15, x7 = rotl32(x7 ^ x11, 7) + paddd %xmm15,%xmm11 + pxor %xmm11,%xmm7 + movdqa %xmm7,%xmm0 + pslld $7,%xmm0 + psrld $25,%xmm7 + por %xmm0,%xmm7 + + # x0 += x5, x15 = rotl32(x15 ^ x0, 16) + movdqa 0x00(%rsp),%xmm0 + paddd %xmm5,%xmm0 + movdqa %xmm0,0x00(%rsp) + pxor %xmm0,%xmm15 + pshufb %xmm3,%xmm15 + # x1 += x6, x12 = rotl32(x12 ^ x1, 16) + movdqa 0x10(%rsp),%xmm0 + paddd %xmm6,%xmm0 + movdqa %xmm0,0x10(%rsp) + pxor %xmm0,%xmm12 + pshufb %xmm3,%xmm12 + # x2 += x7, x13 = rotl32(x13 ^ x2, 16) + movdqa 0x20(%rsp),%xmm0 + paddd %xmm7,%xmm0 + movdqa %xmm0,0x20(%rsp) + pxor %xmm0,%xmm13 + pshufb %xmm3,%xmm13 + # x3 += x4, x14 = rotl32(x14 ^ x3, 16) + movdqa 0x30(%rsp),%xmm0 + paddd %xmm4,%xmm0 + movdqa %xmm0,0x30(%rsp) + pxor %xmm0,%xmm14 + pshufb %xmm3,%xmm14 + + # x10 += x15, x5 = rotl32(x5 ^ x10, 12) + paddd %xmm15,%xmm10 + pxor %xmm10,%xmm5 + movdqa %xmm5,%xmm0 + pslld $12,%xmm0 + psrld $20,%xmm5 + por %xmm0,%xmm5 + # x11 += x12, x6 = rotl32(x6 ^ x11, 12) + paddd %xmm12,%xmm11 + pxor %xmm11,%xmm6 + movdqa %xmm6,%xmm0 + pslld $12,%xmm0 + psrld $20,%xmm6 + por %xmm0,%xmm6 + # x8 += x13, x7 = rotl32(x7 ^ x8, 12) + paddd %xmm13,%xmm8 + pxor %xmm8,%xmm7 + movdqa %xmm7,%xmm0 + pslld $12,%xmm0 + psrld $20,%xmm7 + por %xmm0,%xmm7 + # x9 += x14, x4 = rotl32(x4 ^ x9, 12) + paddd %xmm14,%xmm9 + pxor %xmm9,%xmm4 + movdqa %xmm4,%xmm0 + pslld $12,%xmm0 + psrld $20,%xmm4 + por %xmm0,%xmm4 + + # x0 += x5, x15 = rotl32(x15 ^ x0, 8) + movdqa 0x00(%rsp),%xmm0 + paddd %xmm5,%xmm0 + movdqa %xmm0,0x00(%rsp) + pxor %xmm0,%xmm15 + pshufb %xmm2,%xmm15 + # x1 += x6, x12 = rotl32(x12 ^ x1, 8) + movdqa 0x10(%rsp),%xmm0 + paddd %xmm6,%xmm0 + movdqa %xmm0,0x10(%rsp) + pxor %xmm0,%xmm12 + pshufb %xmm2,%xmm12 + # x2 += x7, x13 = rotl32(x13 ^ x2, 8) + movdqa 0x20(%rsp),%xmm0 + paddd %xmm7,%xmm0 + movdqa %xmm0,0x20(%rsp) + pxor %xmm0,%xmm13 + pshufb %xmm2,%xmm13 + # x3 += x4, x14 = rotl32(x14 ^ x3, 8) + movdqa 0x30(%rsp),%xmm0 + paddd %xmm4,%xmm0 + movdqa %xmm0,0x30(%rsp) + pxor %xmm0,%xmm14 + pshufb %xmm2,%xmm14 + + # x10 += x15, x5 = rotl32(x5 ^ x10, 7) + paddd %xmm15,%xmm10 + pxor %xmm10,%xmm5 + movdqa %xmm5,%xmm0 + pslld $7,%xmm0 + psrld $25,%xmm5 + por %xmm0,%xmm5 + # x11 += x12, x6 = rotl32(x6 ^ x11, 7) + paddd %xmm12,%xmm11 + pxor %xmm11,%xmm6 + movdqa %xmm6,%xmm0 + pslld $7,%xmm0 + psrld $25,%xmm6 + por %xmm0,%xmm6 + # x8 += x13, x7 = rotl32(x7 ^ x8, 7) + paddd %xmm13,%xmm8 + pxor %xmm8,%xmm7 + movdqa %xmm7,%xmm0 + pslld $7,%xmm0 + psrld $25,%xmm7 + por %xmm0,%xmm7 + # x9 += x14, x4 = rotl32(x4 ^ x9, 7) + paddd %xmm14,%xmm9 + pxor %xmm9,%xmm4 + movdqa %xmm4,%xmm0 + pslld $7,%xmm0 + psrld $25,%xmm4 + por %xmm0,%xmm4 + + sub $2,%r8d + jnz .Ldoubleround4 + + # x0[0-3] += s0[0] + # x1[0-3] += s0[1] + movq 0x00(%rdi),%xmm3 + pshufd $0x00,%xmm3,%xmm2 + pshufd $0x55,%xmm3,%xmm3 + paddd 0x00(%rsp),%xmm2 + movdqa %xmm2,0x00(%rsp) + paddd 0x10(%rsp),%xmm3 + movdqa %xmm3,0x10(%rsp) + # x2[0-3] += s0[2] + # x3[0-3] += s0[3] + movq 0x08(%rdi),%xmm3 + pshufd $0x00,%xmm3,%xmm2 + pshufd $0x55,%xmm3,%xmm3 + paddd 0x20(%rsp),%xmm2 + movdqa %xmm2,0x20(%rsp) + paddd 0x30(%rsp),%xmm3 + movdqa %xmm3,0x30(%rsp) + + # x4[0-3] += s1[0] + # x5[0-3] += s1[1] + movq 0x10(%rdi),%xmm3 + pshufd $0x00,%xmm3,%xmm2 + pshufd $0x55,%xmm3,%xmm3 + paddd %xmm2,%xmm4 + paddd %xmm3,%xmm5 + # x6[0-3] += s1[2] + # x7[0-3] += s1[3] + movq 0x18(%rdi),%xmm3 + pshufd $0x00,%xmm3,%xmm2 + pshufd $0x55,%xmm3,%xmm3 + paddd %xmm2,%xmm6 + paddd %xmm3,%xmm7 + + # x8[0-3] += s2[0] + # x9[0-3] += s2[1] + movq 0x20(%rdi),%xmm3 + pshufd $0x00,%xmm3,%xmm2 + pshufd $0x55,%xmm3,%xmm3 + paddd %xmm2,%xmm8 + paddd %xmm3,%xmm9 + # x10[0-3] += s2[2] + # x11[0-3] += s2[3] + movq 0x28(%rdi),%xmm3 + pshufd $0x00,%xmm3,%xmm2 + pshufd $0x55,%xmm3,%xmm3 + paddd %xmm2,%xmm10 + paddd %xmm3,%xmm11 + + # x12[0-3] += s3[0] + # x13[0-3] += s3[1] + movq 0x30(%rdi),%xmm3 + pshufd $0x00,%xmm3,%xmm2 + pshufd $0x55,%xmm3,%xmm3 + paddd %xmm2,%xmm12 + paddd %xmm3,%xmm13 + # x14[0-3] += s3[2] + # x15[0-3] += s3[3] + movq 0x38(%rdi),%xmm3 + pshufd $0x00,%xmm3,%xmm2 + pshufd $0x55,%xmm3,%xmm3 + paddd %xmm2,%xmm14 + paddd %xmm3,%xmm15 + + # x12 += counter values 0-3 + paddd %xmm1,%xmm12 + + # interleave 32-bit words in state n, n+1 + movdqa 0x00(%rsp),%xmm0 + movdqa 0x10(%rsp),%xmm1 + movdqa %xmm0,%xmm2 + punpckldq %xmm1,%xmm2 + punpckhdq %xmm1,%xmm0 + movdqa %xmm2,0x00(%rsp) + movdqa %xmm0,0x10(%rsp) + movdqa 0x20(%rsp),%xmm0 + movdqa 0x30(%rsp),%xmm1 + movdqa %xmm0,%xmm2 + punpckldq %xmm1,%xmm2 + punpckhdq %xmm1,%xmm0 + movdqa %xmm2,0x20(%rsp) + movdqa %xmm0,0x30(%rsp) + movdqa %xmm4,%xmm0 + punpckldq %xmm5,%xmm4 + punpckhdq %xmm5,%xmm0 + movdqa %xmm0,%xmm5 + movdqa %xmm6,%xmm0 + punpckldq %xmm7,%xmm6 + punpckhdq %xmm7,%xmm0 + movdqa %xmm0,%xmm7 + movdqa %xmm8,%xmm0 + punpckldq %xmm9,%xmm8 + punpckhdq %xmm9,%xmm0 + movdqa %xmm0,%xmm9 + movdqa %xmm10,%xmm0 + punpckldq %xmm11,%xmm10 + punpckhdq %xmm11,%xmm0 + movdqa %xmm0,%xmm11 + movdqa %xmm12,%xmm0 + punpckldq %xmm13,%xmm12 + punpckhdq %xmm13,%xmm0 + movdqa %xmm0,%xmm13 + movdqa %xmm14,%xmm0 + punpckldq %xmm15,%xmm14 + punpckhdq %xmm15,%xmm0 + movdqa %xmm0,%xmm15 + + # interleave 64-bit words in state n, n+2 + movdqa 0x00(%rsp),%xmm0 + movdqa 0x20(%rsp),%xmm1 + movdqa %xmm0,%xmm2 + punpcklqdq %xmm1,%xmm2 + punpckhqdq %xmm1,%xmm0 + movdqa %xmm2,0x00(%rsp) + movdqa %xmm0,0x20(%rsp) + movdqa 0x10(%rsp),%xmm0 + movdqa 0x30(%rsp),%xmm1 + movdqa %xmm0,%xmm2 + punpcklqdq %xmm1,%xmm2 + punpckhqdq %xmm1,%xmm0 + movdqa %xmm2,0x10(%rsp) + movdqa %xmm0,0x30(%rsp) + movdqa %xmm4,%xmm0 + punpcklqdq %xmm6,%xmm4 + punpckhqdq %xmm6,%xmm0 + movdqa %xmm0,%xmm6 + movdqa %xmm5,%xmm0 + punpcklqdq %xmm7,%xmm5 + punpckhqdq %xmm7,%xmm0 + movdqa %xmm0,%xmm7 + movdqa %xmm8,%xmm0 + punpcklqdq %xmm10,%xmm8 + punpckhqdq %xmm10,%xmm0 + movdqa %xmm0,%xmm10 + movdqa %xmm9,%xmm0 + punpcklqdq %xmm11,%xmm9 + punpckhqdq %xmm11,%xmm0 + movdqa %xmm0,%xmm11 + movdqa %xmm12,%xmm0 + punpcklqdq %xmm14,%xmm12 + punpckhqdq %xmm14,%xmm0 + movdqa %xmm0,%xmm14 + movdqa %xmm13,%xmm0 + punpcklqdq %xmm15,%xmm13 + punpckhqdq %xmm15,%xmm0 + movdqa %xmm0,%xmm15 + + # xor with corresponding input, write to output + movdqa 0x00(%rsp),%xmm0 + cmp $0x10,%rax + jl .Lxorpart4 + movdqu 0x00(%rdx),%xmm1 + pxor %xmm1,%xmm0 + movdqu %xmm0,0x00(%rsi) + + movdqu %xmm4,%xmm0 + cmp $0x20,%rax + jl .Lxorpart4 + movdqu 0x10(%rdx),%xmm1 + pxor %xmm1,%xmm0 + movdqu %xmm0,0x10(%rsi) + + movdqu %xmm8,%xmm0 + cmp $0x30,%rax + jl .Lxorpart4 + movdqu 0x20(%rdx),%xmm1 + pxor %xmm1,%xmm0 + movdqu %xmm0,0x20(%rsi) + + movdqu %xmm12,%xmm0 + cmp $0x40,%rax + jl .Lxorpart4 + movdqu 0x30(%rdx),%xmm1 + pxor %xmm1,%xmm0 + movdqu %xmm0,0x30(%rsi) + + movdqa 0x20(%rsp),%xmm0 + cmp $0x50,%rax + jl .Lxorpart4 + movdqu 0x40(%rdx),%xmm1 + pxor %xmm1,%xmm0 + movdqu %xmm0,0x40(%rsi) + + movdqu %xmm6,%xmm0 + cmp $0x60,%rax + jl .Lxorpart4 + movdqu 0x50(%rdx),%xmm1 + pxor %xmm1,%xmm0 + movdqu %xmm0,0x50(%rsi) + + movdqu %xmm10,%xmm0 + cmp $0x70,%rax + jl .Lxorpart4 + movdqu 0x60(%rdx),%xmm1 + pxor %xmm1,%xmm0 + movdqu %xmm0,0x60(%rsi) + + movdqu %xmm14,%xmm0 + cmp $0x80,%rax + jl .Lxorpart4 + movdqu 0x70(%rdx),%xmm1 + pxor %xmm1,%xmm0 + movdqu %xmm0,0x70(%rsi) + + movdqa 0x10(%rsp),%xmm0 + cmp $0x90,%rax + jl .Lxorpart4 + movdqu 0x80(%rdx),%xmm1 + pxor %xmm1,%xmm0 + movdqu %xmm0,0x80(%rsi) + + movdqu %xmm5,%xmm0 + cmp $0xa0,%rax + jl .Lxorpart4 + movdqu 0x90(%rdx),%xmm1 + pxor %xmm1,%xmm0 + movdqu %xmm0,0x90(%rsi) + + movdqu %xmm9,%xmm0 + cmp $0xb0,%rax + jl .Lxorpart4 + movdqu 0xa0(%rdx),%xmm1 + pxor %xmm1,%xmm0 + movdqu %xmm0,0xa0(%rsi) + + movdqu %xmm13,%xmm0 + cmp $0xc0,%rax + jl .Lxorpart4 + movdqu 0xb0(%rdx),%xmm1 + pxor %xmm1,%xmm0 + movdqu %xmm0,0xb0(%rsi) + + movdqa 0x30(%rsp),%xmm0 + cmp $0xd0,%rax + jl .Lxorpart4 + movdqu 0xc0(%rdx),%xmm1 + pxor %xmm1,%xmm0 + movdqu %xmm0,0xc0(%rsi) + + movdqu %xmm7,%xmm0 + cmp $0xe0,%rax + jl .Lxorpart4 + movdqu 0xd0(%rdx),%xmm1 + pxor %xmm1,%xmm0 + movdqu %xmm0,0xd0(%rsi) + + movdqu %xmm11,%xmm0 + cmp $0xf0,%rax + jl .Lxorpart4 + movdqu 0xe0(%rdx),%xmm1 + pxor %xmm1,%xmm0 + movdqu %xmm0,0xe0(%rsi) + + movdqu %xmm15,%xmm0 + cmp $0x100,%rax + jl .Lxorpart4 + movdqu 0xf0(%rdx),%xmm1 + pxor %xmm1,%xmm0 + movdqu %xmm0,0xf0(%rsi) + +.Ldone4: + lea -8(%r10),%rsp + ret + +.Lxorpart4: + # xor remaining bytes from partial register into output + mov %rax,%r9 + and $0x0f,%r9 + jz .Ldone4 + and $~0x0f,%rax + + mov %rsi,%r11 + + lea (%rdx,%rax),%rsi + mov %rsp,%rdi + mov %r9,%rcx + rep movsb + + pxor 0x00(%rsp),%xmm0 + movdqa %xmm0,0x00(%rsp) + + mov %rsp,%rsi + lea (%r11,%rax),%rdi + mov %r9,%rcx + rep movsb + + jmp .Ldone4 + +ENDPROC(chacha_4block_xor_ssse3) diff --git a/arch/x86/crypto/chacha20-avx2-x86_64.S b/arch/x86/crypto/chacha20-avx2-x86_64.S deleted file mode 100644 index b6ab082be657..000000000000 --- a/arch/x86/crypto/chacha20-avx2-x86_64.S +++ /dev/null @@ -1,1026 +0,0 @@ -/* - * ChaCha20 256-bit cipher algorithm, RFC7539, x64 AVX2 functions - * - * Copyright (C) 2015 Martin Willi - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - */ - -#include - -.section .rodata.cst32.ROT8, "aM", @progbits, 32 -.align 32 -ROT8: .octa 0x0e0d0c0f0a09080b0605040702010003 - .octa 0x0e0d0c0f0a09080b0605040702010003 - -.section .rodata.cst32.ROT16, "aM", @progbits, 32 -.align 32 -ROT16: .octa 0x0d0c0f0e09080b0a0504070601000302 - .octa 0x0d0c0f0e09080b0a0504070601000302 - -.section .rodata.cst32.CTRINC, "aM", @progbits, 32 -.align 32 -CTRINC: .octa 0x00000003000000020000000100000000 - .octa 0x00000007000000060000000500000004 - -.section .rodata.cst32.CTR2BL, "aM", @progbits, 32 -.align 32 -CTR2BL: .octa 0x00000000000000000000000000000000 - .octa 0x00000000000000000000000000000001 - -.section .rodata.cst32.CTR4BL, "aM", @progbits, 32 -.align 32 -CTR4BL: .octa 0x00000000000000000000000000000002 - .octa 0x00000000000000000000000000000003 - -.text - -ENTRY(chacha20_2block_xor_avx2) - # %rdi: Input state matrix, s - # %rsi: up to 2 data blocks output, o - # %rdx: up to 2 data blocks input, i - # %rcx: input/output length in bytes - - # This function encrypts two ChaCha20 blocks by loading the state - # matrix twice across four AVX registers. It performs matrix operations - # on four words in each matrix in parallel, but requires shuffling to - # rearrange the words after each round. - - vzeroupper - - # x0..3[0-2] = s0..3 - vbroadcasti128 0x00(%rdi),%ymm0 - vbroadcasti128 0x10(%rdi),%ymm1 - vbroadcasti128 0x20(%rdi),%ymm2 - vbroadcasti128 0x30(%rdi),%ymm3 - - vpaddd CTR2BL(%rip),%ymm3,%ymm3 - - vmovdqa %ymm0,%ymm8 - vmovdqa %ymm1,%ymm9 - vmovdqa %ymm2,%ymm10 - vmovdqa %ymm3,%ymm11 - - vmovdqa ROT8(%rip),%ymm4 - vmovdqa ROT16(%rip),%ymm5 - - mov %rcx,%rax - mov $10,%ecx - -.Ldoubleround: - - # x0 += x1, x3 = rotl32(x3 ^ x0, 16) - vpaddd %ymm1,%ymm0,%ymm0 - vpxor %ymm0,%ymm3,%ymm3 - vpshufb %ymm5,%ymm3,%ymm3 - - # x2 += x3, x1 = rotl32(x1 ^ x2, 12) - vpaddd %ymm3,%ymm2,%ymm2 - vpxor %ymm2,%ymm1,%ymm1 - vmovdqa %ymm1,%ymm6 - vpslld $12,%ymm6,%ymm6 - vpsrld $20,%ymm1,%ymm1 - vpor %ymm6,%ymm1,%ymm1 - - # x0 += x1, x3 = rotl32(x3 ^ x0, 8) - vpaddd %ymm1,%ymm0,%ymm0 - vpxor %ymm0,%ymm3,%ymm3 - vpshufb %ymm4,%ymm3,%ymm3 - - # x2 += x3, x1 = rotl32(x1 ^ x2, 7) - vpaddd %ymm3,%ymm2,%ymm2 - vpxor %ymm2,%ymm1,%ymm1 - vmovdqa %ymm1,%ymm7 - vpslld $7,%ymm7,%ymm7 - vpsrld $25,%ymm1,%ymm1 - vpor %ymm7,%ymm1,%ymm1 - - # x1 = shuffle32(x1, MASK(0, 3, 2, 1)) - vpshufd $0x39,%ymm1,%ymm1 - # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) - vpshufd $0x4e,%ymm2,%ymm2 - # x3 = shuffle32(x3, MASK(2, 1, 0, 3)) - vpshufd $0x93,%ymm3,%ymm3 - - # x0 += x1, x3 = rotl32(x3 ^ x0, 16) - vpaddd %ymm1,%ymm0,%ymm0 - vpxor %ymm0,%ymm3,%ymm3 - vpshufb %ymm5,%ymm3,%ymm3 - - # x2 += x3, x1 = rotl32(x1 ^ x2, 12) - vpaddd %ymm3,%ymm2,%ymm2 - vpxor %ymm2,%ymm1,%ymm1 - vmovdqa %ymm1,%ymm6 - vpslld $12,%ymm6,%ymm6 - vpsrld $20,%ymm1,%ymm1 - vpor %ymm6,%ymm1,%ymm1 - - # x0 += x1, x3 = rotl32(x3 ^ x0, 8) - vpaddd %ymm1,%ymm0,%ymm0 - vpxor %ymm0,%ymm3,%ymm3 - vpshufb %ymm4,%ymm3,%ymm3 - - # x2 += x3, x1 = rotl32(x1 ^ x2, 7) - vpaddd %ymm3,%ymm2,%ymm2 - vpxor %ymm2,%ymm1,%ymm1 - vmovdqa %ymm1,%ymm7 - vpslld $7,%ymm7,%ymm7 - vpsrld $25,%ymm1,%ymm1 - vpor %ymm7,%ymm1,%ymm1 - - # x1 = shuffle32(x1, MASK(2, 1, 0, 3)) - vpshufd $0x93,%ymm1,%ymm1 - # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) - vpshufd $0x4e,%ymm2,%ymm2 - # x3 = shuffle32(x3, MASK(0, 3, 2, 1)) - vpshufd $0x39,%ymm3,%ymm3 - - dec %ecx - jnz .Ldoubleround - - # o0 = i0 ^ (x0 + s0) - vpaddd %ymm8,%ymm0,%ymm7 - cmp $0x10,%rax - jl .Lxorpart2 - vpxor 0x00(%rdx),%xmm7,%xmm6 - vmovdqu %xmm6,0x00(%rsi) - vextracti128 $1,%ymm7,%xmm0 - # o1 = i1 ^ (x1 + s1) - vpaddd %ymm9,%ymm1,%ymm7 - cmp $0x20,%rax - jl .Lxorpart2 - vpxor 0x10(%rdx),%xmm7,%xmm6 - vmovdqu %xmm6,0x10(%rsi) - vextracti128 $1,%ymm7,%xmm1 - # o2 = i2 ^ (x2 + s2) - vpaddd %ymm10,%ymm2,%ymm7 - cmp $0x30,%rax - jl .Lxorpart2 - vpxor 0x20(%rdx),%xmm7,%xmm6 - vmovdqu %xmm6,0x20(%rsi) - vextracti128 $1,%ymm7,%xmm2 - # o3 = i3 ^ (x3 + s3) - vpaddd %ymm11,%ymm3,%ymm7 - cmp $0x40,%rax - jl .Lxorpart2 - vpxor 0x30(%rdx),%xmm7,%xmm6 - vmovdqu %xmm6,0x30(%rsi) - vextracti128 $1,%ymm7,%xmm3 - - # xor and write second block - vmovdqa %xmm0,%xmm7 - cmp $0x50,%rax - jl .Lxorpart2 - vpxor 0x40(%rdx),%xmm7,%xmm6 - vmovdqu %xmm6,0x40(%rsi) - - vmovdqa %xmm1,%xmm7 - cmp $0x60,%rax - jl .Lxorpart2 - vpxor 0x50(%rdx),%xmm7,%xmm6 - vmovdqu %xmm6,0x50(%rsi) - - vmovdqa %xmm2,%xmm7 - cmp $0x70,%rax - jl .Lxorpart2 - vpxor 0x60(%rdx),%xmm7,%xmm6 - vmovdqu %xmm6,0x60(%rsi) - - vmovdqa %xmm3,%xmm7 - cmp $0x80,%rax - jl .Lxorpart2 - vpxor 0x70(%rdx),%xmm7,%xmm6 - vmovdqu %xmm6,0x70(%rsi) - -.Ldone2: - vzeroupper - ret - -.Lxorpart2: - # xor remaining bytes from partial register into output - mov %rax,%r9 - and $0x0f,%r9 - jz .Ldone2 - and $~0x0f,%rax - - mov %rsi,%r11 - - lea 8(%rsp),%r10 - sub $0x10,%rsp - and $~31,%rsp - - lea (%rdx,%rax),%rsi - mov %rsp,%rdi - mov %r9,%rcx - rep movsb - - vpxor 0x00(%rsp),%xmm7,%xmm7 - vmovdqa %xmm7,0x00(%rsp) - - mov %rsp,%rsi - lea (%r11,%rax),%rdi - mov %r9,%rcx - rep movsb - - lea -8(%r10),%rsp - jmp .Ldone2 - -ENDPROC(chacha20_2block_xor_avx2) - -ENTRY(chacha20_4block_xor_avx2) - # %rdi: Input state matrix, s - # %rsi: up to 4 data blocks output, o - # %rdx: up to 4 data blocks input, i - # %rcx: input/output length in bytes - - # This function encrypts four ChaCha20 block by loading the state - # matrix four times across eight AVX registers. It performs matrix - # operations on four words in two matrices in parallel, sequentially - # to the operations on the four words of the other two matrices. The - # required word shuffling has a rather high latency, we can do the - # arithmetic on two matrix-pairs without much slowdown. - - vzeroupper - - # x0..3[0-4] = s0..3 - vbroadcasti128 0x00(%rdi),%ymm0 - vbroadcasti128 0x10(%rdi),%ymm1 - vbroadcasti128 0x20(%rdi),%ymm2 - vbroadcasti128 0x30(%rdi),%ymm3 - - vmovdqa %ymm0,%ymm4 - vmovdqa %ymm1,%ymm5 - vmovdqa %ymm2,%ymm6 - vmovdqa %ymm3,%ymm7 - - vpaddd CTR2BL(%rip),%ymm3,%ymm3 - vpaddd CTR4BL(%rip),%ymm7,%ymm7 - - vmovdqa %ymm0,%ymm11 - vmovdqa %ymm1,%ymm12 - vmovdqa %ymm2,%ymm13 - vmovdqa %ymm3,%ymm14 - vmovdqa %ymm7,%ymm15 - - vmovdqa ROT8(%rip),%ymm8 - vmovdqa ROT16(%rip),%ymm9 - - mov %rcx,%rax - mov $10,%ecx - -.Ldoubleround4: - - # x0 += x1, x3 = rotl32(x3 ^ x0, 16) - vpaddd %ymm1,%ymm0,%ymm0 - vpxor %ymm0,%ymm3,%ymm3 - vpshufb %ymm9,%ymm3,%ymm3 - - vpaddd %ymm5,%ymm4,%ymm4 - vpxor %ymm4,%ymm7,%ymm7 - vpshufb %ymm9,%ymm7,%ymm7 - - # x2 += x3, x1 = rotl32(x1 ^ x2, 12) - vpaddd %ymm3,%ymm2,%ymm2 - vpxor %ymm2,%ymm1,%ymm1 - vmovdqa %ymm1,%ymm10 - vpslld $12,%ymm10,%ymm10 - vpsrld $20,%ymm1,%ymm1 - vpor %ymm10,%ymm1,%ymm1 - - vpaddd %ymm7,%ymm6,%ymm6 - vpxor %ymm6,%ymm5,%ymm5 - vmovdqa %ymm5,%ymm10 - vpslld $12,%ymm10,%ymm10 - vpsrld $20,%ymm5,%ymm5 - vpor %ymm10,%ymm5,%ymm5 - - # x0 += x1, x3 = rotl32(x3 ^ x0, 8) - vpaddd %ymm1,%ymm0,%ymm0 - vpxor %ymm0,%ymm3,%ymm3 - vpshufb %ymm8,%ymm3,%ymm3 - - vpaddd %ymm5,%ymm4,%ymm4 - vpxor %ymm4,%ymm7,%ymm7 - vpshufb %ymm8,%ymm7,%ymm7 - - # x2 += x3, x1 = rotl32(x1 ^ x2, 7) - vpaddd %ymm3,%ymm2,%ymm2 - vpxor %ymm2,%ymm1,%ymm1 - vmovdqa %ymm1,%ymm10 - vpslld $7,%ymm10,%ymm10 - vpsrld $25,%ymm1,%ymm1 - vpor %ymm10,%ymm1,%ymm1 - - vpaddd %ymm7,%ymm6,%ymm6 - vpxor %ymm6,%ymm5,%ymm5 - vmovdqa %ymm5,%ymm10 - vpslld $7,%ymm10,%ymm10 - vpsrld $25,%ymm5,%ymm5 - vpor %ymm10,%ymm5,%ymm5 - - # x1 = shuffle32(x1, MASK(0, 3, 2, 1)) - vpshufd $0x39,%ymm1,%ymm1 - vpshufd $0x39,%ymm5,%ymm5 - # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) - vpshufd $0x4e,%ymm2,%ymm2 - vpshufd $0x4e,%ymm6,%ymm6 - # x3 = shuffle32(x3, MASK(2, 1, 0, 3)) - vpshufd $0x93,%ymm3,%ymm3 - vpshufd $0x93,%ymm7,%ymm7 - - # x0 += x1, x3 = rotl32(x3 ^ x0, 16) - vpaddd %ymm1,%ymm0,%ymm0 - vpxor %ymm0,%ymm3,%ymm3 - vpshufb %ymm9,%ymm3,%ymm3 - - vpaddd %ymm5,%ymm4,%ymm4 - vpxor %ymm4,%ymm7,%ymm7 - vpshufb %ymm9,%ymm7,%ymm7 - - # x2 += x3, x1 = rotl32(x1 ^ x2, 12) - vpaddd %ymm3,%ymm2,%ymm2 - vpxor %ymm2,%ymm1,%ymm1 - vmovdqa %ymm1,%ymm10 - vpslld $12,%ymm10,%ymm10 - vpsrld $20,%ymm1,%ymm1 - vpor %ymm10,%ymm1,%ymm1 - - vpaddd %ymm7,%ymm6,%ymm6 - vpxor %ymm6,%ymm5,%ymm5 - vmovdqa %ymm5,%ymm10 - vpslld $12,%ymm10,%ymm10 - vpsrld $20,%ymm5,%ymm5 - vpor %ymm10,%ymm5,%ymm5 - - # x0 += x1, x3 = rotl32(x3 ^ x0, 8) - vpaddd %ymm1,%ymm0,%ymm0 - vpxor %ymm0,%ymm3,%ymm3 - vpshufb %ymm8,%ymm3,%ymm3 - - vpaddd %ymm5,%ymm4,%ymm4 - vpxor %ymm4,%ymm7,%ymm7 - vpshufb %ymm8,%ymm7,%ymm7 - - # x2 += x3, x1 = rotl32(x1 ^ x2, 7) - vpaddd %ymm3,%ymm2,%ymm2 - vpxor %ymm2,%ymm1,%ymm1 - vmovdqa %ymm1,%ymm10 - vpslld $7,%ymm10,%ymm10 - vpsrld $25,%ymm1,%ymm1 - vpor %ymm10,%ymm1,%ymm1 - - vpaddd %ymm7,%ymm6,%ymm6 - vpxor %ymm6,%ymm5,%ymm5 - vmovdqa %ymm5,%ymm10 - vpslld $7,%ymm10,%ymm10 - vpsrld $25,%ymm5,%ymm5 - vpor %ymm10,%ymm5,%ymm5 - - # x1 = shuffle32(x1, MASK(2, 1, 0, 3)) - vpshufd $0x93,%ymm1,%ymm1 - vpshufd $0x93,%ymm5,%ymm5 - # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) - vpshufd $0x4e,%ymm2,%ymm2 - vpshufd $0x4e,%ymm6,%ymm6 - # x3 = shuffle32(x3, MASK(0, 3, 2, 1)) - vpshufd $0x39,%ymm3,%ymm3 - vpshufd $0x39,%ymm7,%ymm7 - - dec %ecx - jnz .Ldoubleround4 - - # o0 = i0 ^ (x0 + s0), first block - vpaddd %ymm11,%ymm0,%ymm10 - cmp $0x10,%rax - jl .Lxorpart4 - vpxor 0x00(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0x00(%rsi) - vextracti128 $1,%ymm10,%xmm0 - # o1 = i1 ^ (x1 + s1), first block - vpaddd %ymm12,%ymm1,%ymm10 - cmp $0x20,%rax - jl .Lxorpart4 - vpxor 0x10(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0x10(%rsi) - vextracti128 $1,%ymm10,%xmm1 - # o2 = i2 ^ (x2 + s2), first block - vpaddd %ymm13,%ymm2,%ymm10 - cmp $0x30,%rax - jl .Lxorpart4 - vpxor 0x20(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0x20(%rsi) - vextracti128 $1,%ymm10,%xmm2 - # o3 = i3 ^ (x3 + s3), first block - vpaddd %ymm14,%ymm3,%ymm10 - cmp $0x40,%rax - jl .Lxorpart4 - vpxor 0x30(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0x30(%rsi) - vextracti128 $1,%ymm10,%xmm3 - - # xor and write second block - vmovdqa %xmm0,%xmm10 - cmp $0x50,%rax - jl .Lxorpart4 - vpxor 0x40(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0x40(%rsi) - - vmovdqa %xmm1,%xmm10 - cmp $0x60,%rax - jl .Lxorpart4 - vpxor 0x50(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0x50(%rsi) - - vmovdqa %xmm2,%xmm10 - cmp $0x70,%rax - jl .Lxorpart4 - vpxor 0x60(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0x60(%rsi) - - vmovdqa %xmm3,%xmm10 - cmp $0x80,%rax - jl .Lxorpart4 - vpxor 0x70(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0x70(%rsi) - - # o0 = i0 ^ (x0 + s0), third block - vpaddd %ymm11,%ymm4,%ymm10 - cmp $0x90,%rax - jl .Lxorpart4 - vpxor 0x80(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0x80(%rsi) - vextracti128 $1,%ymm10,%xmm4 - # o1 = i1 ^ (x1 + s1), third block - vpaddd %ymm12,%ymm5,%ymm10 - cmp $0xa0,%rax - jl .Lxorpart4 - vpxor 0x90(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0x90(%rsi) - vextracti128 $1,%ymm10,%xmm5 - # o2 = i2 ^ (x2 + s2), third block - vpaddd %ymm13,%ymm6,%ymm10 - cmp $0xb0,%rax - jl .Lxorpart4 - vpxor 0xa0(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0xa0(%rsi) - vextracti128 $1,%ymm10,%xmm6 - # o3 = i3 ^ (x3 + s3), third block - vpaddd %ymm15,%ymm7,%ymm10 - cmp $0xc0,%rax - jl .Lxorpart4 - vpxor 0xb0(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0xb0(%rsi) - vextracti128 $1,%ymm10,%xmm7 - - # xor and write fourth block - vmovdqa %xmm4,%xmm10 - cmp $0xd0,%rax - jl .Lxorpart4 - vpxor 0xc0(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0xc0(%rsi) - - vmovdqa %xmm5,%xmm10 - cmp $0xe0,%rax - jl .Lxorpart4 - vpxor 0xd0(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0xd0(%rsi) - - vmovdqa %xmm6,%xmm10 - cmp $0xf0,%rax - jl .Lxorpart4 - vpxor 0xe0(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0xe0(%rsi) - - vmovdqa %xmm7,%xmm10 - cmp $0x100,%rax - jl .Lxorpart4 - vpxor 0xf0(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0xf0(%rsi) - -.Ldone4: - vzeroupper - ret - -.Lxorpart4: - # xor remaining bytes from partial register into output - mov %rax,%r9 - and $0x0f,%r9 - jz .Ldone4 - and $~0x0f,%rax - - mov %rsi,%r11 - - lea 8(%rsp),%r10 - sub $0x10,%rsp - and $~31,%rsp - - lea (%rdx,%rax),%rsi - mov %rsp,%rdi - mov %r9,%rcx - rep movsb - - vpxor 0x00(%rsp),%xmm10,%xmm10 - vmovdqa %xmm10,0x00(%rsp) - - mov %rsp,%rsi - lea (%r11,%rax),%rdi - mov %r9,%rcx - rep movsb - - lea -8(%r10),%rsp - jmp .Ldone4 - -ENDPROC(chacha20_4block_xor_avx2) - -ENTRY(chacha20_8block_xor_avx2) - # %rdi: Input state matrix, s - # %rsi: up to 8 data blocks output, o - # %rdx: up to 8 data blocks input, i - # %rcx: input/output length in bytes - - # This function encrypts eight consecutive ChaCha20 blocks by loading - # the state matrix in AVX registers eight times. As we need some - # scratch registers, we save the first four registers on the stack. The - # algorithm performs each operation on the corresponding word of each - # state matrix, hence requires no word shuffling. For final XORing step - # we transpose the matrix by interleaving 32-, 64- and then 128-bit - # words, which allows us to do XOR in AVX registers. 8/16-bit word - # rotation is done with the slightly better performing byte shuffling, - # 7/12-bit word rotation uses traditional shift+OR. - - vzeroupper - # 4 * 32 byte stack, 32-byte aligned - lea 8(%rsp),%r10 - and $~31, %rsp - sub $0x80, %rsp - mov %rcx,%rax - - # x0..15[0-7] = s[0..15] - vpbroadcastd 0x00(%rdi),%ymm0 - vpbroadcastd 0x04(%rdi),%ymm1 - vpbroadcastd 0x08(%rdi),%ymm2 - vpbroadcastd 0x0c(%rdi),%ymm3 - vpbroadcastd 0x10(%rdi),%ymm4 - vpbroadcastd 0x14(%rdi),%ymm5 - vpbroadcastd 0x18(%rdi),%ymm6 - vpbroadcastd 0x1c(%rdi),%ymm7 - vpbroadcastd 0x20(%rdi),%ymm8 - vpbroadcastd 0x24(%rdi),%ymm9 - vpbroadcastd 0x28(%rdi),%ymm10 - vpbroadcastd 0x2c(%rdi),%ymm11 - vpbroadcastd 0x30(%rdi),%ymm12 - vpbroadcastd 0x34(%rdi),%ymm13 - vpbroadcastd 0x38(%rdi),%ymm14 - vpbroadcastd 0x3c(%rdi),%ymm15 - # x0..3 on stack - vmovdqa %ymm0,0x00(%rsp) - vmovdqa %ymm1,0x20(%rsp) - vmovdqa %ymm2,0x40(%rsp) - vmovdqa %ymm3,0x60(%rsp) - - vmovdqa CTRINC(%rip),%ymm1 - vmovdqa ROT8(%rip),%ymm2 - vmovdqa ROT16(%rip),%ymm3 - - # x12 += counter values 0-3 - vpaddd %ymm1,%ymm12,%ymm12 - - mov $10,%ecx - -.Ldoubleround8: - # x0 += x4, x12 = rotl32(x12 ^ x0, 16) - vpaddd 0x00(%rsp),%ymm4,%ymm0 - vmovdqa %ymm0,0x00(%rsp) - vpxor %ymm0,%ymm12,%ymm12 - vpshufb %ymm3,%ymm12,%ymm12 - # x1 += x5, x13 = rotl32(x13 ^ x1, 16) - vpaddd 0x20(%rsp),%ymm5,%ymm0 - vmovdqa %ymm0,0x20(%rsp) - vpxor %ymm0,%ymm13,%ymm13 - vpshufb %ymm3,%ymm13,%ymm13 - # x2 += x6, x14 = rotl32(x14 ^ x2, 16) - vpaddd 0x40(%rsp),%ymm6,%ymm0 - vmovdqa %ymm0,0x40(%rsp) - vpxor %ymm0,%ymm14,%ymm14 - vpshufb %ymm3,%ymm14,%ymm14 - # x3 += x7, x15 = rotl32(x15 ^ x3, 16) - vpaddd 0x60(%rsp),%ymm7,%ymm0 - vmovdqa %ymm0,0x60(%rsp) - vpxor %ymm0,%ymm15,%ymm15 - vpshufb %ymm3,%ymm15,%ymm15 - - # x8 += x12, x4 = rotl32(x4 ^ x8, 12) - vpaddd %ymm12,%ymm8,%ymm8 - vpxor %ymm8,%ymm4,%ymm4 - vpslld $12,%ymm4,%ymm0 - vpsrld $20,%ymm4,%ymm4 - vpor %ymm0,%ymm4,%ymm4 - # x9 += x13, x5 = rotl32(x5 ^ x9, 12) - vpaddd %ymm13,%ymm9,%ymm9 - vpxor %ymm9,%ymm5,%ymm5 - vpslld $12,%ymm5,%ymm0 - vpsrld $20,%ymm5,%ymm5 - vpor %ymm0,%ymm5,%ymm5 - # x10 += x14, x6 = rotl32(x6 ^ x10, 12) - vpaddd %ymm14,%ymm10,%ymm10 - vpxor %ymm10,%ymm6,%ymm6 - vpslld $12,%ymm6,%ymm0 - vpsrld $20,%ymm6,%ymm6 - vpor %ymm0,%ymm6,%ymm6 - # x11 += x15, x7 = rotl32(x7 ^ x11, 12) - vpaddd %ymm15,%ymm11,%ymm11 - vpxor %ymm11,%ymm7,%ymm7 - vpslld $12,%ymm7,%ymm0 - vpsrld $20,%ymm7,%ymm7 - vpor %ymm0,%ymm7,%ymm7 - - # x0 += x4, x12 = rotl32(x12 ^ x0, 8) - vpaddd 0x00(%rsp),%ymm4,%ymm0 - vmovdqa %ymm0,0x00(%rsp) - vpxor %ymm0,%ymm12,%ymm12 - vpshufb %ymm2,%ymm12,%ymm12 - # x1 += x5, x13 = rotl32(x13 ^ x1, 8) - vpaddd 0x20(%rsp),%ymm5,%ymm0 - vmovdqa %ymm0,0x20(%rsp) - vpxor %ymm0,%ymm13,%ymm13 - vpshufb %ymm2,%ymm13,%ymm13 - # x2 += x6, x14 = rotl32(x14 ^ x2, 8) - vpaddd 0x40(%rsp),%ymm6,%ymm0 - vmovdqa %ymm0,0x40(%rsp) - vpxor %ymm0,%ymm14,%ymm14 - vpshufb %ymm2,%ymm14,%ymm14 - # x3 += x7, x15 = rotl32(x15 ^ x3, 8) - vpaddd 0x60(%rsp),%ymm7,%ymm0 - vmovdqa %ymm0,0x60(%rsp) - vpxor %ymm0,%ymm15,%ymm15 - vpshufb %ymm2,%ymm15,%ymm15 - - # x8 += x12, x4 = rotl32(x4 ^ x8, 7) - vpaddd %ymm12,%ymm8,%ymm8 - vpxor %ymm8,%ymm4,%ymm4 - vpslld $7,%ymm4,%ymm0 - vpsrld $25,%ymm4,%ymm4 - vpor %ymm0,%ymm4,%ymm4 - # x9 += x13, x5 = rotl32(x5 ^ x9, 7) - vpaddd %ymm13,%ymm9,%ymm9 - vpxor %ymm9,%ymm5,%ymm5 - vpslld $7,%ymm5,%ymm0 - vpsrld $25,%ymm5,%ymm5 - vpor %ymm0,%ymm5,%ymm5 - # x10 += x14, x6 = rotl32(x6 ^ x10, 7) - vpaddd %ymm14,%ymm10,%ymm10 - vpxor %ymm10,%ymm6,%ymm6 - vpslld $7,%ymm6,%ymm0 - vpsrld $25,%ymm6,%ymm6 - vpor %ymm0,%ymm6,%ymm6 - # x11 += x15, x7 = rotl32(x7 ^ x11, 7) - vpaddd %ymm15,%ymm11,%ymm11 - vpxor %ymm11,%ymm7,%ymm7 - vpslld $7,%ymm7,%ymm0 - vpsrld $25,%ymm7,%ymm7 - vpor %ymm0,%ymm7,%ymm7 - - # x0 += x5, x15 = rotl32(x15 ^ x0, 16) - vpaddd 0x00(%rsp),%ymm5,%ymm0 - vmovdqa %ymm0,0x00(%rsp) - vpxor %ymm0,%ymm15,%ymm15 - vpshufb %ymm3,%ymm15,%ymm15 - # x1 += x6, x12 = rotl32(x12 ^ x1, 16)%ymm0 - vpaddd 0x20(%rsp),%ymm6,%ymm0 - vmovdqa %ymm0,0x20(%rsp) - vpxor %ymm0,%ymm12,%ymm12 - vpshufb %ymm3,%ymm12,%ymm12 - # x2 += x7, x13 = rotl32(x13 ^ x2, 16) - vpaddd 0x40(%rsp),%ymm7,%ymm0 - vmovdqa %ymm0,0x40(%rsp) - vpxor %ymm0,%ymm13,%ymm13 - vpshufb %ymm3,%ymm13,%ymm13 - # x3 += x4, x14 = rotl32(x14 ^ x3, 16) - vpaddd 0x60(%rsp),%ymm4,%ymm0 - vmovdqa %ymm0,0x60(%rsp) - vpxor %ymm0,%ymm14,%ymm14 - vpshufb %ymm3,%ymm14,%ymm14 - - # x10 += x15, x5 = rotl32(x5 ^ x10, 12) - vpaddd %ymm15,%ymm10,%ymm10 - vpxor %ymm10,%ymm5,%ymm5 - vpslld $12,%ymm5,%ymm0 - vpsrld $20,%ymm5,%ymm5 - vpor %ymm0,%ymm5,%ymm5 - # x11 += x12, x6 = rotl32(x6 ^ x11, 12) - vpaddd %ymm12,%ymm11,%ymm11 - vpxor %ymm11,%ymm6,%ymm6 - vpslld $12,%ymm6,%ymm0 - vpsrld $20,%ymm6,%ymm6 - vpor %ymm0,%ymm6,%ymm6 - # x8 += x13, x7 = rotl32(x7 ^ x8, 12) - vpaddd %ymm13,%ymm8,%ymm8 - vpxor %ymm8,%ymm7,%ymm7 - vpslld $12,%ymm7,%ymm0 - vpsrld $20,%ymm7,%ymm7 - vpor %ymm0,%ymm7,%ymm7 - # x9 += x14, x4 = rotl32(x4 ^ x9, 12) - vpaddd %ymm14,%ymm9,%ymm9 - vpxor %ymm9,%ymm4,%ymm4 - vpslld $12,%ymm4,%ymm0 - vpsrld $20,%ymm4,%ymm4 - vpor %ymm0,%ymm4,%ymm4 - - # x0 += x5, x15 = rotl32(x15 ^ x0, 8) - vpaddd 0x00(%rsp),%ymm5,%ymm0 - vmovdqa %ymm0,0x00(%rsp) - vpxor %ymm0,%ymm15,%ymm15 - vpshufb %ymm2,%ymm15,%ymm15 - # x1 += x6, x12 = rotl32(x12 ^ x1, 8) - vpaddd 0x20(%rsp),%ymm6,%ymm0 - vmovdqa %ymm0,0x20(%rsp) - vpxor %ymm0,%ymm12,%ymm12 - vpshufb %ymm2,%ymm12,%ymm12 - # x2 += x7, x13 = rotl32(x13 ^ x2, 8) - vpaddd 0x40(%rsp),%ymm7,%ymm0 - vmovdqa %ymm0,0x40(%rsp) - vpxor %ymm0,%ymm13,%ymm13 - vpshufb %ymm2,%ymm13,%ymm13 - # x3 += x4, x14 = rotl32(x14 ^ x3, 8) - vpaddd 0x60(%rsp),%ymm4,%ymm0 - vmovdqa %ymm0,0x60(%rsp) - vpxor %ymm0,%ymm14,%ymm14 - vpshufb %ymm2,%ymm14,%ymm14 - - # x10 += x15, x5 = rotl32(x5 ^ x10, 7) - vpaddd %ymm15,%ymm10,%ymm10 - vpxor %ymm10,%ymm5,%ymm5 - vpslld $7,%ymm5,%ymm0 - vpsrld $25,%ymm5,%ymm5 - vpor %ymm0,%ymm5,%ymm5 - # x11 += x12, x6 = rotl32(x6 ^ x11, 7) - vpaddd %ymm12,%ymm11,%ymm11 - vpxor %ymm11,%ymm6,%ymm6 - vpslld $7,%ymm6,%ymm0 - vpsrld $25,%ymm6,%ymm6 - vpor %ymm0,%ymm6,%ymm6 - # x8 += x13, x7 = rotl32(x7 ^ x8, 7) - vpaddd %ymm13,%ymm8,%ymm8 - vpxor %ymm8,%ymm7,%ymm7 - vpslld $7,%ymm7,%ymm0 - vpsrld $25,%ymm7,%ymm7 - vpor %ymm0,%ymm7,%ymm7 - # x9 += x14, x4 = rotl32(x4 ^ x9, 7) - vpaddd %ymm14,%ymm9,%ymm9 - vpxor %ymm9,%ymm4,%ymm4 - vpslld $7,%ymm4,%ymm0 - vpsrld $25,%ymm4,%ymm4 - vpor %ymm0,%ymm4,%ymm4 - - dec %ecx - jnz .Ldoubleround8 - - # x0..15[0-3] += s[0..15] - vpbroadcastd 0x00(%rdi),%ymm0 - vpaddd 0x00(%rsp),%ymm0,%ymm0 - vmovdqa %ymm0,0x00(%rsp) - vpbroadcastd 0x04(%rdi),%ymm0 - vpaddd 0x20(%rsp),%ymm0,%ymm0 - vmovdqa %ymm0,0x20(%rsp) - vpbroadcastd 0x08(%rdi),%ymm0 - vpaddd 0x40(%rsp),%ymm0,%ymm0 - vmovdqa %ymm0,0x40(%rsp) - vpbroadcastd 0x0c(%rdi),%ymm0 - vpaddd 0x60(%rsp),%ymm0,%ymm0 - vmovdqa %ymm0,0x60(%rsp) - vpbroadcastd 0x10(%rdi),%ymm0 - vpaddd %ymm0,%ymm4,%ymm4 - vpbroadcastd 0x14(%rdi),%ymm0 - vpaddd %ymm0,%ymm5,%ymm5 - vpbroadcastd 0x18(%rdi),%ymm0 - vpaddd %ymm0,%ymm6,%ymm6 - vpbroadcastd 0x1c(%rdi),%ymm0 - vpaddd %ymm0,%ymm7,%ymm7 - vpbroadcastd 0x20(%rdi),%ymm0 - vpaddd %ymm0,%ymm8,%ymm8 - vpbroadcastd 0x24(%rdi),%ymm0 - vpaddd %ymm0,%ymm9,%ymm9 - vpbroadcastd 0x28(%rdi),%ymm0 - vpaddd %ymm0,%ymm10,%ymm10 - vpbroadcastd 0x2c(%rdi),%ymm0 - vpaddd %ymm0,%ymm11,%ymm11 - vpbroadcastd 0x30(%rdi),%ymm0 - vpaddd %ymm0,%ymm12,%ymm12 - vpbroadcastd 0x34(%rdi),%ymm0 - vpaddd %ymm0,%ymm13,%ymm13 - vpbroadcastd 0x38(%rdi),%ymm0 - vpaddd %ymm0,%ymm14,%ymm14 - vpbroadcastd 0x3c(%rdi),%ymm0 - vpaddd %ymm0,%ymm15,%ymm15 - - # x12 += counter values 0-3 - vpaddd %ymm1,%ymm12,%ymm12 - - # interleave 32-bit words in state n, n+1 - vmovdqa 0x00(%rsp),%ymm0 - vmovdqa 0x20(%rsp),%ymm1 - vpunpckldq %ymm1,%ymm0,%ymm2 - vpunpckhdq %ymm1,%ymm0,%ymm1 - vmovdqa %ymm2,0x00(%rsp) - vmovdqa %ymm1,0x20(%rsp) - vmovdqa 0x40(%rsp),%ymm0 - vmovdqa 0x60(%rsp),%ymm1 - vpunpckldq %ymm1,%ymm0,%ymm2 - vpunpckhdq %ymm1,%ymm0,%ymm1 - vmovdqa %ymm2,0x40(%rsp) - vmovdqa %ymm1,0x60(%rsp) - vmovdqa %ymm4,%ymm0 - vpunpckldq %ymm5,%ymm0,%ymm4 - vpunpckhdq %ymm5,%ymm0,%ymm5 - vmovdqa %ymm6,%ymm0 - vpunpckldq %ymm7,%ymm0,%ymm6 - vpunpckhdq %ymm7,%ymm0,%ymm7 - vmovdqa %ymm8,%ymm0 - vpunpckldq %ymm9,%ymm0,%ymm8 - vpunpckhdq %ymm9,%ymm0,%ymm9 - vmovdqa %ymm10,%ymm0 - vpunpckldq %ymm11,%ymm0,%ymm10 - vpunpckhdq %ymm11,%ymm0,%ymm11 - vmovdqa %ymm12,%ymm0 - vpunpckldq %ymm13,%ymm0,%ymm12 - vpunpckhdq %ymm13,%ymm0,%ymm13 - vmovdqa %ymm14,%ymm0 - vpunpckldq %ymm15,%ymm0,%ymm14 - vpunpckhdq %ymm15,%ymm0,%ymm15 - - # interleave 64-bit words in state n, n+2 - vmovdqa 0x00(%rsp),%ymm0 - vmovdqa 0x40(%rsp),%ymm2 - vpunpcklqdq %ymm2,%ymm0,%ymm1 - vpunpckhqdq %ymm2,%ymm0,%ymm2 - vmovdqa %ymm1,0x00(%rsp) - vmovdqa %ymm2,0x40(%rsp) - vmovdqa 0x20(%rsp),%ymm0 - vmovdqa 0x60(%rsp),%ymm2 - vpunpcklqdq %ymm2,%ymm0,%ymm1 - vpunpckhqdq %ymm2,%ymm0,%ymm2 - vmovdqa %ymm1,0x20(%rsp) - vmovdqa %ymm2,0x60(%rsp) - vmovdqa %ymm4,%ymm0 - vpunpcklqdq %ymm6,%ymm0,%ymm4 - vpunpckhqdq %ymm6,%ymm0,%ymm6 - vmovdqa %ymm5,%ymm0 - vpunpcklqdq %ymm7,%ymm0,%ymm5 - vpunpckhqdq %ymm7,%ymm0,%ymm7 - vmovdqa %ymm8,%ymm0 - vpunpcklqdq %ymm10,%ymm0,%ymm8 - vpunpckhqdq %ymm10,%ymm0,%ymm10 - vmovdqa %ymm9,%ymm0 - vpunpcklqdq %ymm11,%ymm0,%ymm9 - vpunpckhqdq %ymm11,%ymm0,%ymm11 - vmovdqa %ymm12,%ymm0 - vpunpcklqdq %ymm14,%ymm0,%ymm12 - vpunpckhqdq %ymm14,%ymm0,%ymm14 - vmovdqa %ymm13,%ymm0 - vpunpcklqdq %ymm15,%ymm0,%ymm13 - vpunpckhqdq %ymm15,%ymm0,%ymm15 - - # interleave 128-bit words in state n, n+4 - # xor/write first four blocks - vmovdqa 0x00(%rsp),%ymm1 - vperm2i128 $0x20,%ymm4,%ymm1,%ymm0 - cmp $0x0020,%rax - jl .Lxorpart8 - vpxor 0x0000(%rdx),%ymm0,%ymm0 - vmovdqu %ymm0,0x0000(%rsi) - vperm2i128 $0x31,%ymm4,%ymm1,%ymm4 - - vperm2i128 $0x20,%ymm12,%ymm8,%ymm0 - cmp $0x0040,%rax - jl .Lxorpart8 - vpxor 0x0020(%rdx),%ymm0,%ymm0 - vmovdqu %ymm0,0x0020(%rsi) - vperm2i128 $0x31,%ymm12,%ymm8,%ymm12 - - vmovdqa 0x40(%rsp),%ymm1 - vperm2i128 $0x20,%ymm6,%ymm1,%ymm0 - cmp $0x0060,%rax - jl .Lxorpart8 - vpxor 0x0040(%rdx),%ymm0,%ymm0 - vmovdqu %ymm0,0x0040(%rsi) - vperm2i128 $0x31,%ymm6,%ymm1,%ymm6 - - vperm2i128 $0x20,%ymm14,%ymm10,%ymm0 - cmp $0x0080,%rax - jl .Lxorpart8 - vpxor 0x0060(%rdx),%ymm0,%ymm0 - vmovdqu %ymm0,0x0060(%rsi) - vperm2i128 $0x31,%ymm14,%ymm10,%ymm14 - - vmovdqa 0x20(%rsp),%ymm1 - vperm2i128 $0x20,%ymm5,%ymm1,%ymm0 - cmp $0x00a0,%rax - jl .Lxorpart8 - vpxor 0x0080(%rdx),%ymm0,%ymm0 - vmovdqu %ymm0,0x0080(%rsi) - vperm2i128 $0x31,%ymm5,%ymm1,%ymm5 - - vperm2i128 $0x20,%ymm13,%ymm9,%ymm0 - cmp $0x00c0,%rax - jl .Lxorpart8 - vpxor 0x00a0(%rdx),%ymm0,%ymm0 - vmovdqu %ymm0,0x00a0(%rsi) - vperm2i128 $0x31,%ymm13,%ymm9,%ymm13 - - vmovdqa 0x60(%rsp),%ymm1 - vperm2i128 $0x20,%ymm7,%ymm1,%ymm0 - cmp $0x00e0,%rax - jl .Lxorpart8 - vpxor 0x00c0(%rdx),%ymm0,%ymm0 - vmovdqu %ymm0,0x00c0(%rsi) - vperm2i128 $0x31,%ymm7,%ymm1,%ymm7 - - vperm2i128 $0x20,%ymm15,%ymm11,%ymm0 - cmp $0x0100,%rax - jl .Lxorpart8 - vpxor 0x00e0(%rdx),%ymm0,%ymm0 - vmovdqu %ymm0,0x00e0(%rsi) - vperm2i128 $0x31,%ymm15,%ymm11,%ymm15 - - # xor remaining blocks, write to output - vmovdqa %ymm4,%ymm0 - cmp $0x0120,%rax - jl .Lxorpart8 - vpxor 0x0100(%rdx),%ymm0,%ymm0 - vmovdqu %ymm0,0x0100(%rsi) - - vmovdqa %ymm12,%ymm0 - cmp $0x0140,%rax - jl .Lxorpart8 - vpxor 0x0120(%rdx),%ymm0,%ymm0 - vmovdqu %ymm0,0x0120(%rsi) - - vmovdqa %ymm6,%ymm0 - cmp $0x0160,%rax - jl .Lxorpart8 - vpxor 0x0140(%rdx),%ymm0,%ymm0 - vmovdqu %ymm0,0x0140(%rsi) - - vmovdqa %ymm14,%ymm0 - cmp $0x0180,%rax - jl .Lxorpart8 - vpxor 0x0160(%rdx),%ymm0,%ymm0 - vmovdqu %ymm0,0x0160(%rsi) - - vmovdqa %ymm5,%ymm0 - cmp $0x01a0,%rax - jl .Lxorpart8 - vpxor 0x0180(%rdx),%ymm0,%ymm0 - vmovdqu %ymm0,0x0180(%rsi) - - vmovdqa %ymm13,%ymm0 - cmp $0x01c0,%rax - jl .Lxorpart8 - vpxor 0x01a0(%rdx),%ymm0,%ymm0 - vmovdqu %ymm0,0x01a0(%rsi) - - vmovdqa %ymm7,%ymm0 - cmp $0x01e0,%rax - jl .Lxorpart8 - vpxor 0x01c0(%rdx),%ymm0,%ymm0 - vmovdqu %ymm0,0x01c0(%rsi) - - vmovdqa %ymm15,%ymm0 - cmp $0x0200,%rax - jl .Lxorpart8 - vpxor 0x01e0(%rdx),%ymm0,%ymm0 - vmovdqu %ymm0,0x01e0(%rsi) - -.Ldone8: - vzeroupper - lea -8(%r10),%rsp - ret - -.Lxorpart8: - # xor remaining bytes from partial register into output - mov %rax,%r9 - and $0x1f,%r9 - jz .Ldone8 - and $~0x1f,%rax - - mov %rsi,%r11 - - lea (%rdx,%rax),%rsi - mov %rsp,%rdi - mov %r9,%rcx - rep movsb - - vpxor 0x00(%rsp),%ymm0,%ymm0 - vmovdqa %ymm0,0x00(%rsp) - - mov %rsp,%rsi - lea (%r11,%rax),%rdi - mov %r9,%rcx - rep movsb - - jmp .Ldone8 - -ENDPROC(chacha20_8block_xor_avx2) diff --git a/arch/x86/crypto/chacha20-avx512vl-x86_64.S b/arch/x86/crypto/chacha20-avx512vl-x86_64.S deleted file mode 100644 index 55d34de29e3e..000000000000 --- a/arch/x86/crypto/chacha20-avx512vl-x86_64.S +++ /dev/null @@ -1,839 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0+ */ -/* - * ChaCha20 256-bit cipher algorithm, RFC7539, x64 AVX-512VL functions - * - * Copyright (C) 2018 Martin Willi - */ - -#include - -.section .rodata.cst32.CTR2BL, "aM", @progbits, 32 -.align 32 -CTR2BL: .octa 0x00000000000000000000000000000000 - .octa 0x00000000000000000000000000000001 - -.section .rodata.cst32.CTR4BL, "aM", @progbits, 32 -.align 32 -CTR4BL: .octa 0x00000000000000000000000000000002 - .octa 0x00000000000000000000000000000003 - -.section .rodata.cst32.CTR8BL, "aM", @progbits, 32 -.align 32 -CTR8BL: .octa 0x00000003000000020000000100000000 - .octa 0x00000007000000060000000500000004 - -.text - -ENTRY(chacha20_2block_xor_avx512vl) - # %rdi: Input state matrix, s - # %rsi: up to 2 data blocks output, o - # %rdx: up to 2 data blocks input, i - # %rcx: input/output length in bytes - - # This function encrypts two ChaCha20 blocks by loading the state - # matrix twice across four AVX registers. It performs matrix operations - # on four words in each matrix in parallel, but requires shuffling to - # rearrange the words after each round. - - vzeroupper - - # x0..3[0-2] = s0..3 - vbroadcasti128 0x00(%rdi),%ymm0 - vbroadcasti128 0x10(%rdi),%ymm1 - vbroadcasti128 0x20(%rdi),%ymm2 - vbroadcasti128 0x30(%rdi),%ymm3 - - vpaddd CTR2BL(%rip),%ymm3,%ymm3 - - vmovdqa %ymm0,%ymm8 - vmovdqa %ymm1,%ymm9 - vmovdqa %ymm2,%ymm10 - vmovdqa %ymm3,%ymm11 - - mov $10,%rax - -.Ldoubleround: - - # x0 += x1, x3 = rotl32(x3 ^ x0, 16) - vpaddd %ymm1,%ymm0,%ymm0 - vpxord %ymm0,%ymm3,%ymm3 - vprold $16,%ymm3,%ymm3 - - # x2 += x3, x1 = rotl32(x1 ^ x2, 12) - vpaddd %ymm3,%ymm2,%ymm2 - vpxord %ymm2,%ymm1,%ymm1 - vprold $12,%ymm1,%ymm1 - - # x0 += x1, x3 = rotl32(x3 ^ x0, 8) - vpaddd %ymm1,%ymm0,%ymm0 - vpxord %ymm0,%ymm3,%ymm3 - vprold $8,%ymm3,%ymm3 - - # x2 += x3, x1 = rotl32(x1 ^ x2, 7) - vpaddd %ymm3,%ymm2,%ymm2 - vpxord %ymm2,%ymm1,%ymm1 - vprold $7,%ymm1,%ymm1 - - # x1 = shuffle32(x1, MASK(0, 3, 2, 1)) - vpshufd $0x39,%ymm1,%ymm1 - # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) - vpshufd $0x4e,%ymm2,%ymm2 - # x3 = shuffle32(x3, MASK(2, 1, 0, 3)) - vpshufd $0x93,%ymm3,%ymm3 - - # x0 += x1, x3 = rotl32(x3 ^ x0, 16) - vpaddd %ymm1,%ymm0,%ymm0 - vpxord %ymm0,%ymm3,%ymm3 - vprold $16,%ymm3,%ymm3 - - # x2 += x3, x1 = rotl32(x1 ^ x2, 12) - vpaddd %ymm3,%ymm2,%ymm2 - vpxord %ymm2,%ymm1,%ymm1 - vprold $12,%ymm1,%ymm1 - - # x0 += x1, x3 = rotl32(x3 ^ x0, 8) - vpaddd %ymm1,%ymm0,%ymm0 - vpxord %ymm0,%ymm3,%ymm3 - vprold $8,%ymm3,%ymm3 - - # x2 += x3, x1 = rotl32(x1 ^ x2, 7) - vpaddd %ymm3,%ymm2,%ymm2 - vpxord %ymm2,%ymm1,%ymm1 - vprold $7,%ymm1,%ymm1 - - # x1 = shuffle32(x1, MASK(2, 1, 0, 3)) - vpshufd $0x93,%ymm1,%ymm1 - # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) - vpshufd $0x4e,%ymm2,%ymm2 - # x3 = shuffle32(x3, MASK(0, 3, 2, 1)) - vpshufd $0x39,%ymm3,%ymm3 - - dec %rax - jnz .Ldoubleround - - # o0 = i0 ^ (x0 + s0) - vpaddd %ymm8,%ymm0,%ymm7 - cmp $0x10,%rcx - jl .Lxorpart2 - vpxord 0x00(%rdx),%xmm7,%xmm6 - vmovdqu %xmm6,0x00(%rsi) - vextracti128 $1,%ymm7,%xmm0 - # o1 = i1 ^ (x1 + s1) - vpaddd %ymm9,%ymm1,%ymm7 - cmp $0x20,%rcx - jl .Lxorpart2 - vpxord 0x10(%rdx),%xmm7,%xmm6 - vmovdqu %xmm6,0x10(%rsi) - vextracti128 $1,%ymm7,%xmm1 - # o2 = i2 ^ (x2 + s2) - vpaddd %ymm10,%ymm2,%ymm7 - cmp $0x30,%rcx - jl .Lxorpart2 - vpxord 0x20(%rdx),%xmm7,%xmm6 - vmovdqu %xmm6,0x20(%rsi) - vextracti128 $1,%ymm7,%xmm2 - # o3 = i3 ^ (x3 + s3) - vpaddd %ymm11,%ymm3,%ymm7 - cmp $0x40,%rcx - jl .Lxorpart2 - vpxord 0x30(%rdx),%xmm7,%xmm6 - vmovdqu %xmm6,0x30(%rsi) - vextracti128 $1,%ymm7,%xmm3 - - # xor and write second block - vmovdqa %xmm0,%xmm7 - cmp $0x50,%rcx - jl .Lxorpart2 - vpxord 0x40(%rdx),%xmm7,%xmm6 - vmovdqu %xmm6,0x40(%rsi) - - vmovdqa %xmm1,%xmm7 - cmp $0x60,%rcx - jl .Lxorpart2 - vpxord 0x50(%rdx),%xmm7,%xmm6 - vmovdqu %xmm6,0x50(%rsi) - - vmovdqa %xmm2,%xmm7 - cmp $0x70,%rcx - jl .Lxorpart2 - vpxord 0x60(%rdx),%xmm7,%xmm6 - vmovdqu %xmm6,0x60(%rsi) - - vmovdqa %xmm3,%xmm7 - cmp $0x80,%rcx - jl .Lxorpart2 - vpxord 0x70(%rdx),%xmm7,%xmm6 - vmovdqu %xmm6,0x70(%rsi) - -.Ldone2: - vzeroupper - ret - -.Lxorpart2: - # xor remaining bytes from partial register into output - mov %rcx,%rax - and $0xf,%rcx - jz .Ldone8 - mov %rax,%r9 - and $~0xf,%r9 - - mov $1,%rax - shld %cl,%rax,%rax - sub $1,%rax - kmovq %rax,%k1 - - vmovdqu8 (%rdx,%r9),%xmm1{%k1}{z} - vpxord %xmm7,%xmm1,%xmm1 - vmovdqu8 %xmm1,(%rsi,%r9){%k1} - - jmp .Ldone2 - -ENDPROC(chacha20_2block_xor_avx512vl) - -ENTRY(chacha20_4block_xor_avx512vl) - # %rdi: Input state matrix, s - # %rsi: up to 4 data blocks output, o - # %rdx: up to 4 data blocks input, i - # %rcx: input/output length in bytes - - # This function encrypts four ChaCha20 block by loading the state - # matrix four times across eight AVX registers. It performs matrix - # operations on four words in two matrices in parallel, sequentially - # to the operations on the four words of the other two matrices. The - # required word shuffling has a rather high latency, we can do the - # arithmetic on two matrix-pairs without much slowdown. - - vzeroupper - - # x0..3[0-4] = s0..3 - vbroadcasti128 0x00(%rdi),%ymm0 - vbroadcasti128 0x10(%rdi),%ymm1 - vbroadcasti128 0x20(%rdi),%ymm2 - vbroadcasti128 0x30(%rdi),%ymm3 - - vmovdqa %ymm0,%ymm4 - vmovdqa %ymm1,%ymm5 - vmovdqa %ymm2,%ymm6 - vmovdqa %ymm3,%ymm7 - - vpaddd CTR2BL(%rip),%ymm3,%ymm3 - vpaddd CTR4BL(%rip),%ymm7,%ymm7 - - vmovdqa %ymm0,%ymm11 - vmovdqa %ymm1,%ymm12 - vmovdqa %ymm2,%ymm13 - vmovdqa %ymm3,%ymm14 - vmovdqa %ymm7,%ymm15 - - mov $10,%rax - -.Ldoubleround4: - - # x0 += x1, x3 = rotl32(x3 ^ x0, 16) - vpaddd %ymm1,%ymm0,%ymm0 - vpxord %ymm0,%ymm3,%ymm3 - vprold $16,%ymm3,%ymm3 - - vpaddd %ymm5,%ymm4,%ymm4 - vpxord %ymm4,%ymm7,%ymm7 - vprold $16,%ymm7,%ymm7 - - # x2 += x3, x1 = rotl32(x1 ^ x2, 12) - vpaddd %ymm3,%ymm2,%ymm2 - vpxord %ymm2,%ymm1,%ymm1 - vprold $12,%ymm1,%ymm1 - - vpaddd %ymm7,%ymm6,%ymm6 - vpxord %ymm6,%ymm5,%ymm5 - vprold $12,%ymm5,%ymm5 - - # x0 += x1, x3 = rotl32(x3 ^ x0, 8) - vpaddd %ymm1,%ymm0,%ymm0 - vpxord %ymm0,%ymm3,%ymm3 - vprold $8,%ymm3,%ymm3 - - vpaddd %ymm5,%ymm4,%ymm4 - vpxord %ymm4,%ymm7,%ymm7 - vprold $8,%ymm7,%ymm7 - - # x2 += x3, x1 = rotl32(x1 ^ x2, 7) - vpaddd %ymm3,%ymm2,%ymm2 - vpxord %ymm2,%ymm1,%ymm1 - vprold $7,%ymm1,%ymm1 - - vpaddd %ymm7,%ymm6,%ymm6 - vpxord %ymm6,%ymm5,%ymm5 - vprold $7,%ymm5,%ymm5 - - # x1 = shuffle32(x1, MASK(0, 3, 2, 1)) - vpshufd $0x39,%ymm1,%ymm1 - vpshufd $0x39,%ymm5,%ymm5 - # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) - vpshufd $0x4e,%ymm2,%ymm2 - vpshufd $0x4e,%ymm6,%ymm6 - # x3 = shuffle32(x3, MASK(2, 1, 0, 3)) - vpshufd $0x93,%ymm3,%ymm3 - vpshufd $0x93,%ymm7,%ymm7 - - # x0 += x1, x3 = rotl32(x3 ^ x0, 16) - vpaddd %ymm1,%ymm0,%ymm0 - vpxord %ymm0,%ymm3,%ymm3 - vprold $16,%ymm3,%ymm3 - - vpaddd %ymm5,%ymm4,%ymm4 - vpxord %ymm4,%ymm7,%ymm7 - vprold $16,%ymm7,%ymm7 - - # x2 += x3, x1 = rotl32(x1 ^ x2, 12) - vpaddd %ymm3,%ymm2,%ymm2 - vpxord %ymm2,%ymm1,%ymm1 - vprold $12,%ymm1,%ymm1 - - vpaddd %ymm7,%ymm6,%ymm6 - vpxord %ymm6,%ymm5,%ymm5 - vprold $12,%ymm5,%ymm5 - - # x0 += x1, x3 = rotl32(x3 ^ x0, 8) - vpaddd %ymm1,%ymm0,%ymm0 - vpxord %ymm0,%ymm3,%ymm3 - vprold $8,%ymm3,%ymm3 - - vpaddd %ymm5,%ymm4,%ymm4 - vpxord %ymm4,%ymm7,%ymm7 - vprold $8,%ymm7,%ymm7 - - # x2 += x3, x1 = rotl32(x1 ^ x2, 7) - vpaddd %ymm3,%ymm2,%ymm2 - vpxord %ymm2,%ymm1,%ymm1 - vprold $7,%ymm1,%ymm1 - - vpaddd %ymm7,%ymm6,%ymm6 - vpxord %ymm6,%ymm5,%ymm5 - vprold $7,%ymm5,%ymm5 - - # x1 = shuffle32(x1, MASK(2, 1, 0, 3)) - vpshufd $0x93,%ymm1,%ymm1 - vpshufd $0x93,%ymm5,%ymm5 - # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) - vpshufd $0x4e,%ymm2,%ymm2 - vpshufd $0x4e,%ymm6,%ymm6 - # x3 = shuffle32(x3, MASK(0, 3, 2, 1)) - vpshufd $0x39,%ymm3,%ymm3 - vpshufd $0x39,%ymm7,%ymm7 - - dec %rax - jnz .Ldoubleround4 - - # o0 = i0 ^ (x0 + s0), first block - vpaddd %ymm11,%ymm0,%ymm10 - cmp $0x10,%rcx - jl .Lxorpart4 - vpxord 0x00(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0x00(%rsi) - vextracti128 $1,%ymm10,%xmm0 - # o1 = i1 ^ (x1 + s1), first block - vpaddd %ymm12,%ymm1,%ymm10 - cmp $0x20,%rcx - jl .Lxorpart4 - vpxord 0x10(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0x10(%rsi) - vextracti128 $1,%ymm10,%xmm1 - # o2 = i2 ^ (x2 + s2), first block - vpaddd %ymm13,%ymm2,%ymm10 - cmp $0x30,%rcx - jl .Lxorpart4 - vpxord 0x20(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0x20(%rsi) - vextracti128 $1,%ymm10,%xmm2 - # o3 = i3 ^ (x3 + s3), first block - vpaddd %ymm14,%ymm3,%ymm10 - cmp $0x40,%rcx - jl .Lxorpart4 - vpxord 0x30(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0x30(%rsi) - vextracti128 $1,%ymm10,%xmm3 - - # xor and write second block - vmovdqa %xmm0,%xmm10 - cmp $0x50,%rcx - jl .Lxorpart4 - vpxord 0x40(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0x40(%rsi) - - vmovdqa %xmm1,%xmm10 - cmp $0x60,%rcx - jl .Lxorpart4 - vpxord 0x50(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0x50(%rsi) - - vmovdqa %xmm2,%xmm10 - cmp $0x70,%rcx - jl .Lxorpart4 - vpxord 0x60(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0x60(%rsi) - - vmovdqa %xmm3,%xmm10 - cmp $0x80,%rcx - jl .Lxorpart4 - vpxord 0x70(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0x70(%rsi) - - # o0 = i0 ^ (x0 + s0), third block - vpaddd %ymm11,%ymm4,%ymm10 - cmp $0x90,%rcx - jl .Lxorpart4 - vpxord 0x80(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0x80(%rsi) - vextracti128 $1,%ymm10,%xmm4 - # o1 = i1 ^ (x1 + s1), third block - vpaddd %ymm12,%ymm5,%ymm10 - cmp $0xa0,%rcx - jl .Lxorpart4 - vpxord 0x90(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0x90(%rsi) - vextracti128 $1,%ymm10,%xmm5 - # o2 = i2 ^ (x2 + s2), third block - vpaddd %ymm13,%ymm6,%ymm10 - cmp $0xb0,%rcx - jl .Lxorpart4 - vpxord 0xa0(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0xa0(%rsi) - vextracti128 $1,%ymm10,%xmm6 - # o3 = i3 ^ (x3 + s3), third block - vpaddd %ymm15,%ymm7,%ymm10 - cmp $0xc0,%rcx - jl .Lxorpart4 - vpxord 0xb0(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0xb0(%rsi) - vextracti128 $1,%ymm10,%xmm7 - - # xor and write fourth block - vmovdqa %xmm4,%xmm10 - cmp $0xd0,%rcx - jl .Lxorpart4 - vpxord 0xc0(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0xc0(%rsi) - - vmovdqa %xmm5,%xmm10 - cmp $0xe0,%rcx - jl .Lxorpart4 - vpxord 0xd0(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0xd0(%rsi) - - vmovdqa %xmm6,%xmm10 - cmp $0xf0,%rcx - jl .Lxorpart4 - vpxord 0xe0(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0xe0(%rsi) - - vmovdqa %xmm7,%xmm10 - cmp $0x100,%rcx - jl .Lxorpart4 - vpxord 0xf0(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0xf0(%rsi) - -.Ldone4: - vzeroupper - ret - -.Lxorpart4: - # xor remaining bytes from partial register into output - mov %rcx,%rax - and $0xf,%rcx - jz .Ldone8 - mov %rax,%r9 - and $~0xf,%r9 - - mov $1,%rax - shld %cl,%rax,%rax - sub $1,%rax - kmovq %rax,%k1 - - vmovdqu8 (%rdx,%r9),%xmm1{%k1}{z} - vpxord %xmm10,%xmm1,%xmm1 - vmovdqu8 %xmm1,(%rsi,%r9){%k1} - - jmp .Ldone4 - -ENDPROC(chacha20_4block_xor_avx512vl) - -ENTRY(chacha20_8block_xor_avx512vl) - # %rdi: Input state matrix, s - # %rsi: up to 8 data blocks output, o - # %rdx: up to 8 data blocks input, i - # %rcx: input/output length in bytes - - # This function encrypts eight consecutive ChaCha20 blocks by loading - # the state matrix in AVX registers eight times. Compared to AVX2, this - # mostly benefits from the new rotate instructions in VL and the - # additional registers. - - vzeroupper - - # x0..15[0-7] = s[0..15] - vpbroadcastd 0x00(%rdi),%ymm0 - vpbroadcastd 0x04(%rdi),%ymm1 - vpbroadcastd 0x08(%rdi),%ymm2 - vpbroadcastd 0x0c(%rdi),%ymm3 - vpbroadcastd 0x10(%rdi),%ymm4 - vpbroadcastd 0x14(%rdi),%ymm5 - vpbroadcastd 0x18(%rdi),%ymm6 - vpbroadcastd 0x1c(%rdi),%ymm7 - vpbroadcastd 0x20(%rdi),%ymm8 - vpbroadcastd 0x24(%rdi),%ymm9 - vpbroadcastd 0x28(%rdi),%ymm10 - vpbroadcastd 0x2c(%rdi),%ymm11 - vpbroadcastd 0x30(%rdi),%ymm12 - vpbroadcastd 0x34(%rdi),%ymm13 - vpbroadcastd 0x38(%rdi),%ymm14 - vpbroadcastd 0x3c(%rdi),%ymm15 - - # x12 += counter values 0-3 - vpaddd CTR8BL(%rip),%ymm12,%ymm12 - - vmovdqa64 %ymm0,%ymm16 - vmovdqa64 %ymm1,%ymm17 - vmovdqa64 %ymm2,%ymm18 - vmovdqa64 %ymm3,%ymm19 - vmovdqa64 %ymm4,%ymm20 - vmovdqa64 %ymm5,%ymm21 - vmovdqa64 %ymm6,%ymm22 - vmovdqa64 %ymm7,%ymm23 - vmovdqa64 %ymm8,%ymm24 - vmovdqa64 %ymm9,%ymm25 - vmovdqa64 %ymm10,%ymm26 - vmovdqa64 %ymm11,%ymm27 - vmovdqa64 %ymm12,%ymm28 - vmovdqa64 %ymm13,%ymm29 - vmovdqa64 %ymm14,%ymm30 - vmovdqa64 %ymm15,%ymm31 - - mov $10,%eax - -.Ldoubleround8: - # x0 += x4, x12 = rotl32(x12 ^ x0, 16) - vpaddd %ymm0,%ymm4,%ymm0 - vpxord %ymm0,%ymm12,%ymm12 - vprold $16,%ymm12,%ymm12 - # x1 += x5, x13 = rotl32(x13 ^ x1, 16) - vpaddd %ymm1,%ymm5,%ymm1 - vpxord %ymm1,%ymm13,%ymm13 - vprold $16,%ymm13,%ymm13 - # x2 += x6, x14 = rotl32(x14 ^ x2, 16) - vpaddd %ymm2,%ymm6,%ymm2 - vpxord %ymm2,%ymm14,%ymm14 - vprold $16,%ymm14,%ymm14 - # x3 += x7, x15 = rotl32(x15 ^ x3, 16) - vpaddd %ymm3,%ymm7,%ymm3 - vpxord %ymm3,%ymm15,%ymm15 - vprold $16,%ymm15,%ymm15 - - # x8 += x12, x4 = rotl32(x4 ^ x8, 12) - vpaddd %ymm12,%ymm8,%ymm8 - vpxord %ymm8,%ymm4,%ymm4 - vprold $12,%ymm4,%ymm4 - # x9 += x13, x5 = rotl32(x5 ^ x9, 12) - vpaddd %ymm13,%ymm9,%ymm9 - vpxord %ymm9,%ymm5,%ymm5 - vprold $12,%ymm5,%ymm5 - # x10 += x14, x6 = rotl32(x6 ^ x10, 12) - vpaddd %ymm14,%ymm10,%ymm10 - vpxord %ymm10,%ymm6,%ymm6 - vprold $12,%ymm6,%ymm6 - # x11 += x15, x7 = rotl32(x7 ^ x11, 12) - vpaddd %ymm15,%ymm11,%ymm11 - vpxord %ymm11,%ymm7,%ymm7 - vprold $12,%ymm7,%ymm7 - - # x0 += x4, x12 = rotl32(x12 ^ x0, 8) - vpaddd %ymm0,%ymm4,%ymm0 - vpxord %ymm0,%ymm12,%ymm12 - vprold $8,%ymm12,%ymm12 - # x1 += x5, x13 = rotl32(x13 ^ x1, 8) - vpaddd %ymm1,%ymm5,%ymm1 - vpxord %ymm1,%ymm13,%ymm13 - vprold $8,%ymm13,%ymm13 - # x2 += x6, x14 = rotl32(x14 ^ x2, 8) - vpaddd %ymm2,%ymm6,%ymm2 - vpxord %ymm2,%ymm14,%ymm14 - vprold $8,%ymm14,%ymm14 - # x3 += x7, x15 = rotl32(x15 ^ x3, 8) - vpaddd %ymm3,%ymm7,%ymm3 - vpxord %ymm3,%ymm15,%ymm15 - vprold $8,%ymm15,%ymm15 - - # x8 += x12, x4 = rotl32(x4 ^ x8, 7) - vpaddd %ymm12,%ymm8,%ymm8 - vpxord %ymm8,%ymm4,%ymm4 - vprold $7,%ymm4,%ymm4 - # x9 += x13, x5 = rotl32(x5 ^ x9, 7) - vpaddd %ymm13,%ymm9,%ymm9 - vpxord %ymm9,%ymm5,%ymm5 - vprold $7,%ymm5,%ymm5 - # x10 += x14, x6 = rotl32(x6 ^ x10, 7) - vpaddd %ymm14,%ymm10,%ymm10 - vpxord %ymm10,%ymm6,%ymm6 - vprold $7,%ymm6,%ymm6 - # x11 += x15, x7 = rotl32(x7 ^ x11, 7) - vpaddd %ymm15,%ymm11,%ymm11 - vpxord %ymm11,%ymm7,%ymm7 - vprold $7,%ymm7,%ymm7 - - # x0 += x5, x15 = rotl32(x15 ^ x0, 16) - vpaddd %ymm0,%ymm5,%ymm0 - vpxord %ymm0,%ymm15,%ymm15 - vprold $16,%ymm15,%ymm15 - # x1 += x6, x12 = rotl32(x12 ^ x1, 16) - vpaddd %ymm1,%ymm6,%ymm1 - vpxord %ymm1,%ymm12,%ymm12 - vprold $16,%ymm12,%ymm12 - # x2 += x7, x13 = rotl32(x13 ^ x2, 16) - vpaddd %ymm2,%ymm7,%ymm2 - vpxord %ymm2,%ymm13,%ymm13 - vprold $16,%ymm13,%ymm13 - # x3 += x4, x14 = rotl32(x14 ^ x3, 16) - vpaddd %ymm3,%ymm4,%ymm3 - vpxord %ymm3,%ymm14,%ymm14 - vprold $16,%ymm14,%ymm14 - - # x10 += x15, x5 = rotl32(x5 ^ x10, 12) - vpaddd %ymm15,%ymm10,%ymm10 - vpxord %ymm10,%ymm5,%ymm5 - vprold $12,%ymm5,%ymm5 - # x11 += x12, x6 = rotl32(x6 ^ x11, 12) - vpaddd %ymm12,%ymm11,%ymm11 - vpxord %ymm11,%ymm6,%ymm6 - vprold $12,%ymm6,%ymm6 - # x8 += x13, x7 = rotl32(x7 ^ x8, 12) - vpaddd %ymm13,%ymm8,%ymm8 - vpxord %ymm8,%ymm7,%ymm7 - vprold $12,%ymm7,%ymm7 - # x9 += x14, x4 = rotl32(x4 ^ x9, 12) - vpaddd %ymm14,%ymm9,%ymm9 - vpxord %ymm9,%ymm4,%ymm4 - vprold $12,%ymm4,%ymm4 - - # x0 += x5, x15 = rotl32(x15 ^ x0, 8) - vpaddd %ymm0,%ymm5,%ymm0 - vpxord %ymm0,%ymm15,%ymm15 - vprold $8,%ymm15,%ymm15 - # x1 += x6, x12 = rotl32(x12 ^ x1, 8) - vpaddd %ymm1,%ymm6,%ymm1 - vpxord %ymm1,%ymm12,%ymm12 - vprold $8,%ymm12,%ymm12 - # x2 += x7, x13 = rotl32(x13 ^ x2, 8) - vpaddd %ymm2,%ymm7,%ymm2 - vpxord %ymm2,%ymm13,%ymm13 - vprold $8,%ymm13,%ymm13 - # x3 += x4, x14 = rotl32(x14 ^ x3, 8) - vpaddd %ymm3,%ymm4,%ymm3 - vpxord %ymm3,%ymm14,%ymm14 - vprold $8,%ymm14,%ymm14 - - # x10 += x15, x5 = rotl32(x5 ^ x10, 7) - vpaddd %ymm15,%ymm10,%ymm10 - vpxord %ymm10,%ymm5,%ymm5 - vprold $7,%ymm5,%ymm5 - # x11 += x12, x6 = rotl32(x6 ^ x11, 7) - vpaddd %ymm12,%ymm11,%ymm11 - vpxord %ymm11,%ymm6,%ymm6 - vprold $7,%ymm6,%ymm6 - # x8 += x13, x7 = rotl32(x7 ^ x8, 7) - vpaddd %ymm13,%ymm8,%ymm8 - vpxord %ymm8,%ymm7,%ymm7 - vprold $7,%ymm7,%ymm7 - # x9 += x14, x4 = rotl32(x4 ^ x9, 7) - vpaddd %ymm14,%ymm9,%ymm9 - vpxord %ymm9,%ymm4,%ymm4 - vprold $7,%ymm4,%ymm4 - - dec %eax - jnz .Ldoubleround8 - - # x0..15[0-3] += s[0..15] - vpaddd %ymm16,%ymm0,%ymm0 - vpaddd %ymm17,%ymm1,%ymm1 - vpaddd %ymm18,%ymm2,%ymm2 - vpaddd %ymm19,%ymm3,%ymm3 - vpaddd %ymm20,%ymm4,%ymm4 - vpaddd %ymm21,%ymm5,%ymm5 - vpaddd %ymm22,%ymm6,%ymm6 - vpaddd %ymm23,%ymm7,%ymm7 - vpaddd %ymm24,%ymm8,%ymm8 - vpaddd %ymm25,%ymm9,%ymm9 - vpaddd %ymm26,%ymm10,%ymm10 - vpaddd %ymm27,%ymm11,%ymm11 - vpaddd %ymm28,%ymm12,%ymm12 - vpaddd %ymm29,%ymm13,%ymm13 - vpaddd %ymm30,%ymm14,%ymm14 - vpaddd %ymm31,%ymm15,%ymm15 - - # interleave 32-bit words in state n, n+1 - vpunpckldq %ymm1,%ymm0,%ymm16 - vpunpckhdq %ymm1,%ymm0,%ymm17 - vpunpckldq %ymm3,%ymm2,%ymm18 - vpunpckhdq %ymm3,%ymm2,%ymm19 - vpunpckldq %ymm5,%ymm4,%ymm20 - vpunpckhdq %ymm5,%ymm4,%ymm21 - vpunpckldq %ymm7,%ymm6,%ymm22 - vpunpckhdq %ymm7,%ymm6,%ymm23 - vpunpckldq %ymm9,%ymm8,%ymm24 - vpunpckhdq %ymm9,%ymm8,%ymm25 - vpunpckldq %ymm11,%ymm10,%ymm26 - vpunpckhdq %ymm11,%ymm10,%ymm27 - vpunpckldq %ymm13,%ymm12,%ymm28 - vpunpckhdq %ymm13,%ymm12,%ymm29 - vpunpckldq %ymm15,%ymm14,%ymm30 - vpunpckhdq %ymm15,%ymm14,%ymm31 - - # interleave 64-bit words in state n, n+2 - vpunpcklqdq %ymm18,%ymm16,%ymm0 - vpunpcklqdq %ymm19,%ymm17,%ymm1 - vpunpckhqdq %ymm18,%ymm16,%ymm2 - vpunpckhqdq %ymm19,%ymm17,%ymm3 - vpunpcklqdq %ymm22,%ymm20,%ymm4 - vpunpcklqdq %ymm23,%ymm21,%ymm5 - vpunpckhqdq %ymm22,%ymm20,%ymm6 - vpunpckhqdq %ymm23,%ymm21,%ymm7 - vpunpcklqdq %ymm26,%ymm24,%ymm8 - vpunpcklqdq %ymm27,%ymm25,%ymm9 - vpunpckhqdq %ymm26,%ymm24,%ymm10 - vpunpckhqdq %ymm27,%ymm25,%ymm11 - vpunpcklqdq %ymm30,%ymm28,%ymm12 - vpunpcklqdq %ymm31,%ymm29,%ymm13 - vpunpckhqdq %ymm30,%ymm28,%ymm14 - vpunpckhqdq %ymm31,%ymm29,%ymm15 - - # interleave 128-bit words in state n, n+4 - # xor/write first four blocks - vmovdqa64 %ymm0,%ymm16 - vperm2i128 $0x20,%ymm4,%ymm0,%ymm0 - cmp $0x0020,%rcx - jl .Lxorpart8 - vpxord 0x0000(%rdx),%ymm0,%ymm0 - vmovdqu64 %ymm0,0x0000(%rsi) - vmovdqa64 %ymm16,%ymm0 - vperm2i128 $0x31,%ymm4,%ymm0,%ymm4 - - vperm2i128 $0x20,%ymm12,%ymm8,%ymm0 - cmp $0x0040,%rcx - jl .Lxorpart8 - vpxord 0x0020(%rdx),%ymm0,%ymm0 - vmovdqu64 %ymm0,0x0020(%rsi) - vperm2i128 $0x31,%ymm12,%ymm8,%ymm12 - - vperm2i128 $0x20,%ymm6,%ymm2,%ymm0 - cmp $0x0060,%rcx - jl .Lxorpart8 - vpxord 0x0040(%rdx),%ymm0,%ymm0 - vmovdqu64 %ymm0,0x0040(%rsi) - vperm2i128 $0x31,%ymm6,%ymm2,%ymm6 - - vperm2i128 $0x20,%ymm14,%ymm10,%ymm0 - cmp $0x0080,%rcx - jl .Lxorpart8 - vpxord 0x0060(%rdx),%ymm0,%ymm0 - vmovdqu64 %ymm0,0x0060(%rsi) - vperm2i128 $0x31,%ymm14,%ymm10,%ymm14 - - vperm2i128 $0x20,%ymm5,%ymm1,%ymm0 - cmp $0x00a0,%rcx - jl .Lxorpart8 - vpxord 0x0080(%rdx),%ymm0,%ymm0 - vmovdqu64 %ymm0,0x0080(%rsi) - vperm2i128 $0x31,%ymm5,%ymm1,%ymm5 - - vperm2i128 $0x20,%ymm13,%ymm9,%ymm0 - cmp $0x00c0,%rcx - jl .Lxorpart8 - vpxord 0x00a0(%rdx),%ymm0,%ymm0 - vmovdqu64 %ymm0,0x00a0(%rsi) - vperm2i128 $0x31,%ymm13,%ymm9,%ymm13 - - vperm2i128 $0x20,%ymm7,%ymm3,%ymm0 - cmp $0x00e0,%rcx - jl .Lxorpart8 - vpxord 0x00c0(%rdx),%ymm0,%ymm0 - vmovdqu64 %ymm0,0x00c0(%rsi) - vperm2i128 $0x31,%ymm7,%ymm3,%ymm7 - - vperm2i128 $0x20,%ymm15,%ymm11,%ymm0 - cmp $0x0100,%rcx - jl .Lxorpart8 - vpxord 0x00e0(%rdx),%ymm0,%ymm0 - vmovdqu64 %ymm0,0x00e0(%rsi) - vperm2i128 $0x31,%ymm15,%ymm11,%ymm15 - - # xor remaining blocks, write to output - vmovdqa64 %ymm4,%ymm0 - cmp $0x0120,%rcx - jl .Lxorpart8 - vpxord 0x0100(%rdx),%ymm0,%ymm0 - vmovdqu64 %ymm0,0x0100(%rsi) - - vmovdqa64 %ymm12,%ymm0 - cmp $0x0140,%rcx - jl .Lxorpart8 - vpxord 0x0120(%rdx),%ymm0,%ymm0 - vmovdqu64 %ymm0,0x0120(%rsi) - - vmovdqa64 %ymm6,%ymm0 - cmp $0x0160,%rcx - jl .Lxorpart8 - vpxord 0x0140(%rdx),%ymm0,%ymm0 - vmovdqu64 %ymm0,0x0140(%rsi) - - vmovdqa64 %ymm14,%ymm0 - cmp $0x0180,%rcx - jl .Lxorpart8 - vpxord 0x0160(%rdx),%ymm0,%ymm0 - vmovdqu64 %ymm0,0x0160(%rsi) - - vmovdqa64 %ymm5,%ymm0 - cmp $0x01a0,%rcx - jl .Lxorpart8 - vpxord 0x0180(%rdx),%ymm0,%ymm0 - vmovdqu64 %ymm0,0x0180(%rsi) - - vmovdqa64 %ymm13,%ymm0 - cmp $0x01c0,%rcx - jl .Lxorpart8 - vpxord 0x01a0(%rdx),%ymm0,%ymm0 - vmovdqu64 %ymm0,0x01a0(%rsi) - - vmovdqa64 %ymm7,%ymm0 - cmp $0x01e0,%rcx - jl .Lxorpart8 - vpxord 0x01c0(%rdx),%ymm0,%ymm0 - vmovdqu64 %ymm0,0x01c0(%rsi) - - vmovdqa64 %ymm15,%ymm0 - cmp $0x0200,%rcx - jl .Lxorpart8 - vpxord 0x01e0(%rdx),%ymm0,%ymm0 - vmovdqu64 %ymm0,0x01e0(%rsi) - -.Ldone8: - vzeroupper - ret - -.Lxorpart8: - # xor remaining bytes from partial register into output - mov %rcx,%rax - and $0x1f,%rcx - jz .Ldone8 - mov %rax,%r9 - and $~0x1f,%r9 - - mov $1,%rax - shld %cl,%rax,%rax - sub $1,%rax - kmovq %rax,%k1 - - vmovdqu8 (%rdx,%r9),%ymm1{%k1}{z} - vpxord %ymm0,%ymm1,%ymm1 - vmovdqu8 %ymm1,(%rsi,%r9){%k1} - - jmp .Ldone8 - -ENDPROC(chacha20_8block_xor_avx512vl) diff --git a/arch/x86/crypto/chacha20-ssse3-x86_64.S b/arch/x86/crypto/chacha20-ssse3-x86_64.S deleted file mode 100644 index f6792789f875..000000000000 --- a/arch/x86/crypto/chacha20-ssse3-x86_64.S +++ /dev/null @@ -1,792 +0,0 @@ -/* - * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSSE3 functions - * - * Copyright (C) 2015 Martin Willi - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - */ - -#include -#include - -.section .rodata.cst16.ROT8, "aM", @progbits, 16 -.align 16 -ROT8: .octa 0x0e0d0c0f0a09080b0605040702010003 -.section .rodata.cst16.ROT16, "aM", @progbits, 16 -.align 16 -ROT16: .octa 0x0d0c0f0e09080b0a0504070601000302 -.section .rodata.cst16.CTRINC, "aM", @progbits, 16 -.align 16 -CTRINC: .octa 0x00000003000000020000000100000000 - -.text - -/* - * chacha20_permute - permute one block - * - * Permute one 64-byte block where the state matrix is in %xmm0-%xmm3. This - * function performs matrix operations on four words in parallel, but requires - * shuffling to rearrange the words after each round. 8/16-bit word rotation is - * done with the slightly better performing SSSE3 byte shuffling, 7/12-bit word - * rotation uses traditional shift+OR. - * - * Clobbers: %ecx, %xmm4-%xmm7 - */ -chacha20_permute: - - movdqa ROT8(%rip),%xmm4 - movdqa ROT16(%rip),%xmm5 - mov $10,%ecx - -.Ldoubleround: - # x0 += x1, x3 = rotl32(x3 ^ x0, 16) - paddd %xmm1,%xmm0 - pxor %xmm0,%xmm3 - pshufb %xmm5,%xmm3 - - # x2 += x3, x1 = rotl32(x1 ^ x2, 12) - paddd %xmm3,%xmm2 - pxor %xmm2,%xmm1 - movdqa %xmm1,%xmm6 - pslld $12,%xmm6 - psrld $20,%xmm1 - por %xmm6,%xmm1 - - # x0 += x1, x3 = rotl32(x3 ^ x0, 8) - paddd %xmm1,%xmm0 - pxor %xmm0,%xmm3 - pshufb %xmm4,%xmm3 - - # x2 += x3, x1 = rotl32(x1 ^ x2, 7) - paddd %xmm3,%xmm2 - pxor %xmm2,%xmm1 - movdqa %xmm1,%xmm7 - pslld $7,%xmm7 - psrld $25,%xmm1 - por %xmm7,%xmm1 - - # x1 = shuffle32(x1, MASK(0, 3, 2, 1)) - pshufd $0x39,%xmm1,%xmm1 - # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) - pshufd $0x4e,%xmm2,%xmm2 - # x3 = shuffle32(x3, MASK(2, 1, 0, 3)) - pshufd $0x93,%xmm3,%xmm3 - - # x0 += x1, x3 = rotl32(x3 ^ x0, 16) - paddd %xmm1,%xmm0 - pxor %xmm0,%xmm3 - pshufb %xmm5,%xmm3 - - # x2 += x3, x1 = rotl32(x1 ^ x2, 12) - paddd %xmm3,%xmm2 - pxor %xmm2,%xmm1 - movdqa %xmm1,%xmm6 - pslld $12,%xmm6 - psrld $20,%xmm1 - por %xmm6,%xmm1 - - # x0 += x1, x3 = rotl32(x3 ^ x0, 8) - paddd %xmm1,%xmm0 - pxor %xmm0,%xmm3 - pshufb %xmm4,%xmm3 - - # x2 += x3, x1 = rotl32(x1 ^ x2, 7) - paddd %xmm3,%xmm2 - pxor %xmm2,%xmm1 - movdqa %xmm1,%xmm7 - pslld $7,%xmm7 - psrld $25,%xmm1 - por %xmm7,%xmm1 - - # x1 = shuffle32(x1, MASK(2, 1, 0, 3)) - pshufd $0x93,%xmm1,%xmm1 - # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) - pshufd $0x4e,%xmm2,%xmm2 - # x3 = shuffle32(x3, MASK(0, 3, 2, 1)) - pshufd $0x39,%xmm3,%xmm3 - - dec %ecx - jnz .Ldoubleround - - ret -ENDPROC(chacha20_permute) - -ENTRY(chacha20_block_xor_ssse3) - # %rdi: Input state matrix, s - # %rsi: up to 1 data block output, o - # %rdx: up to 1 data block input, i - # %rcx: input/output length in bytes - FRAME_BEGIN - - # x0..3 = s0..3 - movdqa 0x00(%rdi),%xmm0 - movdqa 0x10(%rdi),%xmm1 - movdqa 0x20(%rdi),%xmm2 - movdqa 0x30(%rdi),%xmm3 - movdqa %xmm0,%xmm8 - movdqa %xmm1,%xmm9 - movdqa %xmm2,%xmm10 - movdqa %xmm3,%xmm11 - - mov %rcx,%rax - call chacha20_permute - - # o0 = i0 ^ (x0 + s0) - paddd %xmm8,%xmm0 - cmp $0x10,%rax - jl .Lxorpart - movdqu 0x00(%rdx),%xmm4 - pxor %xmm4,%xmm0 - movdqu %xmm0,0x00(%rsi) - # o1 = i1 ^ (x1 + s1) - paddd %xmm9,%xmm1 - movdqa %xmm1,%xmm0 - cmp $0x20,%rax - jl .Lxorpart - movdqu 0x10(%rdx),%xmm0 - pxor %xmm1,%xmm0 - movdqu %xmm0,0x10(%rsi) - # o2 = i2 ^ (x2 + s2) - paddd %xmm10,%xmm2 - movdqa %xmm2,%xmm0 - cmp $0x30,%rax - jl .Lxorpart - movdqu 0x20(%rdx),%xmm0 - pxor %xmm2,%xmm0 - movdqu %xmm0,0x20(%rsi) - # o3 = i3 ^ (x3 + s3) - paddd %xmm11,%xmm3 - movdqa %xmm3,%xmm0 - cmp $0x40,%rax - jl .Lxorpart - movdqu 0x30(%rdx),%xmm0 - pxor %xmm3,%xmm0 - movdqu %xmm0,0x30(%rsi) - -.Ldone: - FRAME_END - ret - -.Lxorpart: - # xor remaining bytes from partial register into output - mov %rax,%r9 - and $0x0f,%r9 - jz .Ldone - and $~0x0f,%rax - - mov %rsi,%r11 - - lea 8(%rsp),%r10 - sub $0x10,%rsp - and $~31,%rsp - - lea (%rdx,%rax),%rsi - mov %rsp,%rdi - mov %r9,%rcx - rep movsb - - pxor 0x00(%rsp),%xmm0 - movdqa %xmm0,0x00(%rsp) - - mov %rsp,%rsi - lea (%r11,%rax),%rdi - mov %r9,%rcx - rep movsb - - lea -8(%r10),%rsp - jmp .Ldone - -ENDPROC(chacha20_block_xor_ssse3) - -ENTRY(hchacha20_block_ssse3) - # %rdi: Input state matrix, s - # %rsi: output (8 32-bit words) - FRAME_BEGIN - - movdqa 0x00(%rdi),%xmm0 - movdqa 0x10(%rdi),%xmm1 - movdqa 0x20(%rdi),%xmm2 - movdqa 0x30(%rdi),%xmm3 - - call chacha20_permute - - movdqu %xmm0,0x00(%rsi) - movdqu %xmm3,0x10(%rsi) - - FRAME_END - ret -ENDPROC(hchacha20_block_ssse3) - -ENTRY(chacha20_4block_xor_ssse3) - # %rdi: Input state matrix, s - # %rsi: up to 4 data blocks output, o - # %rdx: up to 4 data blocks input, i - # %rcx: input/output length in bytes - - # This function encrypts four consecutive ChaCha20 blocks by loading the - # the state matrix in SSE registers four times. As we need some scratch - # registers, we save the first four registers on the stack. The - # algorithm performs each operation on the corresponding word of each - # state matrix, hence requires no word shuffling. For final XORing step - # we transpose the matrix by interleaving 32- and then 64-bit words, - # which allows us to do XOR in SSE registers. 8/16-bit word rotation is - # done with the slightly better performing SSSE3 byte shuffling, - # 7/12-bit word rotation uses traditional shift+OR. - - lea 8(%rsp),%r10 - sub $0x80,%rsp - and $~63,%rsp - mov %rcx,%rax - - # x0..15[0-3] = s0..3[0..3] - movq 0x00(%rdi),%xmm1 - pshufd $0x00,%xmm1,%xmm0 - pshufd $0x55,%xmm1,%xmm1 - movq 0x08(%rdi),%xmm3 - pshufd $0x00,%xmm3,%xmm2 - pshufd $0x55,%xmm3,%xmm3 - movq 0x10(%rdi),%xmm5 - pshufd $0x00,%xmm5,%xmm4 - pshufd $0x55,%xmm5,%xmm5 - movq 0x18(%rdi),%xmm7 - pshufd $0x00,%xmm7,%xmm6 - pshufd $0x55,%xmm7,%xmm7 - movq 0x20(%rdi),%xmm9 - pshufd $0x00,%xmm9,%xmm8 - pshufd $0x55,%xmm9,%xmm9 - movq 0x28(%rdi),%xmm11 - pshufd $0x00,%xmm11,%xmm10 - pshufd $0x55,%xmm11,%xmm11 - movq 0x30(%rdi),%xmm13 - pshufd $0x00,%xmm13,%xmm12 - pshufd $0x55,%xmm13,%xmm13 - movq 0x38(%rdi),%xmm15 - pshufd $0x00,%xmm15,%xmm14 - pshufd $0x55,%xmm15,%xmm15 - # x0..3 on stack - movdqa %xmm0,0x00(%rsp) - movdqa %xmm1,0x10(%rsp) - movdqa %xmm2,0x20(%rsp) - movdqa %xmm3,0x30(%rsp) - - movdqa CTRINC(%rip),%xmm1 - movdqa ROT8(%rip),%xmm2 - movdqa ROT16(%rip),%xmm3 - - # x12 += counter values 0-3 - paddd %xmm1,%xmm12 - - mov $10,%ecx - -.Ldoubleround4: - # x0 += x4, x12 = rotl32(x12 ^ x0, 16) - movdqa 0x00(%rsp),%xmm0 - paddd %xmm4,%xmm0 - movdqa %xmm0,0x00(%rsp) - pxor %xmm0,%xmm12 - pshufb %xmm3,%xmm12 - # x1 += x5, x13 = rotl32(x13 ^ x1, 16) - movdqa 0x10(%rsp),%xmm0 - paddd %xmm5,%xmm0 - movdqa %xmm0,0x10(%rsp) - pxor %xmm0,%xmm13 - pshufb %xmm3,%xmm13 - # x2 += x6, x14 = rotl32(x14 ^ x2, 16) - movdqa 0x20(%rsp),%xmm0 - paddd %xmm6,%xmm0 - movdqa %xmm0,0x20(%rsp) - pxor %xmm0,%xmm14 - pshufb %xmm3,%xmm14 - # x3 += x7, x15 = rotl32(x15 ^ x3, 16) - movdqa 0x30(%rsp),%xmm0 - paddd %xmm7,%xmm0 - movdqa %xmm0,0x30(%rsp) - pxor %xmm0,%xmm15 - pshufb %xmm3,%xmm15 - - # x8 += x12, x4 = rotl32(x4 ^ x8, 12) - paddd %xmm12,%xmm8 - pxor %xmm8,%xmm4 - movdqa %xmm4,%xmm0 - pslld $12,%xmm0 - psrld $20,%xmm4 - por %xmm0,%xmm4 - # x9 += x13, x5 = rotl32(x5 ^ x9, 12) - paddd %xmm13,%xmm9 - pxor %xmm9,%xmm5 - movdqa %xmm5,%xmm0 - pslld $12,%xmm0 - psrld $20,%xmm5 - por %xmm0,%xmm5 - # x10 += x14, x6 = rotl32(x6 ^ x10, 12) - paddd %xmm14,%xmm10 - pxor %xmm10,%xmm6 - movdqa %xmm6,%xmm0 - pslld $12,%xmm0 - psrld $20,%xmm6 - por %xmm0,%xmm6 - # x11 += x15, x7 = rotl32(x7 ^ x11, 12) - paddd %xmm15,%xmm11 - pxor %xmm11,%xmm7 - movdqa %xmm7,%xmm0 - pslld $12,%xmm0 - psrld $20,%xmm7 - por %xmm0,%xmm7 - - # x0 += x4, x12 = rotl32(x12 ^ x0, 8) - movdqa 0x00(%rsp),%xmm0 - paddd %xmm4,%xmm0 - movdqa %xmm0,0x00(%rsp) - pxor %xmm0,%xmm12 - pshufb %xmm2,%xmm12 - # x1 += x5, x13 = rotl32(x13 ^ x1, 8) - movdqa 0x10(%rsp),%xmm0 - paddd %xmm5,%xmm0 - movdqa %xmm0,0x10(%rsp) - pxor %xmm0,%xmm13 - pshufb %xmm2,%xmm13 - # x2 += x6, x14 = rotl32(x14 ^ x2, 8) - movdqa 0x20(%rsp),%xmm0 - paddd %xmm6,%xmm0 - movdqa %xmm0,0x20(%rsp) - pxor %xmm0,%xmm14 - pshufb %xmm2,%xmm14 - # x3 += x7, x15 = rotl32(x15 ^ x3, 8) - movdqa 0x30(%rsp),%xmm0 - paddd %xmm7,%xmm0 - movdqa %xmm0,0x30(%rsp) - pxor %xmm0,%xmm15 - pshufb %xmm2,%xmm15 - - # x8 += x12, x4 = rotl32(x4 ^ x8, 7) - paddd %xmm12,%xmm8 - pxor %xmm8,%xmm4 - movdqa %xmm4,%xmm0 - pslld $7,%xmm0 - psrld $25,%xmm4 - por %xmm0,%xmm4 - # x9 += x13, x5 = rotl32(x5 ^ x9, 7) - paddd %xmm13,%xmm9 - pxor %xmm9,%xmm5 - movdqa %xmm5,%xmm0 - pslld $7,%xmm0 - psrld $25,%xmm5 - por %xmm0,%xmm5 - # x10 += x14, x6 = rotl32(x6 ^ x10, 7) - paddd %xmm14,%xmm10 - pxor %xmm10,%xmm6 - movdqa %xmm6,%xmm0 - pslld $7,%xmm0 - psrld $25,%xmm6 - por %xmm0,%xmm6 - # x11 += x15, x7 = rotl32(x7 ^ x11, 7) - paddd %xmm15,%xmm11 - pxor %xmm11,%xmm7 - movdqa %xmm7,%xmm0 - pslld $7,%xmm0 - psrld $25,%xmm7 - por %xmm0,%xmm7 - - # x0 += x5, x15 = rotl32(x15 ^ x0, 16) - movdqa 0x00(%rsp),%xmm0 - paddd %xmm5,%xmm0 - movdqa %xmm0,0x00(%rsp) - pxor %xmm0,%xmm15 - pshufb %xmm3,%xmm15 - # x1 += x6, x12 = rotl32(x12 ^ x1, 16) - movdqa 0x10(%rsp),%xmm0 - paddd %xmm6,%xmm0 - movdqa %xmm0,0x10(%rsp) - pxor %xmm0,%xmm12 - pshufb %xmm3,%xmm12 - # x2 += x7, x13 = rotl32(x13 ^ x2, 16) - movdqa 0x20(%rsp),%xmm0 - paddd %xmm7,%xmm0 - movdqa %xmm0,0x20(%rsp) - pxor %xmm0,%xmm13 - pshufb %xmm3,%xmm13 - # x3 += x4, x14 = rotl32(x14 ^ x3, 16) - movdqa 0x30(%rsp),%xmm0 - paddd %xmm4,%xmm0 - movdqa %xmm0,0x30(%rsp) - pxor %xmm0,%xmm14 - pshufb %xmm3,%xmm14 - - # x10 += x15, x5 = rotl32(x5 ^ x10, 12) - paddd %xmm15,%xmm10 - pxor %xmm10,%xmm5 - movdqa %xmm5,%xmm0 - pslld $12,%xmm0 - psrld $20,%xmm5 - por %xmm0,%xmm5 - # x11 += x12, x6 = rotl32(x6 ^ x11, 12) - paddd %xmm12,%xmm11 - pxor %xmm11,%xmm6 - movdqa %xmm6,%xmm0 - pslld $12,%xmm0 - psrld $20,%xmm6 - por %xmm0,%xmm6 - # x8 += x13, x7 = rotl32(x7 ^ x8, 12) - paddd %xmm13,%xmm8 - pxor %xmm8,%xmm7 - movdqa %xmm7,%xmm0 - pslld $12,%xmm0 - psrld $20,%xmm7 - por %xmm0,%xmm7 - # x9 += x14, x4 = rotl32(x4 ^ x9, 12) - paddd %xmm14,%xmm9 - pxor %xmm9,%xmm4 - movdqa %xmm4,%xmm0 - pslld $12,%xmm0 - psrld $20,%xmm4 - por %xmm0,%xmm4 - - # x0 += x5, x15 = rotl32(x15 ^ x0, 8) - movdqa 0x00(%rsp),%xmm0 - paddd %xmm5,%xmm0 - movdqa %xmm0,0x00(%rsp) - pxor %xmm0,%xmm15 - pshufb %xmm2,%xmm15 - # x1 += x6, x12 = rotl32(x12 ^ x1, 8) - movdqa 0x10(%rsp),%xmm0 - paddd %xmm6,%xmm0 - movdqa %xmm0,0x10(%rsp) - pxor %xmm0,%xmm12 - pshufb %xmm2,%xmm12 - # x2 += x7, x13 = rotl32(x13 ^ x2, 8) - movdqa 0x20(%rsp),%xmm0 - paddd %xmm7,%xmm0 - movdqa %xmm0,0x20(%rsp) - pxor %xmm0,%xmm13 - pshufb %xmm2,%xmm13 - # x3 += x4, x14 = rotl32(x14 ^ x3, 8) - movdqa 0x30(%rsp),%xmm0 - paddd %xmm4,%xmm0 - movdqa %xmm0,0x30(%rsp) - pxor %xmm0,%xmm14 - pshufb %xmm2,%xmm14 - - # x10 += x15, x5 = rotl32(x5 ^ x10, 7) - paddd %xmm15,%xmm10 - pxor %xmm10,%xmm5 - movdqa %xmm5,%xmm0 - pslld $7,%xmm0 - psrld $25,%xmm5 - por %xmm0,%xmm5 - # x11 += x12, x6 = rotl32(x6 ^ x11, 7) - paddd %xmm12,%xmm11 - pxor %xmm11,%xmm6 - movdqa %xmm6,%xmm0 - pslld $7,%xmm0 - psrld $25,%xmm6 - por %xmm0,%xmm6 - # x8 += x13, x7 = rotl32(x7 ^ x8, 7) - paddd %xmm13,%xmm8 - pxor %xmm8,%xmm7 - movdqa %xmm7,%xmm0 - pslld $7,%xmm0 - psrld $25,%xmm7 - por %xmm0,%xmm7 - # x9 += x14, x4 = rotl32(x4 ^ x9, 7) - paddd %xmm14,%xmm9 - pxor %xmm9,%xmm4 - movdqa %xmm4,%xmm0 - pslld $7,%xmm0 - psrld $25,%xmm4 - por %xmm0,%xmm4 - - dec %ecx - jnz .Ldoubleround4 - - # x0[0-3] += s0[0] - # x1[0-3] += s0[1] - movq 0x00(%rdi),%xmm3 - pshufd $0x00,%xmm3,%xmm2 - pshufd $0x55,%xmm3,%xmm3 - paddd 0x00(%rsp),%xmm2 - movdqa %xmm2,0x00(%rsp) - paddd 0x10(%rsp),%xmm3 - movdqa %xmm3,0x10(%rsp) - # x2[0-3] += s0[2] - # x3[0-3] += s0[3] - movq 0x08(%rdi),%xmm3 - pshufd $0x00,%xmm3,%xmm2 - pshufd $0x55,%xmm3,%xmm3 - paddd 0x20(%rsp),%xmm2 - movdqa %xmm2,0x20(%rsp) - paddd 0x30(%rsp),%xmm3 - movdqa %xmm3,0x30(%rsp) - - # x4[0-3] += s1[0] - # x5[0-3] += s1[1] - movq 0x10(%rdi),%xmm3 - pshufd $0x00,%xmm3,%xmm2 - pshufd $0x55,%xmm3,%xmm3 - paddd %xmm2,%xmm4 - paddd %xmm3,%xmm5 - # x6[0-3] += s1[2] - # x7[0-3] += s1[3] - movq 0x18(%rdi),%xmm3 - pshufd $0x00,%xmm3,%xmm2 - pshufd $0x55,%xmm3,%xmm3 - paddd %xmm2,%xmm6 - paddd %xmm3,%xmm7 - - # x8[0-3] += s2[0] - # x9[0-3] += s2[1] - movq 0x20(%rdi),%xmm3 - pshufd $0x00,%xmm3,%xmm2 - pshufd $0x55,%xmm3,%xmm3 - paddd %xmm2,%xmm8 - paddd %xmm3,%xmm9 - # x10[0-3] += s2[2] - # x11[0-3] += s2[3] - movq 0x28(%rdi),%xmm3 - pshufd $0x00,%xmm3,%xmm2 - pshufd $0x55,%xmm3,%xmm3 - paddd %xmm2,%xmm10 - paddd %xmm3,%xmm11 - - # x12[0-3] += s3[0] - # x13[0-3] += s3[1] - movq 0x30(%rdi),%xmm3 - pshufd $0x00,%xmm3,%xmm2 - pshufd $0x55,%xmm3,%xmm3 - paddd %xmm2,%xmm12 - paddd %xmm3,%xmm13 - # x14[0-3] += s3[2] - # x15[0-3] += s3[3] - movq 0x38(%rdi),%xmm3 - pshufd $0x00,%xmm3,%xmm2 - pshufd $0x55,%xmm3,%xmm3 - paddd %xmm2,%xmm14 - paddd %xmm3,%xmm15 - - # x12 += counter values 0-3 - paddd %xmm1,%xmm12 - - # interleave 32-bit words in state n, n+1 - movdqa 0x00(%rsp),%xmm0 - movdqa 0x10(%rsp),%xmm1 - movdqa %xmm0,%xmm2 - punpckldq %xmm1,%xmm2 - punpckhdq %xmm1,%xmm0 - movdqa %xmm2,0x00(%rsp) - movdqa %xmm0,0x10(%rsp) - movdqa 0x20(%rsp),%xmm0 - movdqa 0x30(%rsp),%xmm1 - movdqa %xmm0,%xmm2 - punpckldq %xmm1,%xmm2 - punpckhdq %xmm1,%xmm0 - movdqa %xmm2,0x20(%rsp) - movdqa %xmm0,0x30(%rsp) - movdqa %xmm4,%xmm0 - punpckldq %xmm5,%xmm4 - punpckhdq %xmm5,%xmm0 - movdqa %xmm0,%xmm5 - movdqa %xmm6,%xmm0 - punpckldq %xmm7,%xmm6 - punpckhdq %xmm7,%xmm0 - movdqa %xmm0,%xmm7 - movdqa %xmm8,%xmm0 - punpckldq %xmm9,%xmm8 - punpckhdq %xmm9,%xmm0 - movdqa %xmm0,%xmm9 - movdqa %xmm10,%xmm0 - punpckldq %xmm11,%xmm10 - punpckhdq %xmm11,%xmm0 - movdqa %xmm0,%xmm11 - movdqa %xmm12,%xmm0 - punpckldq %xmm13,%xmm12 - punpckhdq %xmm13,%xmm0 - movdqa %xmm0,%xmm13 - movdqa %xmm14,%xmm0 - punpckldq %xmm15,%xmm14 - punpckhdq %xmm15,%xmm0 - movdqa %xmm0,%xmm15 - - # interleave 64-bit words in state n, n+2 - movdqa 0x00(%rsp),%xmm0 - movdqa 0x20(%rsp),%xmm1 - movdqa %xmm0,%xmm2 - punpcklqdq %xmm1,%xmm2 - punpckhqdq %xmm1,%xmm0 - movdqa %xmm2,0x00(%rsp) - movdqa %xmm0,0x20(%rsp) - movdqa 0x10(%rsp),%xmm0 - movdqa 0x30(%rsp),%xmm1 - movdqa %xmm0,%xmm2 - punpcklqdq %xmm1,%xmm2 - punpckhqdq %xmm1,%xmm0 - movdqa %xmm2,0x10(%rsp) - movdqa %xmm0,0x30(%rsp) - movdqa %xmm4,%xmm0 - punpcklqdq %xmm6,%xmm4 - punpckhqdq %xmm6,%xmm0 - movdqa %xmm0,%xmm6 - movdqa %xmm5,%xmm0 - punpcklqdq %xmm7,%xmm5 - punpckhqdq %xmm7,%xmm0 - movdqa %xmm0,%xmm7 - movdqa %xmm8,%xmm0 - punpcklqdq %xmm10,%xmm8 - punpckhqdq %xmm10,%xmm0 - movdqa %xmm0,%xmm10 - movdqa %xmm9,%xmm0 - punpcklqdq %xmm11,%xmm9 - punpckhqdq %xmm11,%xmm0 - movdqa %xmm0,%xmm11 - movdqa %xmm12,%xmm0 - punpcklqdq %xmm14,%xmm12 - punpckhqdq %xmm14,%xmm0 - movdqa %xmm0,%xmm14 - movdqa %xmm13,%xmm0 - punpcklqdq %xmm15,%xmm13 - punpckhqdq %xmm15,%xmm0 - movdqa %xmm0,%xmm15 - - # xor with corresponding input, write to output - movdqa 0x00(%rsp),%xmm0 - cmp $0x10,%rax - jl .Lxorpart4 - movdqu 0x00(%rdx),%xmm1 - pxor %xmm1,%xmm0 - movdqu %xmm0,0x00(%rsi) - - movdqu %xmm4,%xmm0 - cmp $0x20,%rax - jl .Lxorpart4 - movdqu 0x10(%rdx),%xmm1 - pxor %xmm1,%xmm0 - movdqu %xmm0,0x10(%rsi) - - movdqu %xmm8,%xmm0 - cmp $0x30,%rax - jl .Lxorpart4 - movdqu 0x20(%rdx),%xmm1 - pxor %xmm1,%xmm0 - movdqu %xmm0,0x20(%rsi) - - movdqu %xmm12,%xmm0 - cmp $0x40,%rax - jl .Lxorpart4 - movdqu 0x30(%rdx),%xmm1 - pxor %xmm1,%xmm0 - movdqu %xmm0,0x30(%rsi) - - movdqa 0x20(%rsp),%xmm0 - cmp $0x50,%rax - jl .Lxorpart4 - movdqu 0x40(%rdx),%xmm1 - pxor %xmm1,%xmm0 - movdqu %xmm0,0x40(%rsi) - - movdqu %xmm6,%xmm0 - cmp $0x60,%rax - jl .Lxorpart4 - movdqu 0x50(%rdx),%xmm1 - pxor %xmm1,%xmm0 - movdqu %xmm0,0x50(%rsi) - - movdqu %xmm10,%xmm0 - cmp $0x70,%rax - jl .Lxorpart4 - movdqu 0x60(%rdx),%xmm1 - pxor %xmm1,%xmm0 - movdqu %xmm0,0x60(%rsi) - - movdqu %xmm14,%xmm0 - cmp $0x80,%rax - jl .Lxorpart4 - movdqu 0x70(%rdx),%xmm1 - pxor %xmm1,%xmm0 - movdqu %xmm0,0x70(%rsi) - - movdqa 0x10(%rsp),%xmm0 - cmp $0x90,%rax - jl .Lxorpart4 - movdqu 0x80(%rdx),%xmm1 - pxor %xmm1,%xmm0 - movdqu %xmm0,0x80(%rsi) - - movdqu %xmm5,%xmm0 - cmp $0xa0,%rax - jl .Lxorpart4 - movdqu 0x90(%rdx),%xmm1 - pxor %xmm1,%xmm0 - movdqu %xmm0,0x90(%rsi) - - movdqu %xmm9,%xmm0 - cmp $0xb0,%rax - jl .Lxorpart4 - movdqu 0xa0(%rdx),%xmm1 - pxor %xmm1,%xmm0 - movdqu %xmm0,0xa0(%rsi) - - movdqu %xmm13,%xmm0 - cmp $0xc0,%rax - jl .Lxorpart4 - movdqu 0xb0(%rdx),%xmm1 - pxor %xmm1,%xmm0 - movdqu %xmm0,0xb0(%rsi) - - movdqa 0x30(%rsp),%xmm0 - cmp $0xd0,%rax - jl .Lxorpart4 - movdqu 0xc0(%rdx),%xmm1 - pxor %xmm1,%xmm0 - movdqu %xmm0,0xc0(%rsi) - - movdqu %xmm7,%xmm0 - cmp $0xe0,%rax - jl .Lxorpart4 - movdqu 0xd0(%rdx),%xmm1 - pxor %xmm1,%xmm0 - movdqu %xmm0,0xd0(%rsi) - - movdqu %xmm11,%xmm0 - cmp $0xf0,%rax - jl .Lxorpart4 - movdqu 0xe0(%rdx),%xmm1 - pxor %xmm1,%xmm0 - movdqu %xmm0,0xe0(%rsi) - - movdqu %xmm15,%xmm0 - cmp $0x100,%rax - jl .Lxorpart4 - movdqu 0xf0(%rdx),%xmm1 - pxor %xmm1,%xmm0 - movdqu %xmm0,0xf0(%rsi) - -.Ldone4: - lea -8(%r10),%rsp - ret - -.Lxorpart4: - # xor remaining bytes from partial register into output - mov %rax,%r9 - and $0x0f,%r9 - jz .Ldone4 - and $~0x0f,%rax - - mov %rsi,%r11 - - lea (%rdx,%rax),%rsi - mov %rsp,%rdi - mov %r9,%rcx - rep movsb - - pxor 0x00(%rsp),%xmm0 - movdqa %xmm0,0x00(%rsp) - - mov %rsp,%rsi - lea (%r11,%rax),%rdi - mov %r9,%rcx - rep movsb - - jmp .Ldone4 - -ENDPROC(chacha20_4block_xor_ssse3) diff --git a/arch/x86/crypto/chacha20_glue.c b/arch/x86/crypto/chacha20_glue.c deleted file mode 100644 index 70d388e4a3a2..000000000000 --- a/arch/x86/crypto/chacha20_glue.c +++ /dev/null @@ -1,264 +0,0 @@ -/* - * ChaCha20 256-bit cipher algorithm, RFC7539, SIMD glue code - * - * Copyright (C) 2015 Martin Willi - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - */ - -#include -#include -#include -#include -#include -#include -#include - -#define CHACHA20_STATE_ALIGN 16 - -asmlinkage void chacha20_block_xor_ssse3(u32 *state, u8 *dst, const u8 *src, - unsigned int len); -asmlinkage void chacha20_4block_xor_ssse3(u32 *state, u8 *dst, const u8 *src, - unsigned int len); -asmlinkage void hchacha20_block_ssse3(const u32 *state, u32 *out); -#ifdef CONFIG_AS_AVX2 -asmlinkage void chacha20_2block_xor_avx2(u32 *state, u8 *dst, const u8 *src, - unsigned int len); -asmlinkage void chacha20_4block_xor_avx2(u32 *state, u8 *dst, const u8 *src, - unsigned int len); -asmlinkage void chacha20_8block_xor_avx2(u32 *state, u8 *dst, const u8 *src, - unsigned int len); -static bool chacha20_use_avx2; -#ifdef CONFIG_AS_AVX512 -asmlinkage void chacha20_2block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src, - unsigned int len); -asmlinkage void chacha20_4block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src, - unsigned int len); -asmlinkage void chacha20_8block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src, - unsigned int len); -static bool chacha20_use_avx512vl; -#endif -#endif - -static unsigned int chacha20_advance(unsigned int len, unsigned int maxblocks) -{ - len = min(len, maxblocks * CHACHA_BLOCK_SIZE); - return round_up(len, CHACHA_BLOCK_SIZE) / CHACHA_BLOCK_SIZE; -} - -static void chacha20_dosimd(u32 *state, u8 *dst, const u8 *src, - unsigned int bytes) -{ -#ifdef CONFIG_AS_AVX2 -#ifdef CONFIG_AS_AVX512 - if (chacha20_use_avx512vl) { - while (bytes >= CHACHA_BLOCK_SIZE * 8) { - chacha20_8block_xor_avx512vl(state, dst, src, bytes); - bytes -= CHACHA_BLOCK_SIZE * 8; - src += CHACHA_BLOCK_SIZE * 8; - dst += CHACHA_BLOCK_SIZE * 8; - state[12] += 8; - } - if (bytes > CHACHA_BLOCK_SIZE * 4) { - chacha20_8block_xor_avx512vl(state, dst, src, bytes); - state[12] += chacha20_advance(bytes, 8); - return; - } - if (bytes > CHACHA_BLOCK_SIZE * 2) { - chacha20_4block_xor_avx512vl(state, dst, src, bytes); - state[12] += chacha20_advance(bytes, 4); - return; - } - if (bytes) { - chacha20_2block_xor_avx512vl(state, dst, src, bytes); - state[12] += chacha20_advance(bytes, 2); - return; - } - } -#endif - if (chacha20_use_avx2) { - while (bytes >= CHACHA_BLOCK_SIZE * 8) { - chacha20_8block_xor_avx2(state, dst, src, bytes); - bytes -= CHACHA_BLOCK_SIZE * 8; - src += CHACHA_BLOCK_SIZE * 8; - dst += CHACHA_BLOCK_SIZE * 8; - state[12] += 8; - } - if (bytes > CHACHA_BLOCK_SIZE * 4) { - chacha20_8block_xor_avx2(state, dst, src, bytes); - state[12] += chacha20_advance(bytes, 8); - return; - } - if (bytes > CHACHA_BLOCK_SIZE * 2) { - chacha20_4block_xor_avx2(state, dst, src, bytes); - state[12] += chacha20_advance(bytes, 4); - return; - } - if (bytes > CHACHA_BLOCK_SIZE) { - chacha20_2block_xor_avx2(state, dst, src, bytes); - state[12] += chacha20_advance(bytes, 2); - return; - } - } -#endif - while (bytes >= CHACHA_BLOCK_SIZE * 4) { - chacha20_4block_xor_ssse3(state, dst, src, bytes); - bytes -= CHACHA_BLOCK_SIZE * 4; - src += CHACHA_BLOCK_SIZE * 4; - dst += CHACHA_BLOCK_SIZE * 4; - state[12] += 4; - } - if (bytes > CHACHA_BLOCK_SIZE) { - chacha20_4block_xor_ssse3(state, dst, src, bytes); - state[12] += chacha20_advance(bytes, 4); - return; - } - if (bytes) { - chacha20_block_xor_ssse3(state, dst, src, bytes); - state[12]++; - } -} - -static int chacha20_simd_stream_xor(struct skcipher_request *req, - struct chacha_ctx *ctx, u8 *iv) -{ - u32 *state, state_buf[16 + 2] __aligned(8); - struct skcipher_walk walk; - int err; - - BUILD_BUG_ON(CHACHA20_STATE_ALIGN != 16); - state = PTR_ALIGN(state_buf + 0, CHACHA20_STATE_ALIGN); - - err = skcipher_walk_virt(&walk, req, true); - - crypto_chacha_init(state, ctx, iv); - - while (walk.nbytes > 0) { - unsigned int nbytes = walk.nbytes; - - if (nbytes < walk.total) - nbytes = round_down(nbytes, walk.stride); - - chacha20_dosimd(state, walk.dst.virt.addr, walk.src.virt.addr, - nbytes); - - err = skcipher_walk_done(&walk, walk.nbytes - nbytes); - } - - return err; -} - -static int chacha20_simd(struct skcipher_request *req) -{ - struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm); - int err; - - if (req->cryptlen <= CHACHA_BLOCK_SIZE || !irq_fpu_usable()) - return crypto_chacha_crypt(req); - - kernel_fpu_begin(); - err = chacha20_simd_stream_xor(req, ctx, req->iv); - kernel_fpu_end(); - return err; -} - -static int xchacha20_simd(struct skcipher_request *req) -{ - struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm); - struct chacha_ctx subctx; - u32 *state, state_buf[16 + 2] __aligned(8); - u8 real_iv[16]; - int err; - - if (req->cryptlen <= CHACHA_BLOCK_SIZE || !irq_fpu_usable()) - return crypto_xchacha_crypt(req); - - BUILD_BUG_ON(CHACHA20_STATE_ALIGN != 16); - state = PTR_ALIGN(state_buf + 0, CHACHA20_STATE_ALIGN); - crypto_chacha_init(state, ctx, req->iv); - - kernel_fpu_begin(); - - hchacha20_block_ssse3(state, subctx.key); - - memcpy(&real_iv[0], req->iv + 24, 8); - memcpy(&real_iv[8], req->iv + 16, 8); - err = chacha20_simd_stream_xor(req, &subctx, real_iv); - - kernel_fpu_end(); - - return err; -} - -static struct skcipher_alg algs[] = { - { - .base.cra_name = "chacha20", - .base.cra_driver_name = "chacha20-simd", - .base.cra_priority = 300, - .base.cra_blocksize = 1, - .base.cra_ctxsize = sizeof(struct chacha_ctx), - .base.cra_module = THIS_MODULE, - - .min_keysize = CHACHA_KEY_SIZE, - .max_keysize = CHACHA_KEY_SIZE, - .ivsize = CHACHA_IV_SIZE, - .chunksize = CHACHA_BLOCK_SIZE, - .setkey = crypto_chacha20_setkey, - .encrypt = chacha20_simd, - .decrypt = chacha20_simd, - }, { - .base.cra_name = "xchacha20", - .base.cra_driver_name = "xchacha20-simd", - .base.cra_priority = 300, - .base.cra_blocksize = 1, - .base.cra_ctxsize = sizeof(struct chacha_ctx), - .base.cra_module = THIS_MODULE, - - .min_keysize = CHACHA_KEY_SIZE, - .max_keysize = CHACHA_KEY_SIZE, - .ivsize = XCHACHA_IV_SIZE, - .chunksize = CHACHA_BLOCK_SIZE, - .setkey = crypto_chacha20_setkey, - .encrypt = xchacha20_simd, - .decrypt = xchacha20_simd, - }, -}; - -static int __init chacha20_simd_mod_init(void) -{ - if (!boot_cpu_has(X86_FEATURE_SSSE3)) - return -ENODEV; - -#ifdef CONFIG_AS_AVX2 - chacha20_use_avx2 = boot_cpu_has(X86_FEATURE_AVX) && - boot_cpu_has(X86_FEATURE_AVX2) && - cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL); -#ifdef CONFIG_AS_AVX512 - chacha20_use_avx512vl = chacha20_use_avx2 && - boot_cpu_has(X86_FEATURE_AVX512VL) && - boot_cpu_has(X86_FEATURE_AVX512BW); /* kmovq */ -#endif -#endif - return crypto_register_skciphers(algs, ARRAY_SIZE(algs)); -} - -static void __exit chacha20_simd_mod_fini(void) -{ - crypto_unregister_skciphers(algs, ARRAY_SIZE(algs)); -} - -module_init(chacha20_simd_mod_init); -module_exit(chacha20_simd_mod_fini); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Martin Willi "); -MODULE_DESCRIPTION("chacha20 cipher algorithm, SIMD accelerated"); -MODULE_ALIAS_CRYPTO("chacha20"); -MODULE_ALIAS_CRYPTO("chacha20-simd"); -MODULE_ALIAS_CRYPTO("xchacha20"); -MODULE_ALIAS_CRYPTO("xchacha20-simd"); diff --git a/arch/x86/crypto/chacha_glue.c b/arch/x86/crypto/chacha_glue.c new file mode 100644 index 000000000000..35fd02b50d27 --- /dev/null +++ b/arch/x86/crypto/chacha_glue.c @@ -0,0 +1,270 @@ +/* + * x64 SIMD accelerated ChaCha and XChaCha stream ciphers, + * including ChaCha20 (RFC7539) + * + * Copyright (C) 2015 Martin Willi + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include + +#define CHACHA_STATE_ALIGN 16 + +asmlinkage void chacha_block_xor_ssse3(u32 *state, u8 *dst, const u8 *src, + unsigned int len, int nrounds); +asmlinkage void chacha_4block_xor_ssse3(u32 *state, u8 *dst, const u8 *src, + unsigned int len, int nrounds); +asmlinkage void hchacha_block_ssse3(const u32 *state, u32 *out, int nrounds); +#ifdef CONFIG_AS_AVX2 +asmlinkage void chacha_2block_xor_avx2(u32 *state, u8 *dst, const u8 *src, + unsigned int len, int nrounds); +asmlinkage void chacha_4block_xor_avx2(u32 *state, u8 *dst, const u8 *src, + unsigned int len, int nrounds); +asmlinkage void chacha_8block_xor_avx2(u32 *state, u8 *dst, const u8 *src, + unsigned int len, int nrounds); +static bool chacha_use_avx2; +#ifdef CONFIG_AS_AVX512 +asmlinkage void chacha_2block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src, + unsigned int len, int nrounds); +asmlinkage void chacha_4block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src, + unsigned int len, int nrounds); +asmlinkage void chacha_8block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src, + unsigned int len, int nrounds); +static bool chacha_use_avx512vl; +#endif +#endif + +static unsigned int chacha_advance(unsigned int len, unsigned int maxblocks) +{ + len = min(len, maxblocks * CHACHA_BLOCK_SIZE); + return round_up(len, CHACHA_BLOCK_SIZE) / CHACHA_BLOCK_SIZE; +} + +static void chacha_dosimd(u32 *state, u8 *dst, const u8 *src, + unsigned int bytes, int nrounds) +{ +#ifdef CONFIG_AS_AVX2 +#ifdef CONFIG_AS_AVX512 + if (chacha_use_avx512vl) { + while (bytes >= CHACHA_BLOCK_SIZE * 8) { + chacha_8block_xor_avx512vl(state, dst, src, bytes, + nrounds); + bytes -= CHACHA_BLOCK_SIZE * 8; + src += CHACHA_BLOCK_SIZE * 8; + dst += CHACHA_BLOCK_SIZE * 8; + state[12] += 8; + } + if (bytes > CHACHA_BLOCK_SIZE * 4) { + chacha_8block_xor_avx512vl(state, dst, src, bytes, + nrounds); + state[12] += chacha_advance(bytes, 8); + return; + } + if (bytes > CHACHA_BLOCK_SIZE * 2) { + chacha_4block_xor_avx512vl(state, dst, src, bytes, + nrounds); + state[12] += chacha_advance(bytes, 4); + return; + } + if (bytes) { + chacha_2block_xor_avx512vl(state, dst, src, bytes, + nrounds); + state[12] += chacha_advance(bytes, 2); + return; + } + } +#endif + if (chacha_use_avx2) { + while (bytes >= CHACHA_BLOCK_SIZE * 8) { + chacha_8block_xor_avx2(state, dst, src, bytes, nrounds); + bytes -= CHACHA_BLOCK_SIZE * 8; + src += CHACHA_BLOCK_SIZE * 8; + dst += CHACHA_BLOCK_SIZE * 8; + state[12] += 8; + } + if (bytes > CHACHA_BLOCK_SIZE * 4) { + chacha_8block_xor_avx2(state, dst, src, bytes, nrounds); + state[12] += chacha_advance(bytes, 8); + return; + } + if (bytes > CHACHA_BLOCK_SIZE * 2) { + chacha_4block_xor_avx2(state, dst, src, bytes, nrounds); + state[12] += chacha_advance(bytes, 4); + return; + } + if (bytes > CHACHA_BLOCK_SIZE) { + chacha_2block_xor_avx2(state, dst, src, bytes, nrounds); + state[12] += chacha_advance(bytes, 2); + return; + } + } +#endif + while (bytes >= CHACHA_BLOCK_SIZE * 4) { + chacha_4block_xor_ssse3(state, dst, src, bytes, nrounds); + bytes -= CHACHA_BLOCK_SIZE * 4; + src += CHACHA_BLOCK_SIZE * 4; + dst += CHACHA_BLOCK_SIZE * 4; + state[12] += 4; + } + if (bytes > CHACHA_BLOCK_SIZE) { + chacha_4block_xor_ssse3(state, dst, src, bytes, nrounds); + state[12] += chacha_advance(bytes, 4); + return; + } + if (bytes) { + chacha_block_xor_ssse3(state, dst, src, bytes, nrounds); + state[12]++; + } +} + +static int chacha_simd_stream_xor(struct skcipher_request *req, + struct chacha_ctx *ctx, u8 *iv) +{ + u32 *state, state_buf[16 + 2] __aligned(8); + struct skcipher_walk walk; + int err; + + BUILD_BUG_ON(CHACHA_STATE_ALIGN != 16); + state = PTR_ALIGN(state_buf + 0, CHACHA_STATE_ALIGN); + + err = skcipher_walk_virt(&walk, req, true); + + crypto_chacha_init(state, ctx, iv); + + while (walk.nbytes > 0) { + unsigned int nbytes = walk.nbytes; + + if (nbytes < walk.total) + nbytes = round_down(nbytes, walk.stride); + + chacha_dosimd(state, walk.dst.virt.addr, walk.src.virt.addr, + nbytes, ctx->nrounds); + + err = skcipher_walk_done(&walk, walk.nbytes - nbytes); + } + + return err; +} + +static int chacha_simd(struct skcipher_request *req) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm); + int err; + + if (req->cryptlen <= CHACHA_BLOCK_SIZE || !irq_fpu_usable()) + return crypto_chacha_crypt(req); + + kernel_fpu_begin(); + err = chacha_simd_stream_xor(req, ctx, req->iv); + kernel_fpu_end(); + return err; +} + +static int xchacha_simd(struct skcipher_request *req) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm); + struct chacha_ctx subctx; + u32 *state, state_buf[16 + 2] __aligned(8); + u8 real_iv[16]; + int err; + + if (req->cryptlen <= CHACHA_BLOCK_SIZE || !irq_fpu_usable()) + return crypto_xchacha_crypt(req); + + BUILD_BUG_ON(CHACHA_STATE_ALIGN != 16); + state = PTR_ALIGN(state_buf + 0, CHACHA_STATE_ALIGN); + crypto_chacha_init(state, ctx, req->iv); + + kernel_fpu_begin(); + + hchacha_block_ssse3(state, subctx.key, ctx->nrounds); + subctx.nrounds = ctx->nrounds; + + memcpy(&real_iv[0], req->iv + 24, 8); + memcpy(&real_iv[8], req->iv + 16, 8); + err = chacha_simd_stream_xor(req, &subctx, real_iv); + + kernel_fpu_end(); + + return err; +} + +static struct skcipher_alg algs[] = { + { + .base.cra_name = "chacha20", + .base.cra_driver_name = "chacha20-simd", + .base.cra_priority = 300, + .base.cra_blocksize = 1, + .base.cra_ctxsize = sizeof(struct chacha_ctx), + .base.cra_module = THIS_MODULE, + + .min_keysize = CHACHA_KEY_SIZE, + .max_keysize = CHACHA_KEY_SIZE, + .ivsize = CHACHA_IV_SIZE, + .chunksize = CHACHA_BLOCK_SIZE, + .setkey = crypto_chacha20_setkey, + .encrypt = chacha_simd, + .decrypt = chacha_simd, + }, { + .base.cra_name = "xchacha20", + .base.cra_driver_name = "xchacha20-simd", + .base.cra_priority = 300, + .base.cra_blocksize = 1, + .base.cra_ctxsize = sizeof(struct chacha_ctx), + .base.cra_module = THIS_MODULE, + + .min_keysize = CHACHA_KEY_SIZE, + .max_keysize = CHACHA_KEY_SIZE, + .ivsize = XCHACHA_IV_SIZE, + .chunksize = CHACHA_BLOCK_SIZE, + .setkey = crypto_chacha20_setkey, + .encrypt = xchacha_simd, + .decrypt = xchacha_simd, + }, +}; + +static int __init chacha_simd_mod_init(void) +{ + if (!boot_cpu_has(X86_FEATURE_SSSE3)) + return -ENODEV; + +#ifdef CONFIG_AS_AVX2 + chacha_use_avx2 = boot_cpu_has(X86_FEATURE_AVX) && + boot_cpu_has(X86_FEATURE_AVX2) && + cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL); +#ifdef CONFIG_AS_AVX512 + chacha_use_avx512vl = chacha_use_avx2 && + boot_cpu_has(X86_FEATURE_AVX512VL) && + boot_cpu_has(X86_FEATURE_AVX512BW); /* kmovq */ +#endif +#endif + return crypto_register_skciphers(algs, ARRAY_SIZE(algs)); +} + +static void __exit chacha_simd_mod_fini(void) +{ + crypto_unregister_skciphers(algs, ARRAY_SIZE(algs)); +} + +module_init(chacha_simd_mod_init); +module_exit(chacha_simd_mod_fini); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Martin Willi "); +MODULE_DESCRIPTION("ChaCha and XChaCha stream ciphers (x64 SIMD accelerated)"); +MODULE_ALIAS_CRYPTO("chacha20"); +MODULE_ALIAS_CRYPTO("chacha20-simd"); +MODULE_ALIAS_CRYPTO("xchacha20"); +MODULE_ALIAS_CRYPTO("xchacha20-simd"); -- cgit v1.2.3-59-g8ed1b From 7a507d62258afd514583fadf1482451079fa0e4d Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Tue, 4 Dec 2018 22:20:04 -0800 Subject: crypto: x86/chacha - add XChaCha12 support Now that the x86_64 SIMD implementations of ChaCha20 and XChaCha20 have been refactored to support varying the number of rounds, add support for XChaCha12. This is identical to XChaCha20 except for the number of rounds, which is 12 instead of 20. This can be used by Adiantum. Reviewed-by: Martin Willi Signed-off-by: Eric Biggers Signed-off-by: Herbert Xu --- arch/x86/crypto/chacha_glue.c | 17 +++++++++++++++++ crypto/Kconfig | 4 ++-- 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/arch/x86/crypto/chacha_glue.c b/arch/x86/crypto/chacha_glue.c index 35fd02b50d27..d19c2908be90 100644 --- a/arch/x86/crypto/chacha_glue.c +++ b/arch/x86/crypto/chacha_glue.c @@ -232,6 +232,21 @@ static struct skcipher_alg algs[] = { .setkey = crypto_chacha20_setkey, .encrypt = xchacha_simd, .decrypt = xchacha_simd, + }, { + .base.cra_name = "xchacha12", + .base.cra_driver_name = "xchacha12-simd", + .base.cra_priority = 300, + .base.cra_blocksize = 1, + .base.cra_ctxsize = sizeof(struct chacha_ctx), + .base.cra_module = THIS_MODULE, + + .min_keysize = CHACHA_KEY_SIZE, + .max_keysize = CHACHA_KEY_SIZE, + .ivsize = XCHACHA_IV_SIZE, + .chunksize = CHACHA_BLOCK_SIZE, + .setkey = crypto_chacha12_setkey, + .encrypt = xchacha_simd, + .decrypt = xchacha_simd, }, }; @@ -268,3 +283,5 @@ MODULE_ALIAS_CRYPTO("chacha20"); MODULE_ALIAS_CRYPTO("chacha20-simd"); MODULE_ALIAS_CRYPTO("xchacha20"); MODULE_ALIAS_CRYPTO("xchacha20-simd"); +MODULE_ALIAS_CRYPTO("xchacha12"); +MODULE_ALIAS_CRYPTO("xchacha12-simd"); diff --git a/crypto/Kconfig b/crypto/Kconfig index dc3a0e3ea2ac..045af6eeb7e2 100644 --- a/crypto/Kconfig +++ b/crypto/Kconfig @@ -1473,8 +1473,8 @@ config CRYPTO_CHACHA20_X86_64 select CRYPTO_BLKCIPHER select CRYPTO_CHACHA20 help - SSSE3, AVX2, and AVX-512VL optimized implementations of the ChaCha20 - and XChaCha20 stream ciphers. + SSSE3, AVX2, and AVX-512VL optimized implementations of the ChaCha20, + XChaCha20, and XChaCha12 stream ciphers. config CRYPTO_SEED tristate "SEED cipher algorithm" -- cgit v1.2.3-59-g8ed1b From a033aed5a84eb93a32929b6862602cb283d39e82 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Tue, 4 Dec 2018 22:20:05 -0800 Subject: crypto: x86/chacha - yield the FPU occasionally To improve responsiveness, yield the FPU (temporarily re-enabling preemption) every 4 KiB encrypted/decrypted, rather than keeping preemption disabled during the entire encryption/decryption operation. Alternatively we could do this for every skcipher_walk step, but steps may be small in some cases, and yielding the FPU is expensive on x86. Suggested-by: Martin Willi Signed-off-by: Eric Biggers Signed-off-by: Herbert Xu --- arch/x86/crypto/chacha_glue.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/arch/x86/crypto/chacha_glue.c b/arch/x86/crypto/chacha_glue.c index d19c2908be90..9b1d3fac4943 100644 --- a/arch/x86/crypto/chacha_glue.c +++ b/arch/x86/crypto/chacha_glue.c @@ -132,6 +132,7 @@ static int chacha_simd_stream_xor(struct skcipher_request *req, { u32 *state, state_buf[16 + 2] __aligned(8); struct skcipher_walk walk; + int next_yield = 4096; /* bytes until next FPU yield */ int err; BUILD_BUG_ON(CHACHA_STATE_ALIGN != 16); @@ -144,12 +145,21 @@ static int chacha_simd_stream_xor(struct skcipher_request *req, while (walk.nbytes > 0) { unsigned int nbytes = walk.nbytes; - if (nbytes < walk.total) + if (nbytes < walk.total) { nbytes = round_down(nbytes, walk.stride); + next_yield -= nbytes; + } chacha_dosimd(state, walk.dst.virt.addr, walk.src.virt.addr, nbytes, ctx->nrounds); + if (next_yield <= 0) { + /* temporarily allow preemption */ + kernel_fpu_end(); + kernel_fpu_begin(); + next_yield = 4096; + } + err = skcipher_walk_done(&walk, walk.nbytes - nbytes); } -- cgit v1.2.3-59-g8ed1b From 5569e8c07447344cdc3771378ba4e0da0b94c2a4 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 6 Dec 2018 12:31:54 -0800 Subject: crypto: xchacha - add test vector from XChaCha20 draft RFC There is a draft specification for XChaCha20 being worked on. Add the XChaCha20 test vector from the appendix so that we can be extra sure the kernel's implementation is compatible. I also recomputed the ciphertext with XChaCha12 and added it there too, to keep the tests for XChaCha20 and XChaCha12 in sync. Signed-off-by: Eric Biggers Signed-off-by: Herbert Xu --- crypto/testmgr.h | 178 ++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 176 insertions(+), 2 deletions(-) diff --git a/crypto/testmgr.h b/crypto/testmgr.h index e7e56a8febbc..357cf4cbcbb1 100644 --- a/crypto/testmgr.h +++ b/crypto/testmgr.h @@ -32800,7 +32800,94 @@ static const struct cipher_testvec xchacha20_tv_template[] = { .also_non_np = 1, .np = 3, .tap = { 1200, 1, 80 }, - }, + }, { /* test vector from https://tools.ietf.org/html/draft-arciszewski-xchacha-02#appendix-A.3.2 */ + .key = "\x80\x81\x82\x83\x84\x85\x86\x87" + "\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f" + "\x90\x91\x92\x93\x94\x95\x96\x97" + "\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f", + .klen = 32, + .iv = "\x40\x41\x42\x43\x44\x45\x46\x47" + "\x48\x49\x4a\x4b\x4c\x4d\x4e\x4f" + "\x50\x51\x52\x53\x54\x55\x56\x58" + "\x00\x00\x00\x00\x00\x00\x00\x00", + .ptext = "\x54\x68\x65\x20\x64\x68\x6f\x6c" + "\x65\x20\x28\x70\x72\x6f\x6e\x6f" + "\x75\x6e\x63\x65\x64\x20\x22\x64" + "\x6f\x6c\x65\x22\x29\x20\x69\x73" + "\x20\x61\x6c\x73\x6f\x20\x6b\x6e" + "\x6f\x77\x6e\x20\x61\x73\x20\x74" + "\x68\x65\x20\x41\x73\x69\x61\x74" + "\x69\x63\x20\x77\x69\x6c\x64\x20" + "\x64\x6f\x67\x2c\x20\x72\x65\x64" + "\x20\x64\x6f\x67\x2c\x20\x61\x6e" + "\x64\x20\x77\x68\x69\x73\x74\x6c" + "\x69\x6e\x67\x20\x64\x6f\x67\x2e" + "\x20\x49\x74\x20\x69\x73\x20\x61" + "\x62\x6f\x75\x74\x20\x74\x68\x65" + "\x20\x73\x69\x7a\x65\x20\x6f\x66" + "\x20\x61\x20\x47\x65\x72\x6d\x61" + "\x6e\x20\x73\x68\x65\x70\x68\x65" + "\x72\x64\x20\x62\x75\x74\x20\x6c" + "\x6f\x6f\x6b\x73\x20\x6d\x6f\x72" + "\x65\x20\x6c\x69\x6b\x65\x20\x61" + "\x20\x6c\x6f\x6e\x67\x2d\x6c\x65" + "\x67\x67\x65\x64\x20\x66\x6f\x78" + "\x2e\x20\x54\x68\x69\x73\x20\x68" + "\x69\x67\x68\x6c\x79\x20\x65\x6c" + "\x75\x73\x69\x76\x65\x20\x61\x6e" + "\x64\x20\x73\x6b\x69\x6c\x6c\x65" + "\x64\x20\x6a\x75\x6d\x70\x65\x72" + "\x20\x69\x73\x20\x63\x6c\x61\x73" + "\x73\x69\x66\x69\x65\x64\x20\x77" + "\x69\x74\x68\x20\x77\x6f\x6c\x76" + "\x65\x73\x2c\x20\x63\x6f\x79\x6f" + "\x74\x65\x73\x2c\x20\x6a\x61\x63" + "\x6b\x61\x6c\x73\x2c\x20\x61\x6e" + "\x64\x20\x66\x6f\x78\x65\x73\x20" + "\x69\x6e\x20\x74\x68\x65\x20\x74" + "\x61\x78\x6f\x6e\x6f\x6d\x69\x63" + "\x20\x66\x61\x6d\x69\x6c\x79\x20" + "\x43\x61\x6e\x69\x64\x61\x65\x2e", + .ctext = "\x45\x59\xab\xba\x4e\x48\xc1\x61" + "\x02\xe8\xbb\x2c\x05\xe6\x94\x7f" + "\x50\xa7\x86\xde\x16\x2f\x9b\x0b" + "\x7e\x59\x2a\x9b\x53\xd0\xd4\xe9" + "\x8d\x8d\x64\x10\xd5\x40\xa1\xa6" + "\x37\x5b\x26\xd8\x0d\xac\xe4\xfa" + "\xb5\x23\x84\xc7\x31\xac\xbf\x16" + "\xa5\x92\x3c\x0c\x48\xd3\x57\x5d" + "\x4d\x0d\x2c\x67\x3b\x66\x6f\xaa" + "\x73\x10\x61\x27\x77\x01\x09\x3a" + "\x6b\xf7\xa1\x58\xa8\x86\x42\x92" + "\xa4\x1c\x48\xe3\xa9\xb4\xc0\xda" + "\xec\xe0\xf8\xd9\x8d\x0d\x7e\x05" + "\xb3\x7a\x30\x7b\xbb\x66\x33\x31" + "\x64\xec\x9e\x1b\x24\xea\x0d\x6c" + "\x3f\xfd\xdc\xec\x4f\x68\xe7\x44" + "\x30\x56\x19\x3a\x03\xc8\x10\xe1" + "\x13\x44\xca\x06\xd8\xed\x8a\x2b" + "\xfb\x1e\x8d\x48\xcf\xa6\xbc\x0e" + "\xb4\xe2\x46\x4b\x74\x81\x42\x40" + "\x7c\x9f\x43\x1a\xee\x76\x99\x60" + "\xe1\x5b\xa8\xb9\x68\x90\x46\x6e" + "\xf2\x45\x75\x99\x85\x23\x85\xc6" + "\x61\xf7\x52\xce\x20\xf9\xda\x0c" + "\x09\xab\x6b\x19\xdf\x74\xe7\x6a" + "\x95\x96\x74\x46\xf8\xd0\xfd\x41" + "\x5e\x7b\xee\x2a\x12\xa1\x14\xc2" + "\x0e\xb5\x29\x2a\xe7\xa3\x49\xae" + "\x57\x78\x20\xd5\x52\x0a\x1f\x3f" + "\xb6\x2a\x17\xce\x6a\x7e\x68\xfa" + "\x7c\x79\x11\x1d\x88\x60\x92\x0b" + "\xc0\x48\xef\x43\xfe\x84\x48\x6c" + "\xcb\x87\xc2\x5f\x0a\xe0\x45\xf0" + "\xcc\xe1\xe7\x98\x9a\x9a\xa2\x20" + "\xa2\x8b\xdd\x48\x27\xe7\x51\xa2" + "\x4a\x6d\x5c\x62\xd7\x90\xa6\x63" + "\x93\xb9\x31\x11\xc1\xa5\x5d\xd7" + "\x42\x1a\x10\x18\x49\x74\xc7\xc5", + .len = 304, + } }; /* @@ -33378,7 +33465,94 @@ static const struct cipher_testvec xchacha12_tv_template[] = { .also_non_np = 1, .np = 3, .tap = { 1200, 1, 80 }, - }, + }, { + .key = "\x80\x81\x82\x83\x84\x85\x86\x87" + "\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f" + "\x90\x91\x92\x93\x94\x95\x96\x97" + "\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f", + .klen = 32, + .iv = "\x40\x41\x42\x43\x44\x45\x46\x47" + "\x48\x49\x4a\x4b\x4c\x4d\x4e\x4f" + "\x50\x51\x52\x53\x54\x55\x56\x58" + "\x00\x00\x00\x00\x00\x00\x00\x00", + .ptext = "\x54\x68\x65\x20\x64\x68\x6f\x6c" + "\x65\x20\x28\x70\x72\x6f\x6e\x6f" + "\x75\x6e\x63\x65\x64\x20\x22\x64" + "\x6f\x6c\x65\x22\x29\x20\x69\x73" + "\x20\x61\x6c\x73\x6f\x20\x6b\x6e" + "\x6f\x77\x6e\x20\x61\x73\x20\x74" + "\x68\x65\x20\x41\x73\x69\x61\x74" + "\x69\x63\x20\x77\x69\x6c\x64\x20" + "\x64\x6f\x67\x2c\x20\x72\x65\x64" + "\x20\x64\x6f\x67\x2c\x20\x61\x6e" + "\x64\x20\x77\x68\x69\x73\x74\x6c" + "\x69\x6e\x67\x20\x64\x6f\x67\x2e" + "\x20\x49\x74\x20\x69\x73\x20\x61" + "\x62\x6f\x75\x74\x20\x74\x68\x65" + "\x20\x73\x69\x7a\x65\x20\x6f\x66" + "\x20\x61\x20\x47\x65\x72\x6d\x61" + "\x6e\x20\x73\x68\x65\x70\x68\x65" + "\x72\x64\x20\x62\x75\x74\x20\x6c" + "\x6f\x6f\x6b\x73\x20\x6d\x6f\x72" + "\x65\x20\x6c\x69\x6b\x65\x20\x61" + "\x20\x6c\x6f\x6e\x67\x2d\x6c\x65" + "\x67\x67\x65\x64\x20\x66\x6f\x78" + "\x2e\x20\x54\x68\x69\x73\x20\x68" + "\x69\x67\x68\x6c\x79\x20\x65\x6c" + "\x75\x73\x69\x76\x65\x20\x61\x6e" + "\x64\x20\x73\x6b\x69\x6c\x6c\x65" + "\x64\x20\x6a\x75\x6d\x70\x65\x72" + "\x20\x69\x73\x20\x63\x6c\x61\x73" + "\x73\x69\x66\x69\x65\x64\x20\x77" + "\x69\x74\x68\x20\x77\x6f\x6c\x76" + "\x65\x73\x2c\x20\x63\x6f\x79\x6f" + "\x74\x65\x73\x2c\x20\x6a\x61\x63" + "\x6b\x61\x6c\x73\x2c\x20\x61\x6e" + "\x64\x20\x66\x6f\x78\x65\x73\x20" + "\x69\x6e\x20\x74\x68\x65\x20\x74" + "\x61\x78\x6f\x6e\x6f\x6d\x69\x63" + "\x20\x66\x61\x6d\x69\x6c\x79\x20" + "\x43\x61\x6e\x69\x64\x61\x65\x2e", + .ctext = "\x9f\x1a\xab\x8a\x95\xf4\x7e\xcd" + "\xee\x34\xc0\x39\xd6\x23\x43\x94" + "\xf6\x01\xc1\x7f\x60\x91\xa5\x23" + "\x4a\x8a\xe6\xb1\x14\x8b\xd7\x58" + "\xee\x02\xad\xab\xce\x1e\x7d\xdf" + "\xf9\x49\x27\x69\xd0\x8d\x0c\x20" + "\x6e\x17\xc4\xae\x87\x7a\xc6\x61" + "\x91\xe2\x8e\x0a\x1d\x61\xcc\x38" + "\x02\x64\x43\x49\xc6\xb2\x59\x59" + "\x42\xe7\x9d\x83\x00\x60\x90\xd2" + "\xb9\xcd\x97\x6e\xc7\x95\x71\xbc" + "\x23\x31\x58\x07\xb3\xb4\xac\x0b" + "\x87\x64\x56\xe5\xe3\xec\x63\xa1" + "\x71\x8c\x08\x48\x33\x20\x29\x81" + "\xea\x01\x25\x20\xc3\xda\xe6\xee" + "\x6a\x03\xf6\x68\x4d\x26\xa0\x91" + "\x9e\x44\xb8\xc1\xc0\x8f\x5a\x6a" + "\xc0\xcd\xbf\x24\x5e\x40\x66\xd2" + "\x42\x24\xb5\xbf\xc1\xeb\x12\x60" + "\x56\xbe\xb1\xa6\xc4\x0f\xfc\x49" + "\x69\x9f\xcc\x06\x5c\xe3\x26\xd7" + "\x52\xc0\x42\xe8\xb4\x76\xc3\xee" + "\xb2\x97\xe3\x37\x61\x29\x5a\xb5" + "\x8e\xe8\x8c\xc5\x38\xcc\xcb\xec" + "\x64\x1a\xa9\x12\x5f\xf7\x79\xdf" + "\x64\xca\x77\x4e\xbd\xf9\x83\xa0" + "\x13\x27\x3f\x31\x03\x63\x30\x26" + "\x27\x0b\x3e\xb3\x23\x13\x61\x0b" + "\x70\x1d\xd4\xad\x85\x1e\xbf\xdf" + "\xc6\x8e\x4d\x08\xcc\x7e\x77\xbd" + "\x1e\x18\x77\x38\x3a\xfe\xc0\x5d" + "\x16\xfc\xf0\xa9\x2f\xe9\x17\xc7" + "\xd3\x23\x17\x18\xa3\xe6\x54\x77" + "\x6f\x1b\xbe\x8a\x6e\x7e\xca\x97" + "\x08\x05\x36\x76\xaf\x12\x7a\x42" + "\xf7\x7a\xc2\x35\xc3\xb4\x93\x40" + "\x54\x14\x90\xa0\x4d\x65\x1c\x37" + "\x50\x70\x44\x29\x6d\x6e\x62\x68", + .len = 304, + } }; /* Adiantum test vectors from https://github.com/google/adiantum */ -- cgit v1.2.3-59-g8ed1b From 282c14852d00d6d1b8fadf3e01e4180f02ddda84 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 6 Dec 2018 13:00:08 -0800 Subject: crypto: xchacha20 - fix comments for test vectors The kernel's ChaCha20 uses the RFC7539 convention of the nonce being 12 bytes rather than 8, so actually I only appended 12 random bytes (not 16) to its test vectors to form 24-byte nonces for the XChaCha20 test vectors. The other 4 bytes were just from zero-padding the stream position to 8 bytes. Fix the comments above the test vectors. Signed-off-by: Eric Biggers Signed-off-by: Herbert Xu --- crypto/testmgr.h | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/crypto/testmgr.h b/crypto/testmgr.h index 357cf4cbcbb1..e8f47d7b92cd 100644 --- a/crypto/testmgr.h +++ b/crypto/testmgr.h @@ -32281,8 +32281,9 @@ static const struct cipher_testvec xchacha20_tv_template[] = { "\x57\x78\x8e\x6f\xae\x90\xfc\x31" "\x09\x7c\xfc", .len = 91, - }, { /* Taken from the ChaCha20 test vectors, appended 16 random bytes - to nonce, and recomputed the ciphertext with libsodium */ + }, { /* Taken from the ChaCha20 test vectors, appended 12 random bytes + to the nonce, zero-padded the stream position from 4 to 8 bytes, + and recomputed the ciphertext using libsodium's XChaCha20 */ .key = "\x00\x00\x00\x00\x00\x00\x00\x00" "\x00\x00\x00\x00\x00\x00\x00\x00" "\x00\x00\x00\x00\x00\x00\x00\x00" @@ -32309,8 +32310,7 @@ static const struct cipher_testvec xchacha20_tv_template[] = { "\x03\xdc\xf8\x2b\xc1\xe1\x75\x67" "\x23\x7b\xe6\xfc\xd4\x03\x86\x54", .len = 64, - }, { /* Taken from the ChaCha20 test vectors, appended 16 random bytes - to nonce, and recomputed the ciphertext with libsodium */ + }, { /* Derived from a ChaCha20 test vector, via the process above */ .key = "\x00\x00\x00\x00\x00\x00\x00\x00" "\x00\x00\x00\x00\x00\x00\x00\x00" "\x00\x00\x00\x00\x00\x00\x00\x00" @@ -32419,8 +32419,7 @@ static const struct cipher_testvec xchacha20_tv_template[] = { .np = 3, .tap = { 375 - 20, 4, 16 }, - }, { /* Taken from the ChaCha20 test vectors, appended 16 random bytes - to nonce, and recomputed the ciphertext with libsodium */ + }, { /* Derived from a ChaCha20 test vector, via the process above */ .key = "\x1c\x92\x40\xa5\xeb\x55\xd3\x8a" "\xf3\x33\x88\x86\x04\xf6\xb5\xf0" "\x47\x39\x17\xc1\x40\x2b\x80\x09" @@ -32463,8 +32462,7 @@ static const struct cipher_testvec xchacha20_tv_template[] = { "\x65\x03\xfa\x45\xf7\x9e\x53\x7a" "\x99\xf1\x82\x25\x4f\x8d\x07", .len = 127, - }, { /* Taken from the ChaCha20 test vectors, appended 16 random bytes - to nonce, and recomputed the ciphertext with libsodium */ + }, { /* Derived from a ChaCha20 test vector, via the process above */ .key = "\x1c\x92\x40\xa5\xeb\x55\xd3\x8a" "\xf3\x33\x88\x86\x04\xf6\xb5\xf0" "\x47\x39\x17\xc1\x40\x2b\x80\x09" -- cgit v1.2.3-59-g8ed1b From c6018e1a00b5c70610cdfb3650cc5622c917ed17 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 6 Dec 2018 14:21:59 -0800 Subject: crypto: adiantum - adjust some comments to match latest paper MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The 2018-11-28 revision of the Adiantum paper has revised some notation: - 'M' was replaced with 'L' (meaning "Left", for the left-hand part of the message) in the definition of Adiantum hashing, to avoid confusion with the full message - ε-almost-∆-universal is now abbreviated as ε-∆U instead of εA∆U - "block" is now used only to mean block cipher and Poly1305 blocks Also, Adiantum hashing was moved from the appendix to the main paper. To avoid confusion, update relevant comments in the code to match. Signed-off-by: Eric Biggers Signed-off-by: Herbert Xu --- crypto/adiantum.c | 35 +++++++++++++++++++---------------- crypto/nhpoly1305.c | 8 ++++---- 2 files changed, 23 insertions(+), 20 deletions(-) diff --git a/crypto/adiantum.c b/crypto/adiantum.c index ca27e0dc2958..e62e34f5e389 100644 --- a/crypto/adiantum.c +++ b/crypto/adiantum.c @@ -9,7 +9,7 @@ * Adiantum is a tweakable, length-preserving encryption mode designed for fast * and secure disk encryption, especially on CPUs without dedicated crypto * instructions. Adiantum encrypts each sector using the XChaCha12 stream - * cipher, two passes of an ε-almost-∆-universal (εA∆U) hash function based on + * cipher, two passes of an ε-almost-∆-universal (ε-∆U) hash function based on * NH and Poly1305, and an invocation of the AES-256 block cipher on a single * 16-byte block. See the paper for details: * @@ -21,12 +21,12 @@ * - Stream cipher: XChaCha12 or XChaCha20 * - Block cipher: any with a 128-bit block size and 256-bit key * - * This implementation doesn't currently allow other εA∆U hash functions, i.e. + * This implementation doesn't currently allow other ε-∆U hash functions, i.e. * HPolyC is not supported. This is because Adiantum is ~20% faster than HPolyC - * but still provably as secure, and also the εA∆U hash function of HBSH is + * but still provably as secure, and also the ε-∆U hash function of HBSH is * formally defined to take two inputs (tweak, message) which makes it difficult * to wrap with the crypto_shash API. Rather, some details need to be handled - * here. Nevertheless, if needed in the future, support for other εA∆U hash + * here. Nevertheless, if needed in the future, support for other ε-∆U hash * functions could be added here. */ @@ -41,7 +41,7 @@ #include "internal.h" /* - * Size of right-hand block of input data, in bytes; also the size of the block + * Size of right-hand part of input data, in bytes; also the size of the block * cipher's block size and the hash function's output. */ #define BLOCKCIPHER_BLOCK_SIZE 16 @@ -77,7 +77,7 @@ struct adiantum_tfm_ctx { struct adiantum_request_ctx { /* - * Buffer for right-hand block of data, i.e. + * Buffer for right-hand part of data, i.e. * * P_L => P_M => C_M => C_R when encrypting, or * C_R => C_M => P_M => P_L when decrypting. @@ -93,8 +93,8 @@ struct adiantum_request_ctx { bool enc; /* true if encrypting, false if decrypting */ /* - * The result of the Poly1305 εA∆U hash function applied to - * (message length, tweak). + * The result of the Poly1305 ε-∆U hash function applied to + * (bulk length, tweak) */ le128 header_hash; @@ -213,13 +213,16 @@ static inline void le128_sub(le128 *r, const le128 *v1, const le128 *v2) } /* - * Apply the Poly1305 εA∆U hash function to (message length, tweak) and save the - * result to rctx->header_hash. + * Apply the Poly1305 ε-∆U hash function to (bulk length, tweak) and save the + * result to rctx->header_hash. This is the calculation * - * This value is reused in both the first and second hash steps. Specifically, - * it's added to the result of an independently keyed εA∆U hash function (for - * equal length inputs only) taken over the message. This gives the overall - * Adiantum hash of the (tweak, message) pair. + * H_T ← Poly1305_{K_T}(bin_{128}(|L|) || T) + * + * from the procedure in section 6.4 of the Adiantum paper. The resulting value + * is reused in both the first and second hash steps. Specifically, it's added + * to the result of an independently keyed ε-∆U hash function (for equal length + * inputs only) taken over the left-hand part (the "bulk") of the message, to + * give the overall Adiantum hash of the (tweak, left-hand part) pair. */ static void adiantum_hash_header(struct skcipher_request *req) { @@ -248,7 +251,7 @@ static void adiantum_hash_header(struct skcipher_request *req) poly1305_core_emit(&state, &rctx->header_hash); } -/* Hash the left-hand block (the "bulk") of the message using NHPoly1305 */ +/* Hash the left-hand part (the "bulk") of the message using NHPoly1305 */ static int adiantum_hash_message(struct skcipher_request *req, struct scatterlist *sgl, le128 *digest) { @@ -550,7 +553,7 @@ static int adiantum_create(struct crypto_template *tmpl, struct rtattr **tb) goto out_drop_streamcipher; blockcipher_alg = ictx->blockcipher_spawn.alg; - /* NHPoly1305 εA∆U hash function */ + /* NHPoly1305 ε-∆U hash function */ _hash_alg = crypto_alg_mod_lookup(nhpoly1305_name, CRYPTO_ALG_TYPE_SHASH, CRYPTO_ALG_TYPE_MASK); diff --git a/crypto/nhpoly1305.c b/crypto/nhpoly1305.c index c8385853f699..ec831a5594d8 100644 --- a/crypto/nhpoly1305.c +++ b/crypto/nhpoly1305.c @@ -9,15 +9,15 @@ * "NHPoly1305" is the main component of Adiantum hashing. * Specifically, it is the calculation * - * H_M ← Poly1305_{K_M}(NH_{K_N}(pad_{128}(M))) + * H_L ← Poly1305_{K_L}(NH_{K_N}(pad_{128}(L))) * - * from the procedure in section A.5 of the Adiantum paper [1]. It is an - * ε-almost-∆-universal (εA∆U) hash function for equal-length inputs over + * from the procedure in section 6.4 of the Adiantum paper [1]. It is an + * ε-almost-∆-universal (ε-∆U) hash function for equal-length inputs over * Z/(2^{128}Z), where the "∆" operation is addition. It hashes 1024-byte * chunks of the input with the NH hash function [2], reducing the input length * by 32x. The resulting NH digests are evaluated as a polynomial in * GF(2^{130}-5), like in the Poly1305 MAC [3]. Note that the polynomial - * evaluation by itself would suffice to achieve the εA∆U property; NH is used + * evaluation by itself would suffice to achieve the ε-∆U property; NH is used * for performance since it's over twice as fast as Poly1305. * * This is *not* a cryptographic hash function; do not use it as such! -- cgit v1.2.3-59-g8ed1b From 0ac6b8fb23c724b015d9ca70a89126e8d1563166 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 6 Dec 2018 15:55:41 -0800 Subject: crypto: user - support incremental algorithm dumps CRYPTO_MSG_GETALG in NLM_F_DUMP mode sometimes doesn't return all registered crypto algorithms, because it doesn't support incremental dumps. crypto_dump_report() only permits itself to be called once, yet the netlink subsystem allocates at most ~64 KiB for the skb being dumped to. Thus only the first recvmsg() returns data, and it may only include a subset of the crypto algorithms even if the user buffer passed to recvmsg() is large enough to hold all of them. Fix this by using one of the arguments in the netlink_callback structure to keep track of the current position in the algorithm list. Then userspace can do multiple recvmsg() on the socket after sending the dump request. This is the way netlink dumps work elsewhere in the kernel; it's unclear why this was different (probably just an oversight). Also fix an integer overflow when calculating the dump buffer size hint. Fixes: a38f7907b926 ("crypto: Add userspace configuration API") Signed-off-by: Eric Biggers Signed-off-by: Herbert Xu --- crypto/crypto_user_base.c | 37 ++++++++++++++++++++----------------- 1 file changed, 20 insertions(+), 17 deletions(-) diff --git a/crypto/crypto_user_base.c b/crypto/crypto_user_base.c index 7021efbb35a1..5311fd7fae34 100644 --- a/crypto/crypto_user_base.c +++ b/crypto/crypto_user_base.c @@ -231,30 +231,33 @@ drop_alg: static int crypto_dump_report(struct sk_buff *skb, struct netlink_callback *cb) { - struct crypto_alg *alg; + const size_t start_pos = cb->args[0]; + size_t pos = 0; struct crypto_dump_info info; - int err; - - if (cb->args[0]) - goto out; - - cb->args[0] = 1; + struct crypto_alg *alg; + int res; info.in_skb = cb->skb; info.out_skb = skb; info.nlmsg_seq = cb->nlh->nlmsg_seq; info.nlmsg_flags = NLM_F_MULTI; + down_read(&crypto_alg_sem); list_for_each_entry(alg, &crypto_alg_list, cra_list) { - err = crypto_report_alg(alg, &info); - if (err) - goto out_err; + if (pos >= start_pos) { + res = crypto_report_alg(alg, &info); + if (res == -EMSGSIZE) + break; + if (res) + goto out; + } + pos++; } - + cb->args[0] = pos; + res = skb->len; out: - return skb->len; -out_err: - return err; + up_read(&crypto_alg_sem); + return res; } static int crypto_dump_report_done(struct netlink_callback *cb) @@ -442,7 +445,7 @@ static int crypto_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, if ((type == (CRYPTO_MSG_GETALG - CRYPTO_MSG_BASE) && (nlh->nlmsg_flags & NLM_F_DUMP))) { struct crypto_alg *alg; - u16 dump_alloc = 0; + unsigned long dump_alloc = 0; if (link->dump == NULL) return -EINVAL; @@ -450,16 +453,16 @@ static int crypto_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, down_read(&crypto_alg_sem); list_for_each_entry(alg, &crypto_alg_list, cra_list) dump_alloc += CRYPTO_REPORT_MAXSIZE; + up_read(&crypto_alg_sem); { struct netlink_dump_control c = { .dump = link->dump, .done = link->done, - .min_dump_alloc = dump_alloc, + .min_dump_alloc = min(dump_alloc, 65535UL), }; err = netlink_dump_start(crypto_nlsk, skb, nlh, &c); } - up_read(&crypto_alg_sem); return err; } -- cgit v1.2.3-59-g8ed1b From 00c9fe37a7f27a306bcaa5737f0787fe139f8aba Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 10 Dec 2018 11:45:58 -0800 Subject: crypto: adiantum - fix leaking reference to hash algorithm crypto_alg_mod_lookup() takes a reference to the hash algorithm but crypto_init_shash_spawn() doesn't take ownership of it, hence the reference needs to be dropped in adiantum_create(). Fixes: 059c2a4d8e16 ("crypto: adiantum - add Adiantum support") Signed-off-by: Eric Biggers Signed-off-by: Herbert Xu --- crypto/adiantum.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/crypto/adiantum.c b/crypto/adiantum.c index e62e34f5e389..6651e713c45d 100644 --- a/crypto/adiantum.c +++ b/crypto/adiantum.c @@ -564,10 +564,8 @@ static int adiantum_create(struct crypto_template *tmpl, struct rtattr **tb) hash_alg = __crypto_shash_alg(_hash_alg); err = crypto_init_shash_spawn(&ictx->hash_spawn, hash_alg, skcipher_crypto_instance(inst)); - if (err) { - crypto_mod_put(_hash_alg); - goto out_drop_blockcipher; - } + if (err) + goto out_put_hash; /* Check the set of algorithms */ if (!adiantum_supported_algorithms(streamcipher_alg, blockcipher_alg, @@ -624,10 +622,13 @@ static int adiantum_create(struct crypto_template *tmpl, struct rtattr **tb) if (err) goto out_drop_hash; + crypto_mod_put(_hash_alg); return 0; out_drop_hash: crypto_drop_shash(&ictx->hash_spawn); +out_put_hash: + crypto_mod_put(_hash_alg); out_drop_blockcipher: crypto_drop_spawn(&ictx->blockcipher_spawn); out_drop_streamcipher: -- cgit v1.2.3-59-g8ed1b From f9b1d64678607e132051d232c7a2127f32947d64 Mon Sep 17 00:00:00 2001 From: Dave Watson Date: Mon, 10 Dec 2018 19:56:45 +0000 Subject: crypto: aesni - Merge GCM_ENC_DEC The GCM_ENC_DEC routines for AVX and AVX2 are identical, except they call separate sub-macros. Pass the macros as arguments, and merge them. This facilitates additional refactoring, by requiring changes in only one place. The GCM_ENC_DEC macro was moved above the CONFIG_AS_AVX* ifdefs, since it will be used by both AVX and AVX2. Signed-off-by: Dave Watson Signed-off-by: Herbert Xu --- arch/x86/crypto/aesni-intel_avx-x86_64.S | 2481 +++++++++++++----------------- 1 file changed, 1083 insertions(+), 1398 deletions(-) diff --git a/arch/x86/crypto/aesni-intel_avx-x86_64.S b/arch/x86/crypto/aesni-intel_avx-x86_64.S index 1985ea0b551b..318135a77975 100644 --- a/arch/x86/crypto/aesni-intel_avx-x86_64.S +++ b/arch/x86/crypto/aesni-intel_avx-x86_64.S @@ -280,1252 +280,1250 @@ VARIABLE_OFFSET = 16*8 vaesenclast 16*10(arg1), \XMM0, \XMM0 .endm -#ifdef CONFIG_AS_AVX -############################################################################### -# GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0) -# Input: A and B (128-bits each, bit-reflected) -# Output: C = A*B*x mod poly, (i.e. >>1 ) -# To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input -# GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly. -############################################################################### -.macro GHASH_MUL_AVX GH HK T1 T2 T3 T4 T5 +# combined for GCM encrypt and decrypt functions +# clobbering all xmm registers +# clobbering r10, r11, r12, r13, r14, r15 +.macro GCM_ENC_DEC INITIAL_BLOCKS GHASH_8_ENCRYPT_8_PARALLEL GHASH_LAST_8 GHASH_MUL ENC_DEC - vpshufd $0b01001110, \GH, \T2 - vpshufd $0b01001110, \HK, \T3 - vpxor \GH , \T2, \T2 # T2 = (a1+a0) - vpxor \HK , \T3, \T3 # T3 = (b1+b0) + #the number of pushes must equal STACK_OFFSET + push %r12 + push %r13 + push %r14 + push %r15 - vpclmulqdq $0x11, \HK, \GH, \T1 # T1 = a1*b1 - vpclmulqdq $0x00, \HK, \GH, \GH # GH = a0*b0 - vpclmulqdq $0x00, \T3, \T2, \T2 # T2 = (a1+a0)*(b1+b0) - vpxor \GH, \T2,\T2 - vpxor \T1, \T2,\T2 # T2 = a0*b1+a1*b0 + mov %rsp, %r14 - vpslldq $8, \T2,\T3 # shift-L T3 2 DWs - vpsrldq $8, \T2,\T2 # shift-R T2 2 DWs - vpxor \T3, \GH, \GH - vpxor \T2, \T1, \T1 # = GH x HK - #first phase of the reduction - vpslld $31, \GH, \T2 # packed right shifting << 31 - vpslld $30, \GH, \T3 # packed right shifting shift << 30 - vpslld $25, \GH, \T4 # packed right shifting shift << 25 - vpxor \T3, \T2, \T2 # xor the shifted versions - vpxor \T4, \T2, \T2 - vpsrldq $4, \T2, \T5 # shift-R T5 1 DW + sub $VARIABLE_OFFSET, %rsp + and $~63, %rsp # align rsp to 64 bytes - vpslldq $12, \T2, \T2 # shift-L T2 3 DWs - vpxor \T2, \GH, \GH # first phase of the reduction complete - #second phase of the reduction + vmovdqu HashKey(arg1), %xmm13 # xmm13 = HashKey - vpsrld $1,\GH, \T2 # packed left shifting >> 1 - vpsrld $2,\GH, \T3 # packed left shifting >> 2 - vpsrld $7,\GH, \T4 # packed left shifting >> 7 - vpxor \T3, \T2, \T2 # xor the shifted versions - vpxor \T4, \T2, \T2 + mov arg4, %r13 # save the number of bytes of plaintext/ciphertext + and $-16, %r13 # r13 = r13 - (r13 mod 16) - vpxor \T5, \T2, \T2 - vpxor \T2, \GH, \GH - vpxor \T1, \GH, \GH # the result is in GH + mov %r13, %r12 + shr $4, %r12 + and $7, %r12 + jz _initial_num_blocks_is_0\@ + cmp $7, %r12 + je _initial_num_blocks_is_7\@ + cmp $6, %r12 + je _initial_num_blocks_is_6\@ + cmp $5, %r12 + je _initial_num_blocks_is_5\@ + cmp $4, %r12 + je _initial_num_blocks_is_4\@ + cmp $3, %r12 + je _initial_num_blocks_is_3\@ + cmp $2, %r12 + je _initial_num_blocks_is_2\@ -.endm + jmp _initial_num_blocks_is_1\@ -.macro PRECOMPUTE_AVX HK T1 T2 T3 T4 T5 T6 +_initial_num_blocks_is_7\@: + \INITIAL_BLOCKS 7, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC + sub $16*7, %r13 + jmp _initial_blocks_encrypted\@ - # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i - vmovdqa \HK, \T5 +_initial_num_blocks_is_6\@: + \INITIAL_BLOCKS 6, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC + sub $16*6, %r13 + jmp _initial_blocks_encrypted\@ - vpshufd $0b01001110, \T5, \T1 - vpxor \T5, \T1, \T1 - vmovdqa \T1, HashKey_k(arg1) +_initial_num_blocks_is_5\@: + \INITIAL_BLOCKS 5, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC + sub $16*5, %r13 + jmp _initial_blocks_encrypted\@ - GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^2<<1 mod poly - vmovdqa \T5, HashKey_2(arg1) # [HashKey_2] = HashKey^2<<1 mod poly - vpshufd $0b01001110, \T5, \T1 - vpxor \T5, \T1, \T1 - vmovdqa \T1, HashKey_2_k(arg1) +_initial_num_blocks_is_4\@: + \INITIAL_BLOCKS 4, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC + sub $16*4, %r13 + jmp _initial_blocks_encrypted\@ - GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^3<<1 mod poly - vmovdqa \T5, HashKey_3(arg1) - vpshufd $0b01001110, \T5, \T1 - vpxor \T5, \T1, \T1 - vmovdqa \T1, HashKey_3_k(arg1) +_initial_num_blocks_is_3\@: + \INITIAL_BLOCKS 3, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC + sub $16*3, %r13 + jmp _initial_blocks_encrypted\@ - GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^4<<1 mod poly - vmovdqa \T5, HashKey_4(arg1) - vpshufd $0b01001110, \T5, \T1 - vpxor \T5, \T1, \T1 - vmovdqa \T1, HashKey_4_k(arg1) +_initial_num_blocks_is_2\@: + \INITIAL_BLOCKS 2, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC + sub $16*2, %r13 + jmp _initial_blocks_encrypted\@ - GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^5<<1 mod poly - vmovdqa \T5, HashKey_5(arg1) - vpshufd $0b01001110, \T5, \T1 - vpxor \T5, \T1, \T1 - vmovdqa \T1, HashKey_5_k(arg1) +_initial_num_blocks_is_1\@: + \INITIAL_BLOCKS 1, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC + sub $16*1, %r13 + jmp _initial_blocks_encrypted\@ - GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^6<<1 mod poly - vmovdqa \T5, HashKey_6(arg1) - vpshufd $0b01001110, \T5, \T1 - vpxor \T5, \T1, \T1 - vmovdqa \T1, HashKey_6_k(arg1) +_initial_num_blocks_is_0\@: + \INITIAL_BLOCKS 0, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC - GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^7<<1 mod poly - vmovdqa \T5, HashKey_7(arg1) - vpshufd $0b01001110, \T5, \T1 - vpxor \T5, \T1, \T1 - vmovdqa \T1, HashKey_7_k(arg1) - GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^8<<1 mod poly - vmovdqa \T5, HashKey_8(arg1) - vpshufd $0b01001110, \T5, \T1 - vpxor \T5, \T1, \T1 - vmovdqa \T1, HashKey_8_k(arg1) +_initial_blocks_encrypted\@: + cmp $0, %r13 + je _zero_cipher_left\@ -.endm + sub $128, %r13 + je _eight_cipher_left\@ -## if a = number of total plaintext bytes -## b = floor(a/16) -## num_initial_blocks = b mod 4# -## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext -## r10, r11, r12, rax are clobbered -## arg1, arg2, arg3, r14 are used as a pointer only, not modified -.macro INITIAL_BLOCKS_AVX num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC - i = (8-\num_initial_blocks) - j = 0 - setreg - mov arg6, %r10 # r10 = AAD - mov arg7, %r12 # r12 = aadLen + vmovd %xmm9, %r15d + and $255, %r15d + vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 - mov %r12, %r11 - vpxor reg_j, reg_j, reg_j - vpxor reg_i, reg_i, reg_i - cmp $16, %r11 - jl _get_AAD_rest8\@ -_get_AAD_blocks\@: - vmovdqu (%r10), reg_i - vpshufb SHUF_MASK(%rip), reg_i, reg_i - vpxor reg_i, reg_j, reg_j - GHASH_MUL_AVX reg_j, \T2, \T1, \T3, \T4, \T5, \T6 - add $16, %r10 - sub $16, %r12 - sub $16, %r11 - cmp $16, %r11 - jge _get_AAD_blocks\@ - vmovdqu reg_j, reg_i - cmp $0, %r11 - je _get_AAD_done\@ +_encrypt_by_8_new\@: + cmp $(255-8), %r15d + jg _encrypt_by_8\@ - vpxor reg_i, reg_i, reg_i - /* read the last <16B of AAD. since we have at least 4B of - data right after the AAD (the ICV, and maybe some CT), we can - read 4B/8B blocks safely, and then get rid of the extra stuff */ -_get_AAD_rest8\@: - cmp $4, %r11 - jle _get_AAD_rest4\@ - movq (%r10), \T1 - add $8, %r10 - sub $8, %r11 - vpslldq $8, \T1, \T1 - vpsrldq $8, reg_i, reg_i - vpxor \T1, reg_i, reg_i - jmp _get_AAD_rest8\@ -_get_AAD_rest4\@: - cmp $0, %r11 - jle _get_AAD_rest0\@ - mov (%r10), %eax - movq %rax, \T1 - add $4, %r10 - sub $4, %r11 - vpslldq $12, \T1, \T1 - vpsrldq $4, reg_i, reg_i - vpxor \T1, reg_i, reg_i -_get_AAD_rest0\@: - /* finalize: shift out the extra bytes we read, and align - left. since pslldq can only shift by an immediate, we use - vpshufb and an array of shuffle masks */ - movq %r12, %r11 - salq $4, %r11 - movdqu aad_shift_arr(%r11), \T1 - vpshufb \T1, reg_i, reg_i -_get_AAD_rest_final\@: - vpshufb SHUF_MASK(%rip), reg_i, reg_i - vpxor reg_j, reg_i, reg_i - GHASH_MUL_AVX reg_i, \T2, \T1, \T3, \T4, \T5, \T6 -_get_AAD_done\@: - # initialize the data pointer offset as zero - xor %r11d, %r11d + add $8, %r15b + \GHASH_8_ENCRYPT_8_PARALLEL %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, out_order, \ENC_DEC + add $128, %r11 + sub $128, %r13 + jne _encrypt_by_8_new\@ - # start AES for num_initial_blocks blocks - mov arg5, %rax # rax = *Y0 - vmovdqu (%rax), \CTR # CTR = Y0 - vpshufb SHUF_MASK(%rip), \CTR, \CTR + vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 + jmp _eight_cipher_left\@ +_encrypt_by_8\@: + vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 + add $8, %r15b + \GHASH_8_ENCRYPT_8_PARALLEL %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, in_order, \ENC_DEC + vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 + add $128, %r11 + sub $128, %r13 + jne _encrypt_by_8_new\@ - i = (9-\num_initial_blocks) - setreg -.rep \num_initial_blocks - vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 - vmovdqa \CTR, reg_i - vpshufb SHUF_MASK(%rip), reg_i, reg_i # perform a 16Byte swap - i = (i+1) - setreg -.endr + vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 - vmovdqa (arg1), \T_key - i = (9-\num_initial_blocks) - setreg -.rep \num_initial_blocks - vpxor \T_key, reg_i, reg_i - i = (i+1) - setreg -.endr - j = 1 - setreg -.rep 9 - vmovdqa 16*j(arg1), \T_key - i = (9-\num_initial_blocks) - setreg -.rep \num_initial_blocks - vaesenc \T_key, reg_i, reg_i - i = (i+1) - setreg -.endr - j = (j+1) - setreg -.endr - - - vmovdqa 16*10(arg1), \T_key - i = (9-\num_initial_blocks) - setreg -.rep \num_initial_blocks - vaesenclast \T_key, reg_i, reg_i - i = (i+1) - setreg -.endr - i = (9-\num_initial_blocks) - setreg -.rep \num_initial_blocks - vmovdqu (arg3, %r11), \T1 - vpxor \T1, reg_i, reg_i - vmovdqu reg_i, (arg2 , %r11) # write back ciphertext for num_initial_blocks blocks - add $16, %r11 -.if \ENC_DEC == DEC - vmovdqa \T1, reg_i -.endif - vpshufb SHUF_MASK(%rip), reg_i, reg_i # prepare ciphertext for GHASH computations - i = (i+1) - setreg -.endr +_eight_cipher_left\@: + \GHASH_LAST_8 %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8 - i = (8-\num_initial_blocks) - j = (9-\num_initial_blocks) - setreg +_zero_cipher_left\@: + cmp $16, arg4 + jl _only_less_than_16\@ -.rep \num_initial_blocks - vpxor reg_i, reg_j, reg_j - GHASH_MUL_AVX reg_j, \T2, \T1, \T3, \T4, \T5, \T6 # apply GHASH on num_initial_blocks blocks - i = (i+1) - j = (j+1) - setreg -.endr - # XMM8 has the combined result here + mov arg4, %r13 + and $15, %r13 # r13 = (arg4 mod 16) - vmovdqa \XMM8, TMP1(%rsp) - vmovdqa \XMM8, \T3 + je _multiple_of_16_bytes\@ - cmp $128, %r13 - jl _initial_blocks_done\@ # no need for precomputed constants + # handle the last <16 Byte block seperately -############################################################################### -# Haskey_i_k holds XORed values of the low and high parts of the Haskey_i - vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 - vmovdqa \CTR, \XMM1 - vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap - vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 - vmovdqa \CTR, \XMM2 - vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap + vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn + vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 + ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Yn) - vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 - vmovdqa \CTR, \XMM3 - vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap + sub $16, %r11 + add %r13, %r11 + vmovdqu (arg3, %r11), %xmm1 # receive the last <16 Byte block - vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 - vmovdqa \CTR, \XMM4 - vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap + lea SHIFT_MASK+16(%rip), %r12 + sub %r13, %r12 # adjust the shuffle mask pointer to be + # able to shift 16-r13 bytes (r13 is the + # number of bytes in plaintext mod 16) + vmovdqu (%r12), %xmm2 # get the appropriate shuffle mask + vpshufb %xmm2, %xmm1, %xmm1 # shift right 16-r13 bytes + jmp _final_ghash_mul\@ - vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 - vmovdqa \CTR, \XMM5 - vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap +_only_less_than_16\@: + # check for 0 length + mov arg4, %r13 + and $15, %r13 # r13 = (arg4 mod 16) - vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 - vmovdqa \CTR, \XMM6 - vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap + je _multiple_of_16_bytes\@ - vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 - vmovdqa \CTR, \XMM7 - vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap + # handle the last <16 Byte block separately - vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 - vmovdqa \CTR, \XMM8 - vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap - vmovdqa (arg1), \T_key - vpxor \T_key, \XMM1, \XMM1 - vpxor \T_key, \XMM2, \XMM2 - vpxor \T_key, \XMM3, \XMM3 - vpxor \T_key, \XMM4, \XMM4 - vpxor \T_key, \XMM5, \XMM5 - vpxor \T_key, \XMM6, \XMM6 - vpxor \T_key, \XMM7, \XMM7 - vpxor \T_key, \XMM8, \XMM8 + vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn + vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 + ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Yn) - i = 1 - setreg -.rep 9 # do 9 rounds - vmovdqa 16*i(arg1), \T_key - vaesenc \T_key, \XMM1, \XMM1 - vaesenc \T_key, \XMM2, \XMM2 - vaesenc \T_key, \XMM3, \XMM3 - vaesenc \T_key, \XMM4, \XMM4 - vaesenc \T_key, \XMM5, \XMM5 - vaesenc \T_key, \XMM6, \XMM6 - vaesenc \T_key, \XMM7, \XMM7 - vaesenc \T_key, \XMM8, \XMM8 - i = (i+1) - setreg -.endr + lea SHIFT_MASK+16(%rip), %r12 + sub %r13, %r12 # adjust the shuffle mask pointer to be + # able to shift 16-r13 bytes (r13 is the + # number of bytes in plaintext mod 16) - vmovdqa 16*i(arg1), \T_key - vaesenclast \T_key, \XMM1, \XMM1 - vaesenclast \T_key, \XMM2, \XMM2 - vaesenclast \T_key, \XMM3, \XMM3 - vaesenclast \T_key, \XMM4, \XMM4 - vaesenclast \T_key, \XMM5, \XMM5 - vaesenclast \T_key, \XMM6, \XMM6 - vaesenclast \T_key, \XMM7, \XMM7 - vaesenclast \T_key, \XMM8, \XMM8 +_get_last_16_byte_loop\@: + movb (arg3, %r11), %al + movb %al, TMP1 (%rsp , %r11) + add $1, %r11 + cmp %r13, %r11 + jne _get_last_16_byte_loop\@ - vmovdqu (arg3, %r11), \T1 - vpxor \T1, \XMM1, \XMM1 - vmovdqu \XMM1, (arg2 , %r11) - .if \ENC_DEC == DEC - vmovdqa \T1, \XMM1 - .endif + vmovdqu TMP1(%rsp), %xmm1 - vmovdqu 16*1(arg3, %r11), \T1 - vpxor \T1, \XMM2, \XMM2 - vmovdqu \XMM2, 16*1(arg2 , %r11) - .if \ENC_DEC == DEC - vmovdqa \T1, \XMM2 - .endif + sub $16, %r11 - vmovdqu 16*2(arg3, %r11), \T1 - vpxor \T1, \XMM3, \XMM3 - vmovdqu \XMM3, 16*2(arg2 , %r11) - .if \ENC_DEC == DEC - vmovdqa \T1, \XMM3 - .endif +_final_ghash_mul\@: + .if \ENC_DEC == DEC + vmovdqa %xmm1, %xmm2 + vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn) + vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to + # mask out top 16-r13 bytes of xmm9 + vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9 + vpand %xmm1, %xmm2, %xmm2 + vpshufb SHUF_MASK(%rip), %xmm2, %xmm2 + vpxor %xmm2, %xmm14, %xmm14 + #GHASH computation for the last <16 Byte block + \GHASH_MUL %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 + sub %r13, %r11 + add $16, %r11 + .else + vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn) + vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to + # mask out top 16-r13 bytes of xmm9 + vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9 + vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 + vpxor %xmm9, %xmm14, %xmm14 + #GHASH computation for the last <16 Byte block + \GHASH_MUL %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 + sub %r13, %r11 + add $16, %r11 + vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 # shuffle xmm9 back to output as ciphertext + .endif - vmovdqu 16*3(arg3, %r11), \T1 - vpxor \T1, \XMM4, \XMM4 - vmovdqu \XMM4, 16*3(arg2 , %r11) - .if \ENC_DEC == DEC - vmovdqa \T1, \XMM4 - .endif - vmovdqu 16*4(arg3, %r11), \T1 - vpxor \T1, \XMM5, \XMM5 - vmovdqu \XMM5, 16*4(arg2 , %r11) - .if \ENC_DEC == DEC - vmovdqa \T1, \XMM5 - .endif + ############################# + # output r13 Bytes + vmovq %xmm9, %rax + cmp $8, %r13 + jle _less_than_8_bytes_left\@ - vmovdqu 16*5(arg3, %r11), \T1 - vpxor \T1, \XMM6, \XMM6 - vmovdqu \XMM6, 16*5(arg2 , %r11) - .if \ENC_DEC == DEC - vmovdqa \T1, \XMM6 - .endif + mov %rax, (arg2 , %r11) + add $8, %r11 + vpsrldq $8, %xmm9, %xmm9 + vmovq %xmm9, %rax + sub $8, %r13 - vmovdqu 16*6(arg3, %r11), \T1 - vpxor \T1, \XMM7, \XMM7 - vmovdqu \XMM7, 16*6(arg2 , %r11) - .if \ENC_DEC == DEC - vmovdqa \T1, \XMM7 - .endif +_less_than_8_bytes_left\@: + movb %al, (arg2 , %r11) + add $1, %r11 + shr $8, %rax + sub $1, %r13 + jne _less_than_8_bytes_left\@ + ############################# - vmovdqu 16*7(arg3, %r11), \T1 - vpxor \T1, \XMM8, \XMM8 - vmovdqu \XMM8, 16*7(arg2 , %r11) - .if \ENC_DEC == DEC - vmovdqa \T1, \XMM8 - .endif +_multiple_of_16_bytes\@: + mov arg7, %r12 # r12 = aadLen (number of bytes) + shl $3, %r12 # convert into number of bits + vmovd %r12d, %xmm15 # len(A) in xmm15 - add $128, %r11 + shl $3, arg4 # len(C) in bits (*128) + vmovq arg4, %xmm1 + vpslldq $8, %xmm15, %xmm15 # xmm15 = len(A)|| 0x0000000000000000 + vpxor %xmm1, %xmm15, %xmm15 # xmm15 = len(A)||len(C) - vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap - vpxor TMP1(%rsp), \XMM1, \XMM1 # combine GHASHed value with the corresponding ciphertext - vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap + vpxor %xmm15, %xmm14, %xmm14 + \GHASH_MUL %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 # final GHASH computation + vpshufb SHUF_MASK(%rip), %xmm14, %xmm14 # perform a 16Byte swap -############################################################################### + mov arg5, %rax # rax = *Y0 + vmovdqu (%rax), %xmm9 # xmm9 = Y0 -_initial_blocks_done\@: + ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Y0) -.endm + vpxor %xmm14, %xmm9, %xmm9 -# encrypt 8 blocks at a time -# ghash the 8 previously encrypted ciphertext blocks -# arg1, arg2, arg3 are used as pointers only, not modified -# r11 is the data offset value -.macro GHASH_8_ENCRYPT_8_PARALLEL_AVX T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC - vmovdqa \XMM1, \T2 - vmovdqa \XMM2, TMP2(%rsp) - vmovdqa \XMM3, TMP3(%rsp) - vmovdqa \XMM4, TMP4(%rsp) - vmovdqa \XMM5, TMP5(%rsp) - vmovdqa \XMM6, TMP6(%rsp) - vmovdqa \XMM7, TMP7(%rsp) - vmovdqa \XMM8, TMP8(%rsp) -.if \loop_idx == in_order - vpaddd ONE(%rip), \CTR, \XMM1 # INCR CNT - vpaddd ONE(%rip), \XMM1, \XMM2 - vpaddd ONE(%rip), \XMM2, \XMM3 - vpaddd ONE(%rip), \XMM3, \XMM4 - vpaddd ONE(%rip), \XMM4, \XMM5 - vpaddd ONE(%rip), \XMM5, \XMM6 - vpaddd ONE(%rip), \XMM6, \XMM7 - vpaddd ONE(%rip), \XMM7, \XMM8 - vmovdqa \XMM8, \CTR +_return_T\@: + mov arg8, %r10 # r10 = authTag + mov arg9, %r11 # r11 = auth_tag_len - vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap -.else - vpaddd ONEf(%rip), \CTR, \XMM1 # INCR CNT - vpaddd ONEf(%rip), \XMM1, \XMM2 - vpaddd ONEf(%rip), \XMM2, \XMM3 - vpaddd ONEf(%rip), \XMM3, \XMM4 - vpaddd ONEf(%rip), \XMM4, \XMM5 - vpaddd ONEf(%rip), \XMM5, \XMM6 - vpaddd ONEf(%rip), \XMM6, \XMM7 - vpaddd ONEf(%rip), \XMM7, \XMM8 - vmovdqa \XMM8, \CTR -.endif + cmp $16, %r11 + je _T_16\@ + cmp $8, %r11 + jl _T_4\@ - ####################################################################### +_T_8\@: + vmovq %xmm9, %rax + mov %rax, (%r10) + add $8, %r10 + sub $8, %r11 + vpsrldq $8, %xmm9, %xmm9 + cmp $0, %r11 + je _return_T_done\@ +_T_4\@: + vmovd %xmm9, %eax + mov %eax, (%r10) + add $4, %r10 + sub $4, %r11 + vpsrldq $4, %xmm9, %xmm9 + cmp $0, %r11 + je _return_T_done\@ +_T_123\@: + vmovd %xmm9, %eax + cmp $2, %r11 + jl _T_1\@ + mov %ax, (%r10) + cmp $2, %r11 + je _return_T_done\@ + add $2, %r10 + sar $16, %eax +_T_1\@: + mov %al, (%r10) + jmp _return_T_done\@ - vmovdqu (arg1), \T1 - vpxor \T1, \XMM1, \XMM1 - vpxor \T1, \XMM2, \XMM2 - vpxor \T1, \XMM3, \XMM3 - vpxor \T1, \XMM4, \XMM4 - vpxor \T1, \XMM5, \XMM5 - vpxor \T1, \XMM6, \XMM6 - vpxor \T1, \XMM7, \XMM7 - vpxor \T1, \XMM8, \XMM8 +_T_16\@: + vmovdqu %xmm9, (%r10) - ####################################################################### +_return_T_done\@: + mov %r14, %rsp + pop %r15 + pop %r14 + pop %r13 + pop %r12 +.endm +#ifdef CONFIG_AS_AVX +############################################################################### +# GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0) +# Input: A and B (128-bits each, bit-reflected) +# Output: C = A*B*x mod poly, (i.e. >>1 ) +# To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input +# GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly. +############################################################################### +.macro GHASH_MUL_AVX GH HK T1 T2 T3 T4 T5 + vpshufd $0b01001110, \GH, \T2 + vpshufd $0b01001110, \HK, \T3 + vpxor \GH , \T2, \T2 # T2 = (a1+a0) + vpxor \HK , \T3, \T3 # T3 = (b1+b0) + vpclmulqdq $0x11, \HK, \GH, \T1 # T1 = a1*b1 + vpclmulqdq $0x00, \HK, \GH, \GH # GH = a0*b0 + vpclmulqdq $0x00, \T3, \T2, \T2 # T2 = (a1+a0)*(b1+b0) + vpxor \GH, \T2,\T2 + vpxor \T1, \T2,\T2 # T2 = a0*b1+a1*b0 - vmovdqu 16*1(arg1), \T1 - vaesenc \T1, \XMM1, \XMM1 - vaesenc \T1, \XMM2, \XMM2 - vaesenc \T1, \XMM3, \XMM3 - vaesenc \T1, \XMM4, \XMM4 - vaesenc \T1, \XMM5, \XMM5 - vaesenc \T1, \XMM6, \XMM6 - vaesenc \T1, \XMM7, \XMM7 - vaesenc \T1, \XMM8, \XMM8 + vpslldq $8, \T2,\T3 # shift-L T3 2 DWs + vpsrldq $8, \T2,\T2 # shift-R T2 2 DWs + vpxor \T3, \GH, \GH + vpxor \T2, \T1, \T1 # = GH x HK - vmovdqu 16*2(arg1), \T1 - vaesenc \T1, \XMM1, \XMM1 - vaesenc \T1, \XMM2, \XMM2 - vaesenc \T1, \XMM3, \XMM3 - vaesenc \T1, \XMM4, \XMM4 - vaesenc \T1, \XMM5, \XMM5 - vaesenc \T1, \XMM6, \XMM6 - vaesenc \T1, \XMM7, \XMM7 - vaesenc \T1, \XMM8, \XMM8 + #first phase of the reduction + vpslld $31, \GH, \T2 # packed right shifting << 31 + vpslld $30, \GH, \T3 # packed right shifting shift << 30 + vpslld $25, \GH, \T4 # packed right shifting shift << 25 + vpxor \T3, \T2, \T2 # xor the shifted versions + vpxor \T4, \T2, \T2 - ####################################################################### + vpsrldq $4, \T2, \T5 # shift-R T5 1 DW - vmovdqa HashKey_8(arg1), \T5 - vpclmulqdq $0x11, \T5, \T2, \T4 # T4 = a1*b1 - vpclmulqdq $0x00, \T5, \T2, \T7 # T7 = a0*b0 + vpslldq $12, \T2, \T2 # shift-L T2 3 DWs + vpxor \T2, \GH, \GH # first phase of the reduction complete - vpshufd $0b01001110, \T2, \T6 - vpxor \T2, \T6, \T6 + #second phase of the reduction - vmovdqa HashKey_8_k(arg1), \T5 - vpclmulqdq $0x00, \T5, \T6, \T6 + vpsrld $1,\GH, \T2 # packed left shifting >> 1 + vpsrld $2,\GH, \T3 # packed left shifting >> 2 + vpsrld $7,\GH, \T4 # packed left shifting >> 7 + vpxor \T3, \T2, \T2 # xor the shifted versions + vpxor \T4, \T2, \T2 - vmovdqu 16*3(arg1), \T1 - vaesenc \T1, \XMM1, \XMM1 - vaesenc \T1, \XMM2, \XMM2 - vaesenc \T1, \XMM3, \XMM3 - vaesenc \T1, \XMM4, \XMM4 - vaesenc \T1, \XMM5, \XMM5 - vaesenc \T1, \XMM6, \XMM6 - vaesenc \T1, \XMM7, \XMM7 - vaesenc \T1, \XMM8, \XMM8 + vpxor \T5, \T2, \T2 + vpxor \T2, \GH, \GH + vpxor \T1, \GH, \GH # the result is in GH - vmovdqa TMP2(%rsp), \T1 - vmovdqa HashKey_7(arg1), \T5 - vpclmulqdq $0x11, \T5, \T1, \T3 - vpxor \T3, \T4, \T4 - vpclmulqdq $0x00, \T5, \T1, \T3 - vpxor \T3, \T7, \T7 - vpshufd $0b01001110, \T1, \T3 - vpxor \T1, \T3, \T3 - vmovdqa HashKey_7_k(arg1), \T5 - vpclmulqdq $0x10, \T5, \T3, \T3 - vpxor \T3, \T6, \T6 +.endm - vmovdqu 16*4(arg1), \T1 - vaesenc \T1, \XMM1, \XMM1 - vaesenc \T1, \XMM2, \XMM2 - vaesenc \T1, \XMM3, \XMM3 - vaesenc \T1, \XMM4, \XMM4 - vaesenc \T1, \XMM5, \XMM5 - vaesenc \T1, \XMM6, \XMM6 - vaesenc \T1, \XMM7, \XMM7 - vaesenc \T1, \XMM8, \XMM8 +.macro PRECOMPUTE_AVX HK T1 T2 T3 T4 T5 T6 - ####################################################################### + # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i + vmovdqa \HK, \T5 - vmovdqa TMP3(%rsp), \T1 - vmovdqa HashKey_6(arg1), \T5 - vpclmulqdq $0x11, \T5, \T1, \T3 - vpxor \T3, \T4, \T4 - vpclmulqdq $0x00, \T5, \T1, \T3 - vpxor \T3, \T7, \T7 + vpshufd $0b01001110, \T5, \T1 + vpxor \T5, \T1, \T1 + vmovdqa \T1, HashKey_k(arg1) - vpshufd $0b01001110, \T1, \T3 - vpxor \T1, \T3, \T3 - vmovdqa HashKey_6_k(arg1), \T5 - vpclmulqdq $0x10, \T5, \T3, \T3 - vpxor \T3, \T6, \T6 + GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^2<<1 mod poly + vmovdqa \T5, HashKey_2(arg1) # [HashKey_2] = HashKey^2<<1 mod poly + vpshufd $0b01001110, \T5, \T1 + vpxor \T5, \T1, \T1 + vmovdqa \T1, HashKey_2_k(arg1) - vmovdqu 16*5(arg1), \T1 - vaesenc \T1, \XMM1, \XMM1 - vaesenc \T1, \XMM2, \XMM2 - vaesenc \T1, \XMM3, \XMM3 - vaesenc \T1, \XMM4, \XMM4 - vaesenc \T1, \XMM5, \XMM5 - vaesenc \T1, \XMM6, \XMM6 - vaesenc \T1, \XMM7, \XMM7 - vaesenc \T1, \XMM8, \XMM8 + GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^3<<1 mod poly + vmovdqa \T5, HashKey_3(arg1) + vpshufd $0b01001110, \T5, \T1 + vpxor \T5, \T1, \T1 + vmovdqa \T1, HashKey_3_k(arg1) - vmovdqa TMP4(%rsp), \T1 - vmovdqa HashKey_5(arg1), \T5 - vpclmulqdq $0x11, \T5, \T1, \T3 - vpxor \T3, \T4, \T4 - vpclmulqdq $0x00, \T5, \T1, \T3 - vpxor \T3, \T7, \T7 + GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^4<<1 mod poly + vmovdqa \T5, HashKey_4(arg1) + vpshufd $0b01001110, \T5, \T1 + vpxor \T5, \T1, \T1 + vmovdqa \T1, HashKey_4_k(arg1) - vpshufd $0b01001110, \T1, \T3 - vpxor \T1, \T3, \T3 - vmovdqa HashKey_5_k(arg1), \T5 - vpclmulqdq $0x10, \T5, \T3, \T3 - vpxor \T3, \T6, \T6 + GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^5<<1 mod poly + vmovdqa \T5, HashKey_5(arg1) + vpshufd $0b01001110, \T5, \T1 + vpxor \T5, \T1, \T1 + vmovdqa \T1, HashKey_5_k(arg1) - vmovdqu 16*6(arg1), \T1 - vaesenc \T1, \XMM1, \XMM1 - vaesenc \T1, \XMM2, \XMM2 - vaesenc \T1, \XMM3, \XMM3 - vaesenc \T1, \XMM4, \XMM4 - vaesenc \T1, \XMM5, \XMM5 - vaesenc \T1, \XMM6, \XMM6 - vaesenc \T1, \XMM7, \XMM7 - vaesenc \T1, \XMM8, \XMM8 + GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^6<<1 mod poly + vmovdqa \T5, HashKey_6(arg1) + vpshufd $0b01001110, \T5, \T1 + vpxor \T5, \T1, \T1 + vmovdqa \T1, HashKey_6_k(arg1) + GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^7<<1 mod poly + vmovdqa \T5, HashKey_7(arg1) + vpshufd $0b01001110, \T5, \T1 + vpxor \T5, \T1, \T1 + vmovdqa \T1, HashKey_7_k(arg1) - vmovdqa TMP5(%rsp), \T1 - vmovdqa HashKey_4(arg1), \T5 - vpclmulqdq $0x11, \T5, \T1, \T3 - vpxor \T3, \T4, \T4 - vpclmulqdq $0x00, \T5, \T1, \T3 - vpxor \T3, \T7, \T7 + GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^8<<1 mod poly + vmovdqa \T5, HashKey_8(arg1) + vpshufd $0b01001110, \T5, \T1 + vpxor \T5, \T1, \T1 + vmovdqa \T1, HashKey_8_k(arg1) - vpshufd $0b01001110, \T1, \T3 - vpxor \T1, \T3, \T3 - vmovdqa HashKey_4_k(arg1), \T5 - vpclmulqdq $0x10, \T5, \T3, \T3 - vpxor \T3, \T6, \T6 +.endm - vmovdqu 16*7(arg1), \T1 - vaesenc \T1, \XMM1, \XMM1 - vaesenc \T1, \XMM2, \XMM2 - vaesenc \T1, \XMM3, \XMM3 - vaesenc \T1, \XMM4, \XMM4 - vaesenc \T1, \XMM5, \XMM5 - vaesenc \T1, \XMM6, \XMM6 - vaesenc \T1, \XMM7, \XMM7 - vaesenc \T1, \XMM8, \XMM8 +## if a = number of total plaintext bytes +## b = floor(a/16) +## num_initial_blocks = b mod 4# +## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext +## r10, r11, r12, rax are clobbered +## arg1, arg2, arg3, r14 are used as a pointer only, not modified - vmovdqa TMP6(%rsp), \T1 - vmovdqa HashKey_3(arg1), \T5 - vpclmulqdq $0x11, \T5, \T1, \T3 - vpxor \T3, \T4, \T4 - vpclmulqdq $0x00, \T5, \T1, \T3 - vpxor \T3, \T7, \T7 +.macro INITIAL_BLOCKS_AVX num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC + i = (8-\num_initial_blocks) + j = 0 + setreg - vpshufd $0b01001110, \T1, \T3 - vpxor \T1, \T3, \T3 - vmovdqa HashKey_3_k(arg1), \T5 - vpclmulqdq $0x10, \T5, \T3, \T3 - vpxor \T3, \T6, \T6 + mov arg6, %r10 # r10 = AAD + mov arg7, %r12 # r12 = aadLen - vmovdqu 16*8(arg1), \T1 - vaesenc \T1, \XMM1, \XMM1 - vaesenc \T1, \XMM2, \XMM2 - vaesenc \T1, \XMM3, \XMM3 - vaesenc \T1, \XMM4, \XMM4 - vaesenc \T1, \XMM5, \XMM5 - vaesenc \T1, \XMM6, \XMM6 - vaesenc \T1, \XMM7, \XMM7 - vaesenc \T1, \XMM8, \XMM8 + mov %r12, %r11 - vmovdqa TMP7(%rsp), \T1 - vmovdqa HashKey_2(arg1), \T5 - vpclmulqdq $0x11, \T5, \T1, \T3 - vpxor \T3, \T4, \T4 - vpclmulqdq $0x00, \T5, \T1, \T3 - vpxor \T3, \T7, \T7 + vpxor reg_j, reg_j, reg_j + vpxor reg_i, reg_i, reg_i + cmp $16, %r11 + jl _get_AAD_rest8\@ +_get_AAD_blocks\@: + vmovdqu (%r10), reg_i + vpshufb SHUF_MASK(%rip), reg_i, reg_i + vpxor reg_i, reg_j, reg_j + GHASH_MUL_AVX reg_j, \T2, \T1, \T3, \T4, \T5, \T6 + add $16, %r10 + sub $16, %r12 + sub $16, %r11 + cmp $16, %r11 + jge _get_AAD_blocks\@ + vmovdqu reg_j, reg_i + cmp $0, %r11 + je _get_AAD_done\@ - vpshufd $0b01001110, \T1, \T3 - vpxor \T1, \T3, \T3 - vmovdqa HashKey_2_k(arg1), \T5 - vpclmulqdq $0x10, \T5, \T3, \T3 - vpxor \T3, \T6, \T6 + vpxor reg_i, reg_i, reg_i - ####################################################################### + /* read the last <16B of AAD. since we have at least 4B of + data right after the AAD (the ICV, and maybe some CT), we can + read 4B/8B blocks safely, and then get rid of the extra stuff */ +_get_AAD_rest8\@: + cmp $4, %r11 + jle _get_AAD_rest4\@ + movq (%r10), \T1 + add $8, %r10 + sub $8, %r11 + vpslldq $8, \T1, \T1 + vpsrldq $8, reg_i, reg_i + vpxor \T1, reg_i, reg_i + jmp _get_AAD_rest8\@ +_get_AAD_rest4\@: + cmp $0, %r11 + jle _get_AAD_rest0\@ + mov (%r10), %eax + movq %rax, \T1 + add $4, %r10 + sub $4, %r11 + vpslldq $12, \T1, \T1 + vpsrldq $4, reg_i, reg_i + vpxor \T1, reg_i, reg_i +_get_AAD_rest0\@: + /* finalize: shift out the extra bytes we read, and align + left. since pslldq can only shift by an immediate, we use + vpshufb and an array of shuffle masks */ + movq %r12, %r11 + salq $4, %r11 + movdqu aad_shift_arr(%r11), \T1 + vpshufb \T1, reg_i, reg_i +_get_AAD_rest_final\@: + vpshufb SHUF_MASK(%rip), reg_i, reg_i + vpxor reg_j, reg_i, reg_i + GHASH_MUL_AVX reg_i, \T2, \T1, \T3, \T4, \T5, \T6 - vmovdqu 16*9(arg1), \T5 - vaesenc \T5, \XMM1, \XMM1 - vaesenc \T5, \XMM2, \XMM2 - vaesenc \T5, \XMM3, \XMM3 - vaesenc \T5, \XMM4, \XMM4 - vaesenc \T5, \XMM5, \XMM5 - vaesenc \T5, \XMM6, \XMM6 - vaesenc \T5, \XMM7, \XMM7 - vaesenc \T5, \XMM8, \XMM8 +_get_AAD_done\@: + # initialize the data pointer offset as zero + xor %r11d, %r11d - vmovdqa TMP8(%rsp), \T1 - vmovdqa HashKey(arg1), \T5 - vpclmulqdq $0x11, \T5, \T1, \T3 - vpxor \T3, \T4, \T4 - vpclmulqdq $0x00, \T5, \T1, \T3 - vpxor \T3, \T7, \T7 + # start AES for num_initial_blocks blocks + mov arg5, %rax # rax = *Y0 + vmovdqu (%rax), \CTR # CTR = Y0 + vpshufb SHUF_MASK(%rip), \CTR, \CTR - vpshufd $0b01001110, \T1, \T3 - vpxor \T1, \T3, \T3 - vmovdqa HashKey_k(arg1), \T5 - vpclmulqdq $0x10, \T5, \T3, \T3 - vpxor \T3, \T6, \T6 - vpxor \T4, \T6, \T6 - vpxor \T7, \T6, \T6 + i = (9-\num_initial_blocks) + setreg +.rep \num_initial_blocks + vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 + vmovdqa \CTR, reg_i + vpshufb SHUF_MASK(%rip), reg_i, reg_i # perform a 16Byte swap + i = (i+1) + setreg +.endr - vmovdqu 16*10(arg1), \T5 + vmovdqa (arg1), \T_key + i = (9-\num_initial_blocks) + setreg +.rep \num_initial_blocks + vpxor \T_key, reg_i, reg_i + i = (i+1) + setreg +.endr - i = 0 j = 1 setreg -.rep 8 - vpxor 16*i(arg3, %r11), \T5, \T2 - .if \ENC_DEC == ENC - vaesenclast \T2, reg_j, reg_j - .else - vaesenclast \T2, reg_j, \T3 - vmovdqu 16*i(arg3, %r11), reg_j - vmovdqu \T3, 16*i(arg2, %r11) - .endif +.rep 9 + vmovdqa 16*j(arg1), \T_key + i = (9-\num_initial_blocks) + setreg +.rep \num_initial_blocks + vaesenc \T_key, reg_i, reg_i i = (i+1) + setreg +.endr + j = (j+1) setreg .endr - ####################################################################### - vpslldq $8, \T6, \T3 # shift-L T3 2 DWs - vpsrldq $8, \T6, \T6 # shift-R T2 2 DWs - vpxor \T3, \T7, \T7 - vpxor \T4, \T6, \T6 # accumulate the results in T6:T7 + vmovdqa 16*10(arg1), \T_key + i = (9-\num_initial_blocks) + setreg +.rep \num_initial_blocks + vaesenclast \T_key, reg_i, reg_i + i = (i+1) + setreg +.endr + i = (9-\num_initial_blocks) + setreg +.rep \num_initial_blocks + vmovdqu (arg3, %r11), \T1 + vpxor \T1, reg_i, reg_i + vmovdqu reg_i, (arg2 , %r11) # write back ciphertext for num_initial_blocks blocks + add $16, %r11 +.if \ENC_DEC == DEC + vmovdqa \T1, reg_i +.endif + vpshufb SHUF_MASK(%rip), reg_i, reg_i # prepare ciphertext for GHASH computations + i = (i+1) + setreg +.endr - ####################################################################### - #first phase of the reduction - ####################################################################### - vpslld $31, \T7, \T2 # packed right shifting << 31 - vpslld $30, \T7, \T3 # packed right shifting shift << 30 - vpslld $25, \T7, \T4 # packed right shifting shift << 25 + i = (8-\num_initial_blocks) + j = (9-\num_initial_blocks) + setreg - vpxor \T3, \T2, \T2 # xor the shifted versions - vpxor \T4, \T2, \T2 +.rep \num_initial_blocks + vpxor reg_i, reg_j, reg_j + GHASH_MUL_AVX reg_j, \T2, \T1, \T3, \T4, \T5, \T6 # apply GHASH on num_initial_blocks blocks + i = (i+1) + j = (j+1) + setreg +.endr + # XMM8 has the combined result here - vpsrldq $4, \T2, \T1 # shift-R T1 1 DW + vmovdqa \XMM8, TMP1(%rsp) + vmovdqa \XMM8, \T3 - vpslldq $12, \T2, \T2 # shift-L T2 3 DWs - vpxor \T2, \T7, \T7 # first phase of the reduction complete - ####################################################################### - .if \ENC_DEC == ENC - vmovdqu \XMM1, 16*0(arg2,%r11) # Write to the Ciphertext buffer - vmovdqu \XMM2, 16*1(arg2,%r11) # Write to the Ciphertext buffer - vmovdqu \XMM3, 16*2(arg2,%r11) # Write to the Ciphertext buffer - vmovdqu \XMM4, 16*3(arg2,%r11) # Write to the Ciphertext buffer - vmovdqu \XMM5, 16*4(arg2,%r11) # Write to the Ciphertext buffer - vmovdqu \XMM6, 16*5(arg2,%r11) # Write to the Ciphertext buffer - vmovdqu \XMM7, 16*6(arg2,%r11) # Write to the Ciphertext buffer - vmovdqu \XMM8, 16*7(arg2,%r11) # Write to the Ciphertext buffer - .endif + cmp $128, %r13 + jl _initial_blocks_done\@ # no need for precomputed constants - ####################################################################### - #second phase of the reduction - vpsrld $1, \T7, \T2 # packed left shifting >> 1 - vpsrld $2, \T7, \T3 # packed left shifting >> 2 - vpsrld $7, \T7, \T4 # packed left shifting >> 7 - vpxor \T3, \T2, \T2 # xor the shifted versions - vpxor \T4, \T2, \T2 +############################################################################### +# Haskey_i_k holds XORed values of the low and high parts of the Haskey_i + vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 + vmovdqa \CTR, \XMM1 + vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap - vpxor \T1, \T2, \T2 - vpxor \T2, \T7, \T7 - vpxor \T7, \T6, \T6 # the result is in T6 - ####################################################################### + vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 + vmovdqa \CTR, \XMM2 + vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap + vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 + vmovdqa \CTR, \XMM3 + vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap + vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 + vmovdqa \CTR, \XMM4 + vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap - vpxor \T6, \XMM1, \XMM1 + vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 + vmovdqa \CTR, \XMM5 + vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap + vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 + vmovdqa \CTR, \XMM6 + vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap + vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 + vmovdqa \CTR, \XMM7 + vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap -.endm + vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 + vmovdqa \CTR, \XMM8 + vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap + vmovdqa (arg1), \T_key + vpxor \T_key, \XMM1, \XMM1 + vpxor \T_key, \XMM2, \XMM2 + vpxor \T_key, \XMM3, \XMM3 + vpxor \T_key, \XMM4, \XMM4 + vpxor \T_key, \XMM5, \XMM5 + vpxor \T_key, \XMM6, \XMM6 + vpxor \T_key, \XMM7, \XMM7 + vpxor \T_key, \XMM8, \XMM8 -# GHASH the last 4 ciphertext blocks. -.macro GHASH_LAST_8_AVX T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 + i = 1 + setreg +.rep 9 # do 9 rounds + vmovdqa 16*i(arg1), \T_key + vaesenc \T_key, \XMM1, \XMM1 + vaesenc \T_key, \XMM2, \XMM2 + vaesenc \T_key, \XMM3, \XMM3 + vaesenc \T_key, \XMM4, \XMM4 + vaesenc \T_key, \XMM5, \XMM5 + vaesenc \T_key, \XMM6, \XMM6 + vaesenc \T_key, \XMM7, \XMM7 + vaesenc \T_key, \XMM8, \XMM8 + i = (i+1) + setreg +.endr - ## Karatsuba Method + vmovdqa 16*i(arg1), \T_key + vaesenclast \T_key, \XMM1, \XMM1 + vaesenclast \T_key, \XMM2, \XMM2 + vaesenclast \T_key, \XMM3, \XMM3 + vaesenclast \T_key, \XMM4, \XMM4 + vaesenclast \T_key, \XMM5, \XMM5 + vaesenclast \T_key, \XMM6, \XMM6 + vaesenclast \T_key, \XMM7, \XMM7 + vaesenclast \T_key, \XMM8, \XMM8 - vpshufd $0b01001110, \XMM1, \T2 - vpxor \XMM1, \T2, \T2 - vmovdqa HashKey_8(arg1), \T5 - vpclmulqdq $0x11, \T5, \XMM1, \T6 - vpclmulqdq $0x00, \T5, \XMM1, \T7 + vmovdqu (arg3, %r11), \T1 + vpxor \T1, \XMM1, \XMM1 + vmovdqu \XMM1, (arg2 , %r11) + .if \ENC_DEC == DEC + vmovdqa \T1, \XMM1 + .endif - vmovdqa HashKey_8_k(arg1), \T3 - vpclmulqdq $0x00, \T3, \T2, \XMM1 + vmovdqu 16*1(arg3, %r11), \T1 + vpxor \T1, \XMM2, \XMM2 + vmovdqu \XMM2, 16*1(arg2 , %r11) + .if \ENC_DEC == DEC + vmovdqa \T1, \XMM2 + .endif - ###################### + vmovdqu 16*2(arg3, %r11), \T1 + vpxor \T1, \XMM3, \XMM3 + vmovdqu \XMM3, 16*2(arg2 , %r11) + .if \ENC_DEC == DEC + vmovdqa \T1, \XMM3 + .endif - vpshufd $0b01001110, \XMM2, \T2 - vpxor \XMM2, \T2, \T2 - vmovdqa HashKey_7(arg1), \T5 - vpclmulqdq $0x11, \T5, \XMM2, \T4 - vpxor \T4, \T6, \T6 + vmovdqu 16*3(arg3, %r11), \T1 + vpxor \T1, \XMM4, \XMM4 + vmovdqu \XMM4, 16*3(arg2 , %r11) + .if \ENC_DEC == DEC + vmovdqa \T1, \XMM4 + .endif - vpclmulqdq $0x00, \T5, \XMM2, \T4 - vpxor \T4, \T7, \T7 + vmovdqu 16*4(arg3, %r11), \T1 + vpxor \T1, \XMM5, \XMM5 + vmovdqu \XMM5, 16*4(arg2 , %r11) + .if \ENC_DEC == DEC + vmovdqa \T1, \XMM5 + .endif - vmovdqa HashKey_7_k(arg1), \T3 - vpclmulqdq $0x00, \T3, \T2, \T2 - vpxor \T2, \XMM1, \XMM1 + vmovdqu 16*5(arg3, %r11), \T1 + vpxor \T1, \XMM6, \XMM6 + vmovdqu \XMM6, 16*5(arg2 , %r11) + .if \ENC_DEC == DEC + vmovdqa \T1, \XMM6 + .endif - ###################### + vmovdqu 16*6(arg3, %r11), \T1 + vpxor \T1, \XMM7, \XMM7 + vmovdqu \XMM7, 16*6(arg2 , %r11) + .if \ENC_DEC == DEC + vmovdqa \T1, \XMM7 + .endif - vpshufd $0b01001110, \XMM3, \T2 - vpxor \XMM3, \T2, \T2 - vmovdqa HashKey_6(arg1), \T5 - vpclmulqdq $0x11, \T5, \XMM3, \T4 - vpxor \T4, \T6, \T6 + vmovdqu 16*7(arg3, %r11), \T1 + vpxor \T1, \XMM8, \XMM8 + vmovdqu \XMM8, 16*7(arg2 , %r11) + .if \ENC_DEC == DEC + vmovdqa \T1, \XMM8 + .endif - vpclmulqdq $0x00, \T5, \XMM3, \T4 - vpxor \T4, \T7, \T7 + add $128, %r11 - vmovdqa HashKey_6_k(arg1), \T3 - vpclmulqdq $0x00, \T3, \T2, \T2 - vpxor \T2, \XMM1, \XMM1 + vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap + vpxor TMP1(%rsp), \XMM1, \XMM1 # combine GHASHed value with the corresponding ciphertext + vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap - ###################### +############################################################################### - vpshufd $0b01001110, \XMM4, \T2 - vpxor \XMM4, \T2, \T2 - vmovdqa HashKey_5(arg1), \T5 - vpclmulqdq $0x11, \T5, \XMM4, \T4 - vpxor \T4, \T6, \T6 +_initial_blocks_done\@: - vpclmulqdq $0x00, \T5, \XMM4, \T4 - vpxor \T4, \T7, \T7 +.endm - vmovdqa HashKey_5_k(arg1), \T3 - vpclmulqdq $0x00, \T3, \T2, \T2 - vpxor \T2, \XMM1, \XMM1 +# encrypt 8 blocks at a time +# ghash the 8 previously encrypted ciphertext blocks +# arg1, arg2, arg3 are used as pointers only, not modified +# r11 is the data offset value +.macro GHASH_8_ENCRYPT_8_PARALLEL_AVX T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC - ###################### + vmovdqa \XMM1, \T2 + vmovdqa \XMM2, TMP2(%rsp) + vmovdqa \XMM3, TMP3(%rsp) + vmovdqa \XMM4, TMP4(%rsp) + vmovdqa \XMM5, TMP5(%rsp) + vmovdqa \XMM6, TMP6(%rsp) + vmovdqa \XMM7, TMP7(%rsp) + vmovdqa \XMM8, TMP8(%rsp) - vpshufd $0b01001110, \XMM5, \T2 - vpxor \XMM5, \T2, \T2 - vmovdqa HashKey_4(arg1), \T5 - vpclmulqdq $0x11, \T5, \XMM5, \T4 - vpxor \T4, \T6, \T6 +.if \loop_idx == in_order + vpaddd ONE(%rip), \CTR, \XMM1 # INCR CNT + vpaddd ONE(%rip), \XMM1, \XMM2 + vpaddd ONE(%rip), \XMM2, \XMM3 + vpaddd ONE(%rip), \XMM3, \XMM4 + vpaddd ONE(%rip), \XMM4, \XMM5 + vpaddd ONE(%rip), \XMM5, \XMM6 + vpaddd ONE(%rip), \XMM6, \XMM7 + vpaddd ONE(%rip), \XMM7, \XMM8 + vmovdqa \XMM8, \CTR - vpclmulqdq $0x00, \T5, \XMM5, \T4 - vpxor \T4, \T7, \T7 + vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap +.else + vpaddd ONEf(%rip), \CTR, \XMM1 # INCR CNT + vpaddd ONEf(%rip), \XMM1, \XMM2 + vpaddd ONEf(%rip), \XMM2, \XMM3 + vpaddd ONEf(%rip), \XMM3, \XMM4 + vpaddd ONEf(%rip), \XMM4, \XMM5 + vpaddd ONEf(%rip), \XMM5, \XMM6 + vpaddd ONEf(%rip), \XMM6, \XMM7 + vpaddd ONEf(%rip), \XMM7, \XMM8 + vmovdqa \XMM8, \CTR +.endif - vmovdqa HashKey_4_k(arg1), \T3 - vpclmulqdq $0x00, \T3, \T2, \T2 - vpxor \T2, \XMM1, \XMM1 - ###################### + ####################################################################### - vpshufd $0b01001110, \XMM6, \T2 - vpxor \XMM6, \T2, \T2 - vmovdqa HashKey_3(arg1), \T5 - vpclmulqdq $0x11, \T5, \XMM6, \T4 - vpxor \T4, \T6, \T6 + vmovdqu (arg1), \T1 + vpxor \T1, \XMM1, \XMM1 + vpxor \T1, \XMM2, \XMM2 + vpxor \T1, \XMM3, \XMM3 + vpxor \T1, \XMM4, \XMM4 + vpxor \T1, \XMM5, \XMM5 + vpxor \T1, \XMM6, \XMM6 + vpxor \T1, \XMM7, \XMM7 + vpxor \T1, \XMM8, \XMM8 - vpclmulqdq $0x00, \T5, \XMM6, \T4 - vpxor \T4, \T7, \T7 + ####################################################################### - vmovdqa HashKey_3_k(arg1), \T3 - vpclmulqdq $0x00, \T3, \T2, \T2 - vpxor \T2, \XMM1, \XMM1 - ###################### - vpshufd $0b01001110, \XMM7, \T2 - vpxor \XMM7, \T2, \T2 - vmovdqa HashKey_2(arg1), \T5 - vpclmulqdq $0x11, \T5, \XMM7, \T4 - vpxor \T4, \T6, \T6 - vpclmulqdq $0x00, \T5, \XMM7, \T4 - vpxor \T4, \T7, \T7 - vmovdqa HashKey_2_k(arg1), \T3 - vpclmulqdq $0x00, \T3, \T2, \T2 - vpxor \T2, \XMM1, \XMM1 + vmovdqu 16*1(arg1), \T1 + vaesenc \T1, \XMM1, \XMM1 + vaesenc \T1, \XMM2, \XMM2 + vaesenc \T1, \XMM3, \XMM3 + vaesenc \T1, \XMM4, \XMM4 + vaesenc \T1, \XMM5, \XMM5 + vaesenc \T1, \XMM6, \XMM6 + vaesenc \T1, \XMM7, \XMM7 + vaesenc \T1, \XMM8, \XMM8 - ###################### + vmovdqu 16*2(arg1), \T1 + vaesenc \T1, \XMM1, \XMM1 + vaesenc \T1, \XMM2, \XMM2 + vaesenc \T1, \XMM3, \XMM3 + vaesenc \T1, \XMM4, \XMM4 + vaesenc \T1, \XMM5, \XMM5 + vaesenc \T1, \XMM6, \XMM6 + vaesenc \T1, \XMM7, \XMM7 + vaesenc \T1, \XMM8, \XMM8 - vpshufd $0b01001110, \XMM8, \T2 - vpxor \XMM8, \T2, \T2 - vmovdqa HashKey(arg1), \T5 - vpclmulqdq $0x11, \T5, \XMM8, \T4 - vpxor \T4, \T6, \T6 - vpclmulqdq $0x00, \T5, \XMM8, \T4 - vpxor \T4, \T7, \T7 + ####################################################################### - vmovdqa HashKey_k(arg1), \T3 - vpclmulqdq $0x00, \T3, \T2, \T2 + vmovdqa HashKey_8(arg1), \T5 + vpclmulqdq $0x11, \T5, \T2, \T4 # T4 = a1*b1 + vpclmulqdq $0x00, \T5, \T2, \T7 # T7 = a0*b0 - vpxor \T2, \XMM1, \XMM1 - vpxor \T6, \XMM1, \XMM1 - vpxor \T7, \XMM1, \T2 + vpshufd $0b01001110, \T2, \T6 + vpxor \T2, \T6, \T6 + vmovdqa HashKey_8_k(arg1), \T5 + vpclmulqdq $0x00, \T5, \T6, \T6 + vmovdqu 16*3(arg1), \T1 + vaesenc \T1, \XMM1, \XMM1 + vaesenc \T1, \XMM2, \XMM2 + vaesenc \T1, \XMM3, \XMM3 + vaesenc \T1, \XMM4, \XMM4 + vaesenc \T1, \XMM5, \XMM5 + vaesenc \T1, \XMM6, \XMM6 + vaesenc \T1, \XMM7, \XMM7 + vaesenc \T1, \XMM8, \XMM8 + vmovdqa TMP2(%rsp), \T1 + vmovdqa HashKey_7(arg1), \T5 + vpclmulqdq $0x11, \T5, \T1, \T3 + vpxor \T3, \T4, \T4 + vpclmulqdq $0x00, \T5, \T1, \T3 + vpxor \T3, \T7, \T7 - vpslldq $8, \T2, \T4 - vpsrldq $8, \T2, \T2 + vpshufd $0b01001110, \T1, \T3 + vpxor \T1, \T3, \T3 + vmovdqa HashKey_7_k(arg1), \T5 + vpclmulqdq $0x10, \T5, \T3, \T3 + vpxor \T3, \T6, \T6 - vpxor \T4, \T7, \T7 - vpxor \T2, \T6, \T6 # holds the result of - # the accumulated carry-less multiplications + vmovdqu 16*4(arg1), \T1 + vaesenc \T1, \XMM1, \XMM1 + vaesenc \T1, \XMM2, \XMM2 + vaesenc \T1, \XMM3, \XMM3 + vaesenc \T1, \XMM4, \XMM4 + vaesenc \T1, \XMM5, \XMM5 + vaesenc \T1, \XMM6, \XMM6 + vaesenc \T1, \XMM7, \XMM7 + vaesenc \T1, \XMM8, \XMM8 - ####################################################################### - #first phase of the reduction - vpslld $31, \T7, \T2 # packed right shifting << 31 - vpslld $30, \T7, \T3 # packed right shifting shift << 30 - vpslld $25, \T7, \T4 # packed right shifting shift << 25 + ####################################################################### - vpxor \T3, \T2, \T2 # xor the shifted versions - vpxor \T4, \T2, \T2 + vmovdqa TMP3(%rsp), \T1 + vmovdqa HashKey_6(arg1), \T5 + vpclmulqdq $0x11, \T5, \T1, \T3 + vpxor \T3, \T4, \T4 + vpclmulqdq $0x00, \T5, \T1, \T3 + vpxor \T3, \T7, \T7 - vpsrldq $4, \T2, \T1 # shift-R T1 1 DW + vpshufd $0b01001110, \T1, \T3 + vpxor \T1, \T3, \T3 + vmovdqa HashKey_6_k(arg1), \T5 + vpclmulqdq $0x10, \T5, \T3, \T3 + vpxor \T3, \T6, \T6 - vpslldq $12, \T2, \T2 # shift-L T2 3 DWs - vpxor \T2, \T7, \T7 # first phase of the reduction complete - ####################################################################### + vmovdqu 16*5(arg1), \T1 + vaesenc \T1, \XMM1, \XMM1 + vaesenc \T1, \XMM2, \XMM2 + vaesenc \T1, \XMM3, \XMM3 + vaesenc \T1, \XMM4, \XMM4 + vaesenc \T1, \XMM5, \XMM5 + vaesenc \T1, \XMM6, \XMM6 + vaesenc \T1, \XMM7, \XMM7 + vaesenc \T1, \XMM8, \XMM8 + vmovdqa TMP4(%rsp), \T1 + vmovdqa HashKey_5(arg1), \T5 + vpclmulqdq $0x11, \T5, \T1, \T3 + vpxor \T3, \T4, \T4 + vpclmulqdq $0x00, \T5, \T1, \T3 + vpxor \T3, \T7, \T7 - #second phase of the reduction - vpsrld $1, \T7, \T2 # packed left shifting >> 1 - vpsrld $2, \T7, \T3 # packed left shifting >> 2 - vpsrld $7, \T7, \T4 # packed left shifting >> 7 - vpxor \T3, \T2, \T2 # xor the shifted versions - vpxor \T4, \T2, \T2 + vpshufd $0b01001110, \T1, \T3 + vpxor \T1, \T3, \T3 + vmovdqa HashKey_5_k(arg1), \T5 + vpclmulqdq $0x10, \T5, \T3, \T3 + vpxor \T3, \T6, \T6 - vpxor \T1, \T2, \T2 - vpxor \T2, \T7, \T7 - vpxor \T7, \T6, \T6 # the result is in T6 + vmovdqu 16*6(arg1), \T1 + vaesenc \T1, \XMM1, \XMM1 + vaesenc \T1, \XMM2, \XMM2 + vaesenc \T1, \XMM3, \XMM3 + vaesenc \T1, \XMM4, \XMM4 + vaesenc \T1, \XMM5, \XMM5 + vaesenc \T1, \XMM6, \XMM6 + vaesenc \T1, \XMM7, \XMM7 + vaesenc \T1, \XMM8, \XMM8 -.endm + vmovdqa TMP5(%rsp), \T1 + vmovdqa HashKey_4(arg1), \T5 + vpclmulqdq $0x11, \T5, \T1, \T3 + vpxor \T3, \T4, \T4 + vpclmulqdq $0x00, \T5, \T1, \T3 + vpxor \T3, \T7, \T7 -# combined for GCM encrypt and decrypt functions -# clobbering all xmm registers -# clobbering r10, r11, r12, r13, r14, r15 -.macro GCM_ENC_DEC_AVX ENC_DEC + vpshufd $0b01001110, \T1, \T3 + vpxor \T1, \T3, \T3 + vmovdqa HashKey_4_k(arg1), \T5 + vpclmulqdq $0x10, \T5, \T3, \T3 + vpxor \T3, \T6, \T6 - #the number of pushes must equal STACK_OFFSET - push %r12 - push %r13 - push %r14 - push %r15 + vmovdqu 16*7(arg1), \T1 + vaesenc \T1, \XMM1, \XMM1 + vaesenc \T1, \XMM2, \XMM2 + vaesenc \T1, \XMM3, \XMM3 + vaesenc \T1, \XMM4, \XMM4 + vaesenc \T1, \XMM5, \XMM5 + vaesenc \T1, \XMM6, \XMM6 + vaesenc \T1, \XMM7, \XMM7 + vaesenc \T1, \XMM8, \XMM8 - mov %rsp, %r14 + vmovdqa TMP6(%rsp), \T1 + vmovdqa HashKey_3(arg1), \T5 + vpclmulqdq $0x11, \T5, \T1, \T3 + vpxor \T3, \T4, \T4 + vpclmulqdq $0x00, \T5, \T1, \T3 + vpxor \T3, \T7, \T7 + vpshufd $0b01001110, \T1, \T3 + vpxor \T1, \T3, \T3 + vmovdqa HashKey_3_k(arg1), \T5 + vpclmulqdq $0x10, \T5, \T3, \T3 + vpxor \T3, \T6, \T6 + vmovdqu 16*8(arg1), \T1 + vaesenc \T1, \XMM1, \XMM1 + vaesenc \T1, \XMM2, \XMM2 + vaesenc \T1, \XMM3, \XMM3 + vaesenc \T1, \XMM4, \XMM4 + vaesenc \T1, \XMM5, \XMM5 + vaesenc \T1, \XMM6, \XMM6 + vaesenc \T1, \XMM7, \XMM7 + vaesenc \T1, \XMM8, \XMM8 - sub $VARIABLE_OFFSET, %rsp - and $~63, %rsp # align rsp to 64 bytes + vmovdqa TMP7(%rsp), \T1 + vmovdqa HashKey_2(arg1), \T5 + vpclmulqdq $0x11, \T5, \T1, \T3 + vpxor \T3, \T4, \T4 + vpclmulqdq $0x00, \T5, \T1, \T3 + vpxor \T3, \T7, \T7 + vpshufd $0b01001110, \T1, \T3 + vpxor \T1, \T3, \T3 + vmovdqa HashKey_2_k(arg1), \T5 + vpclmulqdq $0x10, \T5, \T3, \T3 + vpxor \T3, \T6, \T6 - vmovdqu HashKey(arg1), %xmm13 # xmm13 = HashKey + ####################################################################### - mov arg4, %r13 # save the number of bytes of plaintext/ciphertext - and $-16, %r13 # r13 = r13 - (r13 mod 16) + vmovdqu 16*9(arg1), \T5 + vaesenc \T5, \XMM1, \XMM1 + vaesenc \T5, \XMM2, \XMM2 + vaesenc \T5, \XMM3, \XMM3 + vaesenc \T5, \XMM4, \XMM4 + vaesenc \T5, \XMM5, \XMM5 + vaesenc \T5, \XMM6, \XMM6 + vaesenc \T5, \XMM7, \XMM7 + vaesenc \T5, \XMM8, \XMM8 - mov %r13, %r12 - shr $4, %r12 - and $7, %r12 - jz _initial_num_blocks_is_0\@ + vmovdqa TMP8(%rsp), \T1 + vmovdqa HashKey(arg1), \T5 + vpclmulqdq $0x11, \T5, \T1, \T3 + vpxor \T3, \T4, \T4 + vpclmulqdq $0x00, \T5, \T1, \T3 + vpxor \T3, \T7, \T7 - cmp $7, %r12 - je _initial_num_blocks_is_7\@ - cmp $6, %r12 - je _initial_num_blocks_is_6\@ - cmp $5, %r12 - je _initial_num_blocks_is_5\@ - cmp $4, %r12 - je _initial_num_blocks_is_4\@ - cmp $3, %r12 - je _initial_num_blocks_is_3\@ - cmp $2, %r12 - je _initial_num_blocks_is_2\@ + vpshufd $0b01001110, \T1, \T3 + vpxor \T1, \T3, \T3 + vmovdqa HashKey_k(arg1), \T5 + vpclmulqdq $0x10, \T5, \T3, \T3 + vpxor \T3, \T6, \T6 - jmp _initial_num_blocks_is_1\@ + vpxor \T4, \T6, \T6 + vpxor \T7, \T6, \T6 -_initial_num_blocks_is_7\@: - INITIAL_BLOCKS_AVX 7, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC - sub $16*7, %r13 - jmp _initial_blocks_encrypted\@ + vmovdqu 16*10(arg1), \T5 -_initial_num_blocks_is_6\@: - INITIAL_BLOCKS_AVX 6, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC - sub $16*6, %r13 - jmp _initial_blocks_encrypted\@ + i = 0 + j = 1 + setreg +.rep 8 + vpxor 16*i(arg3, %r11), \T5, \T2 + .if \ENC_DEC == ENC + vaesenclast \T2, reg_j, reg_j + .else + vaesenclast \T2, reg_j, \T3 + vmovdqu 16*i(arg3, %r11), reg_j + vmovdqu \T3, 16*i(arg2, %r11) + .endif + i = (i+1) + j = (j+1) + setreg +.endr + ####################################################################### -_initial_num_blocks_is_5\@: - INITIAL_BLOCKS_AVX 5, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC - sub $16*5, %r13 - jmp _initial_blocks_encrypted\@ -_initial_num_blocks_is_4\@: - INITIAL_BLOCKS_AVX 4, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC - sub $16*4, %r13 - jmp _initial_blocks_encrypted\@ + vpslldq $8, \T6, \T3 # shift-L T3 2 DWs + vpsrldq $8, \T6, \T6 # shift-R T2 2 DWs + vpxor \T3, \T7, \T7 + vpxor \T4, \T6, \T6 # accumulate the results in T6:T7 -_initial_num_blocks_is_3\@: - INITIAL_BLOCKS_AVX 3, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC - sub $16*3, %r13 - jmp _initial_blocks_encrypted\@ -_initial_num_blocks_is_2\@: - INITIAL_BLOCKS_AVX 2, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC - sub $16*2, %r13 - jmp _initial_blocks_encrypted\@ -_initial_num_blocks_is_1\@: - INITIAL_BLOCKS_AVX 1, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC - sub $16*1, %r13 - jmp _initial_blocks_encrypted\@ + ####################################################################### + #first phase of the reduction + ####################################################################### + vpslld $31, \T7, \T2 # packed right shifting << 31 + vpslld $30, \T7, \T3 # packed right shifting shift << 30 + vpslld $25, \T7, \T4 # packed right shifting shift << 25 -_initial_num_blocks_is_0\@: - INITIAL_BLOCKS_AVX 0, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC + vpxor \T3, \T2, \T2 # xor the shifted versions + vpxor \T4, \T2, \T2 + vpsrldq $4, \T2, \T1 # shift-R T1 1 DW -_initial_blocks_encrypted\@: - cmp $0, %r13 - je _zero_cipher_left\@ + vpslldq $12, \T2, \T2 # shift-L T2 3 DWs + vpxor \T2, \T7, \T7 # first phase of the reduction complete + ####################################################################### + .if \ENC_DEC == ENC + vmovdqu \XMM1, 16*0(arg2,%r11) # Write to the Ciphertext buffer + vmovdqu \XMM2, 16*1(arg2,%r11) # Write to the Ciphertext buffer + vmovdqu \XMM3, 16*2(arg2,%r11) # Write to the Ciphertext buffer + vmovdqu \XMM4, 16*3(arg2,%r11) # Write to the Ciphertext buffer + vmovdqu \XMM5, 16*4(arg2,%r11) # Write to the Ciphertext buffer + vmovdqu \XMM6, 16*5(arg2,%r11) # Write to the Ciphertext buffer + vmovdqu \XMM7, 16*6(arg2,%r11) # Write to the Ciphertext buffer + vmovdqu \XMM8, 16*7(arg2,%r11) # Write to the Ciphertext buffer + .endif - sub $128, %r13 - je _eight_cipher_left\@ + ####################################################################### + #second phase of the reduction + vpsrld $1, \T7, \T2 # packed left shifting >> 1 + vpsrld $2, \T7, \T3 # packed left shifting >> 2 + vpsrld $7, \T7, \T4 # packed left shifting >> 7 + vpxor \T3, \T2, \T2 # xor the shifted versions + vpxor \T4, \T2, \T2 + vpxor \T1, \T2, \T2 + vpxor \T2, \T7, \T7 + vpxor \T7, \T6, \T6 # the result is in T6 + ####################################################################### + vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap - vmovd %xmm9, %r15d - and $255, %r15d - vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 + vpxor \T6, \XMM1, \XMM1 -_encrypt_by_8_new\@: - cmp $(255-8), %r15d - jg _encrypt_by_8\@ +.endm - add $8, %r15b - GHASH_8_ENCRYPT_8_PARALLEL_AVX %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, out_order, \ENC_DEC - add $128, %r11 - sub $128, %r13 - jne _encrypt_by_8_new\@ +# GHASH the last 4 ciphertext blocks. +.macro GHASH_LAST_8_AVX T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 - vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 - jmp _eight_cipher_left\@ + ## Karatsuba Method -_encrypt_by_8\@: - vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 - add $8, %r15b - GHASH_8_ENCRYPT_8_PARALLEL_AVX %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, in_order, \ENC_DEC - vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 - add $128, %r11 - sub $128, %r13 - jne _encrypt_by_8_new\@ - vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 + vpshufd $0b01001110, \XMM1, \T2 + vpxor \XMM1, \T2, \T2 + vmovdqa HashKey_8(arg1), \T5 + vpclmulqdq $0x11, \T5, \XMM1, \T6 + vpclmulqdq $0x00, \T5, \XMM1, \T7 + vmovdqa HashKey_8_k(arg1), \T3 + vpclmulqdq $0x00, \T3, \T2, \XMM1 + ###################### + vpshufd $0b01001110, \XMM2, \T2 + vpxor \XMM2, \T2, \T2 + vmovdqa HashKey_7(arg1), \T5 + vpclmulqdq $0x11, \T5, \XMM2, \T4 + vpxor \T4, \T6, \T6 -_eight_cipher_left\@: - GHASH_LAST_8_AVX %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8 + vpclmulqdq $0x00, \T5, \XMM2, \T4 + vpxor \T4, \T7, \T7 + vmovdqa HashKey_7_k(arg1), \T3 + vpclmulqdq $0x00, \T3, \T2, \T2 + vpxor \T2, \XMM1, \XMM1 -_zero_cipher_left\@: - cmp $16, arg4 - jl _only_less_than_16\@ + ###################### - mov arg4, %r13 - and $15, %r13 # r13 = (arg4 mod 16) + vpshufd $0b01001110, \XMM3, \T2 + vpxor \XMM3, \T2, \T2 + vmovdqa HashKey_6(arg1), \T5 + vpclmulqdq $0x11, \T5, \XMM3, \T4 + vpxor \T4, \T6, \T6 - je _multiple_of_16_bytes\@ + vpclmulqdq $0x00, \T5, \XMM3, \T4 + vpxor \T4, \T7, \T7 - # handle the last <16 Byte block seperately + vmovdqa HashKey_6_k(arg1), \T3 + vpclmulqdq $0x00, \T3, \T2, \T2 + vpxor \T2, \XMM1, \XMM1 + ###################### - vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn - vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 - ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Yn) + vpshufd $0b01001110, \XMM4, \T2 + vpxor \XMM4, \T2, \T2 + vmovdqa HashKey_5(arg1), \T5 + vpclmulqdq $0x11, \T5, \XMM4, \T4 + vpxor \T4, \T6, \T6 - sub $16, %r11 - add %r13, %r11 - vmovdqu (arg3, %r11), %xmm1 # receive the last <16 Byte block + vpclmulqdq $0x00, \T5, \XMM4, \T4 + vpxor \T4, \T7, \T7 - lea SHIFT_MASK+16(%rip), %r12 - sub %r13, %r12 # adjust the shuffle mask pointer to be - # able to shift 16-r13 bytes (r13 is the - # number of bytes in plaintext mod 16) - vmovdqu (%r12), %xmm2 # get the appropriate shuffle mask - vpshufb %xmm2, %xmm1, %xmm1 # shift right 16-r13 bytes - jmp _final_ghash_mul\@ + vmovdqa HashKey_5_k(arg1), \T3 + vpclmulqdq $0x00, \T3, \T2, \T2 + vpxor \T2, \XMM1, \XMM1 -_only_less_than_16\@: - # check for 0 length - mov arg4, %r13 - and $15, %r13 # r13 = (arg4 mod 16) + ###################### - je _multiple_of_16_bytes\@ + vpshufd $0b01001110, \XMM5, \T2 + vpxor \XMM5, \T2, \T2 + vmovdqa HashKey_4(arg1), \T5 + vpclmulqdq $0x11, \T5, \XMM5, \T4 + vpxor \T4, \T6, \T6 - # handle the last <16 Byte block seperately + vpclmulqdq $0x00, \T5, \XMM5, \T4 + vpxor \T4, \T7, \T7 + vmovdqa HashKey_4_k(arg1), \T3 + vpclmulqdq $0x00, \T3, \T2, \T2 + vpxor \T2, \XMM1, \XMM1 - vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn - vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 - ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Yn) + ###################### + vpshufd $0b01001110, \XMM6, \T2 + vpxor \XMM6, \T2, \T2 + vmovdqa HashKey_3(arg1), \T5 + vpclmulqdq $0x11, \T5, \XMM6, \T4 + vpxor \T4, \T6, \T6 - lea SHIFT_MASK+16(%rip), %r12 - sub %r13, %r12 # adjust the shuffle mask pointer to be - # able to shift 16-r13 bytes (r13 is the - # number of bytes in plaintext mod 16) + vpclmulqdq $0x00, \T5, \XMM6, \T4 + vpxor \T4, \T7, \T7 -_get_last_16_byte_loop\@: - movb (arg3, %r11), %al - movb %al, TMP1 (%rsp , %r11) - add $1, %r11 - cmp %r13, %r11 - jne _get_last_16_byte_loop\@ + vmovdqa HashKey_3_k(arg1), \T3 + vpclmulqdq $0x00, \T3, \T2, \T2 + vpxor \T2, \XMM1, \XMM1 - vmovdqu TMP1(%rsp), %xmm1 + ###################### - sub $16, %r11 + vpshufd $0b01001110, \XMM7, \T2 + vpxor \XMM7, \T2, \T2 + vmovdqa HashKey_2(arg1), \T5 + vpclmulqdq $0x11, \T5, \XMM7, \T4 + vpxor \T4, \T6, \T6 -_final_ghash_mul\@: - .if \ENC_DEC == DEC - vmovdqa %xmm1, %xmm2 - vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn) - vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to - # mask out top 16-r13 bytes of xmm9 - vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9 - vpand %xmm1, %xmm2, %xmm2 - vpshufb SHUF_MASK(%rip), %xmm2, %xmm2 - vpxor %xmm2, %xmm14, %xmm14 - #GHASH computation for the last <16 Byte block - GHASH_MUL_AVX %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 - sub %r13, %r11 - add $16, %r11 - .else - vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn) - vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to - # mask out top 16-r13 bytes of xmm9 - vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9 - vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 - vpxor %xmm9, %xmm14, %xmm14 - #GHASH computation for the last <16 Byte block - GHASH_MUL_AVX %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 - sub %r13, %r11 - add $16, %r11 - vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 # shuffle xmm9 back to output as ciphertext - .endif + vpclmulqdq $0x00, \T5, \XMM7, \T4 + vpxor \T4, \T7, \T7 + vmovdqa HashKey_2_k(arg1), \T3 + vpclmulqdq $0x00, \T3, \T2, \T2 + vpxor \T2, \XMM1, \XMM1 - ############################# - # output r13 Bytes - vmovq %xmm9, %rax - cmp $8, %r13 - jle _less_than_8_bytes_left\@ + ###################### - mov %rax, (arg2 , %r11) - add $8, %r11 - vpsrldq $8, %xmm9, %xmm9 - vmovq %xmm9, %rax - sub $8, %r13 + vpshufd $0b01001110, \XMM8, \T2 + vpxor \XMM8, \T2, \T2 + vmovdqa HashKey(arg1), \T5 + vpclmulqdq $0x11, \T5, \XMM8, \T4 + vpxor \T4, \T6, \T6 -_less_than_8_bytes_left\@: - movb %al, (arg2 , %r11) - add $1, %r11 - shr $8, %rax - sub $1, %r13 - jne _less_than_8_bytes_left\@ - ############################# + vpclmulqdq $0x00, \T5, \XMM8, \T4 + vpxor \T4, \T7, \T7 -_multiple_of_16_bytes\@: - mov arg7, %r12 # r12 = aadLen (number of bytes) - shl $3, %r12 # convert into number of bits - vmovd %r12d, %xmm15 # len(A) in xmm15 + vmovdqa HashKey_k(arg1), \T3 + vpclmulqdq $0x00, \T3, \T2, \T2 - shl $3, arg4 # len(C) in bits (*128) - vmovq arg4, %xmm1 - vpslldq $8, %xmm15, %xmm15 # xmm15 = len(A)|| 0x0000000000000000 - vpxor %xmm1, %xmm15, %xmm15 # xmm15 = len(A)||len(C) + vpxor \T2, \XMM1, \XMM1 + vpxor \T6, \XMM1, \XMM1 + vpxor \T7, \XMM1, \T2 - vpxor %xmm15, %xmm14, %xmm14 - GHASH_MUL_AVX %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 # final GHASH computation - vpshufb SHUF_MASK(%rip), %xmm14, %xmm14 # perform a 16Byte swap - mov arg5, %rax # rax = *Y0 - vmovdqu (%rax), %xmm9 # xmm9 = Y0 - ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Y0) - vpxor %xmm14, %xmm9, %xmm9 + vpslldq $8, \T2, \T4 + vpsrldq $8, \T2, \T2 + vpxor \T4, \T7, \T7 + vpxor \T2, \T6, \T6 # holds the result of + # the accumulated carry-less multiplications + ####################################################################### + #first phase of the reduction + vpslld $31, \T7, \T2 # packed right shifting << 31 + vpslld $30, \T7, \T3 # packed right shifting shift << 30 + vpslld $25, \T7, \T4 # packed right shifting shift << 25 -_return_T\@: - mov arg8, %r10 # r10 = authTag - mov arg9, %r11 # r11 = auth_tag_len + vpxor \T3, \T2, \T2 # xor the shifted versions + vpxor \T4, \T2, \T2 - cmp $16, %r11 - je _T_16\@ + vpsrldq $4, \T2, \T1 # shift-R T1 1 DW - cmp $8, %r11 - jl _T_4\@ + vpslldq $12, \T2, \T2 # shift-L T2 3 DWs + vpxor \T2, \T7, \T7 # first phase of the reduction complete + ####################################################################### -_T_8\@: - vmovq %xmm9, %rax - mov %rax, (%r10) - add $8, %r10 - sub $8, %r11 - vpsrldq $8, %xmm9, %xmm9 - cmp $0, %r11 - je _return_T_done\@ -_T_4\@: - vmovd %xmm9, %eax - mov %eax, (%r10) - add $4, %r10 - sub $4, %r11 - vpsrldq $4, %xmm9, %xmm9 - cmp $0, %r11 - je _return_T_done\@ -_T_123\@: - vmovd %xmm9, %eax - cmp $2, %r11 - jl _T_1\@ - mov %ax, (%r10) - cmp $2, %r11 - je _return_T_done\@ - add $2, %r10 - sar $16, %eax -_T_1\@: - mov %al, (%r10) - jmp _return_T_done\@ -_T_16\@: - vmovdqu %xmm9, (%r10) + #second phase of the reduction + vpsrld $1, \T7, \T2 # packed left shifting >> 1 + vpsrld $2, \T7, \T3 # packed left shifting >> 2 + vpsrld $7, \T7, \T4 # packed left shifting >> 7 + vpxor \T3, \T2, \T2 # xor the shifted versions + vpxor \T4, \T2, \T2 -_return_T_done\@: - mov %r14, %rsp + vpxor \T1, \T2, \T2 + vpxor \T2, \T7, \T7 + vpxor \T7, \T6, \T6 # the result is in T6 - pop %r15 - pop %r14 - pop %r13 - pop %r12 .endm - ############################################################# #void aesni_gcm_precomp_avx_gen2 # (gcm_data *my_ctx_data, @@ -1593,7 +1591,7 @@ ENDPROC(aesni_gcm_precomp_avx_gen2) # Valid values are 16 (most likely), 12 or 8. */ ############################################################################### ENTRY(aesni_gcm_enc_avx_gen2) - GCM_ENC_DEC_AVX ENC + GCM_ENC_DEC INITIAL_BLOCKS_AVX GHASH_8_ENCRYPT_8_PARALLEL_AVX GHASH_LAST_8_AVX GHASH_MUL_AVX ENC ret ENDPROC(aesni_gcm_enc_avx_gen2) @@ -1614,7 +1612,7 @@ ENDPROC(aesni_gcm_enc_avx_gen2) # Valid values are 16 (most likely), 12 or 8. */ ############################################################################### ENTRY(aesni_gcm_dec_avx_gen2) - GCM_ENC_DEC_AVX DEC + GCM_ENC_DEC INITIAL_BLOCKS_AVX GHASH_8_ENCRYPT_8_PARALLEL_AVX GHASH_LAST_8_AVX GHASH_MUL_AVX DEC ret ENDPROC(aesni_gcm_dec_avx_gen2) #endif /* CONFIG_AS_AVX */ @@ -2378,477 +2376,164 @@ _initial_blocks_done\@: vmovdqa HashKey_7(arg1), \T5 vpshufd $0b01001110, \XMM2, \T2 vpshufd $0b01001110, \T5, \T3 - vpxor \XMM2, \T2, \T2 - vpxor \T5, \T3, \T3 - - vpclmulqdq $0x11, \T5, \XMM2, \T4 - vpxor \T4, \T6, \T6 - - vpclmulqdq $0x00, \T5, \XMM2, \T4 - vpxor \T4, \T7, \T7 - - vpclmulqdq $0x00, \T3, \T2, \T2 - - vpxor \T2, \XMM1, \XMM1 - - ###################### - - vmovdqa HashKey_6(arg1), \T5 - vpshufd $0b01001110, \XMM3, \T2 - vpshufd $0b01001110, \T5, \T3 - vpxor \XMM3, \T2, \T2 - vpxor \T5, \T3, \T3 - - vpclmulqdq $0x11, \T5, \XMM3, \T4 - vpxor \T4, \T6, \T6 - - vpclmulqdq $0x00, \T5, \XMM3, \T4 - vpxor \T4, \T7, \T7 - - vpclmulqdq $0x00, \T3, \T2, \T2 - - vpxor \T2, \XMM1, \XMM1 - - ###################### - - vmovdqa HashKey_5(arg1), \T5 - vpshufd $0b01001110, \XMM4, \T2 - vpshufd $0b01001110, \T5, \T3 - vpxor \XMM4, \T2, \T2 - vpxor \T5, \T3, \T3 - - vpclmulqdq $0x11, \T5, \XMM4, \T4 - vpxor \T4, \T6, \T6 - - vpclmulqdq $0x00, \T5, \XMM4, \T4 - vpxor \T4, \T7, \T7 - - vpclmulqdq $0x00, \T3, \T2, \T2 - - vpxor \T2, \XMM1, \XMM1 - - ###################### - - vmovdqa HashKey_4(arg1), \T5 - vpshufd $0b01001110, \XMM5, \T2 - vpshufd $0b01001110, \T5, \T3 - vpxor \XMM5, \T2, \T2 - vpxor \T5, \T3, \T3 - - vpclmulqdq $0x11, \T5, \XMM5, \T4 - vpxor \T4, \T6, \T6 - - vpclmulqdq $0x00, \T5, \XMM5, \T4 - vpxor \T4, \T7, \T7 - - vpclmulqdq $0x00, \T3, \T2, \T2 - - vpxor \T2, \XMM1, \XMM1 - - ###################### - - vmovdqa HashKey_3(arg1), \T5 - vpshufd $0b01001110, \XMM6, \T2 - vpshufd $0b01001110, \T5, \T3 - vpxor \XMM6, \T2, \T2 - vpxor \T5, \T3, \T3 - - vpclmulqdq $0x11, \T5, \XMM6, \T4 - vpxor \T4, \T6, \T6 - - vpclmulqdq $0x00, \T5, \XMM6, \T4 - vpxor \T4, \T7, \T7 - - vpclmulqdq $0x00, \T3, \T2, \T2 - - vpxor \T2, \XMM1, \XMM1 - - ###################### - - vmovdqa HashKey_2(arg1), \T5 - vpshufd $0b01001110, \XMM7, \T2 - vpshufd $0b01001110, \T5, \T3 - vpxor \XMM7, \T2, \T2 - vpxor \T5, \T3, \T3 - - vpclmulqdq $0x11, \T5, \XMM7, \T4 - vpxor \T4, \T6, \T6 - - vpclmulqdq $0x00, \T5, \XMM7, \T4 - vpxor \T4, \T7, \T7 - - vpclmulqdq $0x00, \T3, \T2, \T2 - - vpxor \T2, \XMM1, \XMM1 - - ###################### - - vmovdqa HashKey(arg1), \T5 - vpshufd $0b01001110, \XMM8, \T2 - vpshufd $0b01001110, \T5, \T3 - vpxor \XMM8, \T2, \T2 - vpxor \T5, \T3, \T3 - - vpclmulqdq $0x11, \T5, \XMM8, \T4 - vpxor \T4, \T6, \T6 - - vpclmulqdq $0x00, \T5, \XMM8, \T4 - vpxor \T4, \T7, \T7 - - vpclmulqdq $0x00, \T3, \T2, \T2 - - vpxor \T2, \XMM1, \XMM1 - vpxor \T6, \XMM1, \XMM1 - vpxor \T7, \XMM1, \T2 - - - - - vpslldq $8, \T2, \T4 - vpsrldq $8, \T2, \T2 - - vpxor \T4, \T7, \T7 - vpxor \T2, \T6, \T6 # holds the result of the - # accumulated carry-less multiplications - - ####################################################################### - #first phase of the reduction - vmovdqa POLY2(%rip), \T3 - - vpclmulqdq $0x01, \T7, \T3, \T2 - vpslldq $8, \T2, \T2 # shift-L xmm2 2 DWs - - vpxor \T2, \T7, \T7 # first phase of the reduction complete - ####################################################################### - - - #second phase of the reduction - vpclmulqdq $0x00, \T7, \T3, \T2 - vpsrldq $4, \T2, \T2 # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R) - - vpclmulqdq $0x10, \T7, \T3, \T4 - vpslldq $4, \T4, \T4 # shift-L T4 1 DW (Shift-L 1-DW to obtain result with no shifts) - - vpxor \T2, \T4, \T4 # second phase of the reduction complete - ####################################################################### - vpxor \T4, \T6, \T6 # the result is in T6 -.endm - - - -# combined for GCM encrypt and decrypt functions -# clobbering all xmm registers -# clobbering r10, r11, r12, r13, r14, r15 -.macro GCM_ENC_DEC_AVX2 ENC_DEC - - #the number of pushes must equal STACK_OFFSET - push %r12 - push %r13 - push %r14 - push %r15 - - mov %rsp, %r14 - - - - - sub $VARIABLE_OFFSET, %rsp - and $~63, %rsp # align rsp to 64 bytes - - - vmovdqu HashKey(arg1), %xmm13 # xmm13 = HashKey - - mov arg4, %r13 # save the number of bytes of plaintext/ciphertext - and $-16, %r13 # r13 = r13 - (r13 mod 16) - - mov %r13, %r12 - shr $4, %r12 - and $7, %r12 - jz _initial_num_blocks_is_0\@ - - cmp $7, %r12 - je _initial_num_blocks_is_7\@ - cmp $6, %r12 - je _initial_num_blocks_is_6\@ - cmp $5, %r12 - je _initial_num_blocks_is_5\@ - cmp $4, %r12 - je _initial_num_blocks_is_4\@ - cmp $3, %r12 - je _initial_num_blocks_is_3\@ - cmp $2, %r12 - je _initial_num_blocks_is_2\@ - - jmp _initial_num_blocks_is_1\@ - -_initial_num_blocks_is_7\@: - INITIAL_BLOCKS_AVX2 7, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC - sub $16*7, %r13 - jmp _initial_blocks_encrypted\@ - -_initial_num_blocks_is_6\@: - INITIAL_BLOCKS_AVX2 6, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC - sub $16*6, %r13 - jmp _initial_blocks_encrypted\@ - -_initial_num_blocks_is_5\@: - INITIAL_BLOCKS_AVX2 5, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC - sub $16*5, %r13 - jmp _initial_blocks_encrypted\@ - -_initial_num_blocks_is_4\@: - INITIAL_BLOCKS_AVX2 4, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC - sub $16*4, %r13 - jmp _initial_blocks_encrypted\@ - -_initial_num_blocks_is_3\@: - INITIAL_BLOCKS_AVX2 3, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC - sub $16*3, %r13 - jmp _initial_blocks_encrypted\@ - -_initial_num_blocks_is_2\@: - INITIAL_BLOCKS_AVX2 2, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC - sub $16*2, %r13 - jmp _initial_blocks_encrypted\@ - -_initial_num_blocks_is_1\@: - INITIAL_BLOCKS_AVX2 1, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC - sub $16*1, %r13 - jmp _initial_blocks_encrypted\@ - -_initial_num_blocks_is_0\@: - INITIAL_BLOCKS_AVX2 0, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC - - -_initial_blocks_encrypted\@: - cmp $0, %r13 - je _zero_cipher_left\@ - - sub $128, %r13 - je _eight_cipher_left\@ - - - + vpxor \XMM2, \T2, \T2 + vpxor \T5, \T3, \T3 - vmovd %xmm9, %r15d - and $255, %r15d - vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 + vpclmulqdq $0x11, \T5, \XMM2, \T4 + vpxor \T4, \T6, \T6 + vpclmulqdq $0x00, \T5, \XMM2, \T4 + vpxor \T4, \T7, \T7 -_encrypt_by_8_new\@: - cmp $(255-8), %r15d - jg _encrypt_by_8\@ + vpclmulqdq $0x00, \T3, \T2, \T2 + vpxor \T2, \XMM1, \XMM1 + ###################### - add $8, %r15b - GHASH_8_ENCRYPT_8_PARALLEL_AVX2 %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, out_order, \ENC_DEC - add $128, %r11 - sub $128, %r13 - jne _encrypt_by_8_new\@ + vmovdqa HashKey_6(arg1), \T5 + vpshufd $0b01001110, \XMM3, \T2 + vpshufd $0b01001110, \T5, \T3 + vpxor \XMM3, \T2, \T2 + vpxor \T5, \T3, \T3 - vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 - jmp _eight_cipher_left\@ + vpclmulqdq $0x11, \T5, \XMM3, \T4 + vpxor \T4, \T6, \T6 -_encrypt_by_8\@: - vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 - add $8, %r15b - GHASH_8_ENCRYPT_8_PARALLEL_AVX2 %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, in_order, \ENC_DEC - vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 - add $128, %r11 - sub $128, %r13 - jne _encrypt_by_8_new\@ + vpclmulqdq $0x00, \T5, \XMM3, \T4 + vpxor \T4, \T7, \T7 - vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 + vpclmulqdq $0x00, \T3, \T2, \T2 + vpxor \T2, \XMM1, \XMM1 + ###################### + vmovdqa HashKey_5(arg1), \T5 + vpshufd $0b01001110, \XMM4, \T2 + vpshufd $0b01001110, \T5, \T3 + vpxor \XMM4, \T2, \T2 + vpxor \T5, \T3, \T3 -_eight_cipher_left\@: - GHASH_LAST_8_AVX2 %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8 + vpclmulqdq $0x11, \T5, \XMM4, \T4 + vpxor \T4, \T6, \T6 + vpclmulqdq $0x00, \T5, \XMM4, \T4 + vpxor \T4, \T7, \T7 -_zero_cipher_left\@: - cmp $16, arg4 - jl _only_less_than_16\@ + vpclmulqdq $0x00, \T3, \T2, \T2 - mov arg4, %r13 - and $15, %r13 # r13 = (arg4 mod 16) + vpxor \T2, \XMM1, \XMM1 - je _multiple_of_16_bytes\@ + ###################### - # handle the last <16 Byte block seperately + vmovdqa HashKey_4(arg1), \T5 + vpshufd $0b01001110, \XMM5, \T2 + vpshufd $0b01001110, \T5, \T3 + vpxor \XMM5, \T2, \T2 + vpxor \T5, \T3, \T3 + vpclmulqdq $0x11, \T5, \XMM5, \T4 + vpxor \T4, \T6, \T6 - vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn - vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 - ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Yn) + vpclmulqdq $0x00, \T5, \XMM5, \T4 + vpxor \T4, \T7, \T7 - sub $16, %r11 - add %r13, %r11 - vmovdqu (arg3, %r11), %xmm1 # receive the last <16 Byte block + vpclmulqdq $0x00, \T3, \T2, \T2 - lea SHIFT_MASK+16(%rip), %r12 - sub %r13, %r12 # adjust the shuffle mask pointer - # to be able to shift 16-r13 bytes - # (r13 is the number of bytes in plaintext mod 16) - vmovdqu (%r12), %xmm2 # get the appropriate shuffle mask - vpshufb %xmm2, %xmm1, %xmm1 # shift right 16-r13 bytes - jmp _final_ghash_mul\@ + vpxor \T2, \XMM1, \XMM1 -_only_less_than_16\@: - # check for 0 length - mov arg4, %r13 - and $15, %r13 # r13 = (arg4 mod 16) + ###################### - je _multiple_of_16_bytes\@ + vmovdqa HashKey_3(arg1), \T5 + vpshufd $0b01001110, \XMM6, \T2 + vpshufd $0b01001110, \T5, \T3 + vpxor \XMM6, \T2, \T2 + vpxor \T5, \T3, \T3 - # handle the last <16 Byte block seperately + vpclmulqdq $0x11, \T5, \XMM6, \T4 + vpxor \T4, \T6, \T6 + vpclmulqdq $0x00, \T5, \XMM6, \T4 + vpxor \T4, \T7, \T7 - vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn - vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 - ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Yn) + vpclmulqdq $0x00, \T3, \T2, \T2 + vpxor \T2, \XMM1, \XMM1 - lea SHIFT_MASK+16(%rip), %r12 - sub %r13, %r12 # adjust the shuffle mask pointer to be - # able to shift 16-r13 bytes (r13 is the - # number of bytes in plaintext mod 16) + ###################### -_get_last_16_byte_loop\@: - movb (arg3, %r11), %al - movb %al, TMP1 (%rsp , %r11) - add $1, %r11 - cmp %r13, %r11 - jne _get_last_16_byte_loop\@ + vmovdqa HashKey_2(arg1), \T5 + vpshufd $0b01001110, \XMM7, \T2 + vpshufd $0b01001110, \T5, \T3 + vpxor \XMM7, \T2, \T2 + vpxor \T5, \T3, \T3 - vmovdqu TMP1(%rsp), %xmm1 + vpclmulqdq $0x11, \T5, \XMM7, \T4 + vpxor \T4, \T6, \T6 - sub $16, %r11 + vpclmulqdq $0x00, \T5, \XMM7, \T4 + vpxor \T4, \T7, \T7 -_final_ghash_mul\@: - .if \ENC_DEC == DEC - vmovdqa %xmm1, %xmm2 - vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn) - vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to mask out top 16-r13 bytes of xmm9 - vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9 - vpand %xmm1, %xmm2, %xmm2 - vpshufb SHUF_MASK(%rip), %xmm2, %xmm2 - vpxor %xmm2, %xmm14, %xmm14 - #GHASH computation for the last <16 Byte block - GHASH_MUL_AVX2 %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 - sub %r13, %r11 - add $16, %r11 - .else - vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn) - vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to mask out top 16-r13 bytes of xmm9 - vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9 - vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 - vpxor %xmm9, %xmm14, %xmm14 - #GHASH computation for the last <16 Byte block - GHASH_MUL_AVX2 %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 - sub %r13, %r11 - add $16, %r11 - vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 # shuffle xmm9 back to output as ciphertext - .endif + vpclmulqdq $0x00, \T3, \T2, \T2 + vpxor \T2, \XMM1, \XMM1 - ############################# - # output r13 Bytes - vmovq %xmm9, %rax - cmp $8, %r13 - jle _less_than_8_bytes_left\@ + ###################### - mov %rax, (arg2 , %r11) - add $8, %r11 - vpsrldq $8, %xmm9, %xmm9 - vmovq %xmm9, %rax - sub $8, %r13 + vmovdqa HashKey(arg1), \T5 + vpshufd $0b01001110, \XMM8, \T2 + vpshufd $0b01001110, \T5, \T3 + vpxor \XMM8, \T2, \T2 + vpxor \T5, \T3, \T3 -_less_than_8_bytes_left\@: - movb %al, (arg2 , %r11) - add $1, %r11 - shr $8, %rax - sub $1, %r13 - jne _less_than_8_bytes_left\@ - ############################# + vpclmulqdq $0x11, \T5, \XMM8, \T4 + vpxor \T4, \T6, \T6 -_multiple_of_16_bytes\@: - mov arg7, %r12 # r12 = aadLen (number of bytes) - shl $3, %r12 # convert into number of bits - vmovd %r12d, %xmm15 # len(A) in xmm15 + vpclmulqdq $0x00, \T5, \XMM8, \T4 + vpxor \T4, \T7, \T7 - shl $3, arg4 # len(C) in bits (*128) - vmovq arg4, %xmm1 - vpslldq $8, %xmm15, %xmm15 # xmm15 = len(A)|| 0x0000000000000000 - vpxor %xmm1, %xmm15, %xmm15 # xmm15 = len(A)||len(C) + vpclmulqdq $0x00, \T3, \T2, \T2 - vpxor %xmm15, %xmm14, %xmm14 - GHASH_MUL_AVX2 %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 # final GHASH computation - vpshufb SHUF_MASK(%rip), %xmm14, %xmm14 # perform a 16Byte swap + vpxor \T2, \XMM1, \XMM1 + vpxor \T6, \XMM1, \XMM1 + vpxor \T7, \XMM1, \T2 - mov arg5, %rax # rax = *Y0 - vmovdqu (%rax), %xmm9 # xmm9 = Y0 - ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Y0) - vpxor %xmm14, %xmm9, %xmm9 + vpslldq $8, \T2, \T4 + vpsrldq $8, \T2, \T2 + vpxor \T4, \T7, \T7 + vpxor \T2, \T6, \T6 # holds the result of the + # accumulated carry-less multiplications -_return_T\@: - mov arg8, %r10 # r10 = authTag - mov arg9, %r11 # r11 = auth_tag_len + ####################################################################### + #first phase of the reduction + vmovdqa POLY2(%rip), \T3 - cmp $16, %r11 - je _T_16\@ + vpclmulqdq $0x01, \T7, \T3, \T2 + vpslldq $8, \T2, \T2 # shift-L xmm2 2 DWs - cmp $8, %r11 - jl _T_4\@ + vpxor \T2, \T7, \T7 # first phase of the reduction complete + ####################################################################### -_T_8\@: - vmovq %xmm9, %rax - mov %rax, (%r10) - add $8, %r10 - sub $8, %r11 - vpsrldq $8, %xmm9, %xmm9 - cmp $0, %r11 - je _return_T_done\@ -_T_4\@: - vmovd %xmm9, %eax - mov %eax, (%r10) - add $4, %r10 - sub $4, %r11 - vpsrldq $4, %xmm9, %xmm9 - cmp $0, %r11 - je _return_T_done\@ -_T_123\@: - vmovd %xmm9, %eax - cmp $2, %r11 - jl _T_1\@ - mov %ax, (%r10) - cmp $2, %r11 - je _return_T_done\@ - add $2, %r10 - sar $16, %eax -_T_1\@: - mov %al, (%r10) - jmp _return_T_done\@ -_T_16\@: - vmovdqu %xmm9, (%r10) + #second phase of the reduction + vpclmulqdq $0x00, \T7, \T3, \T2 + vpsrldq $4, \T2, \T2 # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R) -_return_T_done\@: - mov %r14, %rsp + vpclmulqdq $0x10, \T7, \T3, \T4 + vpslldq $4, \T4, \T4 # shift-L T4 1 DW (Shift-L 1-DW to obtain result with no shifts) - pop %r15 - pop %r14 - pop %r13 - pop %r12 + vpxor \T2, \T4, \T4 # second phase of the reduction complete + ####################################################################### + vpxor \T4, \T6, \T6 # the result is in T6 .endm + ############################################################# #void aesni_gcm_precomp_avx_gen4 # (gcm_data *my_ctx_data, @@ -2918,7 +2603,7 @@ ENDPROC(aesni_gcm_precomp_avx_gen4) # Valid values are 16 (most likely), 12 or 8. */ ############################################################################### ENTRY(aesni_gcm_enc_avx_gen4) - GCM_ENC_DEC_AVX2 ENC + GCM_ENC_DEC INITIAL_BLOCKS_AVX2 GHASH_8_ENCRYPT_8_PARALLEL_AVX2 GHASH_LAST_8_AVX2 GHASH_MUL_AVX2 ENC ret ENDPROC(aesni_gcm_enc_avx_gen4) @@ -2939,7 +2624,7 @@ ENDPROC(aesni_gcm_enc_avx_gen4) # Valid values are 16 (most likely), 12 or 8. */ ############################################################################### ENTRY(aesni_gcm_dec_avx_gen4) - GCM_ENC_DEC_AVX2 DEC + GCM_ENC_DEC INITIAL_BLOCKS_AVX2 GHASH_8_ENCRYPT_8_PARALLEL_AVX2 GHASH_LAST_8_AVX2 GHASH_MUL_AVX2 DEC ret ENDPROC(aesni_gcm_dec_avx_gen4) -- cgit v1.2.3-59-g8ed1b From de85fc46b1037099c78d2fec08a662079e568d62 Mon Sep 17 00:00:00 2001 From: Dave Watson Date: Mon, 10 Dec 2018 19:57:00 +0000 Subject: crypto: aesni - Introduce gcm_context_data Add the gcm_context_data structure to the avx asm routines. This will be necessary to support both 256 bit keys and scatter/gather. The pre-computed HashKeys are now stored in the gcm_context_data struct, which is expanded to hold the greater number of hashkeys necessary for avx. Loads and stores to the new struct are always done unlaligned to avoid compiler issues, see e5b954e8 "Use unaligned loads from gcm_context_data" Signed-off-by: Dave Watson Signed-off-by: Herbert Xu --- arch/x86/crypto/aesni-intel_avx-x86_64.S | 378 +++++++++++++++---------------- arch/x86/crypto/aesni-intel_glue.c | 58 +++-- 2 files changed, 215 insertions(+), 221 deletions(-) diff --git a/arch/x86/crypto/aesni-intel_avx-x86_64.S b/arch/x86/crypto/aesni-intel_avx-x86_64.S index 318135a77975..284f1b8b88fc 100644 --- a/arch/x86/crypto/aesni-intel_avx-x86_64.S +++ b/arch/x86/crypto/aesni-intel_avx-x86_64.S @@ -182,43 +182,22 @@ aad_shift_arr: .text -##define the fields of the gcm aes context -#{ -# u8 expanded_keys[16*11] store expanded keys -# u8 shifted_hkey_1[16] store HashKey <<1 mod poly here -# u8 shifted_hkey_2[16] store HashKey^2 <<1 mod poly here -# u8 shifted_hkey_3[16] store HashKey^3 <<1 mod poly here -# u8 shifted_hkey_4[16] store HashKey^4 <<1 mod poly here -# u8 shifted_hkey_5[16] store HashKey^5 <<1 mod poly here -# u8 shifted_hkey_6[16] store HashKey^6 <<1 mod poly here -# u8 shifted_hkey_7[16] store HashKey^7 <<1 mod poly here -# u8 shifted_hkey_8[16] store HashKey^8 <<1 mod poly here -# u8 shifted_hkey_1_k[16] store XOR HashKey <<1 mod poly here (for Karatsuba purposes) -# u8 shifted_hkey_2_k[16] store XOR HashKey^2 <<1 mod poly here (for Karatsuba purposes) -# u8 shifted_hkey_3_k[16] store XOR HashKey^3 <<1 mod poly here (for Karatsuba purposes) -# u8 shifted_hkey_4_k[16] store XOR HashKey^4 <<1 mod poly here (for Karatsuba purposes) -# u8 shifted_hkey_5_k[16] store XOR HashKey^5 <<1 mod poly here (for Karatsuba purposes) -# u8 shifted_hkey_6_k[16] store XOR HashKey^6 <<1 mod poly here (for Karatsuba purposes) -# u8 shifted_hkey_7_k[16] store XOR HashKey^7 <<1 mod poly here (for Karatsuba purposes) -# u8 shifted_hkey_8_k[16] store XOR HashKey^8 <<1 mod poly here (for Karatsuba purposes) -#} gcm_ctx# - -HashKey = 16*11 # store HashKey <<1 mod poly here -HashKey_2 = 16*12 # store HashKey^2 <<1 mod poly here -HashKey_3 = 16*13 # store HashKey^3 <<1 mod poly here -HashKey_4 = 16*14 # store HashKey^4 <<1 mod poly here -HashKey_5 = 16*15 # store HashKey^5 <<1 mod poly here -HashKey_6 = 16*16 # store HashKey^6 <<1 mod poly here -HashKey_7 = 16*17 # store HashKey^7 <<1 mod poly here -HashKey_8 = 16*18 # store HashKey^8 <<1 mod poly here -HashKey_k = 16*19 # store XOR of HashKey <<1 mod poly here (for Karatsuba purposes) -HashKey_2_k = 16*20 # store XOR of HashKey^2 <<1 mod poly here (for Karatsuba purposes) -HashKey_3_k = 16*21 # store XOR of HashKey^3 <<1 mod poly here (for Karatsuba purposes) -HashKey_4_k = 16*22 # store XOR of HashKey^4 <<1 mod poly here (for Karatsuba purposes) -HashKey_5_k = 16*23 # store XOR of HashKey^5 <<1 mod poly here (for Karatsuba purposes) -HashKey_6_k = 16*24 # store XOR of HashKey^6 <<1 mod poly here (for Karatsuba purposes) -HashKey_7_k = 16*25 # store XOR of HashKey^7 <<1 mod poly here (for Karatsuba purposes) -HashKey_8_k = 16*26 # store XOR of HashKey^8 <<1 mod poly here (for Karatsuba purposes) +HashKey = 16*6 # store HashKey <<1 mod poly here +HashKey_2 = 16*7 # store HashKey^2 <<1 mod poly here +HashKey_3 = 16*8 # store HashKey^3 <<1 mod poly here +HashKey_4 = 16*9 # store HashKey^4 <<1 mod poly here +HashKey_5 = 16*10 # store HashKey^5 <<1 mod poly here +HashKey_6 = 16*11 # store HashKey^6 <<1 mod poly here +HashKey_7 = 16*12 # store HashKey^7 <<1 mod poly here +HashKey_8 = 16*13 # store HashKey^8 <<1 mod poly here +HashKey_k = 16*14 # store XOR of HashKey <<1 mod poly here (for Karatsuba purposes) +HashKey_2_k = 16*15 # store XOR of HashKey^2 <<1 mod poly here (for Karatsuba purposes) +HashKey_3_k = 16*16 # store XOR of HashKey^3 <<1 mod poly here (for Karatsuba purposes) +HashKey_4_k = 16*17 # store XOR of HashKey^4 <<1 mod poly here (for Karatsuba purposes) +HashKey_5_k = 16*18 # store XOR of HashKey^5 <<1 mod poly here (for Karatsuba purposes) +HashKey_6_k = 16*19 # store XOR of HashKey^6 <<1 mod poly here (for Karatsuba purposes) +HashKey_7_k = 16*20 # store XOR of HashKey^7 <<1 mod poly here (for Karatsuba purposes) +HashKey_8_k = 16*21 # store XOR of HashKey^8 <<1 mod poly here (for Karatsuba purposes) #define arg1 %rdi #define arg2 %rsi @@ -229,6 +208,7 @@ HashKey_8_k = 16*26 # store XOR of HashKey^8 <<1 mod poly here (for Karatsu #define arg7 STACK_OFFSET+8*1(%r14) #define arg8 STACK_OFFSET+8*2(%r14) #define arg9 STACK_OFFSET+8*3(%r14) +#define arg10 STACK_OFFSET+8*4(%r14) i = 0 j = 0 @@ -300,9 +280,9 @@ VARIABLE_OFFSET = 16*8 and $~63, %rsp # align rsp to 64 bytes - vmovdqu HashKey(arg1), %xmm13 # xmm13 = HashKey + vmovdqu HashKey(arg2), %xmm13 # xmm13 = HashKey - mov arg4, %r13 # save the number of bytes of plaintext/ciphertext + mov arg5, %r13 # save the number of bytes of plaintext/ciphertext and $-16, %r13 # r13 = r13 - (r13 mod 16) mov %r13, %r12 @@ -413,11 +393,11 @@ _eight_cipher_left\@: _zero_cipher_left\@: - cmp $16, arg4 + cmp $16, arg5 jl _only_less_than_16\@ - mov arg4, %r13 - and $15, %r13 # r13 = (arg4 mod 16) + mov arg5, %r13 + and $15, %r13 # r13 = (arg5 mod 16) je _multiple_of_16_bytes\@ @@ -430,7 +410,7 @@ _zero_cipher_left\@: sub $16, %r11 add %r13, %r11 - vmovdqu (arg3, %r11), %xmm1 # receive the last <16 Byte block + vmovdqu (arg4, %r11), %xmm1 # receive the last <16 Byte block lea SHIFT_MASK+16(%rip), %r12 sub %r13, %r12 # adjust the shuffle mask pointer to be @@ -442,8 +422,8 @@ _zero_cipher_left\@: _only_less_than_16\@: # check for 0 length - mov arg4, %r13 - and $15, %r13 # r13 = (arg4 mod 16) + mov arg5, %r13 + and $15, %r13 # r13 = (arg5 mod 16) je _multiple_of_16_bytes\@ @@ -461,7 +441,7 @@ _only_less_than_16\@: # number of bytes in plaintext mod 16) _get_last_16_byte_loop\@: - movb (arg3, %r11), %al + movb (arg4, %r11), %al movb %al, TMP1 (%rsp , %r11) add $1, %r11 cmp %r13, %r11 @@ -506,14 +486,14 @@ _final_ghash_mul\@: cmp $8, %r13 jle _less_than_8_bytes_left\@ - mov %rax, (arg2 , %r11) + mov %rax, (arg3 , %r11) add $8, %r11 vpsrldq $8, %xmm9, %xmm9 vmovq %xmm9, %rax sub $8, %r13 _less_than_8_bytes_left\@: - movb %al, (arg2 , %r11) + movb %al, (arg3 , %r11) add $1, %r11 shr $8, %rax sub $1, %r13 @@ -521,12 +501,12 @@ _less_than_8_bytes_left\@: ############################# _multiple_of_16_bytes\@: - mov arg7, %r12 # r12 = aadLen (number of bytes) + mov arg8, %r12 # r12 = aadLen (number of bytes) shl $3, %r12 # convert into number of bits vmovd %r12d, %xmm15 # len(A) in xmm15 - shl $3, arg4 # len(C) in bits (*128) - vmovq arg4, %xmm1 + shl $3, arg5 # len(C) in bits (*128) + vmovq arg5, %xmm1 vpslldq $8, %xmm15, %xmm15 # xmm15 = len(A)|| 0x0000000000000000 vpxor %xmm1, %xmm15, %xmm15 # xmm15 = len(A)||len(C) @@ -534,7 +514,7 @@ _multiple_of_16_bytes\@: \GHASH_MUL %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 # final GHASH computation vpshufb SHUF_MASK(%rip), %xmm14, %xmm14 # perform a 16Byte swap - mov arg5, %rax # rax = *Y0 + mov arg6, %rax # rax = *Y0 vmovdqu (%rax), %xmm9 # xmm9 = Y0 ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Y0) @@ -544,8 +524,8 @@ _multiple_of_16_bytes\@: _return_T\@: - mov arg8, %r10 # r10 = authTag - mov arg9, %r11 # r11 = auth_tag_len + mov arg9, %r10 # r10 = authTag + mov arg10, %r11 # r11 = auth_tag_len cmp $16, %r11 je _T_16\@ @@ -655,49 +635,49 @@ _return_T_done\@: vpshufd $0b01001110, \T5, \T1 vpxor \T5, \T1, \T1 - vmovdqa \T1, HashKey_k(arg1) + vmovdqu \T1, HashKey_k(arg2) GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^2<<1 mod poly - vmovdqa \T5, HashKey_2(arg1) # [HashKey_2] = HashKey^2<<1 mod poly + vmovdqu \T5, HashKey_2(arg2) # [HashKey_2] = HashKey^2<<1 mod poly vpshufd $0b01001110, \T5, \T1 vpxor \T5, \T1, \T1 - vmovdqa \T1, HashKey_2_k(arg1) + vmovdqu \T1, HashKey_2_k(arg2) GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^3<<1 mod poly - vmovdqa \T5, HashKey_3(arg1) + vmovdqu \T5, HashKey_3(arg2) vpshufd $0b01001110, \T5, \T1 vpxor \T5, \T1, \T1 - vmovdqa \T1, HashKey_3_k(arg1) + vmovdqu \T1, HashKey_3_k(arg2) GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^4<<1 mod poly - vmovdqa \T5, HashKey_4(arg1) + vmovdqu \T5, HashKey_4(arg2) vpshufd $0b01001110, \T5, \T1 vpxor \T5, \T1, \T1 - vmovdqa \T1, HashKey_4_k(arg1) + vmovdqu \T1, HashKey_4_k(arg2) GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^5<<1 mod poly - vmovdqa \T5, HashKey_5(arg1) + vmovdqu \T5, HashKey_5(arg2) vpshufd $0b01001110, \T5, \T1 vpxor \T5, \T1, \T1 - vmovdqa \T1, HashKey_5_k(arg1) + vmovdqu \T1, HashKey_5_k(arg2) GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^6<<1 mod poly - vmovdqa \T5, HashKey_6(arg1) + vmovdqu \T5, HashKey_6(arg2) vpshufd $0b01001110, \T5, \T1 vpxor \T5, \T1, \T1 - vmovdqa \T1, HashKey_6_k(arg1) + vmovdqu \T1, HashKey_6_k(arg2) GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^7<<1 mod poly - vmovdqa \T5, HashKey_7(arg1) + vmovdqu \T5, HashKey_7(arg2) vpshufd $0b01001110, \T5, \T1 vpxor \T5, \T1, \T1 - vmovdqa \T1, HashKey_7_k(arg1) + vmovdqu \T1, HashKey_7_k(arg2) GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^8<<1 mod poly - vmovdqa \T5, HashKey_8(arg1) + vmovdqu \T5, HashKey_8(arg2) vpshufd $0b01001110, \T5, \T1 vpxor \T5, \T1, \T1 - vmovdqa \T1, HashKey_8_k(arg1) + vmovdqu \T1, HashKey_8_k(arg2) .endm @@ -706,15 +686,15 @@ _return_T_done\@: ## num_initial_blocks = b mod 4# ## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext ## r10, r11, r12, rax are clobbered -## arg1, arg2, arg3, r14 are used as a pointer only, not modified +## arg1, arg3, arg4, r14 are used as a pointer only, not modified .macro INITIAL_BLOCKS_AVX num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC i = (8-\num_initial_blocks) j = 0 setreg - mov arg6, %r10 # r10 = AAD - mov arg7, %r12 # r12 = aadLen + mov arg7, %r10 # r10 = AAD + mov arg8, %r12 # r12 = aadLen mov %r12, %r11 @@ -780,7 +760,7 @@ _get_AAD_done\@: xor %r11d, %r11d # start AES for num_initial_blocks blocks - mov arg5, %rax # rax = *Y0 + mov arg6, %rax # rax = *Y0 vmovdqu (%rax), \CTR # CTR = Y0 vpshufb SHUF_MASK(%rip), \CTR, \CTR @@ -833,9 +813,9 @@ _get_AAD_done\@: i = (9-\num_initial_blocks) setreg .rep \num_initial_blocks - vmovdqu (arg3, %r11), \T1 + vmovdqu (arg4, %r11), \T1 vpxor \T1, reg_i, reg_i - vmovdqu reg_i, (arg2 , %r11) # write back ciphertext for num_initial_blocks blocks + vmovdqu reg_i, (arg3 , %r11) # write back ciphertext for num_initial_blocks blocks add $16, %r11 .if \ENC_DEC == DEC vmovdqa \T1, reg_i @@ -936,58 +916,58 @@ _get_AAD_done\@: vaesenclast \T_key, \XMM7, \XMM7 vaesenclast \T_key, \XMM8, \XMM8 - vmovdqu (arg3, %r11), \T1 + vmovdqu (arg4, %r11), \T1 vpxor \T1, \XMM1, \XMM1 - vmovdqu \XMM1, (arg2 , %r11) + vmovdqu \XMM1, (arg3 , %r11) .if \ENC_DEC == DEC vmovdqa \T1, \XMM1 .endif - vmovdqu 16*1(arg3, %r11), \T1 + vmovdqu 16*1(arg4, %r11), \T1 vpxor \T1, \XMM2, \XMM2 - vmovdqu \XMM2, 16*1(arg2 , %r11) + vmovdqu \XMM2, 16*1(arg3 , %r11) .if \ENC_DEC == DEC vmovdqa \T1, \XMM2 .endif - vmovdqu 16*2(arg3, %r11), \T1 + vmovdqu 16*2(arg4, %r11), \T1 vpxor \T1, \XMM3, \XMM3 - vmovdqu \XMM3, 16*2(arg2 , %r11) + vmovdqu \XMM3, 16*2(arg3 , %r11) .if \ENC_DEC == DEC vmovdqa \T1, \XMM3 .endif - vmovdqu 16*3(arg3, %r11), \T1 + vmovdqu 16*3(arg4, %r11), \T1 vpxor \T1, \XMM4, \XMM4 - vmovdqu \XMM4, 16*3(arg2 , %r11) + vmovdqu \XMM4, 16*3(arg3 , %r11) .if \ENC_DEC == DEC vmovdqa \T1, \XMM4 .endif - vmovdqu 16*4(arg3, %r11), \T1 + vmovdqu 16*4(arg4, %r11), \T1 vpxor \T1, \XMM5, \XMM5 - vmovdqu \XMM5, 16*4(arg2 , %r11) + vmovdqu \XMM5, 16*4(arg3 , %r11) .if \ENC_DEC == DEC vmovdqa \T1, \XMM5 .endif - vmovdqu 16*5(arg3, %r11), \T1 + vmovdqu 16*5(arg4, %r11), \T1 vpxor \T1, \XMM6, \XMM6 - vmovdqu \XMM6, 16*5(arg2 , %r11) + vmovdqu \XMM6, 16*5(arg3 , %r11) .if \ENC_DEC == DEC vmovdqa \T1, \XMM6 .endif - vmovdqu 16*6(arg3, %r11), \T1 + vmovdqu 16*6(arg4, %r11), \T1 vpxor \T1, \XMM7, \XMM7 - vmovdqu \XMM7, 16*6(arg2 , %r11) + vmovdqu \XMM7, 16*6(arg3 , %r11) .if \ENC_DEC == DEC vmovdqa \T1, \XMM7 .endif - vmovdqu 16*7(arg3, %r11), \T1 + vmovdqu 16*7(arg4, %r11), \T1 vpxor \T1, \XMM8, \XMM8 - vmovdqu \XMM8, 16*7(arg2 , %r11) + vmovdqu \XMM8, 16*7(arg3 , %r11) .if \ENC_DEC == DEC vmovdqa \T1, \XMM8 .endif @@ -1012,7 +992,7 @@ _initial_blocks_done\@: # encrypt 8 blocks at a time # ghash the 8 previously encrypted ciphertext blocks -# arg1, arg2, arg3 are used as pointers only, not modified +# arg1, arg3, arg4 are used as pointers only, not modified # r11 is the data offset value .macro GHASH_8_ENCRYPT_8_PARALLEL_AVX T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC @@ -1098,14 +1078,14 @@ _initial_blocks_done\@: ####################################################################### - vmovdqa HashKey_8(arg1), \T5 + vmovdqu HashKey_8(arg2), \T5 vpclmulqdq $0x11, \T5, \T2, \T4 # T4 = a1*b1 vpclmulqdq $0x00, \T5, \T2, \T7 # T7 = a0*b0 vpshufd $0b01001110, \T2, \T6 vpxor \T2, \T6, \T6 - vmovdqa HashKey_8_k(arg1), \T5 + vmovdqu HashKey_8_k(arg2), \T5 vpclmulqdq $0x00, \T5, \T6, \T6 vmovdqu 16*3(arg1), \T1 @@ -1119,7 +1099,7 @@ _initial_blocks_done\@: vaesenc \T1, \XMM8, \XMM8 vmovdqa TMP2(%rsp), \T1 - vmovdqa HashKey_7(arg1), \T5 + vmovdqu HashKey_7(arg2), \T5 vpclmulqdq $0x11, \T5, \T1, \T3 vpxor \T3, \T4, \T4 vpclmulqdq $0x00, \T5, \T1, \T3 @@ -1127,7 +1107,7 @@ _initial_blocks_done\@: vpshufd $0b01001110, \T1, \T3 vpxor \T1, \T3, \T3 - vmovdqa HashKey_7_k(arg1), \T5 + vmovdqu HashKey_7_k(arg2), \T5 vpclmulqdq $0x10, \T5, \T3, \T3 vpxor \T3, \T6, \T6 @@ -1144,7 +1124,7 @@ _initial_blocks_done\@: ####################################################################### vmovdqa TMP3(%rsp), \T1 - vmovdqa HashKey_6(arg1), \T5 + vmovdqu HashKey_6(arg2), \T5 vpclmulqdq $0x11, \T5, \T1, \T3 vpxor \T3, \T4, \T4 vpclmulqdq $0x00, \T5, \T1, \T3 @@ -1152,7 +1132,7 @@ _initial_blocks_done\@: vpshufd $0b01001110, \T1, \T3 vpxor \T1, \T3, \T3 - vmovdqa HashKey_6_k(arg1), \T5 + vmovdqu HashKey_6_k(arg2), \T5 vpclmulqdq $0x10, \T5, \T3, \T3 vpxor \T3, \T6, \T6 @@ -1167,7 +1147,7 @@ _initial_blocks_done\@: vaesenc \T1, \XMM8, \XMM8 vmovdqa TMP4(%rsp), \T1 - vmovdqa HashKey_5(arg1), \T5 + vmovdqu HashKey_5(arg2), \T5 vpclmulqdq $0x11, \T5, \T1, \T3 vpxor \T3, \T4, \T4 vpclmulqdq $0x00, \T5, \T1, \T3 @@ -1175,7 +1155,7 @@ _initial_blocks_done\@: vpshufd $0b01001110, \T1, \T3 vpxor \T1, \T3, \T3 - vmovdqa HashKey_5_k(arg1), \T5 + vmovdqu HashKey_5_k(arg2), \T5 vpclmulqdq $0x10, \T5, \T3, \T3 vpxor \T3, \T6, \T6 @@ -1191,7 +1171,7 @@ _initial_blocks_done\@: vmovdqa TMP5(%rsp), \T1 - vmovdqa HashKey_4(arg1), \T5 + vmovdqu HashKey_4(arg2), \T5 vpclmulqdq $0x11, \T5, \T1, \T3 vpxor \T3, \T4, \T4 vpclmulqdq $0x00, \T5, \T1, \T3 @@ -1199,7 +1179,7 @@ _initial_blocks_done\@: vpshufd $0b01001110, \T1, \T3 vpxor \T1, \T3, \T3 - vmovdqa HashKey_4_k(arg1), \T5 + vmovdqu HashKey_4_k(arg2), \T5 vpclmulqdq $0x10, \T5, \T3, \T3 vpxor \T3, \T6, \T6 @@ -1214,7 +1194,7 @@ _initial_blocks_done\@: vaesenc \T1, \XMM8, \XMM8 vmovdqa TMP6(%rsp), \T1 - vmovdqa HashKey_3(arg1), \T5 + vmovdqu HashKey_3(arg2), \T5 vpclmulqdq $0x11, \T5, \T1, \T3 vpxor \T3, \T4, \T4 vpclmulqdq $0x00, \T5, \T1, \T3 @@ -1222,7 +1202,7 @@ _initial_blocks_done\@: vpshufd $0b01001110, \T1, \T3 vpxor \T1, \T3, \T3 - vmovdqa HashKey_3_k(arg1), \T5 + vmovdqu HashKey_3_k(arg2), \T5 vpclmulqdq $0x10, \T5, \T3, \T3 vpxor \T3, \T6, \T6 @@ -1238,7 +1218,7 @@ _initial_blocks_done\@: vaesenc \T1, \XMM8, \XMM8 vmovdqa TMP7(%rsp), \T1 - vmovdqa HashKey_2(arg1), \T5 + vmovdqu HashKey_2(arg2), \T5 vpclmulqdq $0x11, \T5, \T1, \T3 vpxor \T3, \T4, \T4 vpclmulqdq $0x00, \T5, \T1, \T3 @@ -1246,7 +1226,7 @@ _initial_blocks_done\@: vpshufd $0b01001110, \T1, \T3 vpxor \T1, \T3, \T3 - vmovdqa HashKey_2_k(arg1), \T5 + vmovdqu HashKey_2_k(arg2), \T5 vpclmulqdq $0x10, \T5, \T3, \T3 vpxor \T3, \T6, \T6 @@ -1263,7 +1243,7 @@ _initial_blocks_done\@: vaesenc \T5, \XMM8, \XMM8 vmovdqa TMP8(%rsp), \T1 - vmovdqa HashKey(arg1), \T5 + vmovdqu HashKey(arg2), \T5 vpclmulqdq $0x11, \T5, \T1, \T3 vpxor \T3, \T4, \T4 vpclmulqdq $0x00, \T5, \T1, \T3 @@ -1271,7 +1251,7 @@ _initial_blocks_done\@: vpshufd $0b01001110, \T1, \T3 vpxor \T1, \T3, \T3 - vmovdqa HashKey_k(arg1), \T5 + vmovdqu HashKey_k(arg2), \T5 vpclmulqdq $0x10, \T5, \T3, \T3 vpxor \T3, \T6, \T6 @@ -1284,13 +1264,13 @@ _initial_blocks_done\@: j = 1 setreg .rep 8 - vpxor 16*i(arg3, %r11), \T5, \T2 + vpxor 16*i(arg4, %r11), \T5, \T2 .if \ENC_DEC == ENC vaesenclast \T2, reg_j, reg_j .else vaesenclast \T2, reg_j, \T3 - vmovdqu 16*i(arg3, %r11), reg_j - vmovdqu \T3, 16*i(arg2, %r11) + vmovdqu 16*i(arg4, %r11), reg_j + vmovdqu \T3, 16*i(arg3, %r11) .endif i = (i+1) j = (j+1) @@ -1322,14 +1302,14 @@ _initial_blocks_done\@: vpxor \T2, \T7, \T7 # first phase of the reduction complete ####################################################################### .if \ENC_DEC == ENC - vmovdqu \XMM1, 16*0(arg2,%r11) # Write to the Ciphertext buffer - vmovdqu \XMM2, 16*1(arg2,%r11) # Write to the Ciphertext buffer - vmovdqu \XMM3, 16*2(arg2,%r11) # Write to the Ciphertext buffer - vmovdqu \XMM4, 16*3(arg2,%r11) # Write to the Ciphertext buffer - vmovdqu \XMM5, 16*4(arg2,%r11) # Write to the Ciphertext buffer - vmovdqu \XMM6, 16*5(arg2,%r11) # Write to the Ciphertext buffer - vmovdqu \XMM7, 16*6(arg2,%r11) # Write to the Ciphertext buffer - vmovdqu \XMM8, 16*7(arg2,%r11) # Write to the Ciphertext buffer + vmovdqu \XMM1, 16*0(arg3,%r11) # Write to the Ciphertext buffer + vmovdqu \XMM2, 16*1(arg3,%r11) # Write to the Ciphertext buffer + vmovdqu \XMM3, 16*2(arg3,%r11) # Write to the Ciphertext buffer + vmovdqu \XMM4, 16*3(arg3,%r11) # Write to the Ciphertext buffer + vmovdqu \XMM5, 16*4(arg3,%r11) # Write to the Ciphertext buffer + vmovdqu \XMM6, 16*5(arg3,%r11) # Write to the Ciphertext buffer + vmovdqu \XMM7, 16*6(arg3,%r11) # Write to the Ciphertext buffer + vmovdqu \XMM8, 16*7(arg3,%r11) # Write to the Ciphertext buffer .endif ####################################################################### @@ -1370,25 +1350,25 @@ _initial_blocks_done\@: vpshufd $0b01001110, \XMM1, \T2 vpxor \XMM1, \T2, \T2 - vmovdqa HashKey_8(arg1), \T5 + vmovdqu HashKey_8(arg2), \T5 vpclmulqdq $0x11, \T5, \XMM1, \T6 vpclmulqdq $0x00, \T5, \XMM1, \T7 - vmovdqa HashKey_8_k(arg1), \T3 + vmovdqu HashKey_8_k(arg2), \T3 vpclmulqdq $0x00, \T3, \T2, \XMM1 ###################### vpshufd $0b01001110, \XMM2, \T2 vpxor \XMM2, \T2, \T2 - vmovdqa HashKey_7(arg1), \T5 + vmovdqu HashKey_7(arg2), \T5 vpclmulqdq $0x11, \T5, \XMM2, \T4 vpxor \T4, \T6, \T6 vpclmulqdq $0x00, \T5, \XMM2, \T4 vpxor \T4, \T7, \T7 - vmovdqa HashKey_7_k(arg1), \T3 + vmovdqu HashKey_7_k(arg2), \T3 vpclmulqdq $0x00, \T3, \T2, \T2 vpxor \T2, \XMM1, \XMM1 @@ -1396,14 +1376,14 @@ _initial_blocks_done\@: vpshufd $0b01001110, \XMM3, \T2 vpxor \XMM3, \T2, \T2 - vmovdqa HashKey_6(arg1), \T5 + vmovdqu HashKey_6(arg2), \T5 vpclmulqdq $0x11, \T5, \XMM3, \T4 vpxor \T4, \T6, \T6 vpclmulqdq $0x00, \T5, \XMM3, \T4 vpxor \T4, \T7, \T7 - vmovdqa HashKey_6_k(arg1), \T3 + vmovdqu HashKey_6_k(arg2), \T3 vpclmulqdq $0x00, \T3, \T2, \T2 vpxor \T2, \XMM1, \XMM1 @@ -1411,14 +1391,14 @@ _initial_blocks_done\@: vpshufd $0b01001110, \XMM4, \T2 vpxor \XMM4, \T2, \T2 - vmovdqa HashKey_5(arg1), \T5 + vmovdqu HashKey_5(arg2), \T5 vpclmulqdq $0x11, \T5, \XMM4, \T4 vpxor \T4, \T6, \T6 vpclmulqdq $0x00, \T5, \XMM4, \T4 vpxor \T4, \T7, \T7 - vmovdqa HashKey_5_k(arg1), \T3 + vmovdqu HashKey_5_k(arg2), \T3 vpclmulqdq $0x00, \T3, \T2, \T2 vpxor \T2, \XMM1, \XMM1 @@ -1426,14 +1406,14 @@ _initial_blocks_done\@: vpshufd $0b01001110, \XMM5, \T2 vpxor \XMM5, \T2, \T2 - vmovdqa HashKey_4(arg1), \T5 + vmovdqu HashKey_4(arg2), \T5 vpclmulqdq $0x11, \T5, \XMM5, \T4 vpxor \T4, \T6, \T6 vpclmulqdq $0x00, \T5, \XMM5, \T4 vpxor \T4, \T7, \T7 - vmovdqa HashKey_4_k(arg1), \T3 + vmovdqu HashKey_4_k(arg2), \T3 vpclmulqdq $0x00, \T3, \T2, \T2 vpxor \T2, \XMM1, \XMM1 @@ -1441,14 +1421,14 @@ _initial_blocks_done\@: vpshufd $0b01001110, \XMM6, \T2 vpxor \XMM6, \T2, \T2 - vmovdqa HashKey_3(arg1), \T5 + vmovdqu HashKey_3(arg2), \T5 vpclmulqdq $0x11, \T5, \XMM6, \T4 vpxor \T4, \T6, \T6 vpclmulqdq $0x00, \T5, \XMM6, \T4 vpxor \T4, \T7, \T7 - vmovdqa HashKey_3_k(arg1), \T3 + vmovdqu HashKey_3_k(arg2), \T3 vpclmulqdq $0x00, \T3, \T2, \T2 vpxor \T2, \XMM1, \XMM1 @@ -1456,14 +1436,14 @@ _initial_blocks_done\@: vpshufd $0b01001110, \XMM7, \T2 vpxor \XMM7, \T2, \T2 - vmovdqa HashKey_2(arg1), \T5 + vmovdqu HashKey_2(arg2), \T5 vpclmulqdq $0x11, \T5, \XMM7, \T4 vpxor \T4, \T6, \T6 vpclmulqdq $0x00, \T5, \XMM7, \T4 vpxor \T4, \T7, \T7 - vmovdqa HashKey_2_k(arg1), \T3 + vmovdqu HashKey_2_k(arg2), \T3 vpclmulqdq $0x00, \T3, \T2, \T2 vpxor \T2, \XMM1, \XMM1 @@ -1471,14 +1451,14 @@ _initial_blocks_done\@: vpshufd $0b01001110, \XMM8, \T2 vpxor \XMM8, \T2, \T2 - vmovdqa HashKey(arg1), \T5 + vmovdqu HashKey(arg2), \T5 vpclmulqdq $0x11, \T5, \XMM8, \T4 vpxor \T4, \T6, \T6 vpclmulqdq $0x00, \T5, \XMM8, \T4 vpxor \T4, \T7, \T7 - vmovdqa HashKey_k(arg1), \T3 + vmovdqu HashKey_k(arg2), \T3 vpclmulqdq $0x00, \T3, \T2, \T2 vpxor \T2, \XMM1, \XMM1 @@ -1527,6 +1507,7 @@ _initial_blocks_done\@: ############################################################# #void aesni_gcm_precomp_avx_gen2 # (gcm_data *my_ctx_data, +# gcm_context_data *data, # u8 *hash_subkey)# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */ ############################################################# ENTRY(aesni_gcm_precomp_avx_gen2) @@ -1543,7 +1524,7 @@ ENTRY(aesni_gcm_precomp_avx_gen2) sub $VARIABLE_OFFSET, %rsp and $~63, %rsp # align rsp to 64 bytes - vmovdqu (arg2), %xmm6 # xmm6 = HashKey + vmovdqu (arg3), %xmm6 # xmm6 = HashKey vpshufb SHUF_MASK(%rip), %xmm6, %xmm6 ############### PRECOMPUTATION of HashKey<<1 mod poly from the HashKey @@ -1560,7 +1541,7 @@ ENTRY(aesni_gcm_precomp_avx_gen2) vpand POLY(%rip), %xmm2, %xmm2 vpxor %xmm2, %xmm6, %xmm6 # xmm6 holds the HashKey<<1 mod poly ####################################################################### - vmovdqa %xmm6, HashKey(arg1) # store HashKey<<1 mod poly + vmovdqu %xmm6, HashKey(arg2) # store HashKey<<1 mod poly PRECOMPUTE_AVX %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5 @@ -1577,6 +1558,7 @@ ENDPROC(aesni_gcm_precomp_avx_gen2) ############################################################################### #void aesni_gcm_enc_avx_gen2( # gcm_data *my_ctx_data, /* aligned to 16 Bytes */ +# gcm_context_data *data, # u8 *out, /* Ciphertext output. Encrypt in-place is allowed. */ # const u8 *in, /* Plaintext input */ # u64 plaintext_len, /* Length of data in Bytes for encryption. */ @@ -1598,6 +1580,7 @@ ENDPROC(aesni_gcm_enc_avx_gen2) ############################################################################### #void aesni_gcm_dec_avx_gen2( # gcm_data *my_ctx_data, /* aligned to 16 Bytes */ +# gcm_context_data *data, # u8 *out, /* Plaintext output. Decrypt in-place is allowed. */ # const u8 *in, /* Ciphertext input */ # u64 plaintext_len, /* Length of data in Bytes for encryption. */ @@ -1668,25 +1651,25 @@ ENDPROC(aesni_gcm_dec_avx_gen2) # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i vmovdqa \HK, \T5 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^2<<1 mod poly - vmovdqa \T5, HashKey_2(arg1) # [HashKey_2] = HashKey^2<<1 mod poly + vmovdqu \T5, HashKey_2(arg2) # [HashKey_2] = HashKey^2<<1 mod poly GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^3<<1 mod poly - vmovdqa \T5, HashKey_3(arg1) + vmovdqu \T5, HashKey_3(arg2) GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^4<<1 mod poly - vmovdqa \T5, HashKey_4(arg1) + vmovdqu \T5, HashKey_4(arg2) GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^5<<1 mod poly - vmovdqa \T5, HashKey_5(arg1) + vmovdqu \T5, HashKey_5(arg2) GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^6<<1 mod poly - vmovdqa \T5, HashKey_6(arg1) + vmovdqu \T5, HashKey_6(arg2) GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^7<<1 mod poly - vmovdqa \T5, HashKey_7(arg1) + vmovdqu \T5, HashKey_7(arg2) GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^8<<1 mod poly - vmovdqa \T5, HashKey_8(arg1) + vmovdqu \T5, HashKey_8(arg2) .endm @@ -1696,15 +1679,15 @@ ENDPROC(aesni_gcm_dec_avx_gen2) ## num_initial_blocks = b mod 4# ## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext ## r10, r11, r12, rax are clobbered -## arg1, arg2, arg3, r14 are used as a pointer only, not modified +## arg1, arg3, arg4, r14 are used as a pointer only, not modified .macro INITIAL_BLOCKS_AVX2 num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC VER i = (8-\num_initial_blocks) j = 0 setreg - mov arg6, %r10 # r10 = AAD - mov arg7, %r12 # r12 = aadLen + mov arg7, %r10 # r10 = AAD + mov arg8, %r12 # r12 = aadLen mov %r12, %r11 @@ -1771,7 +1754,7 @@ _get_AAD_done\@: xor %r11d, %r11d # start AES for num_initial_blocks blocks - mov arg5, %rax # rax = *Y0 + mov arg6, %rax # rax = *Y0 vmovdqu (%rax), \CTR # CTR = Y0 vpshufb SHUF_MASK(%rip), \CTR, \CTR @@ -1824,9 +1807,9 @@ _get_AAD_done\@: i = (9-\num_initial_blocks) setreg .rep \num_initial_blocks - vmovdqu (arg3, %r11), \T1 + vmovdqu (arg4, %r11), \T1 vpxor \T1, reg_i, reg_i - vmovdqu reg_i, (arg2 , %r11) # write back ciphertext for + vmovdqu reg_i, (arg3 , %r11) # write back ciphertext for # num_initial_blocks blocks add $16, %r11 .if \ENC_DEC == DEC @@ -1928,58 +1911,58 @@ _get_AAD_done\@: vaesenclast \T_key, \XMM7, \XMM7 vaesenclast \T_key, \XMM8, \XMM8 - vmovdqu (arg3, %r11), \T1 + vmovdqu (arg4, %r11), \T1 vpxor \T1, \XMM1, \XMM1 - vmovdqu \XMM1, (arg2 , %r11) + vmovdqu \XMM1, (arg3 , %r11) .if \ENC_DEC == DEC vmovdqa \T1, \XMM1 .endif - vmovdqu 16*1(arg3, %r11), \T1 + vmovdqu 16*1(arg4, %r11), \T1 vpxor \T1, \XMM2, \XMM2 - vmovdqu \XMM2, 16*1(arg2 , %r11) + vmovdqu \XMM2, 16*1(arg3 , %r11) .if \ENC_DEC == DEC vmovdqa \T1, \XMM2 .endif - vmovdqu 16*2(arg3, %r11), \T1 + vmovdqu 16*2(arg4, %r11), \T1 vpxor \T1, \XMM3, \XMM3 - vmovdqu \XMM3, 16*2(arg2 , %r11) + vmovdqu \XMM3, 16*2(arg3 , %r11) .if \ENC_DEC == DEC vmovdqa \T1, \XMM3 .endif - vmovdqu 16*3(arg3, %r11), \T1 + vmovdqu 16*3(arg4, %r11), \T1 vpxor \T1, \XMM4, \XMM4 - vmovdqu \XMM4, 16*3(arg2 , %r11) + vmovdqu \XMM4, 16*3(arg3 , %r11) .if \ENC_DEC == DEC vmovdqa \T1, \XMM4 .endif - vmovdqu 16*4(arg3, %r11), \T1 + vmovdqu 16*4(arg4, %r11), \T1 vpxor \T1, \XMM5, \XMM5 - vmovdqu \XMM5, 16*4(arg2 , %r11) + vmovdqu \XMM5, 16*4(arg3 , %r11) .if \ENC_DEC == DEC vmovdqa \T1, \XMM5 .endif - vmovdqu 16*5(arg3, %r11), \T1 + vmovdqu 16*5(arg4, %r11), \T1 vpxor \T1, \XMM6, \XMM6 - vmovdqu \XMM6, 16*5(arg2 , %r11) + vmovdqu \XMM6, 16*5(arg3 , %r11) .if \ENC_DEC == DEC vmovdqa \T1, \XMM6 .endif - vmovdqu 16*6(arg3, %r11), \T1 + vmovdqu 16*6(arg4, %r11), \T1 vpxor \T1, \XMM7, \XMM7 - vmovdqu \XMM7, 16*6(arg2 , %r11) + vmovdqu \XMM7, 16*6(arg3 , %r11) .if \ENC_DEC == DEC vmovdqa \T1, \XMM7 .endif - vmovdqu 16*7(arg3, %r11), \T1 + vmovdqu 16*7(arg4, %r11), \T1 vpxor \T1, \XMM8, \XMM8 - vmovdqu \XMM8, 16*7(arg2 , %r11) + vmovdqu \XMM8, 16*7(arg3 , %r11) .if \ENC_DEC == DEC vmovdqa \T1, \XMM8 .endif @@ -2008,7 +1991,7 @@ _initial_blocks_done\@: # encrypt 8 blocks at a time # ghash the 8 previously encrypted ciphertext blocks -# arg1, arg2, arg3 are used as pointers only, not modified +# arg1, arg3, arg4 are used as pointers only, not modified # r11 is the data offset value .macro GHASH_8_ENCRYPT_8_PARALLEL_AVX2 T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC @@ -2094,7 +2077,7 @@ _initial_blocks_done\@: ####################################################################### - vmovdqa HashKey_8(arg1), \T5 + vmovdqu HashKey_8(arg2), \T5 vpclmulqdq $0x11, \T5, \T2, \T4 # T4 = a1*b1 vpclmulqdq $0x00, \T5, \T2, \T7 # T7 = a0*b0 vpclmulqdq $0x01, \T5, \T2, \T6 # T6 = a1*b0 @@ -2112,7 +2095,7 @@ _initial_blocks_done\@: vaesenc \T1, \XMM8, \XMM8 vmovdqa TMP2(%rsp), \T1 - vmovdqa HashKey_7(arg1), \T5 + vmovdqu HashKey_7(arg2), \T5 vpclmulqdq $0x11, \T5, \T1, \T3 vpxor \T3, \T4, \T4 @@ -2138,7 +2121,7 @@ _initial_blocks_done\@: ####################################################################### vmovdqa TMP3(%rsp), \T1 - vmovdqa HashKey_6(arg1), \T5 + vmovdqu HashKey_6(arg2), \T5 vpclmulqdq $0x11, \T5, \T1, \T3 vpxor \T3, \T4, \T4 @@ -2162,7 +2145,7 @@ _initial_blocks_done\@: vaesenc \T1, \XMM8, \XMM8 vmovdqa TMP4(%rsp), \T1 - vmovdqa HashKey_5(arg1), \T5 + vmovdqu HashKey_5(arg2), \T5 vpclmulqdq $0x11, \T5, \T1, \T3 vpxor \T3, \T4, \T4 @@ -2187,7 +2170,7 @@ _initial_blocks_done\@: vmovdqa TMP5(%rsp), \T1 - vmovdqa HashKey_4(arg1), \T5 + vmovdqu HashKey_4(arg2), \T5 vpclmulqdq $0x11, \T5, \T1, \T3 vpxor \T3, \T4, \T4 @@ -2211,7 +2194,7 @@ _initial_blocks_done\@: vaesenc \T1, \XMM8, \XMM8 vmovdqa TMP6(%rsp), \T1 - vmovdqa HashKey_3(arg1), \T5 + vmovdqu HashKey_3(arg2), \T5 vpclmulqdq $0x11, \T5, \T1, \T3 vpxor \T3, \T4, \T4 @@ -2235,7 +2218,7 @@ _initial_blocks_done\@: vaesenc \T1, \XMM8, \XMM8 vmovdqa TMP7(%rsp), \T1 - vmovdqa HashKey_2(arg1), \T5 + vmovdqu HashKey_2(arg2), \T5 vpclmulqdq $0x11, \T5, \T1, \T3 vpxor \T3, \T4, \T4 @@ -2262,7 +2245,7 @@ _initial_blocks_done\@: vaesenc \T5, \XMM8, \XMM8 vmovdqa TMP8(%rsp), \T1 - vmovdqa HashKey(arg1), \T5 + vmovdqu HashKey(arg2), \T5 vpclmulqdq $0x00, \T5, \T1, \T3 vpxor \T3, \T7, \T7 @@ -2283,13 +2266,13 @@ _initial_blocks_done\@: j = 1 setreg .rep 8 - vpxor 16*i(arg3, %r11), \T5, \T2 + vpxor 16*i(arg4, %r11), \T5, \T2 .if \ENC_DEC == ENC vaesenclast \T2, reg_j, reg_j .else vaesenclast \T2, reg_j, \T3 - vmovdqu 16*i(arg3, %r11), reg_j - vmovdqu \T3, 16*i(arg2, %r11) + vmovdqu 16*i(arg4, %r11), reg_j + vmovdqu \T3, 16*i(arg3, %r11) .endif i = (i+1) j = (j+1) @@ -2315,14 +2298,14 @@ _initial_blocks_done\@: vpxor \T2, \T7, \T7 # first phase of the reduction complete ####################################################################### .if \ENC_DEC == ENC - vmovdqu \XMM1, 16*0(arg2,%r11) # Write to the Ciphertext buffer - vmovdqu \XMM2, 16*1(arg2,%r11) # Write to the Ciphertext buffer - vmovdqu \XMM3, 16*2(arg2,%r11) # Write to the Ciphertext buffer - vmovdqu \XMM4, 16*3(arg2,%r11) # Write to the Ciphertext buffer - vmovdqu \XMM5, 16*4(arg2,%r11) # Write to the Ciphertext buffer - vmovdqu \XMM6, 16*5(arg2,%r11) # Write to the Ciphertext buffer - vmovdqu \XMM7, 16*6(arg2,%r11) # Write to the Ciphertext buffer - vmovdqu \XMM8, 16*7(arg2,%r11) # Write to the Ciphertext buffer + vmovdqu \XMM1, 16*0(arg3,%r11) # Write to the Ciphertext buffer + vmovdqu \XMM2, 16*1(arg3,%r11) # Write to the Ciphertext buffer + vmovdqu \XMM3, 16*2(arg3,%r11) # Write to the Ciphertext buffer + vmovdqu \XMM4, 16*3(arg3,%r11) # Write to the Ciphertext buffer + vmovdqu \XMM5, 16*4(arg3,%r11) # Write to the Ciphertext buffer + vmovdqu \XMM6, 16*5(arg3,%r11) # Write to the Ciphertext buffer + vmovdqu \XMM7, 16*6(arg3,%r11) # Write to the Ciphertext buffer + vmovdqu \XMM8, 16*7(arg3,%r11) # Write to the Ciphertext buffer .endif ####################################################################### @@ -2359,7 +2342,7 @@ _initial_blocks_done\@: ## Karatsuba Method - vmovdqa HashKey_8(arg1), \T5 + vmovdqu HashKey_8(arg2), \T5 vpshufd $0b01001110, \XMM1, \T2 vpshufd $0b01001110, \T5, \T3 @@ -2373,7 +2356,7 @@ _initial_blocks_done\@: ###################### - vmovdqa HashKey_7(arg1), \T5 + vmovdqu HashKey_7(arg2), \T5 vpshufd $0b01001110, \XMM2, \T2 vpshufd $0b01001110, \T5, \T3 vpxor \XMM2, \T2, \T2 @@ -2391,7 +2374,7 @@ _initial_blocks_done\@: ###################### - vmovdqa HashKey_6(arg1), \T5 + vmovdqu HashKey_6(arg2), \T5 vpshufd $0b01001110, \XMM3, \T2 vpshufd $0b01001110, \T5, \T3 vpxor \XMM3, \T2, \T2 @@ -2409,7 +2392,7 @@ _initial_blocks_done\@: ###################### - vmovdqa HashKey_5(arg1), \T5 + vmovdqu HashKey_5(arg2), \T5 vpshufd $0b01001110, \XMM4, \T2 vpshufd $0b01001110, \T5, \T3 vpxor \XMM4, \T2, \T2 @@ -2427,7 +2410,7 @@ _initial_blocks_done\@: ###################### - vmovdqa HashKey_4(arg1), \T5 + vmovdqu HashKey_4(arg2), \T5 vpshufd $0b01001110, \XMM5, \T2 vpshufd $0b01001110, \T5, \T3 vpxor \XMM5, \T2, \T2 @@ -2445,7 +2428,7 @@ _initial_blocks_done\@: ###################### - vmovdqa HashKey_3(arg1), \T5 + vmovdqu HashKey_3(arg2), \T5 vpshufd $0b01001110, \XMM6, \T2 vpshufd $0b01001110, \T5, \T3 vpxor \XMM6, \T2, \T2 @@ -2463,7 +2446,7 @@ _initial_blocks_done\@: ###################### - vmovdqa HashKey_2(arg1), \T5 + vmovdqu HashKey_2(arg2), \T5 vpshufd $0b01001110, \XMM7, \T2 vpshufd $0b01001110, \T5, \T3 vpxor \XMM7, \T2, \T2 @@ -2481,7 +2464,7 @@ _initial_blocks_done\@: ###################### - vmovdqa HashKey(arg1), \T5 + vmovdqu HashKey(arg2), \T5 vpshufd $0b01001110, \XMM8, \T2 vpshufd $0b01001110, \T5, \T3 vpxor \XMM8, \T2, \T2 @@ -2537,6 +2520,7 @@ _initial_blocks_done\@: ############################################################# #void aesni_gcm_precomp_avx_gen4 # (gcm_data *my_ctx_data, +# gcm_context_data *data, # u8 *hash_subkey)# /* H, the Hash sub key input. # Data starts on a 16-byte boundary. */ ############################################################# @@ -2554,7 +2538,7 @@ ENTRY(aesni_gcm_precomp_avx_gen4) sub $VARIABLE_OFFSET, %rsp and $~63, %rsp # align rsp to 64 bytes - vmovdqu (arg2), %xmm6 # xmm6 = HashKey + vmovdqu (arg3), %xmm6 # xmm6 = HashKey vpshufb SHUF_MASK(%rip), %xmm6, %xmm6 ############### PRECOMPUTATION of HashKey<<1 mod poly from the HashKey @@ -2571,7 +2555,7 @@ ENTRY(aesni_gcm_precomp_avx_gen4) vpand POLY(%rip), %xmm2, %xmm2 vpxor %xmm2, %xmm6, %xmm6 # xmm6 holds the HashKey<<1 mod poly ####################################################################### - vmovdqa %xmm6, HashKey(arg1) # store HashKey<<1 mod poly + vmovdqu %xmm6, HashKey(arg2) # store HashKey<<1 mod poly PRECOMPUTE_AVX2 %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5 @@ -2589,6 +2573,7 @@ ENDPROC(aesni_gcm_precomp_avx_gen4) ############################################################################### #void aesni_gcm_enc_avx_gen4( # gcm_data *my_ctx_data, /* aligned to 16 Bytes */ +# gcm_context_data *data, # u8 *out, /* Ciphertext output. Encrypt in-place is allowed. */ # const u8 *in, /* Plaintext input */ # u64 plaintext_len, /* Length of data in Bytes for encryption. */ @@ -2610,6 +2595,7 @@ ENDPROC(aesni_gcm_enc_avx_gen4) ############################################################################### #void aesni_gcm_dec_avx_gen4( # gcm_data *my_ctx_data, /* aligned to 16 Bytes */ +# gcm_context_data *data, # u8 *out, /* Plaintext output. Decrypt in-place is allowed. */ # const u8 *in, /* Ciphertext input */ # u64 plaintext_len, /* Length of data in Bytes for encryption. */ diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c index 661f7daf43da..d8b8cb33f608 100644 --- a/arch/x86/crypto/aesni-intel_glue.c +++ b/arch/x86/crypto/aesni-intel_glue.c @@ -84,7 +84,7 @@ struct gcm_context_data { u8 current_counter[GCM_BLOCK_LEN]; u64 partial_block_len; u64 unused; - u8 hash_keys[GCM_BLOCK_LEN * 8]; + u8 hash_keys[GCM_BLOCK_LEN * 16]; }; asmlinkage int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key, @@ -187,14 +187,18 @@ asmlinkage void aes_ctr_enc_256_avx_by8(const u8 *in, u8 *iv, * gcm_data *my_ctx_data, context data * u8 *hash_subkey, the Hash sub key input. Data starts on a 16-byte boundary. */ -asmlinkage void aesni_gcm_precomp_avx_gen2(void *my_ctx_data, u8 *hash_subkey); +asmlinkage void aesni_gcm_precomp_avx_gen2(void *my_ctx_data, + struct gcm_context_data *gdata, + u8 *hash_subkey); -asmlinkage void aesni_gcm_enc_avx_gen2(void *ctx, u8 *out, +asmlinkage void aesni_gcm_enc_avx_gen2(void *ctx, + struct gcm_context_data *gdata, u8 *out, const u8 *in, unsigned long plaintext_len, u8 *iv, const u8 *aad, unsigned long aad_len, u8 *auth_tag, unsigned long auth_tag_len); -asmlinkage void aesni_gcm_dec_avx_gen2(void *ctx, u8 *out, +asmlinkage void aesni_gcm_dec_avx_gen2(void *ctx, + struct gcm_context_data *gdata, u8 *out, const u8 *in, unsigned long ciphertext_len, u8 *iv, const u8 *aad, unsigned long aad_len, u8 *auth_tag, unsigned long auth_tag_len); @@ -211,9 +215,9 @@ static void aesni_gcm_enc_avx(void *ctx, plaintext_len, iv, hash_subkey, aad, aad_len, auth_tag, auth_tag_len); } else { - aesni_gcm_precomp_avx_gen2(ctx, hash_subkey); - aesni_gcm_enc_avx_gen2(ctx, out, in, plaintext_len, iv, aad, - aad_len, auth_tag, auth_tag_len); + aesni_gcm_precomp_avx_gen2(ctx, data, hash_subkey); + aesni_gcm_enc_avx_gen2(ctx, data, out, in, plaintext_len, iv, + aad, aad_len, auth_tag, auth_tag_len); } } @@ -229,9 +233,9 @@ static void aesni_gcm_dec_avx(void *ctx, ciphertext_len, iv, hash_subkey, aad, aad_len, auth_tag, auth_tag_len); } else { - aesni_gcm_precomp_avx_gen2(ctx, hash_subkey); - aesni_gcm_dec_avx_gen2(ctx, out, in, ciphertext_len, iv, aad, - aad_len, auth_tag, auth_tag_len); + aesni_gcm_precomp_avx_gen2(ctx, data, hash_subkey); + aesni_gcm_dec_avx_gen2(ctx, data, out, in, ciphertext_len, iv, + aad, aad_len, auth_tag, auth_tag_len); } } #endif @@ -242,14 +246,18 @@ static void aesni_gcm_dec_avx(void *ctx, * gcm_data *my_ctx_data, context data * u8 *hash_subkey, the Hash sub key input. Data starts on a 16-byte boundary. */ -asmlinkage void aesni_gcm_precomp_avx_gen4(void *my_ctx_data, u8 *hash_subkey); +asmlinkage void aesni_gcm_precomp_avx_gen4(void *my_ctx_data, + struct gcm_context_data *gdata, + u8 *hash_subkey); -asmlinkage void aesni_gcm_enc_avx_gen4(void *ctx, u8 *out, +asmlinkage void aesni_gcm_enc_avx_gen4(void *ctx, + struct gcm_context_data *gdata, u8 *out, const u8 *in, unsigned long plaintext_len, u8 *iv, const u8 *aad, unsigned long aad_len, u8 *auth_tag, unsigned long auth_tag_len); -asmlinkage void aesni_gcm_dec_avx_gen4(void *ctx, u8 *out, +asmlinkage void aesni_gcm_dec_avx_gen4(void *ctx, + struct gcm_context_data *gdata, u8 *out, const u8 *in, unsigned long ciphertext_len, u8 *iv, const u8 *aad, unsigned long aad_len, u8 *auth_tag, unsigned long auth_tag_len); @@ -266,13 +274,13 @@ static void aesni_gcm_enc_avx2(void *ctx, plaintext_len, iv, hash_subkey, aad, aad_len, auth_tag, auth_tag_len); } else if (plaintext_len < AVX_GEN4_OPTSIZE) { - aesni_gcm_precomp_avx_gen2(ctx, hash_subkey); - aesni_gcm_enc_avx_gen2(ctx, out, in, plaintext_len, iv, aad, - aad_len, auth_tag, auth_tag_len); + aesni_gcm_precomp_avx_gen2(ctx, data, hash_subkey); + aesni_gcm_enc_avx_gen2(ctx, data, out, in, plaintext_len, iv, + aad, aad_len, auth_tag, auth_tag_len); } else { - aesni_gcm_precomp_avx_gen4(ctx, hash_subkey); - aesni_gcm_enc_avx_gen4(ctx, out, in, plaintext_len, iv, aad, - aad_len, auth_tag, auth_tag_len); + aesni_gcm_precomp_avx_gen4(ctx, data, hash_subkey); + aesni_gcm_enc_avx_gen4(ctx, data, out, in, plaintext_len, iv, + aad, aad_len, auth_tag, auth_tag_len); } } @@ -288,13 +296,13 @@ static void aesni_gcm_dec_avx2(void *ctx, ciphertext_len, iv, hash_subkey, aad, aad_len, auth_tag, auth_tag_len); } else if (ciphertext_len < AVX_GEN4_OPTSIZE) { - aesni_gcm_precomp_avx_gen2(ctx, hash_subkey); - aesni_gcm_dec_avx_gen2(ctx, out, in, ciphertext_len, iv, aad, - aad_len, auth_tag, auth_tag_len); + aesni_gcm_precomp_avx_gen2(ctx, data, hash_subkey); + aesni_gcm_dec_avx_gen2(ctx, data, out, in, ciphertext_len, iv, + aad, aad_len, auth_tag, auth_tag_len); } else { - aesni_gcm_precomp_avx_gen4(ctx, hash_subkey); - aesni_gcm_dec_avx_gen4(ctx, out, in, ciphertext_len, iv, aad, - aad_len, auth_tag, auth_tag_len); + aesni_gcm_precomp_avx_gen4(ctx, data, hash_subkey); + aesni_gcm_dec_avx_gen4(ctx, data, out, in, ciphertext_len, iv, + aad, aad_len, auth_tag, auth_tag_len); } } #endif -- cgit v1.2.3-59-g8ed1b From 2426f64bc51fc86951b735d2247d6eb89259d580 Mon Sep 17 00:00:00 2001 From: Dave Watson Date: Mon, 10 Dec 2018 19:57:12 +0000 Subject: crypto: aesni - Macro-ify func save/restore Macro-ify function save and restore. These will be used in new functions added for scatter/gather update operations. Signed-off-by: Dave Watson Signed-off-by: Herbert Xu --- arch/x86/crypto/aesni-intel_avx-x86_64.S | 94 ++++++++++++-------------------- 1 file changed, 36 insertions(+), 58 deletions(-) diff --git a/arch/x86/crypto/aesni-intel_avx-x86_64.S b/arch/x86/crypto/aesni-intel_avx-x86_64.S index 284f1b8b88fc..dd895f69399b 100644 --- a/arch/x86/crypto/aesni-intel_avx-x86_64.S +++ b/arch/x86/crypto/aesni-intel_avx-x86_64.S @@ -247,6 +247,30 @@ VARIABLE_OFFSET = 16*8 # Utility Macros ################################ +.macro FUNC_SAVE + #the number of pushes must equal STACK_OFFSET + push %r12 + push %r13 + push %r14 + push %r15 + + mov %rsp, %r14 + + + + sub $VARIABLE_OFFSET, %rsp + and $~63, %rsp # align rsp to 64 bytes +.endm + +.macro FUNC_RESTORE + mov %r14, %rsp + + pop %r15 + pop %r14 + pop %r13 + pop %r12 +.endm + # Encryption of a single block .macro ENCRYPT_SINGLE_BLOCK XMM0 vpxor (arg1), \XMM0, \XMM0 @@ -264,22 +288,6 @@ VARIABLE_OFFSET = 16*8 # clobbering all xmm registers # clobbering r10, r11, r12, r13, r14, r15 .macro GCM_ENC_DEC INITIAL_BLOCKS GHASH_8_ENCRYPT_8_PARALLEL GHASH_LAST_8 GHASH_MUL ENC_DEC - - #the number of pushes must equal STACK_OFFSET - push %r12 - push %r13 - push %r14 - push %r15 - - mov %rsp, %r14 - - - - - sub $VARIABLE_OFFSET, %rsp - and $~63, %rsp # align rsp to 64 bytes - - vmovdqu HashKey(arg2), %xmm13 # xmm13 = HashKey mov arg5, %r13 # save the number of bytes of plaintext/ciphertext @@ -566,12 +574,6 @@ _T_16\@: vmovdqu %xmm9, (%r10) _return_T_done\@: - mov %r14, %rsp - - pop %r15 - pop %r14 - pop %r13 - pop %r12 .endm #ifdef CONFIG_AS_AVX @@ -1511,18 +1513,7 @@ _initial_blocks_done\@: # u8 *hash_subkey)# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */ ############################################################# ENTRY(aesni_gcm_precomp_avx_gen2) - #the number of pushes must equal STACK_OFFSET - push %r12 - push %r13 - push %r14 - push %r15 - - mov %rsp, %r14 - - - - sub $VARIABLE_OFFSET, %rsp - and $~63, %rsp # align rsp to 64 bytes + FUNC_SAVE vmovdqu (arg3), %xmm6 # xmm6 = HashKey @@ -1546,12 +1537,7 @@ ENTRY(aesni_gcm_precomp_avx_gen2) PRECOMPUTE_AVX %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5 - mov %r14, %rsp - - pop %r15 - pop %r14 - pop %r13 - pop %r12 + FUNC_RESTORE ret ENDPROC(aesni_gcm_precomp_avx_gen2) @@ -1573,7 +1559,9 @@ ENDPROC(aesni_gcm_precomp_avx_gen2) # Valid values are 16 (most likely), 12 or 8. */ ############################################################################### ENTRY(aesni_gcm_enc_avx_gen2) + FUNC_SAVE GCM_ENC_DEC INITIAL_BLOCKS_AVX GHASH_8_ENCRYPT_8_PARALLEL_AVX GHASH_LAST_8_AVX GHASH_MUL_AVX ENC + FUNC_RESTORE ret ENDPROC(aesni_gcm_enc_avx_gen2) @@ -1595,7 +1583,9 @@ ENDPROC(aesni_gcm_enc_avx_gen2) # Valid values are 16 (most likely), 12 or 8. */ ############################################################################### ENTRY(aesni_gcm_dec_avx_gen2) + FUNC_SAVE GCM_ENC_DEC INITIAL_BLOCKS_AVX GHASH_8_ENCRYPT_8_PARALLEL_AVX GHASH_LAST_8_AVX GHASH_MUL_AVX DEC + FUNC_RESTORE ret ENDPROC(aesni_gcm_dec_avx_gen2) #endif /* CONFIG_AS_AVX */ @@ -2525,18 +2515,7 @@ _initial_blocks_done\@: # Data starts on a 16-byte boundary. */ ############################################################# ENTRY(aesni_gcm_precomp_avx_gen4) - #the number of pushes must equal STACK_OFFSET - push %r12 - push %r13 - push %r14 - push %r15 - - mov %rsp, %r14 - - - - sub $VARIABLE_OFFSET, %rsp - and $~63, %rsp # align rsp to 64 bytes + FUNC_SAVE vmovdqu (arg3), %xmm6 # xmm6 = HashKey @@ -2560,12 +2539,7 @@ ENTRY(aesni_gcm_precomp_avx_gen4) PRECOMPUTE_AVX2 %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5 - mov %r14, %rsp - - pop %r15 - pop %r14 - pop %r13 - pop %r12 + FUNC_RESTORE ret ENDPROC(aesni_gcm_precomp_avx_gen4) @@ -2588,7 +2562,9 @@ ENDPROC(aesni_gcm_precomp_avx_gen4) # Valid values are 16 (most likely), 12 or 8. */ ############################################################################### ENTRY(aesni_gcm_enc_avx_gen4) + FUNC_SAVE GCM_ENC_DEC INITIAL_BLOCKS_AVX2 GHASH_8_ENCRYPT_8_PARALLEL_AVX2 GHASH_LAST_8_AVX2 GHASH_MUL_AVX2 ENC + FUNC_RESTORE ret ENDPROC(aesni_gcm_enc_avx_gen4) @@ -2610,7 +2586,9 @@ ENDPROC(aesni_gcm_enc_avx_gen4) # Valid values are 16 (most likely), 12 or 8. */ ############################################################################### ENTRY(aesni_gcm_dec_avx_gen4) + FUNC_SAVE GCM_ENC_DEC INITIAL_BLOCKS_AVX2 GHASH_8_ENCRYPT_8_PARALLEL_AVX2 GHASH_LAST_8_AVX2 GHASH_MUL_AVX2 DEC + FUNC_RESTORE ret ENDPROC(aesni_gcm_dec_avx_gen4) -- cgit v1.2.3-59-g8ed1b From 5350b0f563433dacc134214a452fd316b36251d6 Mon Sep 17 00:00:00 2001 From: Dave Watson Date: Mon, 10 Dec 2018 19:57:36 +0000 Subject: crypto: aesni - support 256 byte keys in avx asm Add support for 192/256-bit keys using the avx gcm/aes routines. The sse routines were previously updated in e31ac32d3b (Add support for 192 & 256 bit keys to AESNI RFC4106). Instead of adding an additional loop in the hotpath as in e31ac32d3b, this diff instead generates separate versions of the code using macros, and the entry routines choose which version once. This results in a 5% performance improvement vs. adding a loop to the hot path. This is the same strategy chosen by the intel isa-l_crypto library. The key size checks are removed from the c code where appropriate. Note that this diff depends on using gcm_context_data - 256 bit keys require 16 HashKeys + 15 expanded keys, which is larger than struct crypto_aes_ctx, so they are stored in struct gcm_context_data. Signed-off-by: Dave Watson Signed-off-by: Herbert Xu --- arch/x86/crypto/aesni-intel_avx-x86_64.S | 188 +++++++++++++++++++++++-------- arch/x86/crypto/aesni-intel_glue.c | 18 +-- 2 files changed, 145 insertions(+), 61 deletions(-) diff --git a/arch/x86/crypto/aesni-intel_avx-x86_64.S b/arch/x86/crypto/aesni-intel_avx-x86_64.S index dd895f69399b..2aa11c503bb9 100644 --- a/arch/x86/crypto/aesni-intel_avx-x86_64.S +++ b/arch/x86/crypto/aesni-intel_avx-x86_64.S @@ -209,6 +209,7 @@ HashKey_8_k = 16*21 # store XOR of HashKey^8 <<1 mod poly here (for Karatsu #define arg8 STACK_OFFSET+8*2(%r14) #define arg9 STACK_OFFSET+8*3(%r14) #define arg10 STACK_OFFSET+8*4(%r14) +#define keysize 2*15*16(arg1) i = 0 j = 0 @@ -272,22 +273,22 @@ VARIABLE_OFFSET = 16*8 .endm # Encryption of a single block -.macro ENCRYPT_SINGLE_BLOCK XMM0 +.macro ENCRYPT_SINGLE_BLOCK REP XMM0 vpxor (arg1), \XMM0, \XMM0 - i = 1 - setreg -.rep 9 + i = 1 + setreg +.rep \REP vaesenc 16*i(arg1), \XMM0, \XMM0 - i = (i+1) - setreg + i = (i+1) + setreg .endr - vaesenclast 16*10(arg1), \XMM0, \XMM0 + vaesenclast 16*i(arg1), \XMM0, \XMM0 .endm # combined for GCM encrypt and decrypt functions # clobbering all xmm registers # clobbering r10, r11, r12, r13, r14, r15 -.macro GCM_ENC_DEC INITIAL_BLOCKS GHASH_8_ENCRYPT_8_PARALLEL GHASH_LAST_8 GHASH_MUL ENC_DEC +.macro GCM_ENC_DEC INITIAL_BLOCKS GHASH_8_ENCRYPT_8_PARALLEL GHASH_LAST_8 GHASH_MUL ENC_DEC REP vmovdqu HashKey(arg2), %xmm13 # xmm13 = HashKey mov arg5, %r13 # save the number of bytes of plaintext/ciphertext @@ -314,42 +315,42 @@ VARIABLE_OFFSET = 16*8 jmp _initial_num_blocks_is_1\@ _initial_num_blocks_is_7\@: - \INITIAL_BLOCKS 7, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC + \INITIAL_BLOCKS \REP, 7, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC sub $16*7, %r13 jmp _initial_blocks_encrypted\@ _initial_num_blocks_is_6\@: - \INITIAL_BLOCKS 6, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC + \INITIAL_BLOCKS \REP, 6, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC sub $16*6, %r13 jmp _initial_blocks_encrypted\@ _initial_num_blocks_is_5\@: - \INITIAL_BLOCKS 5, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC + \INITIAL_BLOCKS \REP, 5, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC sub $16*5, %r13 jmp _initial_blocks_encrypted\@ _initial_num_blocks_is_4\@: - \INITIAL_BLOCKS 4, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC + \INITIAL_BLOCKS \REP, 4, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC sub $16*4, %r13 jmp _initial_blocks_encrypted\@ _initial_num_blocks_is_3\@: - \INITIAL_BLOCKS 3, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC + \INITIAL_BLOCKS \REP, 3, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC sub $16*3, %r13 jmp _initial_blocks_encrypted\@ _initial_num_blocks_is_2\@: - \INITIAL_BLOCKS 2, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC + \INITIAL_BLOCKS \REP, 2, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC sub $16*2, %r13 jmp _initial_blocks_encrypted\@ _initial_num_blocks_is_1\@: - \INITIAL_BLOCKS 1, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC + \INITIAL_BLOCKS \REP, 1, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC sub $16*1, %r13 jmp _initial_blocks_encrypted\@ _initial_num_blocks_is_0\@: - \INITIAL_BLOCKS 0, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC + \INITIAL_BLOCKS \REP, 0, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC _initial_blocks_encrypted\@: @@ -374,7 +375,7 @@ _encrypt_by_8_new\@: add $8, %r15b - \GHASH_8_ENCRYPT_8_PARALLEL %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, out_order, \ENC_DEC + \GHASH_8_ENCRYPT_8_PARALLEL \REP, %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, out_order, \ENC_DEC add $128, %r11 sub $128, %r13 jne _encrypt_by_8_new\@ @@ -385,7 +386,7 @@ _encrypt_by_8_new\@: _encrypt_by_8\@: vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 add $8, %r15b - \GHASH_8_ENCRYPT_8_PARALLEL %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, in_order, \ENC_DEC + \GHASH_8_ENCRYPT_8_PARALLEL \REP, %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, in_order, \ENC_DEC vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 add $128, %r11 sub $128, %r13 @@ -414,7 +415,7 @@ _zero_cipher_left\@: vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 - ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Yn) + ENCRYPT_SINGLE_BLOCK \REP, %xmm9 # E(K, Yn) sub $16, %r11 add %r13, %r11 @@ -440,7 +441,7 @@ _only_less_than_16\@: vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 - ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Yn) + ENCRYPT_SINGLE_BLOCK \REP, %xmm9 # E(K, Yn) lea SHIFT_MASK+16(%rip), %r12 @@ -525,7 +526,7 @@ _multiple_of_16_bytes\@: mov arg6, %rax # rax = *Y0 vmovdqu (%rax), %xmm9 # xmm9 = Y0 - ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Y0) + ENCRYPT_SINGLE_BLOCK \REP, %xmm9 # E(K, Y0) vpxor %xmm14, %xmm9, %xmm9 @@ -690,7 +691,7 @@ _return_T_done\@: ## r10, r11, r12, rax are clobbered ## arg1, arg3, arg4, r14 are used as a pointer only, not modified -.macro INITIAL_BLOCKS_AVX num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC +.macro INITIAL_BLOCKS_AVX REP num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC i = (8-\num_initial_blocks) j = 0 setreg @@ -786,10 +787,10 @@ _get_AAD_done\@: setreg .endr - j = 1 - setreg -.rep 9 - vmovdqa 16*j(arg1), \T_key + j = 1 + setreg +.rep \REP + vmovdqa 16*j(arg1), \T_key i = (9-\num_initial_blocks) setreg .rep \num_initial_blocks @@ -798,12 +799,11 @@ _get_AAD_done\@: setreg .endr - j = (j+1) - setreg + j = (j+1) + setreg .endr - - vmovdqa 16*10(arg1), \T_key + vmovdqa 16*j(arg1), \T_key i = (9-\num_initial_blocks) setreg .rep \num_initial_blocks @@ -891,9 +891,9 @@ _get_AAD_done\@: vpxor \T_key, \XMM7, \XMM7 vpxor \T_key, \XMM8, \XMM8 - i = 1 - setreg -.rep 9 # do 9 rounds + i = 1 + setreg +.rep \REP # do REP rounds vmovdqa 16*i(arg1), \T_key vaesenc \T_key, \XMM1, \XMM1 vaesenc \T_key, \XMM2, \XMM2 @@ -903,11 +903,10 @@ _get_AAD_done\@: vaesenc \T_key, \XMM6, \XMM6 vaesenc \T_key, \XMM7, \XMM7 vaesenc \T_key, \XMM8, \XMM8 - i = (i+1) - setreg + i = (i+1) + setreg .endr - vmovdqa 16*i(arg1), \T_key vaesenclast \T_key, \XMM1, \XMM1 vaesenclast \T_key, \XMM2, \XMM2 @@ -996,7 +995,7 @@ _initial_blocks_done\@: # ghash the 8 previously encrypted ciphertext blocks # arg1, arg3, arg4 are used as pointers only, not modified # r11 is the data offset value -.macro GHASH_8_ENCRYPT_8_PARALLEL_AVX T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC +.macro GHASH_8_ENCRYPT_8_PARALLEL_AVX REP T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC vmovdqa \XMM1, \T2 vmovdqa \XMM2, TMP2(%rsp) @@ -1262,6 +1261,24 @@ _initial_blocks_done\@: vmovdqu 16*10(arg1), \T5 + i = 11 + setreg +.rep (\REP-9) + + vaesenc \T5, \XMM1, \XMM1 + vaesenc \T5, \XMM2, \XMM2 + vaesenc \T5, \XMM3, \XMM3 + vaesenc \T5, \XMM4, \XMM4 + vaesenc \T5, \XMM5, \XMM5 + vaesenc \T5, \XMM6, \XMM6 + vaesenc \T5, \XMM7, \XMM7 + vaesenc \T5, \XMM8, \XMM8 + + vmovdqu 16*i(arg1), \T5 + i = i + 1 + setreg +.endr + i = 0 j = 1 setreg @@ -1560,9 +1577,23 @@ ENDPROC(aesni_gcm_precomp_avx_gen2) ############################################################################### ENTRY(aesni_gcm_enc_avx_gen2) FUNC_SAVE - GCM_ENC_DEC INITIAL_BLOCKS_AVX GHASH_8_ENCRYPT_8_PARALLEL_AVX GHASH_LAST_8_AVX GHASH_MUL_AVX ENC + mov keysize, %eax + cmp $32, %eax + je key_256_enc + cmp $16, %eax + je key_128_enc + # must be 192 + GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 11 FUNC_RESTORE - ret + ret +key_128_enc: + GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 9 + FUNC_RESTORE + ret +key_256_enc: + GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 13 + FUNC_RESTORE + ret ENDPROC(aesni_gcm_enc_avx_gen2) ############################################################################### @@ -1584,9 +1615,23 @@ ENDPROC(aesni_gcm_enc_avx_gen2) ############################################################################### ENTRY(aesni_gcm_dec_avx_gen2) FUNC_SAVE - GCM_ENC_DEC INITIAL_BLOCKS_AVX GHASH_8_ENCRYPT_8_PARALLEL_AVX GHASH_LAST_8_AVX GHASH_MUL_AVX DEC + mov keysize,%eax + cmp $32, %eax + je key_256_dec + cmp $16, %eax + je key_128_dec + # must be 192 + GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 11 FUNC_RESTORE - ret + ret +key_128_dec: + GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 9 + FUNC_RESTORE + ret +key_256_dec: + GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 13 + FUNC_RESTORE + ret ENDPROC(aesni_gcm_dec_avx_gen2) #endif /* CONFIG_AS_AVX */ @@ -1671,7 +1716,7 @@ ENDPROC(aesni_gcm_dec_avx_gen2) ## r10, r11, r12, rax are clobbered ## arg1, arg3, arg4, r14 are used as a pointer only, not modified -.macro INITIAL_BLOCKS_AVX2 num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC VER +.macro INITIAL_BLOCKS_AVX2 REP num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC VER i = (8-\num_initial_blocks) j = 0 setreg @@ -1770,7 +1815,7 @@ _get_AAD_done\@: j = 1 setreg -.rep 9 +.rep \REP vmovdqa 16*j(arg1), \T_key i = (9-\num_initial_blocks) setreg @@ -1785,7 +1830,7 @@ _get_AAD_done\@: .endr - vmovdqa 16*10(arg1), \T_key + vmovdqa 16*j(arg1), \T_key i = (9-\num_initial_blocks) setreg .rep \num_initial_blocks @@ -1876,7 +1921,7 @@ _get_AAD_done\@: i = 1 setreg -.rep 9 # do 9 rounds +.rep \REP # do REP rounds vmovdqa 16*i(arg1), \T_key vaesenc \T_key, \XMM1, \XMM1 vaesenc \T_key, \XMM2, \XMM2 @@ -1983,7 +2028,7 @@ _initial_blocks_done\@: # ghash the 8 previously encrypted ciphertext blocks # arg1, arg3, arg4 are used as pointers only, not modified # r11 is the data offset value -.macro GHASH_8_ENCRYPT_8_PARALLEL_AVX2 T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC +.macro GHASH_8_ENCRYPT_8_PARALLEL_AVX2 REP T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC vmovdqa \XMM1, \T2 vmovdqa \XMM2, TMP2(%rsp) @@ -2252,6 +2297,23 @@ _initial_blocks_done\@: vmovdqu 16*10(arg1), \T5 + i = 11 + setreg +.rep (\REP-9) + vaesenc \T5, \XMM1, \XMM1 + vaesenc \T5, \XMM2, \XMM2 + vaesenc \T5, \XMM3, \XMM3 + vaesenc \T5, \XMM4, \XMM4 + vaesenc \T5, \XMM5, \XMM5 + vaesenc \T5, \XMM6, \XMM6 + vaesenc \T5, \XMM7, \XMM7 + vaesenc \T5, \XMM8, \XMM8 + + vmovdqu 16*i(arg1), \T5 + i = i + 1 + setreg +.endr + i = 0 j = 1 setreg @@ -2563,7 +2625,21 @@ ENDPROC(aesni_gcm_precomp_avx_gen4) ############################################################################### ENTRY(aesni_gcm_enc_avx_gen4) FUNC_SAVE - GCM_ENC_DEC INITIAL_BLOCKS_AVX2 GHASH_8_ENCRYPT_8_PARALLEL_AVX2 GHASH_LAST_8_AVX2 GHASH_MUL_AVX2 ENC + mov keysize,%eax + cmp $32, %eax + je key_256_enc4 + cmp $16, %eax + je key_128_enc4 + # must be 192 + GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 11 + FUNC_RESTORE + ret +key_128_enc4: + GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 9 + FUNC_RESTORE + ret +key_256_enc4: + GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 13 FUNC_RESTORE ret ENDPROC(aesni_gcm_enc_avx_gen4) @@ -2587,9 +2663,23 @@ ENDPROC(aesni_gcm_enc_avx_gen4) ############################################################################### ENTRY(aesni_gcm_dec_avx_gen4) FUNC_SAVE - GCM_ENC_DEC INITIAL_BLOCKS_AVX2 GHASH_8_ENCRYPT_8_PARALLEL_AVX2 GHASH_LAST_8_AVX2 GHASH_MUL_AVX2 DEC + mov keysize,%eax + cmp $32, %eax + je key_256_dec4 + cmp $16, %eax + je key_128_dec4 + # must be 192 + GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 11 FUNC_RESTORE - ret + ret +key_128_dec4: + GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 9 + FUNC_RESTORE + ret +key_256_dec4: + GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 13 + FUNC_RESTORE + ret ENDPROC(aesni_gcm_dec_avx_gen4) #endif /* CONFIG_AS_AVX2 */ diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c index d8b8cb33f608..7d1259feb0f9 100644 --- a/arch/x86/crypto/aesni-intel_glue.c +++ b/arch/x86/crypto/aesni-intel_glue.c @@ -209,8 +209,7 @@ static void aesni_gcm_enc_avx(void *ctx, u8 *hash_subkey, const u8 *aad, unsigned long aad_len, u8 *auth_tag, unsigned long auth_tag_len) { - struct crypto_aes_ctx *aes_ctx = (struct crypto_aes_ctx*)ctx; - if ((plaintext_len < AVX_GEN2_OPTSIZE) || (aes_ctx-> key_length != AES_KEYSIZE_128)){ + if (plaintext_len < AVX_GEN2_OPTSIZE) { aesni_gcm_enc(ctx, data, out, in, plaintext_len, iv, hash_subkey, aad, aad_len, auth_tag, auth_tag_len); @@ -227,8 +226,7 @@ static void aesni_gcm_dec_avx(void *ctx, u8 *hash_subkey, const u8 *aad, unsigned long aad_len, u8 *auth_tag, unsigned long auth_tag_len) { - struct crypto_aes_ctx *aes_ctx = (struct crypto_aes_ctx*)ctx; - if ((ciphertext_len < AVX_GEN2_OPTSIZE) || (aes_ctx-> key_length != AES_KEYSIZE_128)) { + if (ciphertext_len < AVX_GEN2_OPTSIZE) { aesni_gcm_dec(ctx, data, out, in, ciphertext_len, iv, hash_subkey, aad, aad_len, auth_tag, auth_tag_len); @@ -268,8 +266,7 @@ static void aesni_gcm_enc_avx2(void *ctx, u8 *hash_subkey, const u8 *aad, unsigned long aad_len, u8 *auth_tag, unsigned long auth_tag_len) { - struct crypto_aes_ctx *aes_ctx = (struct crypto_aes_ctx*)ctx; - if ((plaintext_len < AVX_GEN2_OPTSIZE) || (aes_ctx-> key_length != AES_KEYSIZE_128)) { + if (plaintext_len < AVX_GEN2_OPTSIZE) { aesni_gcm_enc(ctx, data, out, in, plaintext_len, iv, hash_subkey, aad, aad_len, auth_tag, auth_tag_len); @@ -290,8 +287,7 @@ static void aesni_gcm_dec_avx2(void *ctx, u8 *hash_subkey, const u8 *aad, unsigned long aad_len, u8 *auth_tag, unsigned long auth_tag_len) { - struct crypto_aes_ctx *aes_ctx = (struct crypto_aes_ctx*)ctx; - if ((ciphertext_len < AVX_GEN2_OPTSIZE) || (aes_ctx-> key_length != AES_KEYSIZE_128)) { + if (ciphertext_len < AVX_GEN2_OPTSIZE) { aesni_gcm_dec(ctx, data, out, in, ciphertext_len, iv, hash_subkey, aad, aad_len, auth_tag, auth_tag_len); @@ -928,8 +924,7 @@ static int gcmaes_encrypt(struct aead_request *req, unsigned int assoclen, struct scatter_walk dst_sg_walk = {}; struct gcm_context_data data AESNI_ALIGN_ATTR; - if (((struct crypto_aes_ctx *)aes_ctx)->key_length != AES_KEYSIZE_128 || - aesni_gcm_enc_tfm == aesni_gcm_enc || + if (aesni_gcm_enc_tfm == aesni_gcm_enc || req->cryptlen < AVX_GEN2_OPTSIZE) { return gcmaes_crypt_by_sg(true, req, assoclen, hash_subkey, iv, aes_ctx); @@ -1000,8 +995,7 @@ static int gcmaes_decrypt(struct aead_request *req, unsigned int assoclen, struct gcm_context_data data AESNI_ALIGN_ATTR; int retval = 0; - if (((struct crypto_aes_ctx *)aes_ctx)->key_length != AES_KEYSIZE_128 || - aesni_gcm_enc_tfm == aesni_gcm_enc || + if (aesni_gcm_enc_tfm == aesni_gcm_enc || req->cryptlen < AVX_GEN2_OPTSIZE) { return gcmaes_crypt_by_sg(false, req, assoclen, hash_subkey, iv, aes_ctx); -- cgit v1.2.3-59-g8ed1b From e377bedb09d6970ad27d7714b0a6365ee7e4d732 Mon Sep 17 00:00:00 2001 From: Dave Watson Date: Mon, 10 Dec 2018 19:57:49 +0000 Subject: crypto: aesni - Add GCM_COMPLETE macro Merge encode and decode tag calculations in GCM_COMPLETE macro. Scatter/gather routines will call this once at the end of encryption or decryption. Signed-off-by: Dave Watson Signed-off-by: Herbert Xu --- arch/x86/crypto/aesni-intel_avx-x86_64.S | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/arch/x86/crypto/aesni-intel_avx-x86_64.S b/arch/x86/crypto/aesni-intel_avx-x86_64.S index 2aa11c503bb9..8e9ae4b26118 100644 --- a/arch/x86/crypto/aesni-intel_avx-x86_64.S +++ b/arch/x86/crypto/aesni-intel_avx-x86_64.S @@ -510,6 +510,14 @@ _less_than_8_bytes_left\@: ############################# _multiple_of_16_bytes\@: + GCM_COMPLETE \GHASH_MUL \REP +.endm + + +# GCM_COMPLETE Finishes update of tag of last partial block +# Output: Authorization Tag (AUTH_TAG) +# Clobbers rax, r10-r12, and xmm0, xmm1, xmm5-xmm15 +.macro GCM_COMPLETE GHASH_MUL REP mov arg8, %r12 # r12 = aadLen (number of bytes) shl $3, %r12 # convert into number of bits vmovd %r12d, %xmm15 # len(A) in xmm15 -- cgit v1.2.3-59-g8ed1b From 38003cd26c9f59da77d98927fb9af58732da207a Mon Sep 17 00:00:00 2001 From: Dave Watson Date: Mon, 10 Dec 2018 19:58:19 +0000 Subject: crypto: aesni - Split AAD hash calculation to separate macro AAD hash only needs to be calculated once for each scatter/gather operation. Move it to its own macro, and call it from GCM_INIT instead of INITIAL_BLOCKS. Signed-off-by: Dave Watson Signed-off-by: Herbert Xu --- arch/x86/crypto/aesni-intel_avx-x86_64.S | 228 +++++++++++++------------------ arch/x86/crypto/aesni-intel_glue.c | 28 ++-- 2 files changed, 115 insertions(+), 141 deletions(-) diff --git a/arch/x86/crypto/aesni-intel_avx-x86_64.S b/arch/x86/crypto/aesni-intel_avx-x86_64.S index 8e9ae4b26118..305abece93ad 100644 --- a/arch/x86/crypto/aesni-intel_avx-x86_64.S +++ b/arch/x86/crypto/aesni-intel_avx-x86_64.S @@ -182,6 +182,14 @@ aad_shift_arr: .text +#define AadHash 16*0 +#define AadLen 16*1 +#define InLen (16*1)+8 +#define PBlockEncKey 16*2 +#define OrigIV 16*3 +#define CurCount 16*4 +#define PBlockLen 16*5 + HashKey = 16*6 # store HashKey <<1 mod poly here HashKey_2 = 16*7 # store HashKey^2 <<1 mod poly here HashKey_3 = 16*8 # store HashKey^3 <<1 mod poly here @@ -585,6 +593,74 @@ _T_16\@: _return_T_done\@: .endm +.macro CALC_AAD_HASH GHASH_MUL AAD AADLEN T1 T2 T3 T4 T5 T6 T7 T8 + + mov \AAD, %r10 # r10 = AAD + mov \AADLEN, %r12 # r12 = aadLen + + + mov %r12, %r11 + + vpxor \T8, \T8, \T8 + vpxor \T7, \T7, \T7 + cmp $16, %r11 + jl _get_AAD_rest8\@ +_get_AAD_blocks\@: + vmovdqu (%r10), \T7 + vpshufb SHUF_MASK(%rip), \T7, \T7 + vpxor \T7, \T8, \T8 + \GHASH_MUL \T8, \T2, \T1, \T3, \T4, \T5, \T6 + add $16, %r10 + sub $16, %r12 + sub $16, %r11 + cmp $16, %r11 + jge _get_AAD_blocks\@ + vmovdqu \T8, \T7 + cmp $0, %r11 + je _get_AAD_done\@ + + vpxor \T7, \T7, \T7 + + /* read the last <16B of AAD. since we have at least 4B of + data right after the AAD (the ICV, and maybe some CT), we can + read 4B/8B blocks safely, and then get rid of the extra stuff */ +_get_AAD_rest8\@: + cmp $4, %r11 + jle _get_AAD_rest4\@ + movq (%r10), \T1 + add $8, %r10 + sub $8, %r11 + vpslldq $8, \T1, \T1 + vpsrldq $8, \T7, \T7 + vpxor \T1, \T7, \T7 + jmp _get_AAD_rest8\@ +_get_AAD_rest4\@: + cmp $0, %r11 + jle _get_AAD_rest0\@ + mov (%r10), %eax + movq %rax, \T1 + add $4, %r10 + sub $4, %r11 + vpslldq $12, \T1, \T1 + vpsrldq $4, \T7, \T7 + vpxor \T1, \T7, \T7 +_get_AAD_rest0\@: + /* finalize: shift out the extra bytes we read, and align + left. since pslldq can only shift by an immediate, we use + vpshufb and an array of shuffle masks */ + movq %r12, %r11 + salq $4, %r11 + vmovdqu aad_shift_arr(%r11), \T1 + vpshufb \T1, \T7, \T7 +_get_AAD_rest_final\@: + vpshufb SHUF_MASK(%rip), \T7, \T7 + vpxor \T8, \T7, \T7 + \GHASH_MUL \T7, \T2, \T1, \T3, \T4, \T5, \T6 + +_get_AAD_done\@: + vmovdqu \T7, AadHash(arg2) +.endm + #ifdef CONFIG_AS_AVX ############################################################################### # GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0) @@ -701,72 +777,9 @@ _return_T_done\@: .macro INITIAL_BLOCKS_AVX REP num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC i = (8-\num_initial_blocks) - j = 0 setreg + vmovdqu AadHash(arg2), reg_i - mov arg7, %r10 # r10 = AAD - mov arg8, %r12 # r12 = aadLen - - - mov %r12, %r11 - - vpxor reg_j, reg_j, reg_j - vpxor reg_i, reg_i, reg_i - cmp $16, %r11 - jl _get_AAD_rest8\@ -_get_AAD_blocks\@: - vmovdqu (%r10), reg_i - vpshufb SHUF_MASK(%rip), reg_i, reg_i - vpxor reg_i, reg_j, reg_j - GHASH_MUL_AVX reg_j, \T2, \T1, \T3, \T4, \T5, \T6 - add $16, %r10 - sub $16, %r12 - sub $16, %r11 - cmp $16, %r11 - jge _get_AAD_blocks\@ - vmovdqu reg_j, reg_i - cmp $0, %r11 - je _get_AAD_done\@ - - vpxor reg_i, reg_i, reg_i - - /* read the last <16B of AAD. since we have at least 4B of - data right after the AAD (the ICV, and maybe some CT), we can - read 4B/8B blocks safely, and then get rid of the extra stuff */ -_get_AAD_rest8\@: - cmp $4, %r11 - jle _get_AAD_rest4\@ - movq (%r10), \T1 - add $8, %r10 - sub $8, %r11 - vpslldq $8, \T1, \T1 - vpsrldq $8, reg_i, reg_i - vpxor \T1, reg_i, reg_i - jmp _get_AAD_rest8\@ -_get_AAD_rest4\@: - cmp $0, %r11 - jle _get_AAD_rest0\@ - mov (%r10), %eax - movq %rax, \T1 - add $4, %r10 - sub $4, %r11 - vpslldq $12, \T1, \T1 - vpsrldq $4, reg_i, reg_i - vpxor \T1, reg_i, reg_i -_get_AAD_rest0\@: - /* finalize: shift out the extra bytes we read, and align - left. since pslldq can only shift by an immediate, we use - vpshufb and an array of shuffle masks */ - movq %r12, %r11 - salq $4, %r11 - movdqu aad_shift_arr(%r11), \T1 - vpshufb \T1, reg_i, reg_i -_get_AAD_rest_final\@: - vpshufb SHUF_MASK(%rip), reg_i, reg_i - vpxor reg_j, reg_i, reg_i - GHASH_MUL_AVX reg_i, \T2, \T1, \T3, \T4, \T5, \T6 - -_get_AAD_done\@: # initialize the data pointer offset as zero xor %r11d, %r11d @@ -1535,7 +1548,13 @@ _initial_blocks_done\@: #void aesni_gcm_precomp_avx_gen2 # (gcm_data *my_ctx_data, # gcm_context_data *data, -# u8 *hash_subkey)# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */ +# u8 *hash_subkey# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */ +# u8 *iv, /* Pre-counter block j0: 4 byte salt +# (from Security Association) concatenated with 8 byte +# Initialisation Vector (from IPSec ESP Payload) +# concatenated with 0x00000001. 16-byte aligned pointer. */ +# const u8 *aad, /* Additional Authentication Data (AAD)*/ +# u64 aad_len) /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */ ############################################################# ENTRY(aesni_gcm_precomp_avx_gen2) FUNC_SAVE @@ -1560,6 +1579,8 @@ ENTRY(aesni_gcm_precomp_avx_gen2) vmovdqu %xmm6, HashKey(arg2) # store HashKey<<1 mod poly + CALC_AAD_HASH GHASH_MUL_AVX, arg5, arg6, %xmm2, %xmm6, %xmm3, %xmm4, %xmm5, %xmm7, %xmm1, %xmm0 + PRECOMPUTE_AVX %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5 FUNC_RESTORE @@ -1716,7 +1737,6 @@ ENDPROC(aesni_gcm_dec_avx_gen2) .endm - ## if a = number of total plaintext bytes ## b = floor(a/16) ## num_initial_blocks = b mod 4# @@ -1726,73 +1746,9 @@ ENDPROC(aesni_gcm_dec_avx_gen2) .macro INITIAL_BLOCKS_AVX2 REP num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC VER i = (8-\num_initial_blocks) - j = 0 setreg + vmovdqu AadHash(arg2), reg_i - mov arg7, %r10 # r10 = AAD - mov arg8, %r12 # r12 = aadLen - - - mov %r12, %r11 - - vpxor reg_j, reg_j, reg_j - vpxor reg_i, reg_i, reg_i - - cmp $16, %r11 - jl _get_AAD_rest8\@ -_get_AAD_blocks\@: - vmovdqu (%r10), reg_i - vpshufb SHUF_MASK(%rip), reg_i, reg_i - vpxor reg_i, reg_j, reg_j - GHASH_MUL_AVX2 reg_j, \T2, \T1, \T3, \T4, \T5, \T6 - add $16, %r10 - sub $16, %r12 - sub $16, %r11 - cmp $16, %r11 - jge _get_AAD_blocks\@ - vmovdqu reg_j, reg_i - cmp $0, %r11 - je _get_AAD_done\@ - - vpxor reg_i, reg_i, reg_i - - /* read the last <16B of AAD. since we have at least 4B of - data right after the AAD (the ICV, and maybe some CT), we can - read 4B/8B blocks safely, and then get rid of the extra stuff */ -_get_AAD_rest8\@: - cmp $4, %r11 - jle _get_AAD_rest4\@ - movq (%r10), \T1 - add $8, %r10 - sub $8, %r11 - vpslldq $8, \T1, \T1 - vpsrldq $8, reg_i, reg_i - vpxor \T1, reg_i, reg_i - jmp _get_AAD_rest8\@ -_get_AAD_rest4\@: - cmp $0, %r11 - jle _get_AAD_rest0\@ - mov (%r10), %eax - movq %rax, \T1 - add $4, %r10 - sub $4, %r11 - vpslldq $12, \T1, \T1 - vpsrldq $4, reg_i, reg_i - vpxor \T1, reg_i, reg_i -_get_AAD_rest0\@: - /* finalize: shift out the extra bytes we read, and align - left. since pslldq can only shift by an immediate, we use - vpshufb and an array of shuffle masks */ - movq %r12, %r11 - salq $4, %r11 - movdqu aad_shift_arr(%r11), \T1 - vpshufb \T1, reg_i, reg_i -_get_AAD_rest_final\@: - vpshufb SHUF_MASK(%rip), reg_i, reg_i - vpxor reg_j, reg_i, reg_i - GHASH_MUL_AVX2 reg_i, \T2, \T1, \T3, \T4, \T5, \T6 - -_get_AAD_done\@: # initialize the data pointer offset as zero xor %r11d, %r11d @@ -2581,8 +2537,13 @@ _initial_blocks_done\@: #void aesni_gcm_precomp_avx_gen4 # (gcm_data *my_ctx_data, # gcm_context_data *data, -# u8 *hash_subkey)# /* H, the Hash sub key input. -# Data starts on a 16-byte boundary. */ +# u8 *hash_subkey# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */ +# u8 *iv, /* Pre-counter block j0: 4 byte salt +# (from Security Association) concatenated with 8 byte +# Initialisation Vector (from IPSec ESP Payload) +# concatenated with 0x00000001. 16-byte aligned pointer. */ +# const u8 *aad, /* Additional Authentication Data (AAD)*/ +# u64 aad_len) /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */ ############################################################# ENTRY(aesni_gcm_precomp_avx_gen4) FUNC_SAVE @@ -2606,6 +2567,7 @@ ENTRY(aesni_gcm_precomp_avx_gen4) ####################################################################### vmovdqu %xmm6, HashKey(arg2) # store HashKey<<1 mod poly + CALC_AAD_HASH GHASH_MUL_AVX2, arg5, arg6, %xmm2, %xmm6, %xmm3, %xmm4, %xmm5, %xmm7, %xmm1, %xmm0 PRECOMPUTE_AVX2 %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5 diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c index 7d1259feb0f9..2648842f1c3f 100644 --- a/arch/x86/crypto/aesni-intel_glue.c +++ b/arch/x86/crypto/aesni-intel_glue.c @@ -189,7 +189,10 @@ asmlinkage void aes_ctr_enc_256_avx_by8(const u8 *in, u8 *iv, */ asmlinkage void aesni_gcm_precomp_avx_gen2(void *my_ctx_data, struct gcm_context_data *gdata, - u8 *hash_subkey); + u8 *hash_subkey, + u8 *iv, + const u8 *aad, + unsigned long aad_len); asmlinkage void aesni_gcm_enc_avx_gen2(void *ctx, struct gcm_context_data *gdata, u8 *out, @@ -214,7 +217,8 @@ static void aesni_gcm_enc_avx(void *ctx, plaintext_len, iv, hash_subkey, aad, aad_len, auth_tag, auth_tag_len); } else { - aesni_gcm_precomp_avx_gen2(ctx, data, hash_subkey); + aesni_gcm_precomp_avx_gen2(ctx, data, hash_subkey, iv, + aad, aad_len); aesni_gcm_enc_avx_gen2(ctx, data, out, in, plaintext_len, iv, aad, aad_len, auth_tag, auth_tag_len); } @@ -231,7 +235,8 @@ static void aesni_gcm_dec_avx(void *ctx, ciphertext_len, iv, hash_subkey, aad, aad_len, auth_tag, auth_tag_len); } else { - aesni_gcm_precomp_avx_gen2(ctx, data, hash_subkey); + aesni_gcm_precomp_avx_gen2(ctx, data, hash_subkey, iv, + aad, aad_len); aesni_gcm_dec_avx_gen2(ctx, data, out, in, ciphertext_len, iv, aad, aad_len, auth_tag, auth_tag_len); } @@ -246,7 +251,10 @@ static void aesni_gcm_dec_avx(void *ctx, */ asmlinkage void aesni_gcm_precomp_avx_gen4(void *my_ctx_data, struct gcm_context_data *gdata, - u8 *hash_subkey); + u8 *hash_subkey, + u8 *iv, + const u8 *aad, + unsigned long aad_len); asmlinkage void aesni_gcm_enc_avx_gen4(void *ctx, struct gcm_context_data *gdata, u8 *out, @@ -271,11 +279,13 @@ static void aesni_gcm_enc_avx2(void *ctx, plaintext_len, iv, hash_subkey, aad, aad_len, auth_tag, auth_tag_len); } else if (plaintext_len < AVX_GEN4_OPTSIZE) { - aesni_gcm_precomp_avx_gen2(ctx, data, hash_subkey); + aesni_gcm_precomp_avx_gen2(ctx, data, hash_subkey, iv, + aad, aad_len); aesni_gcm_enc_avx_gen2(ctx, data, out, in, plaintext_len, iv, aad, aad_len, auth_tag, auth_tag_len); } else { - aesni_gcm_precomp_avx_gen4(ctx, data, hash_subkey); + aesni_gcm_precomp_avx_gen4(ctx, data, hash_subkey, iv, + aad, aad_len); aesni_gcm_enc_avx_gen4(ctx, data, out, in, plaintext_len, iv, aad, aad_len, auth_tag, auth_tag_len); } @@ -292,11 +302,13 @@ static void aesni_gcm_dec_avx2(void *ctx, ciphertext_len, iv, hash_subkey, aad, aad_len, auth_tag, auth_tag_len); } else if (ciphertext_len < AVX_GEN4_OPTSIZE) { - aesni_gcm_precomp_avx_gen2(ctx, data, hash_subkey); + aesni_gcm_precomp_avx_gen2(ctx, data, hash_subkey, iv, + aad, aad_len); aesni_gcm_dec_avx_gen2(ctx, data, out, in, ciphertext_len, iv, aad, aad_len, auth_tag, auth_tag_len); } else { - aesni_gcm_precomp_avx_gen4(ctx, data, hash_subkey); + aesni_gcm_precomp_avx_gen4(ctx, data, hash_subkey, iv, + aad, aad_len); aesni_gcm_dec_avx_gen4(ctx, data, out, in, ciphertext_len, iv, aad, aad_len, auth_tag, auth_tag_len); } -- cgit v1.2.3-59-g8ed1b From 1cb1bcbb567d10bdd9de9d03964c5e2958f0d111 Mon Sep 17 00:00:00 2001 From: Dave Watson Date: Mon, 10 Dec 2018 19:58:38 +0000 Subject: crypto: aesni - Merge avx precompute functions The precompute functions differ only by the sub-macros they call, merge them to a single macro. Later diffs add more code to fill in the gcm_context_data structure, this allows changes in a single place. Signed-off-by: Dave Watson Signed-off-by: Herbert Xu --- arch/x86/crypto/aesni-intel_avx-x86_64.S | 76 ++++++++++++-------------------- 1 file changed, 27 insertions(+), 49 deletions(-) diff --git a/arch/x86/crypto/aesni-intel_avx-x86_64.S b/arch/x86/crypto/aesni-intel_avx-x86_64.S index 305abece93ad..e347ba61db65 100644 --- a/arch/x86/crypto/aesni-intel_avx-x86_64.S +++ b/arch/x86/crypto/aesni-intel_avx-x86_64.S @@ -661,6 +661,31 @@ _get_AAD_done\@: vmovdqu \T7, AadHash(arg2) .endm +.macro INIT GHASH_MUL PRECOMPUTE + vmovdqu (arg3), %xmm6 # xmm6 = HashKey + + vpshufb SHUF_MASK(%rip), %xmm6, %xmm6 + ############### PRECOMPUTATION of HashKey<<1 mod poly from the HashKey + vmovdqa %xmm6, %xmm2 + vpsllq $1, %xmm6, %xmm6 + vpsrlq $63, %xmm2, %xmm2 + vmovdqa %xmm2, %xmm1 + vpslldq $8, %xmm2, %xmm2 + vpsrldq $8, %xmm1, %xmm1 + vpor %xmm2, %xmm6, %xmm6 + #reduction + vpshufd $0b00100100, %xmm1, %xmm2 + vpcmpeqd TWOONE(%rip), %xmm2, %xmm2 + vpand POLY(%rip), %xmm2, %xmm2 + vpxor %xmm2, %xmm6, %xmm6 # xmm6 holds the HashKey<<1 mod poly + ####################################################################### + vmovdqu %xmm6, HashKey(arg2) # store HashKey<<1 mod poly + + CALC_AAD_HASH \GHASH_MUL, arg5, arg6, %xmm2, %xmm6, %xmm3, %xmm4, %xmm5, %xmm7, %xmm1, %xmm0 + + \PRECOMPUTE %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5 +.endm + #ifdef CONFIG_AS_AVX ############################################################################### # GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0) @@ -1558,31 +1583,7 @@ _initial_blocks_done\@: ############################################################# ENTRY(aesni_gcm_precomp_avx_gen2) FUNC_SAVE - - vmovdqu (arg3), %xmm6 # xmm6 = HashKey - - vpshufb SHUF_MASK(%rip), %xmm6, %xmm6 - ############### PRECOMPUTATION of HashKey<<1 mod poly from the HashKey - vmovdqa %xmm6, %xmm2 - vpsllq $1, %xmm6, %xmm6 - vpsrlq $63, %xmm2, %xmm2 - vmovdqa %xmm2, %xmm1 - vpslldq $8, %xmm2, %xmm2 - vpsrldq $8, %xmm1, %xmm1 - vpor %xmm2, %xmm6, %xmm6 - #reduction - vpshufd $0b00100100, %xmm1, %xmm2 - vpcmpeqd TWOONE(%rip), %xmm2, %xmm2 - vpand POLY(%rip), %xmm2, %xmm2 - vpxor %xmm2, %xmm6, %xmm6 # xmm6 holds the HashKey<<1 mod poly - ####################################################################### - vmovdqu %xmm6, HashKey(arg2) # store HashKey<<1 mod poly - - - CALC_AAD_HASH GHASH_MUL_AVX, arg5, arg6, %xmm2, %xmm6, %xmm3, %xmm4, %xmm5, %xmm7, %xmm1, %xmm0 - - PRECOMPUTE_AVX %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5 - + INIT GHASH_MUL_AVX, PRECOMPUTE_AVX FUNC_RESTORE ret ENDPROC(aesni_gcm_precomp_avx_gen2) @@ -2547,30 +2548,7 @@ _initial_blocks_done\@: ############################################################# ENTRY(aesni_gcm_precomp_avx_gen4) FUNC_SAVE - - vmovdqu (arg3), %xmm6 # xmm6 = HashKey - - vpshufb SHUF_MASK(%rip), %xmm6, %xmm6 - ############### PRECOMPUTATION of HashKey<<1 mod poly from the HashKey - vmovdqa %xmm6, %xmm2 - vpsllq $1, %xmm6, %xmm6 - vpsrlq $63, %xmm2, %xmm2 - vmovdqa %xmm2, %xmm1 - vpslldq $8, %xmm2, %xmm2 - vpsrldq $8, %xmm1, %xmm1 - vpor %xmm2, %xmm6, %xmm6 - #reduction - vpshufd $0b00100100, %xmm1, %xmm2 - vpcmpeqd TWOONE(%rip), %xmm2, %xmm2 - vpand POLY(%rip), %xmm2, %xmm2 - vpxor %xmm2, %xmm6, %xmm6 # xmm6 holds the HashKey<<1 mod poly - ####################################################################### - vmovdqu %xmm6, HashKey(arg2) # store HashKey<<1 mod poly - - CALC_AAD_HASH GHASH_MUL_AVX2, arg5, arg6, %xmm2, %xmm6, %xmm3, %xmm4, %xmm5, %xmm7, %xmm1, %xmm0 - - PRECOMPUTE_AVX2 %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5 - + INIT GHASH_MUL_AVX2, PRECOMPUTE_AVX2 FUNC_RESTORE ret ENDPROC(aesni_gcm_precomp_avx_gen4) -- cgit v1.2.3-59-g8ed1b From a44b419fe5aee3bfe12c88698a023cb5c6067985 Mon Sep 17 00:00:00 2001 From: Dave Watson Date: Mon, 10 Dec 2018 19:58:56 +0000 Subject: crypto: aesni - Fill in new context data structures Fill in aadhash, aadlen, pblocklen, curcount with appropriate values. pblocklen, aadhash, and pblockenckey are also updated at the end of each scatter/gather operation, to be carried over to the next operation. Signed-off-by: Dave Watson Signed-off-by: Herbert Xu --- arch/x86/crypto/aesni-intel_avx-x86_64.S | 51 +++++++++++++++++++++++--------- 1 file changed, 37 insertions(+), 14 deletions(-) diff --git a/arch/x86/crypto/aesni-intel_avx-x86_64.S b/arch/x86/crypto/aesni-intel_avx-x86_64.S index e347ba61db65..0a9cdcfdd987 100644 --- a/arch/x86/crypto/aesni-intel_avx-x86_64.S +++ b/arch/x86/crypto/aesni-intel_avx-x86_64.S @@ -297,7 +297,9 @@ VARIABLE_OFFSET = 16*8 # clobbering all xmm registers # clobbering r10, r11, r12, r13, r14, r15 .macro GCM_ENC_DEC INITIAL_BLOCKS GHASH_8_ENCRYPT_8_PARALLEL GHASH_LAST_8 GHASH_MUL ENC_DEC REP + vmovdqu AadHash(arg2), %xmm8 vmovdqu HashKey(arg2), %xmm13 # xmm13 = HashKey + add arg5, InLen(arg2) mov arg5, %r13 # save the number of bytes of plaintext/ciphertext and $-16, %r13 # r13 = r13 - (r13 mod 16) @@ -410,6 +412,9 @@ _eight_cipher_left\@: _zero_cipher_left\@: + vmovdqu %xmm14, AadHash(arg2) + vmovdqu %xmm9, CurCount(arg2) + cmp $16, arg5 jl _only_less_than_16\@ @@ -420,10 +425,14 @@ _zero_cipher_left\@: # handle the last <16 Byte block seperately + mov %r13, PBlockLen(arg2) vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn + vmovdqu %xmm9, CurCount(arg2) vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 + ENCRYPT_SINGLE_BLOCK \REP, %xmm9 # E(K, Yn) + vmovdqu %xmm9, PBlockEncKey(arg2) sub $16, %r11 add %r13, %r11 @@ -451,6 +460,7 @@ _only_less_than_16\@: vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 ENCRYPT_SINGLE_BLOCK \REP, %xmm9 # E(K, Yn) + vmovdqu %xmm9, PBlockEncKey(arg2) lea SHIFT_MASK+16(%rip), %r12 sub %r13, %r12 # adjust the shuffle mask pointer to be @@ -480,6 +490,7 @@ _final_ghash_mul\@: vpxor %xmm2, %xmm14, %xmm14 #GHASH computation for the last <16 Byte block \GHASH_MUL %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 + vmovdqu %xmm14, AadHash(arg2) sub %r13, %r11 add $16, %r11 .else @@ -491,6 +502,7 @@ _final_ghash_mul\@: vpxor %xmm9, %xmm14, %xmm14 #GHASH computation for the last <16 Byte block \GHASH_MUL %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 + vmovdqu %xmm14, AadHash(arg2) sub %r13, %r11 add $16, %r11 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 # shuffle xmm9 back to output as ciphertext @@ -526,12 +538,16 @@ _multiple_of_16_bytes\@: # Output: Authorization Tag (AUTH_TAG) # Clobbers rax, r10-r12, and xmm0, xmm1, xmm5-xmm15 .macro GCM_COMPLETE GHASH_MUL REP - mov arg8, %r12 # r12 = aadLen (number of bytes) + vmovdqu AadHash(arg2), %xmm14 + vmovdqu HashKey(arg2), %xmm13 + + mov AadLen(arg2), %r12 # r12 = aadLen (number of bytes) shl $3, %r12 # convert into number of bits vmovd %r12d, %xmm15 # len(A) in xmm15 - shl $3, arg5 # len(C) in bits (*128) - vmovq arg5, %xmm1 + mov InLen(arg2), %r12 + shl $3, %r12 # len(C) in bits (*128) + vmovq %r12, %xmm1 vpslldq $8, %xmm15, %xmm15 # xmm15 = len(A)|| 0x0000000000000000 vpxor %xmm1, %xmm15, %xmm15 # xmm15 = len(A)||len(C) @@ -539,8 +555,7 @@ _multiple_of_16_bytes\@: \GHASH_MUL %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 # final GHASH computation vpshufb SHUF_MASK(%rip), %xmm14, %xmm14 # perform a 16Byte swap - mov arg6, %rax # rax = *Y0 - vmovdqu (%rax), %xmm9 # xmm9 = Y0 + vmovdqu OrigIV(arg2), %xmm9 ENCRYPT_SINGLE_BLOCK \REP, %xmm9 # E(K, Y0) @@ -662,6 +677,20 @@ _get_AAD_done\@: .endm .macro INIT GHASH_MUL PRECOMPUTE + mov arg6, %r11 + mov %r11, AadLen(arg2) # ctx_data.aad_length = aad_length + xor %r11d, %r11d + mov %r11, InLen(arg2) # ctx_data.in_length = 0 + + mov %r11, PBlockLen(arg2) # ctx_data.partial_block_length = 0 + mov %r11, PBlockEncKey(arg2) # ctx_data.partial_block_enc_key = 0 + mov arg4, %rax + movdqu (%rax), %xmm0 + movdqu %xmm0, OrigIV(arg2) # ctx_data.orig_IV = iv + + vpshufb SHUF_MASK(%rip), %xmm0, %xmm0 + movdqu %xmm0, CurCount(arg2) # ctx_data.current_counter = iv + vmovdqu (arg3), %xmm6 # xmm6 = HashKey vpshufb SHUF_MASK(%rip), %xmm6, %xmm6 @@ -809,10 +838,7 @@ _get_AAD_done\@: xor %r11d, %r11d # start AES for num_initial_blocks blocks - mov arg6, %rax # rax = *Y0 - vmovdqu (%rax), \CTR # CTR = Y0 - vpshufb SHUF_MASK(%rip), \CTR, \CTR - + vmovdqu CurCount(arg2), \CTR i = (9-\num_initial_blocks) setreg @@ -1748,16 +1774,13 @@ ENDPROC(aesni_gcm_dec_avx_gen2) .macro INITIAL_BLOCKS_AVX2 REP num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC VER i = (8-\num_initial_blocks) setreg - vmovdqu AadHash(arg2), reg_i + vmovdqu AadHash(arg2), reg_i # initialize the data pointer offset as zero xor %r11d, %r11d # start AES for num_initial_blocks blocks - mov arg6, %rax # rax = *Y0 - vmovdqu (%rax), \CTR # CTR = Y0 - vpshufb SHUF_MASK(%rip), \CTR, \CTR - + vmovdqu CurCount(arg2), \CTR i = (9-\num_initial_blocks) setreg -- cgit v1.2.3-59-g8ed1b From 517a448e09846732d46e983a2195002d05857919 Mon Sep 17 00:00:00 2001 From: Dave Watson Date: Mon, 10 Dec 2018 19:59:11 +0000 Subject: crypto: aesni - Move ghash_mul to GCM_COMPLETE Prepare to handle partial blocks between scatter/gather calls. For the last partial block, we only want to calculate the aadhash in GCM_COMPLETE, and a new partial block macro will handle both aadhash update and encrypting partial blocks between calls. Signed-off-by: Dave Watson Signed-off-by: Herbert Xu --- arch/x86/crypto/aesni-intel_avx-x86_64.S | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/arch/x86/crypto/aesni-intel_avx-x86_64.S b/arch/x86/crypto/aesni-intel_avx-x86_64.S index 0a9cdcfdd987..44a4a8b43ca4 100644 --- a/arch/x86/crypto/aesni-intel_avx-x86_64.S +++ b/arch/x86/crypto/aesni-intel_avx-x86_64.S @@ -488,8 +488,7 @@ _final_ghash_mul\@: vpand %xmm1, %xmm2, %xmm2 vpshufb SHUF_MASK(%rip), %xmm2, %xmm2 vpxor %xmm2, %xmm14, %xmm14 - #GHASH computation for the last <16 Byte block - \GHASH_MUL %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 + vmovdqu %xmm14, AadHash(arg2) sub %r13, %r11 add $16, %r11 @@ -500,8 +499,7 @@ _final_ghash_mul\@: vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 vpxor %xmm9, %xmm14, %xmm14 - #GHASH computation for the last <16 Byte block - \GHASH_MUL %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 + vmovdqu %xmm14, AadHash(arg2) sub %r13, %r11 add $16, %r11 @@ -541,6 +539,14 @@ _multiple_of_16_bytes\@: vmovdqu AadHash(arg2), %xmm14 vmovdqu HashKey(arg2), %xmm13 + mov PBlockLen(arg2), %r12 + cmp $0, %r12 + je _partial_done\@ + + #GHASH computation for the last <16 Byte block + \GHASH_MUL %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 + +_partial_done\@: mov AadLen(arg2), %r12 # r12 = aadLen (number of bytes) shl $3, %r12 # convert into number of bits vmovd %r12d, %xmm15 # len(A) in xmm15 -- cgit v1.2.3-59-g8ed1b From ec8c02d9a30b8324d7aae9e4e7a08973a8eaa8b4 Mon Sep 17 00:00:00 2001 From: Dave Watson Date: Mon, 10 Dec 2018 19:59:26 +0000 Subject: crypto: aesni - Introduce READ_PARTIAL_BLOCK macro Introduce READ_PARTIAL_BLOCK macro, and use it in the two existing partial block cases: AAD and the end of ENC_DEC. In particular, the ENC_DEC case should be faster, since we read by 8/4 bytes if possible. This macro will also be used to read partial blocks between enc_update and dec_update calls. Signed-off-by: Dave Watson Signed-off-by: Herbert Xu --- arch/x86/crypto/aesni-intel_avx-x86_64.S | 102 ++++++++++++++++++------------- 1 file changed, 59 insertions(+), 43 deletions(-) diff --git a/arch/x86/crypto/aesni-intel_avx-x86_64.S b/arch/x86/crypto/aesni-intel_avx-x86_64.S index 44a4a8b43ca4..ff00ad19064d 100644 --- a/arch/x86/crypto/aesni-intel_avx-x86_64.S +++ b/arch/x86/crypto/aesni-intel_avx-x86_64.S @@ -415,68 +415,56 @@ _zero_cipher_left\@: vmovdqu %xmm14, AadHash(arg2) vmovdqu %xmm9, CurCount(arg2) - cmp $16, arg5 - jl _only_less_than_16\@ - + # check for 0 length mov arg5, %r13 and $15, %r13 # r13 = (arg5 mod 16) je _multiple_of_16_bytes\@ - # handle the last <16 Byte block seperately + # handle the last <16 Byte block separately mov %r13, PBlockLen(arg2) - vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn + vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn vmovdqu %xmm9, CurCount(arg2) vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 ENCRYPT_SINGLE_BLOCK \REP, %xmm9 # E(K, Yn) vmovdqu %xmm9, PBlockEncKey(arg2) - sub $16, %r11 - add %r13, %r11 - vmovdqu (arg4, %r11), %xmm1 # receive the last <16 Byte block - - lea SHIFT_MASK+16(%rip), %r12 - sub %r13, %r12 # adjust the shuffle mask pointer to be - # able to shift 16-r13 bytes (r13 is the - # number of bytes in plaintext mod 16) - vmovdqu (%r12), %xmm2 # get the appropriate shuffle mask - vpshufb %xmm2, %xmm1, %xmm1 # shift right 16-r13 bytes - jmp _final_ghash_mul\@ - -_only_less_than_16\@: - # check for 0 length - mov arg5, %r13 - and $15, %r13 # r13 = (arg5 mod 16) + cmp $16, arg5 + jge _large_enough_update\@ - je _multiple_of_16_bytes\@ + lea (arg4,%r11,1), %r10 + mov %r13, %r12 - # handle the last <16 Byte block separately - - - vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn - vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 - ENCRYPT_SINGLE_BLOCK \REP, %xmm9 # E(K, Yn) - - vmovdqu %xmm9, PBlockEncKey(arg2) + READ_PARTIAL_BLOCK %r10 %r12 %xmm1 lea SHIFT_MASK+16(%rip), %r12 sub %r13, %r12 # adjust the shuffle mask pointer to be # able to shift 16-r13 bytes (r13 is the - # number of bytes in plaintext mod 16) + # number of bytes in plaintext mod 16) -_get_last_16_byte_loop\@: - movb (arg4, %r11), %al - movb %al, TMP1 (%rsp , %r11) - add $1, %r11 - cmp %r13, %r11 - jne _get_last_16_byte_loop\@ + jmp _final_ghash_mul\@ + +_large_enough_update\@: + sub $16, %r11 + add %r13, %r11 + + # receive the last <16 Byte block + vmovdqu (arg4, %r11, 1), %xmm1 - vmovdqu TMP1(%rsp), %xmm1 + sub %r13, %r11 + add $16, %r11 - sub $16, %r11 + lea SHIFT_MASK+16(%rip), %r12 + # adjust the shuffle mask pointer to be able to shift 16-r13 bytes + # (r13 is the number of bytes in plaintext mod 16) + sub %r13, %r12 + # get the appropriate shuffle mask + vmovdqu (%r12), %xmm2 + # shift right 16-r13 bytes + vpshufb %xmm2, %xmm1, %xmm1 _final_ghash_mul\@: .if \ENC_DEC == DEC @@ -490,8 +478,6 @@ _final_ghash_mul\@: vpxor %xmm2, %xmm14, %xmm14 vmovdqu %xmm14, AadHash(arg2) - sub %r13, %r11 - add $16, %r11 .else vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn) vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to @@ -501,8 +487,6 @@ _final_ghash_mul\@: vpxor %xmm9, %xmm14, %xmm14 vmovdqu %xmm14, AadHash(arg2) - sub %r13, %r11 - add $16, %r11 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 # shuffle xmm9 back to output as ciphertext .endif @@ -721,6 +705,38 @@ _get_AAD_done\@: \PRECOMPUTE %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5 .endm + +# Reads DLEN bytes starting at DPTR and stores in XMMDst +# where 0 < DLEN < 16 +# Clobbers %rax, DLEN +.macro READ_PARTIAL_BLOCK DPTR DLEN XMMDst + vpxor \XMMDst, \XMMDst, \XMMDst + + cmp $8, \DLEN + jl _read_lt8_\@ + mov (\DPTR), %rax + vpinsrq $0, %rax, \XMMDst, \XMMDst + sub $8, \DLEN + jz _done_read_partial_block_\@ + xor %eax, %eax +_read_next_byte_\@: + shl $8, %rax + mov 7(\DPTR, \DLEN, 1), %al + dec \DLEN + jnz _read_next_byte_\@ + vpinsrq $1, %rax, \XMMDst, \XMMDst + jmp _done_read_partial_block_\@ +_read_lt8_\@: + xor %eax, %eax +_read_next_byte_lt8_\@: + shl $8, %rax + mov -1(\DPTR, \DLEN, 1), %al + dec \DLEN + jnz _read_next_byte_lt8_\@ + vpinsrq $0, %rax, \XMMDst, \XMMDst +_done_read_partial_block_\@: +.endm + #ifdef CONFIG_AS_AVX ############################################################################### # GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0) -- cgit v1.2.3-59-g8ed1b From e044d5056396029cc12ed5354aa2a073b747195a Mon Sep 17 00:00:00 2001 From: Dave Watson Date: Mon, 10 Dec 2018 19:59:38 +0000 Subject: crypto: aesni - Introduce partial block macro Before this diff, multiple calls to GCM_ENC_DEC will succeed, but only if all calls are a multiple of 16 bytes. Handle partial blocks at the start of GCM_ENC_DEC, and update aadhash as appropriate. The data offset %r11 is also updated after the partial block. Signed-off-by: Dave Watson Signed-off-by: Herbert Xu --- arch/x86/crypto/aesni-intel_avx-x86_64.S | 156 +++++++++++++++++++++++++++++-- 1 file changed, 150 insertions(+), 6 deletions(-) diff --git a/arch/x86/crypto/aesni-intel_avx-x86_64.S b/arch/x86/crypto/aesni-intel_avx-x86_64.S index ff00ad19064d..af45fc57db90 100644 --- a/arch/x86/crypto/aesni-intel_avx-x86_64.S +++ b/arch/x86/crypto/aesni-intel_avx-x86_64.S @@ -301,6 +301,12 @@ VARIABLE_OFFSET = 16*8 vmovdqu HashKey(arg2), %xmm13 # xmm13 = HashKey add arg5, InLen(arg2) + # initialize the data pointer offset as zero + xor %r11d, %r11d + + PARTIAL_BLOCK \GHASH_MUL, arg3, arg4, arg5, %r11, %xmm8, \ENC_DEC + sub %r11, arg5 + mov arg5, %r13 # save the number of bytes of plaintext/ciphertext and $-16, %r13 # r13 = r13 - (r13 mod 16) @@ -737,6 +743,150 @@ _read_next_byte_lt8_\@: _done_read_partial_block_\@: .endm +# PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks +# between update calls. +# Requires the input data be at least 1 byte long due to READ_PARTIAL_BLOCK +# Outputs encrypted bytes, and updates hash and partial info in gcm_data_context +# Clobbers rax, r10, r12, r13, xmm0-6, xmm9-13 +.macro PARTIAL_BLOCK GHASH_MUL CYPH_PLAIN_OUT PLAIN_CYPH_IN PLAIN_CYPH_LEN DATA_OFFSET \ + AAD_HASH ENC_DEC + mov PBlockLen(arg2), %r13 + cmp $0, %r13 + je _partial_block_done_\@ # Leave Macro if no partial blocks + # Read in input data without over reading + cmp $16, \PLAIN_CYPH_LEN + jl _fewer_than_16_bytes_\@ + vmovdqu (\PLAIN_CYPH_IN), %xmm1 # If more than 16 bytes, just fill xmm + jmp _data_read_\@ + +_fewer_than_16_bytes_\@: + lea (\PLAIN_CYPH_IN, \DATA_OFFSET, 1), %r10 + mov \PLAIN_CYPH_LEN, %r12 + READ_PARTIAL_BLOCK %r10 %r12 %xmm1 + + mov PBlockLen(arg2), %r13 + +_data_read_\@: # Finished reading in data + + vmovdqu PBlockEncKey(arg2), %xmm9 + vmovdqu HashKey(arg2), %xmm13 + + lea SHIFT_MASK(%rip), %r12 + + # adjust the shuffle mask pointer to be able to shift r13 bytes + # r16-r13 is the number of bytes in plaintext mod 16) + add %r13, %r12 + vmovdqu (%r12), %xmm2 # get the appropriate shuffle mask + vpshufb %xmm2, %xmm9, %xmm9 # shift right r13 bytes + +.if \ENC_DEC == DEC + vmovdqa %xmm1, %xmm3 + pxor %xmm1, %xmm9 # Cyphertext XOR E(K, Yn) + + mov \PLAIN_CYPH_LEN, %r10 + add %r13, %r10 + # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling + sub $16, %r10 + # Determine if if partial block is not being filled and + # shift mask accordingly + jge _no_extra_mask_1_\@ + sub %r10, %r12 +_no_extra_mask_1_\@: + + vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 + # get the appropriate mask to mask out bottom r13 bytes of xmm9 + vpand %xmm1, %xmm9, %xmm9 # mask out bottom r13 bytes of xmm9 + + vpand %xmm1, %xmm3, %xmm3 + vmovdqa SHUF_MASK(%rip), %xmm10 + vpshufb %xmm10, %xmm3, %xmm3 + vpshufb %xmm2, %xmm3, %xmm3 + vpxor %xmm3, \AAD_HASH, \AAD_HASH + + cmp $0, %r10 + jl _partial_incomplete_1_\@ + + # GHASH computation for the last <16 Byte block + \GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 + xor %eax,%eax + + mov %rax, PBlockLen(arg2) + jmp _dec_done_\@ +_partial_incomplete_1_\@: + add \PLAIN_CYPH_LEN, PBlockLen(arg2) +_dec_done_\@: + vmovdqu \AAD_HASH, AadHash(arg2) +.else + vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn) + + mov \PLAIN_CYPH_LEN, %r10 + add %r13, %r10 + # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling + sub $16, %r10 + # Determine if if partial block is not being filled and + # shift mask accordingly + jge _no_extra_mask_2_\@ + sub %r10, %r12 +_no_extra_mask_2_\@: + + vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 + # get the appropriate mask to mask out bottom r13 bytes of xmm9 + vpand %xmm1, %xmm9, %xmm9 + + vmovdqa SHUF_MASK(%rip), %xmm1 + vpshufb %xmm1, %xmm9, %xmm9 + vpshufb %xmm2, %xmm9, %xmm9 + vpxor %xmm9, \AAD_HASH, \AAD_HASH + + cmp $0, %r10 + jl _partial_incomplete_2_\@ + + # GHASH computation for the last <16 Byte block + \GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 + xor %eax,%eax + + mov %rax, PBlockLen(arg2) + jmp _encode_done_\@ +_partial_incomplete_2_\@: + add \PLAIN_CYPH_LEN, PBlockLen(arg2) +_encode_done_\@: + vmovdqu \AAD_HASH, AadHash(arg2) + + vmovdqa SHUF_MASK(%rip), %xmm10 + # shuffle xmm9 back to output as ciphertext + vpshufb %xmm10, %xmm9, %xmm9 + vpshufb %xmm2, %xmm9, %xmm9 +.endif + # output encrypted Bytes + cmp $0, %r10 + jl _partial_fill_\@ + mov %r13, %r12 + mov $16, %r13 + # Set r13 to be the number of bytes to write out + sub %r12, %r13 + jmp _count_set_\@ +_partial_fill_\@: + mov \PLAIN_CYPH_LEN, %r13 +_count_set_\@: + vmovdqa %xmm9, %xmm0 + vmovq %xmm0, %rax + cmp $8, %r13 + jle _less_than_8_bytes_left_\@ + + mov %rax, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1) + add $8, \DATA_OFFSET + psrldq $8, %xmm0 + vmovq %xmm0, %rax + sub $8, %r13 +_less_than_8_bytes_left_\@: + movb %al, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1) + add $1, \DATA_OFFSET + shr $8, %rax + sub $1, %r13 + jne _less_than_8_bytes_left_\@ +_partial_block_done_\@: +.endm # PARTIAL_BLOCK + #ifdef CONFIG_AS_AVX ############################################################################### # GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0) @@ -856,9 +1006,6 @@ _done_read_partial_block_\@: setreg vmovdqu AadHash(arg2), reg_i - # initialize the data pointer offset as zero - xor %r11d, %r11d - # start AES for num_initial_blocks blocks vmovdqu CurCount(arg2), \CTR @@ -1798,9 +1945,6 @@ ENDPROC(aesni_gcm_dec_avx_gen2) setreg vmovdqu AadHash(arg2), reg_i - # initialize the data pointer offset as zero - xor %r11d, %r11d - # start AES for num_initial_blocks blocks vmovdqu CurCount(arg2), \CTR -- cgit v1.2.3-59-g8ed1b From 603f8c3b0dbbe21fabb7e005f57883b21aaadd82 Mon Sep 17 00:00:00 2001 From: Dave Watson Date: Mon, 10 Dec 2018 19:59:59 +0000 Subject: crypto: aesni - Add scatter/gather avx stubs, and use them in C Add the appropriate scatter/gather stubs to the avx asm. In the C code, we can now always use crypt_by_sg, since both sse and asm code now support scatter/gather. Introduce a new struct, aesni_gcm_tfm, that is initialized on startup to point to either the SSE, AVX, or AVX2 versions of the four necessary encryption/decryption routines. GENX_OPTSIZE is still checked at the start of crypt_by_sg. The total size of the data is checked, since the additional overhead is in the init function, calculating additional HashKeys. Signed-off-by: Dave Watson Signed-off-by: Herbert Xu --- arch/x86/crypto/aesni-intel_avx-x86_64.S | 181 +++++++++------- arch/x86/crypto/aesni-intel_glue.c | 349 +++++++++---------------------- 2 files changed, 198 insertions(+), 332 deletions(-) diff --git a/arch/x86/crypto/aesni-intel_avx-x86_64.S b/arch/x86/crypto/aesni-intel_avx-x86_64.S index af45fc57db90..91c039ab5699 100644 --- a/arch/x86/crypto/aesni-intel_avx-x86_64.S +++ b/arch/x86/crypto/aesni-intel_avx-x86_64.S @@ -518,14 +518,13 @@ _less_than_8_bytes_left\@: ############################# _multiple_of_16_bytes\@: - GCM_COMPLETE \GHASH_MUL \REP .endm # GCM_COMPLETE Finishes update of tag of last partial block # Output: Authorization Tag (AUTH_TAG) # Clobbers rax, r10-r12, and xmm0, xmm1, xmm5-xmm15 -.macro GCM_COMPLETE GHASH_MUL REP +.macro GCM_COMPLETE GHASH_MUL REP AUTH_TAG AUTH_TAG_LEN vmovdqu AadHash(arg2), %xmm14 vmovdqu HashKey(arg2), %xmm13 @@ -560,8 +559,8 @@ _partial_done\@: _return_T\@: - mov arg9, %r10 # r10 = authTag - mov arg10, %r11 # r11 = auth_tag_len + mov \AUTH_TAG, %r10 # r10 = authTag + mov \AUTH_TAG_LEN, %r11 # r11 = auth_tag_len cmp $16, %r11 je _T_16\@ @@ -680,14 +679,14 @@ _get_AAD_done\@: mov %r11, PBlockLen(arg2) # ctx_data.partial_block_length = 0 mov %r11, PBlockEncKey(arg2) # ctx_data.partial_block_enc_key = 0 - mov arg4, %rax + mov arg3, %rax movdqu (%rax), %xmm0 movdqu %xmm0, OrigIV(arg2) # ctx_data.orig_IV = iv vpshufb SHUF_MASK(%rip), %xmm0, %xmm0 movdqu %xmm0, CurCount(arg2) # ctx_data.current_counter = iv - vmovdqu (arg3), %xmm6 # xmm6 = HashKey + vmovdqu (arg4), %xmm6 # xmm6 = HashKey vpshufb SHUF_MASK(%rip), %xmm6, %xmm6 ############### PRECOMPUTATION of HashKey<<1 mod poly from the HashKey @@ -1776,88 +1775,100 @@ _initial_blocks_done\@: # const u8 *aad, /* Additional Authentication Data (AAD)*/ # u64 aad_len) /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */ ############################################################# -ENTRY(aesni_gcm_precomp_avx_gen2) +ENTRY(aesni_gcm_init_avx_gen2) FUNC_SAVE INIT GHASH_MUL_AVX, PRECOMPUTE_AVX FUNC_RESTORE ret -ENDPROC(aesni_gcm_precomp_avx_gen2) +ENDPROC(aesni_gcm_init_avx_gen2) ############################################################################### -#void aesni_gcm_enc_avx_gen2( +#void aesni_gcm_enc_update_avx_gen2( # gcm_data *my_ctx_data, /* aligned to 16 Bytes */ # gcm_context_data *data, # u8 *out, /* Ciphertext output. Encrypt in-place is allowed. */ # const u8 *in, /* Plaintext input */ -# u64 plaintext_len, /* Length of data in Bytes for encryption. */ -# u8 *iv, /* Pre-counter block j0: 4 byte salt -# (from Security Association) concatenated with 8 byte -# Initialisation Vector (from IPSec ESP Payload) -# concatenated with 0x00000001. 16-byte aligned pointer. */ -# const u8 *aad, /* Additional Authentication Data (AAD)*/ -# u64 aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */ -# u8 *auth_tag, /* Authenticated Tag output. */ -# u64 auth_tag_len)# /* Authenticated Tag Length in bytes. -# Valid values are 16 (most likely), 12 or 8. */ +# u64 plaintext_len) /* Length of data in Bytes for encryption. */ ############################################################################### -ENTRY(aesni_gcm_enc_avx_gen2) +ENTRY(aesni_gcm_enc_update_avx_gen2) FUNC_SAVE mov keysize, %eax cmp $32, %eax - je key_256_enc + je key_256_enc_update cmp $16, %eax - je key_128_enc + je key_128_enc_update # must be 192 GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 11 FUNC_RESTORE ret -key_128_enc: +key_128_enc_update: GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 9 FUNC_RESTORE ret -key_256_enc: +key_256_enc_update: GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 13 FUNC_RESTORE ret -ENDPROC(aesni_gcm_enc_avx_gen2) +ENDPROC(aesni_gcm_enc_update_avx_gen2) ############################################################################### -#void aesni_gcm_dec_avx_gen2( +#void aesni_gcm_dec_update_avx_gen2( # gcm_data *my_ctx_data, /* aligned to 16 Bytes */ # gcm_context_data *data, # u8 *out, /* Plaintext output. Decrypt in-place is allowed. */ # const u8 *in, /* Ciphertext input */ -# u64 plaintext_len, /* Length of data in Bytes for encryption. */ -# u8 *iv, /* Pre-counter block j0: 4 byte salt -# (from Security Association) concatenated with 8 byte -# Initialisation Vector (from IPSec ESP Payload) -# concatenated with 0x00000001. 16-byte aligned pointer. */ -# const u8 *aad, /* Additional Authentication Data (AAD)*/ -# u64 aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */ -# u8 *auth_tag, /* Authenticated Tag output. */ -# u64 auth_tag_len)# /* Authenticated Tag Length in bytes. -# Valid values are 16 (most likely), 12 or 8. */ +# u64 plaintext_len) /* Length of data in Bytes for encryption. */ ############################################################################### -ENTRY(aesni_gcm_dec_avx_gen2) +ENTRY(aesni_gcm_dec_update_avx_gen2) FUNC_SAVE mov keysize,%eax cmp $32, %eax - je key_256_dec + je key_256_dec_update cmp $16, %eax - je key_128_dec + je key_128_dec_update # must be 192 GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 11 FUNC_RESTORE ret -key_128_dec: +key_128_dec_update: GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 9 FUNC_RESTORE ret -key_256_dec: +key_256_dec_update: GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 13 FUNC_RESTORE ret -ENDPROC(aesni_gcm_dec_avx_gen2) +ENDPROC(aesni_gcm_dec_update_avx_gen2) + +############################################################################### +#void aesni_gcm_finalize_avx_gen2( +# gcm_data *my_ctx_data, /* aligned to 16 Bytes */ +# gcm_context_data *data, +# u8 *auth_tag, /* Authenticated Tag output. */ +# u64 auth_tag_len)# /* Authenticated Tag Length in bytes. +# Valid values are 16 (most likely), 12 or 8. */ +############################################################################### +ENTRY(aesni_gcm_finalize_avx_gen2) + FUNC_SAVE + mov keysize,%eax + cmp $32, %eax + je key_256_finalize + cmp $16, %eax + je key_128_finalize + # must be 192 + GCM_COMPLETE GHASH_MUL_AVX, 11, arg3, arg4 + FUNC_RESTORE + ret +key_128_finalize: + GCM_COMPLETE GHASH_MUL_AVX, 9, arg3, arg4 + FUNC_RESTORE + ret +key_256_finalize: + GCM_COMPLETE GHASH_MUL_AVX, 13, arg3, arg4 + FUNC_RESTORE + ret +ENDPROC(aesni_gcm_finalize_avx_gen2) + #endif /* CONFIG_AS_AVX */ #ifdef CONFIG_AS_AVX2 @@ -2724,24 +2735,23 @@ _initial_blocks_done\@: ############################################################# -#void aesni_gcm_precomp_avx_gen4 +#void aesni_gcm_init_avx_gen4 # (gcm_data *my_ctx_data, # gcm_context_data *data, -# u8 *hash_subkey# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */ # u8 *iv, /* Pre-counter block j0: 4 byte salt # (from Security Association) concatenated with 8 byte # Initialisation Vector (from IPSec ESP Payload) # concatenated with 0x00000001. 16-byte aligned pointer. */ +# u8 *hash_subkey# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */ # const u8 *aad, /* Additional Authentication Data (AAD)*/ # u64 aad_len) /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */ ############################################################# -ENTRY(aesni_gcm_precomp_avx_gen4) +ENTRY(aesni_gcm_init_avx_gen4) FUNC_SAVE INIT GHASH_MUL_AVX2, PRECOMPUTE_AVX2 FUNC_RESTORE ret -ENDPROC(aesni_gcm_precomp_avx_gen4) - +ENDPROC(aesni_gcm_init_avx_gen4) ############################################################################### #void aesni_gcm_enc_avx_gen4( @@ -2749,74 +2759,85 @@ ENDPROC(aesni_gcm_precomp_avx_gen4) # gcm_context_data *data, # u8 *out, /* Ciphertext output. Encrypt in-place is allowed. */ # const u8 *in, /* Plaintext input */ -# u64 plaintext_len, /* Length of data in Bytes for encryption. */ -# u8 *iv, /* Pre-counter block j0: 4 byte salt -# (from Security Association) concatenated with 8 byte -# Initialisation Vector (from IPSec ESP Payload) -# concatenated with 0x00000001. 16-byte aligned pointer. */ -# const u8 *aad, /* Additional Authentication Data (AAD)*/ -# u64 aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */ -# u8 *auth_tag, /* Authenticated Tag output. */ -# u64 auth_tag_len)# /* Authenticated Tag Length in bytes. -# Valid values are 16 (most likely), 12 or 8. */ +# u64 plaintext_len) /* Length of data in Bytes for encryption. */ ############################################################################### -ENTRY(aesni_gcm_enc_avx_gen4) +ENTRY(aesni_gcm_enc_update_avx_gen4) FUNC_SAVE mov keysize,%eax cmp $32, %eax - je key_256_enc4 + je key_256_enc_update4 cmp $16, %eax - je key_128_enc4 + je key_128_enc_update4 # must be 192 GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 11 FUNC_RESTORE ret -key_128_enc4: +key_128_enc_update4: GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 9 FUNC_RESTORE ret -key_256_enc4: +key_256_enc_update4: GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 13 FUNC_RESTORE ret -ENDPROC(aesni_gcm_enc_avx_gen4) +ENDPROC(aesni_gcm_enc_update_avx_gen4) ############################################################################### -#void aesni_gcm_dec_avx_gen4( +#void aesni_gcm_dec_update_avx_gen4( # gcm_data *my_ctx_data, /* aligned to 16 Bytes */ # gcm_context_data *data, # u8 *out, /* Plaintext output. Decrypt in-place is allowed. */ # const u8 *in, /* Ciphertext input */ -# u64 plaintext_len, /* Length of data in Bytes for encryption. */ -# u8 *iv, /* Pre-counter block j0: 4 byte salt -# (from Security Association) concatenated with 8 byte -# Initialisation Vector (from IPSec ESP Payload) -# concatenated with 0x00000001. 16-byte aligned pointer. */ -# const u8 *aad, /* Additional Authentication Data (AAD)*/ -# u64 aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */ -# u8 *auth_tag, /* Authenticated Tag output. */ -# u64 auth_tag_len)# /* Authenticated Tag Length in bytes. -# Valid values are 16 (most likely), 12 or 8. */ +# u64 plaintext_len) /* Length of data in Bytes for encryption. */ ############################################################################### -ENTRY(aesni_gcm_dec_avx_gen4) +ENTRY(aesni_gcm_dec_update_avx_gen4) FUNC_SAVE mov keysize,%eax cmp $32, %eax - je key_256_dec4 + je key_256_dec_update4 cmp $16, %eax - je key_128_dec4 + je key_128_dec_update4 # must be 192 GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 11 FUNC_RESTORE ret -key_128_dec4: +key_128_dec_update4: GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 9 FUNC_RESTORE ret -key_256_dec4: +key_256_dec_update4: GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 13 FUNC_RESTORE ret -ENDPROC(aesni_gcm_dec_avx_gen4) +ENDPROC(aesni_gcm_dec_update_avx_gen4) + +############################################################################### +#void aesni_gcm_finalize_avx_gen4( +# gcm_data *my_ctx_data, /* aligned to 16 Bytes */ +# gcm_context_data *data, +# u8 *auth_tag, /* Authenticated Tag output. */ +# u64 auth_tag_len)# /* Authenticated Tag Length in bytes. +# Valid values are 16 (most likely), 12 or 8. */ +############################################################################### +ENTRY(aesni_gcm_finalize_avx_gen4) + FUNC_SAVE + mov keysize,%eax + cmp $32, %eax + je key_256_finalize4 + cmp $16, %eax + je key_128_finalize4 + # must be 192 + GCM_COMPLETE GHASH_MUL_AVX2, 11, arg3, arg4 + FUNC_RESTORE + ret +key_128_finalize4: + GCM_COMPLETE GHASH_MUL_AVX2, 9, arg3, arg4 + FUNC_RESTORE + ret +key_256_finalize4: + GCM_COMPLETE GHASH_MUL_AVX2, 13, arg3, arg4 + FUNC_RESTORE + ret +ENDPROC(aesni_gcm_finalize_avx_gen4) #endif /* CONFIG_AS_AVX2 */ diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c index 2648842f1c3f..1321700d6647 100644 --- a/arch/x86/crypto/aesni-intel_glue.c +++ b/arch/x86/crypto/aesni-intel_glue.c @@ -175,6 +175,32 @@ asmlinkage void aesni_gcm_finalize(void *ctx, struct gcm_context_data *gdata, u8 *auth_tag, unsigned long auth_tag_len); +static struct aesni_gcm_tfm_s { +void (*init)(void *ctx, + struct gcm_context_data *gdata, + u8 *iv, + u8 *hash_subkey, const u8 *aad, + unsigned long aad_len); +void (*enc_update)(void *ctx, + struct gcm_context_data *gdata, u8 *out, + const u8 *in, + unsigned long plaintext_len); +void (*dec_update)(void *ctx, + struct gcm_context_data *gdata, u8 *out, + const u8 *in, + unsigned long ciphertext_len); +void (*finalize)(void *ctx, + struct gcm_context_data *gdata, + u8 *auth_tag, unsigned long auth_tag_len); +} *aesni_gcm_tfm; + +struct aesni_gcm_tfm_s aesni_gcm_tfm_sse = { + .init = &aesni_gcm_init, + .enc_update = &aesni_gcm_enc_update, + .dec_update = &aesni_gcm_dec_update, + .finalize = &aesni_gcm_finalize, +}; + #ifdef CONFIG_AS_AVX asmlinkage void aes_ctr_enc_128_avx_by8(const u8 *in, u8 *iv, void *keys, u8 *out, unsigned int num_bytes); @@ -183,16 +209,27 @@ asmlinkage void aes_ctr_enc_192_avx_by8(const u8 *in, u8 *iv, asmlinkage void aes_ctr_enc_256_avx_by8(const u8 *in, u8 *iv, void *keys, u8 *out, unsigned int num_bytes); /* - * asmlinkage void aesni_gcm_precomp_avx_gen2() + * asmlinkage void aesni_gcm_init_avx_gen2() * gcm_data *my_ctx_data, context data * u8 *hash_subkey, the Hash sub key input. Data starts on a 16-byte boundary. */ -asmlinkage void aesni_gcm_precomp_avx_gen2(void *my_ctx_data, - struct gcm_context_data *gdata, - u8 *hash_subkey, - u8 *iv, - const u8 *aad, - unsigned long aad_len); +asmlinkage void aesni_gcm_init_avx_gen2(void *my_ctx_data, + struct gcm_context_data *gdata, + u8 *iv, + u8 *hash_subkey, + const u8 *aad, + unsigned long aad_len); + +asmlinkage void aesni_gcm_enc_update_avx_gen2(void *ctx, + struct gcm_context_data *gdata, u8 *out, + const u8 *in, unsigned long plaintext_len); +asmlinkage void aesni_gcm_dec_update_avx_gen2(void *ctx, + struct gcm_context_data *gdata, u8 *out, + const u8 *in, + unsigned long ciphertext_len); +asmlinkage void aesni_gcm_finalize_avx_gen2(void *ctx, + struct gcm_context_data *gdata, + u8 *auth_tag, unsigned long auth_tag_len); asmlinkage void aesni_gcm_enc_avx_gen2(void *ctx, struct gcm_context_data *gdata, u8 *out, @@ -206,55 +243,38 @@ asmlinkage void aesni_gcm_dec_avx_gen2(void *ctx, const u8 *aad, unsigned long aad_len, u8 *auth_tag, unsigned long auth_tag_len); -static void aesni_gcm_enc_avx(void *ctx, - struct gcm_context_data *data, u8 *out, - const u8 *in, unsigned long plaintext_len, u8 *iv, - u8 *hash_subkey, const u8 *aad, unsigned long aad_len, - u8 *auth_tag, unsigned long auth_tag_len) -{ - if (plaintext_len < AVX_GEN2_OPTSIZE) { - aesni_gcm_enc(ctx, data, out, in, - plaintext_len, iv, hash_subkey, aad, - aad_len, auth_tag, auth_tag_len); - } else { - aesni_gcm_precomp_avx_gen2(ctx, data, hash_subkey, iv, - aad, aad_len); - aesni_gcm_enc_avx_gen2(ctx, data, out, in, plaintext_len, iv, - aad, aad_len, auth_tag, auth_tag_len); - } -} +struct aesni_gcm_tfm_s aesni_gcm_tfm_avx_gen2 = { + .init = &aesni_gcm_init_avx_gen2, + .enc_update = &aesni_gcm_enc_update_avx_gen2, + .dec_update = &aesni_gcm_dec_update_avx_gen2, + .finalize = &aesni_gcm_finalize_avx_gen2, +}; -static void aesni_gcm_dec_avx(void *ctx, - struct gcm_context_data *data, u8 *out, - const u8 *in, unsigned long ciphertext_len, u8 *iv, - u8 *hash_subkey, const u8 *aad, unsigned long aad_len, - u8 *auth_tag, unsigned long auth_tag_len) -{ - if (ciphertext_len < AVX_GEN2_OPTSIZE) { - aesni_gcm_dec(ctx, data, out, in, - ciphertext_len, iv, hash_subkey, aad, - aad_len, auth_tag, auth_tag_len); - } else { - aesni_gcm_precomp_avx_gen2(ctx, data, hash_subkey, iv, - aad, aad_len); - aesni_gcm_dec_avx_gen2(ctx, data, out, in, ciphertext_len, iv, - aad, aad_len, auth_tag, auth_tag_len); - } -} #endif #ifdef CONFIG_AS_AVX2 /* - * asmlinkage void aesni_gcm_precomp_avx_gen4() + * asmlinkage void aesni_gcm_init_avx_gen4() * gcm_data *my_ctx_data, context data * u8 *hash_subkey, the Hash sub key input. Data starts on a 16-byte boundary. */ -asmlinkage void aesni_gcm_precomp_avx_gen4(void *my_ctx_data, - struct gcm_context_data *gdata, - u8 *hash_subkey, - u8 *iv, - const u8 *aad, - unsigned long aad_len); +asmlinkage void aesni_gcm_init_avx_gen4(void *my_ctx_data, + struct gcm_context_data *gdata, + u8 *iv, + u8 *hash_subkey, + const u8 *aad, + unsigned long aad_len); + +asmlinkage void aesni_gcm_enc_update_avx_gen4(void *ctx, + struct gcm_context_data *gdata, u8 *out, + const u8 *in, unsigned long plaintext_len); +asmlinkage void aesni_gcm_dec_update_avx_gen4(void *ctx, + struct gcm_context_data *gdata, u8 *out, + const u8 *in, + unsigned long ciphertext_len); +asmlinkage void aesni_gcm_finalize_avx_gen4(void *ctx, + struct gcm_context_data *gdata, + u8 *auth_tag, unsigned long auth_tag_len); asmlinkage void aesni_gcm_enc_avx_gen4(void *ctx, struct gcm_context_data *gdata, u8 *out, @@ -268,67 +288,15 @@ asmlinkage void aesni_gcm_dec_avx_gen4(void *ctx, const u8 *aad, unsigned long aad_len, u8 *auth_tag, unsigned long auth_tag_len); -static void aesni_gcm_enc_avx2(void *ctx, - struct gcm_context_data *data, u8 *out, - const u8 *in, unsigned long plaintext_len, u8 *iv, - u8 *hash_subkey, const u8 *aad, unsigned long aad_len, - u8 *auth_tag, unsigned long auth_tag_len) -{ - if (plaintext_len < AVX_GEN2_OPTSIZE) { - aesni_gcm_enc(ctx, data, out, in, - plaintext_len, iv, hash_subkey, aad, - aad_len, auth_tag, auth_tag_len); - } else if (plaintext_len < AVX_GEN4_OPTSIZE) { - aesni_gcm_precomp_avx_gen2(ctx, data, hash_subkey, iv, - aad, aad_len); - aesni_gcm_enc_avx_gen2(ctx, data, out, in, plaintext_len, iv, - aad, aad_len, auth_tag, auth_tag_len); - } else { - aesni_gcm_precomp_avx_gen4(ctx, data, hash_subkey, iv, - aad, aad_len); - aesni_gcm_enc_avx_gen4(ctx, data, out, in, plaintext_len, iv, - aad, aad_len, auth_tag, auth_tag_len); - } -} +struct aesni_gcm_tfm_s aesni_gcm_tfm_avx_gen4 = { + .init = &aesni_gcm_init_avx_gen4, + .enc_update = &aesni_gcm_enc_update_avx_gen4, + .dec_update = &aesni_gcm_dec_update_avx_gen4, + .finalize = &aesni_gcm_finalize_avx_gen4, +}; -static void aesni_gcm_dec_avx2(void *ctx, - struct gcm_context_data *data, u8 *out, - const u8 *in, unsigned long ciphertext_len, u8 *iv, - u8 *hash_subkey, const u8 *aad, unsigned long aad_len, - u8 *auth_tag, unsigned long auth_tag_len) -{ - if (ciphertext_len < AVX_GEN2_OPTSIZE) { - aesni_gcm_dec(ctx, data, out, in, - ciphertext_len, iv, hash_subkey, - aad, aad_len, auth_tag, auth_tag_len); - } else if (ciphertext_len < AVX_GEN4_OPTSIZE) { - aesni_gcm_precomp_avx_gen2(ctx, data, hash_subkey, iv, - aad, aad_len); - aesni_gcm_dec_avx_gen2(ctx, data, out, in, ciphertext_len, iv, - aad, aad_len, auth_tag, auth_tag_len); - } else { - aesni_gcm_precomp_avx_gen4(ctx, data, hash_subkey, iv, - aad, aad_len); - aesni_gcm_dec_avx_gen4(ctx, data, out, in, ciphertext_len, iv, - aad, aad_len, auth_tag, auth_tag_len); - } -} #endif -static void (*aesni_gcm_enc_tfm)(void *ctx, - struct gcm_context_data *data, u8 *out, - const u8 *in, unsigned long plaintext_len, - u8 *iv, u8 *hash_subkey, const u8 *aad, - unsigned long aad_len, u8 *auth_tag, - unsigned long auth_tag_len); - -static void (*aesni_gcm_dec_tfm)(void *ctx, - struct gcm_context_data *data, u8 *out, - const u8 *in, unsigned long ciphertext_len, - u8 *iv, u8 *hash_subkey, const u8 *aad, - unsigned long aad_len, u8 *auth_tag, - unsigned long auth_tag_len); - static inline struct aesni_rfc4106_gcm_ctx *aesni_rfc4106_gcm_ctx_get(struct crypto_aead *tfm) { @@ -810,6 +778,7 @@ static int gcmaes_crypt_by_sg(bool enc, struct aead_request *req, { struct crypto_aead *tfm = crypto_aead_reqtfm(req); unsigned long auth_tag_len = crypto_aead_authsize(tfm); + struct aesni_gcm_tfm_s *gcm_tfm = aesni_gcm_tfm; struct gcm_context_data data AESNI_ALIGN_ATTR; struct scatter_walk dst_sg_walk = {}; unsigned long left = req->cryptlen; @@ -827,6 +796,15 @@ static int gcmaes_crypt_by_sg(bool enc, struct aead_request *req, if (!enc) left -= auth_tag_len; +#ifdef CONFIG_AS_AVX2 + if (left < AVX_GEN4_OPTSIZE && gcm_tfm == &aesni_gcm_tfm_avx_gen4) + gcm_tfm = &aesni_gcm_tfm_avx_gen2; +#endif +#ifdef CONFIG_AS_AVX + if (left < AVX_GEN2_OPTSIZE && gcm_tfm == &aesni_gcm_tfm_avx_gen2) + gcm_tfm = &aesni_gcm_tfm_sse; +#endif + /* Linearize assoc, if not already linear */ if (req->src->length >= assoclen && req->src->length && (!PageHighMem(sg_page(req->src)) || @@ -851,7 +829,7 @@ static int gcmaes_crypt_by_sg(bool enc, struct aead_request *req, } kernel_fpu_begin(); - aesni_gcm_init(aes_ctx, &data, iv, + gcm_tfm->init(aes_ctx, &data, iv, hash_subkey, assoc, assoclen); if (req->src != req->dst) { while (left) { @@ -862,10 +840,10 @@ static int gcmaes_crypt_by_sg(bool enc, struct aead_request *req, len = min(srclen, dstlen); if (len) { if (enc) - aesni_gcm_enc_update(aes_ctx, &data, + gcm_tfm->enc_update(aes_ctx, &data, dst, src, len); else - aesni_gcm_dec_update(aes_ctx, &data, + gcm_tfm->dec_update(aes_ctx, &data, dst, src, len); } left -= len; @@ -883,10 +861,10 @@ static int gcmaes_crypt_by_sg(bool enc, struct aead_request *req, len = scatterwalk_clamp(&src_sg_walk, left); if (len) { if (enc) - aesni_gcm_enc_update(aes_ctx, &data, + gcm_tfm->enc_update(aes_ctx, &data, src, src, len); else - aesni_gcm_dec_update(aes_ctx, &data, + gcm_tfm->dec_update(aes_ctx, &data, src, src, len); } left -= len; @@ -895,7 +873,7 @@ static int gcmaes_crypt_by_sg(bool enc, struct aead_request *req, scatterwalk_done(&src_sg_walk, 1, left); } } - aesni_gcm_finalize(aes_ctx, &data, authTag, auth_tag_len); + gcm_tfm->finalize(aes_ctx, &data, authTag, auth_tag_len); kernel_fpu_end(); if (!assocmem) @@ -928,145 +906,15 @@ static int gcmaes_crypt_by_sg(bool enc, struct aead_request *req, static int gcmaes_encrypt(struct aead_request *req, unsigned int assoclen, u8 *hash_subkey, u8 *iv, void *aes_ctx) { - u8 one_entry_in_sg = 0; - u8 *src, *dst, *assoc; - struct crypto_aead *tfm = crypto_aead_reqtfm(req); - unsigned long auth_tag_len = crypto_aead_authsize(tfm); - struct scatter_walk src_sg_walk; - struct scatter_walk dst_sg_walk = {}; - struct gcm_context_data data AESNI_ALIGN_ATTR; - - if (aesni_gcm_enc_tfm == aesni_gcm_enc || - req->cryptlen < AVX_GEN2_OPTSIZE) { - return gcmaes_crypt_by_sg(true, req, assoclen, hash_subkey, iv, - aes_ctx); - } - if (sg_is_last(req->src) && - (!PageHighMem(sg_page(req->src)) || - req->src->offset + req->src->length <= PAGE_SIZE) && - sg_is_last(req->dst) && - (!PageHighMem(sg_page(req->dst)) || - req->dst->offset + req->dst->length <= PAGE_SIZE)) { - one_entry_in_sg = 1; - scatterwalk_start(&src_sg_walk, req->src); - assoc = scatterwalk_map(&src_sg_walk); - src = assoc + req->assoclen; - dst = src; - if (unlikely(req->src != req->dst)) { - scatterwalk_start(&dst_sg_walk, req->dst); - dst = scatterwalk_map(&dst_sg_walk) + req->assoclen; - } - } else { - /* Allocate memory for src, dst, assoc */ - assoc = kmalloc(req->cryptlen + auth_tag_len + req->assoclen, - GFP_ATOMIC); - if (unlikely(!assoc)) - return -ENOMEM; - scatterwalk_map_and_copy(assoc, req->src, 0, - req->assoclen + req->cryptlen, 0); - src = assoc + req->assoclen; - dst = src; - } - - kernel_fpu_begin(); - aesni_gcm_enc_tfm(aes_ctx, &data, dst, src, req->cryptlen, iv, - hash_subkey, assoc, assoclen, - dst + req->cryptlen, auth_tag_len); - kernel_fpu_end(); - - /* The authTag (aka the Integrity Check Value) needs to be written - * back to the packet. */ - if (one_entry_in_sg) { - if (unlikely(req->src != req->dst)) { - scatterwalk_unmap(dst - req->assoclen); - scatterwalk_advance(&dst_sg_walk, req->dst->length); - scatterwalk_done(&dst_sg_walk, 1, 0); - } - scatterwalk_unmap(assoc); - scatterwalk_advance(&src_sg_walk, req->src->length); - scatterwalk_done(&src_sg_walk, req->src == req->dst, 0); - } else { - scatterwalk_map_and_copy(dst, req->dst, req->assoclen, - req->cryptlen + auth_tag_len, 1); - kfree(assoc); - } - return 0; + return gcmaes_crypt_by_sg(true, req, assoclen, hash_subkey, iv, + aes_ctx); } static int gcmaes_decrypt(struct aead_request *req, unsigned int assoclen, u8 *hash_subkey, u8 *iv, void *aes_ctx) { - u8 one_entry_in_sg = 0; - u8 *src, *dst, *assoc; - unsigned long tempCipherLen = 0; - struct crypto_aead *tfm = crypto_aead_reqtfm(req); - unsigned long auth_tag_len = crypto_aead_authsize(tfm); - u8 authTag[16]; - struct scatter_walk src_sg_walk; - struct scatter_walk dst_sg_walk = {}; - struct gcm_context_data data AESNI_ALIGN_ATTR; - int retval = 0; - - if (aesni_gcm_enc_tfm == aesni_gcm_enc || - req->cryptlen < AVX_GEN2_OPTSIZE) { - return gcmaes_crypt_by_sg(false, req, assoclen, hash_subkey, iv, - aes_ctx); - } - tempCipherLen = (unsigned long)(req->cryptlen - auth_tag_len); - - if (sg_is_last(req->src) && - (!PageHighMem(sg_page(req->src)) || - req->src->offset + req->src->length <= PAGE_SIZE) && - sg_is_last(req->dst) && req->dst->length && - (!PageHighMem(sg_page(req->dst)) || - req->dst->offset + req->dst->length <= PAGE_SIZE)) { - one_entry_in_sg = 1; - scatterwalk_start(&src_sg_walk, req->src); - assoc = scatterwalk_map(&src_sg_walk); - src = assoc + req->assoclen; - dst = src; - if (unlikely(req->src != req->dst)) { - scatterwalk_start(&dst_sg_walk, req->dst); - dst = scatterwalk_map(&dst_sg_walk) + req->assoclen; - } - } else { - /* Allocate memory for src, dst, assoc */ - assoc = kmalloc(req->cryptlen + req->assoclen, GFP_ATOMIC); - if (!assoc) - return -ENOMEM; - scatterwalk_map_and_copy(assoc, req->src, 0, - req->assoclen + req->cryptlen, 0); - src = assoc + req->assoclen; - dst = src; - } - - - kernel_fpu_begin(); - aesni_gcm_dec_tfm(aes_ctx, &data, dst, src, tempCipherLen, iv, - hash_subkey, assoc, assoclen, - authTag, auth_tag_len); - kernel_fpu_end(); - - /* Compare generated tag with passed in tag. */ - retval = crypto_memneq(src + tempCipherLen, authTag, auth_tag_len) ? - -EBADMSG : 0; - - if (one_entry_in_sg) { - if (unlikely(req->src != req->dst)) { - scatterwalk_unmap(dst - req->assoclen); - scatterwalk_advance(&dst_sg_walk, req->dst->length); - scatterwalk_done(&dst_sg_walk, 1, 0); - } - scatterwalk_unmap(assoc); - scatterwalk_advance(&src_sg_walk, req->src->length); - scatterwalk_done(&src_sg_walk, req->src == req->dst, 0); - } else { - scatterwalk_map_and_copy(dst, req->dst, req->assoclen, - tempCipherLen, 1); - kfree(assoc); - } - return retval; - + return gcmaes_crypt_by_sg(false, req, assoclen, hash_subkey, iv, + aes_ctx); } static int helper_rfc4106_encrypt(struct aead_request *req) @@ -1434,21 +1282,18 @@ static int __init aesni_init(void) #ifdef CONFIG_AS_AVX2 if (boot_cpu_has(X86_FEATURE_AVX2)) { pr_info("AVX2 version of gcm_enc/dec engaged.\n"); - aesni_gcm_enc_tfm = aesni_gcm_enc_avx2; - aesni_gcm_dec_tfm = aesni_gcm_dec_avx2; + aesni_gcm_tfm = &aesni_gcm_tfm_avx_gen4; } else #endif #ifdef CONFIG_AS_AVX if (boot_cpu_has(X86_FEATURE_AVX)) { pr_info("AVX version of gcm_enc/dec engaged.\n"); - aesni_gcm_enc_tfm = aesni_gcm_enc_avx; - aesni_gcm_dec_tfm = aesni_gcm_dec_avx; + aesni_gcm_tfm = &aesni_gcm_tfm_avx_gen2; } else #endif { pr_info("SSE version of gcm_enc/dec engaged.\n"); - aesni_gcm_enc_tfm = aesni_gcm_enc; - aesni_gcm_dec_tfm = aesni_gcm_dec; + aesni_gcm_tfm = &aesni_gcm_tfm_sse; } aesni_ctr_enc_tfm = aesni_ctr_enc; #ifdef CONFIG_AS_AVX -- cgit v1.2.3-59-g8ed1b From 9d880c5945c748d8edcac30965f3349a602158c4 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Mon, 10 Dec 2018 16:49:29 -0700 Subject: crypto: ux500 - Use proper enum in cryp_set_dma_transfer Clang warns when one enumerated type is implicitly converted to another: drivers/crypto/ux500/cryp/cryp_core.c:559:5: warning: implicit conversion from enumeration type 'enum dma_data_direction' to different enumeration type 'enum dma_transfer_direction' [-Wenum-conversion] direction, DMA_CTRL_ACK); ^~~~~~~~~ drivers/crypto/ux500/cryp/cryp_core.c:583:5: warning: implicit conversion from enumeration type 'enum dma_data_direction' to different enumeration type 'enum dma_transfer_direction' [-Wenum-conversion] direction, ^~~~~~~~~ 2 warnings generated. dmaengine_prep_slave_sg expects an enum from dma_transfer_direction. Because we know the value of the dma_data_direction enum from the switch statement, we can just use the proper value from dma_transfer_direction so there is no more conversion. DMA_TO_DEVICE = DMA_MEM_TO_DEV = 1 DMA_FROM_DEVICE = DMA_DEV_TO_MEM = 2 Signed-off-by: Nathan Chancellor Reviewed-by: Nick Desaulniers Signed-off-by: Herbert Xu --- drivers/crypto/ux500/cryp/cryp_core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/crypto/ux500/cryp/cryp_core.c b/drivers/crypto/ux500/cryp/cryp_core.c index d2663a4e1f5e..a92a66b1ff46 100644 --- a/drivers/crypto/ux500/cryp/cryp_core.c +++ b/drivers/crypto/ux500/cryp/cryp_core.c @@ -556,7 +556,7 @@ static int cryp_set_dma_transfer(struct cryp_ctx *ctx, desc = dmaengine_prep_slave_sg(channel, ctx->device->dma.sg_src, ctx->device->dma.sg_src_len, - direction, DMA_CTRL_ACK); + DMA_MEM_TO_DEV, DMA_CTRL_ACK); break; case DMA_FROM_DEVICE: @@ -580,7 +580,7 @@ static int cryp_set_dma_transfer(struct cryp_ctx *ctx, desc = dmaengine_prep_slave_sg(channel, ctx->device->dma.sg_dst, ctx->device->dma.sg_dst_len, - direction, + DMA_DEV_TO_MEM, DMA_CTRL_ACK | DMA_PREP_INTERRUPT); -- cgit v1.2.3-59-g8ed1b From 5ac93f808338f4dd465402e91869702eb87db241 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Mon, 10 Dec 2018 16:49:54 -0700 Subject: crypto: ux500 - Use proper enum in hash_set_dma_transfer Clang warns when one enumerated type is implicitly converted to another: drivers/crypto/ux500/hash/hash_core.c:169:4: warning: implicit conversion from enumeration type 'enum dma_data_direction' to different enumeration type 'enum dma_transfer_direction' [-Wenum-conversion] direction, DMA_CTRL_ACK | DMA_PREP_INTERRUPT); ^~~~~~~~~ 1 warning generated. dmaengine_prep_slave_sg expects an enum from dma_transfer_direction. We know that the only direction supported by this function is DMA_TO_DEVICE because of the check at the top of this function so we can just use the equivalent value from dma_transfer_direction. DMA_TO_DEVICE = DMA_MEM_TO_DEV = 1 Signed-off-by: Nathan Chancellor Reviewed-by: Nick Desaulniers Signed-off-by: Herbert Xu --- drivers/crypto/ux500/hash/hash_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/crypto/ux500/hash/hash_core.c b/drivers/crypto/ux500/hash/hash_core.c index 633321a8dd03..a0bb8a6eec3f 100644 --- a/drivers/crypto/ux500/hash/hash_core.c +++ b/drivers/crypto/ux500/hash/hash_core.c @@ -166,7 +166,7 @@ static int hash_set_dma_transfer(struct hash_ctx *ctx, struct scatterlist *sg, __func__); desc = dmaengine_prep_slave_sg(channel, ctx->device->dma.sg, ctx->device->dma.sg_len, - direction, DMA_CTRL_ACK | DMA_PREP_INTERRUPT); + DMA_MEM_TO_DEV, DMA_CTRL_ACK | DMA_PREP_INTERRUPT); if (!desc) { dev_err(ctx->device->dev, "%s: dmaengine_prep_slave_sg() failed!\n", __func__); -- cgit v1.2.3-59-g8ed1b From 3cc04c160208ec55940db652343d236515d88af5 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Tue, 11 Dec 2018 08:11:59 +0000 Subject: crypto: chelsio - remove set but not used variable 'kctx_len' Fixes gcc '-Wunused-but-set-variable' warning: drivers/crypto/chelsio/chcr_ipsec.c: In function 'chcr_ipsec_xmit': drivers/crypto/chelsio/chcr_ipsec.c:674:33: warning: variable 'kctx_len' set but not used [-Wunused-but-set-variable] unsigned int flits = 0, ndesc, kctx_len; It not used since commit 8362ea16f69f ("crypto: chcr - ESN for Inline IPSec Tx") Signed-off-by: YueHaibing Signed-off-by: Herbert Xu --- drivers/crypto/chelsio/chcr_ipsec.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/crypto/chelsio/chcr_ipsec.c b/drivers/crypto/chelsio/chcr_ipsec.c index 9321d2b1a5ab..82ca33126786 100644 --- a/drivers/crypto/chelsio/chcr_ipsec.c +++ b/drivers/crypto/chelsio/chcr_ipsec.c @@ -671,7 +671,7 @@ int chcr_ipsec_xmit(struct sk_buff *skb, struct net_device *dev) struct ipsec_sa_entry *sa_entry; u64 *pos, *end, *before, *sgl; int qidx, left, credits; - unsigned int flits = 0, ndesc, kctx_len; + unsigned int flits = 0, ndesc; struct adapter *adap; struct sge_eth_txq *q; struct port_info *pi; @@ -682,7 +682,6 @@ int chcr_ipsec_xmit(struct sk_buff *skb, struct net_device *dev) return NETDEV_TX_BUSY; sa_entry = (struct ipsec_sa_entry *)x->xso.offload_handle; - kctx_len = sa_entry->kctx_len; if (skb->sp->len != 1) { out_free: dev_kfree_skb_any(skb); -- cgit v1.2.3-59-g8ed1b From 1f479e4cfd08f20e48dfde07b27e3180e0901252 Mon Sep 17 00:00:00 2001 From: Harsh Jain Date: Tue, 11 Dec 2018 16:21:37 +0530 Subject: crypto: chelsio - Swap location of AAD and IV sent in WR Send input as IV | AAD | Data. It will allow sending IV as Immediate Data and Creates space in Work request to add more dma mapped entries. Signed-off-by: Harsh Jain Signed-off-by: Herbert Xu --- drivers/crypto/chelsio/chcr_algo.c | 212 +++++++++++++++++------------------ drivers/crypto/chelsio/chcr_algo.h | 2 +- drivers/crypto/chelsio/chcr_crypto.h | 10 +- 3 files changed, 104 insertions(+), 120 deletions(-) diff --git a/drivers/crypto/chelsio/chcr_algo.c b/drivers/crypto/chelsio/chcr_algo.c index 497c57803de7..2b26735fd122 100644 --- a/drivers/crypto/chelsio/chcr_algo.c +++ b/drivers/crypto/chelsio/chcr_algo.c @@ -2215,10 +2215,7 @@ static int chcr_aead_common_init(struct aead_request *req) error = -ENOMEM; goto err; } - reqctx->aad_nents = sg_nents_xlen(req->src, req->assoclen, - CHCR_SRC_SG_SIZE, 0); - reqctx->src_nents = sg_nents_xlen(req->src, req->cryptlen, - CHCR_SRC_SG_SIZE, req->assoclen); + return 0; err: return error; @@ -2268,10 +2265,10 @@ static struct sk_buff *create_authenc_wr(struct aead_request *req, struct ulptx_sgl *ulptx; unsigned int transhdr_len; unsigned int dst_size = 0, temp, subtype = get_aead_subtype(tfm); - unsigned int kctx_len = 0, dnents; - unsigned int assoclen = req->assoclen; + unsigned int kctx_len = 0, dnents, snents; unsigned int authsize = crypto_aead_authsize(tfm); int error = -EINVAL; + u8 *ivptr; int null = 0; gfp_t flags = req->base.flags & CRYPTO_TFM_REQ_MAY_SLEEP ? GFP_KERNEL : GFP_ATOMIC; @@ -2288,24 +2285,20 @@ static struct sk_buff *create_authenc_wr(struct aead_request *req, if (subtype == CRYPTO_ALG_SUB_TYPE_CBC_NULL || subtype == CRYPTO_ALG_SUB_TYPE_CTR_NULL) { null = 1; - assoclen = 0; - reqctx->aad_nents = 0; } - dnents = sg_nents_xlen(req->dst, assoclen, CHCR_DST_SG_SIZE, 0); - dnents += sg_nents_xlen(req->dst, req->cryptlen + - (reqctx->op ? -authsize : authsize), CHCR_DST_SG_SIZE, - req->assoclen); + dnents = sg_nents_xlen(req->dst, req->assoclen + req->cryptlen + + (reqctx->op ? -authsize : authsize), CHCR_DST_SG_SIZE, 0); dnents += MIN_AUTH_SG; // For IV - + snents = sg_nents_xlen(req->src, req->assoclen + req->cryptlen, + CHCR_SRC_SG_SIZE, 0); dst_size = get_space_for_phys_dsgl(dnents); kctx_len = (ntohl(KEY_CONTEXT_CTX_LEN_V(aeadctx->key_ctx_hdr)) << 4) - sizeof(chcr_req->key_ctx); transhdr_len = CIPHER_TRANSHDR_SIZE(kctx_len, dst_size); - reqctx->imm = (transhdr_len + assoclen + IV + req->cryptlen) < + reqctx->imm = (transhdr_len + req->assoclen + req->cryptlen) < SGE_MAX_WR_LEN; - temp = reqctx->imm ? roundup(assoclen + IV + req->cryptlen, 16) - : (sgl_len(reqctx->src_nents + reqctx->aad_nents - + MIN_GCM_SG) * 8); + temp = reqctx->imm ? roundup(req->assoclen + req->cryptlen, 16) + : (sgl_len(snents) * 8); transhdr_len += temp; transhdr_len = roundup(transhdr_len, 16); @@ -2315,7 +2308,7 @@ static struct sk_buff *create_authenc_wr(struct aead_request *req, chcr_aead_common_exit(req); return ERR_PTR(chcr_aead_fallback(req, reqctx->op)); } - skb = alloc_skb(SGE_MAX_WR_LEN, flags); + skb = alloc_skb(transhdr_len, flags); if (!skb) { error = -ENOMEM; goto err; @@ -2331,16 +2324,16 @@ static struct sk_buff *create_authenc_wr(struct aead_request *req, * to the hardware spec */ chcr_req->sec_cpl.op_ivinsrtofst = - FILL_SEC_CPL_OP_IVINSR(a_ctx(tfm)->dev->rx_channel_id, 2, - assoclen + 1); - chcr_req->sec_cpl.pldlen = htonl(assoclen + IV + req->cryptlen); + FILL_SEC_CPL_OP_IVINSR(a_ctx(tfm)->dev->rx_channel_id, 2, 1); + chcr_req->sec_cpl.pldlen = htonl(req->assoclen + IV + req->cryptlen); chcr_req->sec_cpl.aadstart_cipherstop_hi = FILL_SEC_CPL_CIPHERSTOP_HI( - assoclen ? 1 : 0, assoclen, - assoclen + IV + 1, + null ? 0 : 1 + IV, + null ? 0 : IV + req->assoclen, + req->assoclen + IV + 1, (temp & 0x1F0) >> 4); chcr_req->sec_cpl.cipherstop_lo_authinsert = FILL_SEC_CPL_AUTHINSERT( temp & 0xF, - null ? 0 : assoclen + IV + 1, + null ? 0 : req->assoclen + IV + 1, temp, temp); if (subtype == CRYPTO_ALG_SUB_TYPE_CTR_NULL || subtype == CRYPTO_ALG_SUB_TYPE_CTR_SHA) @@ -2367,23 +2360,24 @@ static struct sk_buff *create_authenc_wr(struct aead_request *req, memcpy(chcr_req->key_ctx.key + roundup(aeadctx->enckey_len, 16), actx->h_iopad, kctx_len - roundup(aeadctx->enckey_len, 16)); + phys_cpl = (struct cpl_rx_phys_dsgl *)((u8 *)(chcr_req + 1) + kctx_len); + ivptr = (u8 *)(phys_cpl + 1) + dst_size; + ulptx = (struct ulptx_sgl *)(ivptr + IV); if (subtype == CRYPTO_ALG_SUB_TYPE_CTR_SHA || subtype == CRYPTO_ALG_SUB_TYPE_CTR_NULL) { - memcpy(reqctx->iv, aeadctx->nonce, CTR_RFC3686_NONCE_SIZE); - memcpy(reqctx->iv + CTR_RFC3686_NONCE_SIZE, req->iv, + memcpy(ivptr, aeadctx->nonce, CTR_RFC3686_NONCE_SIZE); + memcpy(ivptr + CTR_RFC3686_NONCE_SIZE, req->iv, CTR_RFC3686_IV_SIZE); - *(__be32 *)(reqctx->iv + CTR_RFC3686_NONCE_SIZE + + *(__be32 *)(ivptr + CTR_RFC3686_NONCE_SIZE + CTR_RFC3686_IV_SIZE) = cpu_to_be32(1); } else { - memcpy(reqctx->iv, req->iv, IV); + memcpy(ivptr, req->iv, IV); } - phys_cpl = (struct cpl_rx_phys_dsgl *)((u8 *)(chcr_req + 1) + kctx_len); - ulptx = (struct ulptx_sgl *)((u8 *)(phys_cpl + 1) + dst_size); - chcr_add_aead_dst_ent(req, phys_cpl, assoclen, qid); - chcr_add_aead_src_ent(req, ulptx, assoclen); + chcr_add_aead_dst_ent(req, phys_cpl, qid); + chcr_add_aead_src_ent(req, ulptx); atomic_inc(&adap->chcr_stats.cipher_rqst); - temp = sizeof(struct cpl_rx_phys_dsgl) + dst_size + - kctx_len + (reqctx->imm ? (assoclen + IV + req->cryptlen) : 0); + temp = sizeof(struct cpl_rx_phys_dsgl) + dst_size + IV + + kctx_len + (reqctx->imm ? (req->assoclen + req->cryptlen) : 0); create_wreq(a_ctx(tfm), chcr_req, &req->base, reqctx->imm, size, transhdr_len, temp, 0); reqctx->skb = skb; @@ -2470,8 +2464,7 @@ void chcr_aead_dma_unmap(struct device *dev, } void chcr_add_aead_src_ent(struct aead_request *req, - struct ulptx_sgl *ulptx, - unsigned int assoclen) + struct ulptx_sgl *ulptx) { struct ulptx_walk ulp_walk; struct chcr_aead_reqctx *reqctx = aead_request_ctx(req); @@ -2484,28 +2477,20 @@ void chcr_add_aead_src_ent(struct aead_request *req, buf += reqctx->b0_len; } sg_pcopy_to_buffer(req->src, sg_nents(req->src), - buf, assoclen, 0); - buf += assoclen; - memcpy(buf, reqctx->iv, IV); - buf += IV; - sg_pcopy_to_buffer(req->src, sg_nents(req->src), - buf, req->cryptlen, req->assoclen); + buf, req->cryptlen + req->assoclen, 0); } else { ulptx_walk_init(&ulp_walk, ulptx); if (reqctx->b0_len) ulptx_walk_add_page(&ulp_walk, reqctx->b0_len, &reqctx->b0_dma); - ulptx_walk_add_sg(&ulp_walk, req->src, assoclen, 0); - ulptx_walk_add_page(&ulp_walk, IV, &reqctx->iv_dma); - ulptx_walk_add_sg(&ulp_walk, req->src, req->cryptlen, - req->assoclen); + ulptx_walk_add_sg(&ulp_walk, req->src, req->cryptlen + + req->assoclen, 0); ulptx_walk_end(&ulp_walk); } } void chcr_add_aead_dst_ent(struct aead_request *req, struct cpl_rx_phys_dsgl *phys_cpl, - unsigned int assoclen, unsigned short qid) { struct chcr_aead_reqctx *reqctx = aead_request_ctx(req); @@ -2516,12 +2501,10 @@ void chcr_add_aead_dst_ent(struct aead_request *req, u32 temp; dsgl_walk_init(&dsgl_walk, phys_cpl); - if (reqctx->b0_len) - dsgl_walk_add_page(&dsgl_walk, reqctx->b0_len, &reqctx->b0_dma); - dsgl_walk_add_sg(&dsgl_walk, req->dst, assoclen, 0); - dsgl_walk_add_page(&dsgl_walk, IV, &reqctx->iv_dma); - temp = req->cryptlen + (reqctx->op ? -authsize : authsize); - dsgl_walk_add_sg(&dsgl_walk, req->dst, temp, req->assoclen); + dsgl_walk_add_page(&dsgl_walk, IV + reqctx->b0_len, &reqctx->iv_dma); + temp = req->assoclen + req->cryptlen + + (reqctx->op ? -authsize : authsize); + dsgl_walk_add_sg(&dsgl_walk, req->dst, temp, 0); dsgl_walk_end(&dsgl_walk, qid, ctx->pci_chan_id); } @@ -2689,8 +2672,7 @@ static int set_msg_len(u8 *block, unsigned int msglen, int csize) return 0; } -static void generate_b0(struct aead_request *req, - struct chcr_aead_ctx *aeadctx, +static void generate_b0(struct aead_request *req, u8 *ivptr, unsigned short op_type) { unsigned int l, lp, m; @@ -2701,7 +2683,7 @@ static void generate_b0(struct aead_request *req, m = crypto_aead_authsize(aead); - memcpy(b0, reqctx->iv, 16); + memcpy(b0, ivptr, 16); lp = b0[0]; l = lp + 1; @@ -2727,29 +2709,31 @@ static inline int crypto_ccm_check_iv(const u8 *iv) } static int ccm_format_packet(struct aead_request *req, - struct chcr_aead_ctx *aeadctx, + u8 *ivptr, unsigned int sub_type, unsigned short op_type, unsigned int assoclen) { struct chcr_aead_reqctx *reqctx = aead_request_ctx(req); + struct crypto_aead *tfm = crypto_aead_reqtfm(req); + struct chcr_aead_ctx *aeadctx = AEAD_CTX(a_ctx(tfm)); int rc = 0; if (sub_type == CRYPTO_ALG_SUB_TYPE_AEAD_RFC4309) { - reqctx->iv[0] = 3; - memcpy(reqctx->iv + 1, &aeadctx->salt[0], 3); - memcpy(reqctx->iv + 4, req->iv, 8); - memset(reqctx->iv + 12, 0, 4); + ivptr[0] = 3; + memcpy(ivptr + 1, &aeadctx->salt[0], 3); + memcpy(ivptr + 4, req->iv, 8); + memset(ivptr + 12, 0, 4); } else { - memcpy(reqctx->iv, req->iv, 16); + memcpy(ivptr, req->iv, 16); } if (assoclen) *((unsigned short *)(reqctx->scratch_pad + 16)) = htons(assoclen); - generate_b0(req, aeadctx, op_type); + generate_b0(req, ivptr, op_type); /* zero the ctr value */ - memset(reqctx->iv + 15 - reqctx->iv[0], 0, reqctx->iv[0] + 1); + memset(ivptr + 15 - ivptr[0], 0, ivptr[0] + 1); return rc; } @@ -2775,7 +2759,7 @@ static void fill_sec_cpl_for_aead(struct cpl_tx_sec_pdu *sec_cpl, ((assoclen) ? CCM_AAD_FIELD_SIZE : 0); auth_offset = req->cryptlen ? - (assoclen + IV + 1 + ccm_xtra) : 0; + (req->assoclen + IV + 1 + ccm_xtra) : 0; if (op_type == CHCR_DECRYPT_OP) { if (crypto_aead_authsize(tfm) != req->cryptlen) tag_offset = crypto_aead_authsize(tfm); @@ -2785,13 +2769,13 @@ static void fill_sec_cpl_for_aead(struct cpl_tx_sec_pdu *sec_cpl, sec_cpl->op_ivinsrtofst = FILL_SEC_CPL_OP_IVINSR(c_id, - 2, assoclen + 1 + ccm_xtra); + 2, 1); sec_cpl->pldlen = - htonl(assoclen + IV + req->cryptlen + ccm_xtra); + htonl(req->assoclen + IV + req->cryptlen + ccm_xtra); /* For CCM there wil be b0 always. So AAD start will be 1 always */ sec_cpl->aadstart_cipherstop_hi = FILL_SEC_CPL_CIPHERSTOP_HI( - 1, assoclen + ccm_xtra, assoclen - + IV + 1 + ccm_xtra, 0); + 1 + IV, IV + assoclen + ccm_xtra, + req->assoclen + IV + 1 + ccm_xtra, 0); sec_cpl->cipherstop_lo_authinsert = FILL_SEC_CPL_AUTHINSERT(0, auth_offset, tag_offset, @@ -2838,10 +2822,11 @@ static struct sk_buff *create_aead_ccm_wr(struct aead_request *req, struct cpl_rx_phys_dsgl *phys_cpl; struct ulptx_sgl *ulptx; unsigned int transhdr_len; - unsigned int dst_size = 0, kctx_len, dnents, temp; + unsigned int dst_size = 0, kctx_len, dnents, temp, snents; unsigned int sub_type, assoclen = req->assoclen; unsigned int authsize = crypto_aead_authsize(tfm); int error = -EINVAL; + u8 *ivptr; gfp_t flags = req->base.flags & CRYPTO_TFM_REQ_MAY_SLEEP ? GFP_KERNEL : GFP_ATOMIC; struct adapter *adap = padap(a_ctx(tfm)->dev); @@ -2857,37 +2842,38 @@ static struct sk_buff *create_aead_ccm_wr(struct aead_request *req, error = aead_ccm_validate_input(reqctx->op, req, aeadctx, sub_type); if (error) goto err; - dnents = sg_nents_xlen(req->dst, assoclen, CHCR_DST_SG_SIZE, 0); - dnents += sg_nents_xlen(req->dst, req->cryptlen + dnents = sg_nents_xlen(req->dst, req->assoclen + req->cryptlen + (reqctx->op ? -authsize : authsize), - CHCR_DST_SG_SIZE, req->assoclen); + CHCR_DST_SG_SIZE, 0); dnents += MIN_CCM_SG; // For IV and B0 dst_size = get_space_for_phys_dsgl(dnents); + snents = sg_nents_xlen(req->src, req->assoclen + req->cryptlen, + CHCR_SRC_SG_SIZE, 0); + snents += MIN_CCM_SG; //For B0 kctx_len = roundup(aeadctx->enckey_len, 16) * 2; transhdr_len = CIPHER_TRANSHDR_SIZE(kctx_len, dst_size); - reqctx->imm = (transhdr_len + assoclen + IV + req->cryptlen + + reqctx->imm = (transhdr_len + req->assoclen + req->cryptlen + reqctx->b0_len) <= SGE_MAX_WR_LEN; - temp = reqctx->imm ? roundup(assoclen + IV + req->cryptlen + + temp = reqctx->imm ? roundup(req->assoclen + req->cryptlen + reqctx->b0_len, 16) : - (sgl_len(reqctx->src_nents + reqctx->aad_nents + - MIN_CCM_SG) * 8); + (sgl_len(snents) * 8); transhdr_len += temp; transhdr_len = roundup(transhdr_len, 16); if (chcr_aead_need_fallback(req, dnents, T6_MAX_AAD_SIZE - - reqctx->b0_len, transhdr_len, reqctx->op)) { + reqctx->b0_len, transhdr_len, reqctx->op)) { atomic_inc(&adap->chcr_stats.fallback); chcr_aead_common_exit(req); return ERR_PTR(chcr_aead_fallback(req, reqctx->op)); } - skb = alloc_skb(SGE_MAX_WR_LEN, flags); + skb = alloc_skb(transhdr_len, flags); if (!skb) { error = -ENOMEM; goto err; } - chcr_req = (struct chcr_wr *) __skb_put_zero(skb, transhdr_len); + chcr_req = __skb_put_zero(skb, transhdr_len); fill_sec_cpl_for_aead(&chcr_req->sec_cpl, dst_size, req, reqctx->op); @@ -2897,16 +2883,17 @@ static struct sk_buff *create_aead_ccm_wr(struct aead_request *req, aeadctx->key, aeadctx->enckey_len); phys_cpl = (struct cpl_rx_phys_dsgl *)((u8 *)(chcr_req + 1) + kctx_len); - ulptx = (struct ulptx_sgl *)((u8 *)(phys_cpl + 1) + dst_size); - error = ccm_format_packet(req, aeadctx, sub_type, reqctx->op, assoclen); + ivptr = (u8 *)(phys_cpl + 1) + dst_size; + ulptx = (struct ulptx_sgl *)(ivptr + IV); + error = ccm_format_packet(req, ivptr, sub_type, reqctx->op, assoclen); if (error) goto dstmap_fail; - chcr_add_aead_dst_ent(req, phys_cpl, assoclen, qid); - chcr_add_aead_src_ent(req, ulptx, assoclen); + chcr_add_aead_dst_ent(req, phys_cpl, qid); + chcr_add_aead_src_ent(req, ulptx); atomic_inc(&adap->chcr_stats.aead_rqst); - temp = sizeof(struct cpl_rx_phys_dsgl) + dst_size + - kctx_len + (reqctx->imm ? (assoclen + IV + req->cryptlen + + temp = sizeof(struct cpl_rx_phys_dsgl) + dst_size + IV + + kctx_len + (reqctx->imm ? (req->assoclen + req->cryptlen + reqctx->b0_len) : 0); create_wreq(a_ctx(tfm), chcr_req, &req->base, reqctx->imm, 0, transhdr_len, temp, 0); @@ -2931,10 +2918,11 @@ static struct sk_buff *create_gcm_wr(struct aead_request *req, struct chcr_wr *chcr_req; struct cpl_rx_phys_dsgl *phys_cpl; struct ulptx_sgl *ulptx; - unsigned int transhdr_len, dnents = 0; + unsigned int transhdr_len, dnents = 0, snents; unsigned int dst_size = 0, temp = 0, kctx_len, assoclen = req->assoclen; unsigned int authsize = crypto_aead_authsize(tfm); int error = -EINVAL; + u8 *ivptr; gfp_t flags = req->base.flags & CRYPTO_TFM_REQ_MAY_SLEEP ? GFP_KERNEL : GFP_ATOMIC; struct adapter *adap = padap(a_ctx(tfm)->dev); @@ -2946,19 +2934,19 @@ static struct sk_buff *create_gcm_wr(struct aead_request *req, error = chcr_aead_common_init(req); if (error) return ERR_PTR(error); - dnents = sg_nents_xlen(req->dst, assoclen, CHCR_DST_SG_SIZE, 0); - dnents += sg_nents_xlen(req->dst, req->cryptlen + + dnents = sg_nents_xlen(req->dst, req->assoclen + req->cryptlen + (reqctx->op ? -authsize : authsize), - CHCR_DST_SG_SIZE, req->assoclen); + CHCR_DST_SG_SIZE, 0); + snents = sg_nents_xlen(req->src, req->assoclen + req->cryptlen, + CHCR_SRC_SG_SIZE, 0); dnents += MIN_GCM_SG; // For IV dst_size = get_space_for_phys_dsgl(dnents); kctx_len = roundup(aeadctx->enckey_len, 16) + AEAD_H_SIZE; transhdr_len = CIPHER_TRANSHDR_SIZE(kctx_len, dst_size); - reqctx->imm = (transhdr_len + assoclen + IV + req->cryptlen) <= + reqctx->imm = (transhdr_len + req->assoclen + req->cryptlen) <= SGE_MAX_WR_LEN; - temp = reqctx->imm ? roundup(assoclen + IV + req->cryptlen, 16) : - (sgl_len(reqctx->src_nents + - reqctx->aad_nents + MIN_GCM_SG) * 8); + temp = reqctx->imm ? roundup(req->assoclen + req->cryptlen, 16) : + (sgl_len(snents) * 8); transhdr_len += temp; transhdr_len = roundup(transhdr_len, 16); if (chcr_aead_need_fallback(req, dnents, T6_MAX_AAD_SIZE, @@ -2968,7 +2956,7 @@ static struct sk_buff *create_gcm_wr(struct aead_request *req, chcr_aead_common_exit(req); return ERR_PTR(chcr_aead_fallback(req, reqctx->op)); } - skb = alloc_skb(SGE_MAX_WR_LEN, flags); + skb = alloc_skb(transhdr_len, flags); if (!skb) { error = -ENOMEM; goto err; @@ -2979,15 +2967,15 @@ static struct sk_buff *create_gcm_wr(struct aead_request *req, //Offset of tag from end temp = (reqctx->op == CHCR_ENCRYPT_OP) ? 0 : authsize; chcr_req->sec_cpl.op_ivinsrtofst = FILL_SEC_CPL_OP_IVINSR( - a_ctx(tfm)->dev->rx_channel_id, 2, - (assoclen + 1)); + a_ctx(tfm)->dev->rx_channel_id, 2, 1); chcr_req->sec_cpl.pldlen = - htonl(assoclen + IV + req->cryptlen); + htonl(req->assoclen + IV + req->cryptlen); chcr_req->sec_cpl.aadstart_cipherstop_hi = FILL_SEC_CPL_CIPHERSTOP_HI( - assoclen ? 1 : 0, assoclen, - assoclen + IV + 1, 0); + assoclen ? 1 + IV : 0, + assoclen ? IV + assoclen : 0, + req->assoclen + IV + 1, 0); chcr_req->sec_cpl.cipherstop_lo_authinsert = - FILL_SEC_CPL_AUTHINSERT(0, assoclen + IV + 1, + FILL_SEC_CPL_AUTHINSERT(0, req->assoclen + IV + 1, temp, temp); chcr_req->sec_cpl.seqno_numivs = FILL_SEC_CPL_SCMD0_SEQNO(reqctx->op, (reqctx->op == @@ -3002,25 +2990,26 @@ static struct sk_buff *create_gcm_wr(struct aead_request *req, memcpy(chcr_req->key_ctx.key + roundup(aeadctx->enckey_len, 16), GCM_CTX(aeadctx)->ghash_h, AEAD_H_SIZE); + phys_cpl = (struct cpl_rx_phys_dsgl *)((u8 *)(chcr_req + 1) + kctx_len); + ivptr = (u8 *)(phys_cpl + 1) + dst_size; /* prepare a 16 byte iv */ /* S A L T | IV | 0x00000001 */ if (get_aead_subtype(tfm) == CRYPTO_ALG_SUB_TYPE_AEAD_RFC4106) { - memcpy(reqctx->iv, aeadctx->salt, 4); - memcpy(reqctx->iv + 4, req->iv, GCM_RFC4106_IV_SIZE); + memcpy(ivptr, aeadctx->salt, 4); + memcpy(ivptr + 4, req->iv, GCM_RFC4106_IV_SIZE); } else { - memcpy(reqctx->iv, req->iv, GCM_AES_IV_SIZE); + memcpy(ivptr, req->iv, GCM_AES_IV_SIZE); } - *((unsigned int *)(reqctx->iv + 12)) = htonl(0x01); + *((unsigned int *)(ivptr + 12)) = htonl(0x01); - phys_cpl = (struct cpl_rx_phys_dsgl *)((u8 *)(chcr_req + 1) + kctx_len); - ulptx = (struct ulptx_sgl *)((u8 *)(phys_cpl + 1) + dst_size); + ulptx = (struct ulptx_sgl *)(ivptr + 16); - chcr_add_aead_dst_ent(req, phys_cpl, assoclen, qid); - chcr_add_aead_src_ent(req, ulptx, assoclen); + chcr_add_aead_dst_ent(req, phys_cpl, qid); + chcr_add_aead_src_ent(req, ulptx); atomic_inc(&adap->chcr_stats.aead_rqst); - temp = sizeof(struct cpl_rx_phys_dsgl) + dst_size + - kctx_len + (reqctx->imm ? (assoclen + IV + req->cryptlen) : 0); + temp = sizeof(struct cpl_rx_phys_dsgl) + dst_size + IV + + kctx_len + (reqctx->imm ? (req->assoclen + req->cryptlen) : 0); create_wreq(a_ctx(tfm), chcr_req, &req->base, reqctx->imm, size, transhdr_len, temp, reqctx->verify); reqctx->skb = skb; @@ -4178,7 +4167,6 @@ static struct chcr_alg_template driver_algs[] = { .setauthsize = chcr_authenc_null_setauthsize, } }, - }; /* diff --git a/drivers/crypto/chelsio/chcr_algo.h b/drivers/crypto/chelsio/chcr_algo.h index 1871500309e2..ee20dd899e83 100644 --- a/drivers/crypto/chelsio/chcr_algo.h +++ b/drivers/crypto/chelsio/chcr_algo.h @@ -262,7 +262,7 @@ #define MIN_AUTH_SG 1 /* IV */ #define MIN_GCM_SG 1 /* IV */ #define MIN_DIGEST_SG 1 /*Partial Buffer*/ -#define MIN_CCM_SG 2 /*IV+B0*/ +#define MIN_CCM_SG 1 /*IV+B0*/ #define CIP_SPACE_LEFT(len) \ ((SGE_MAX_WR_LEN - CIP_WR_MIN_LEN - (len))) #define HASH_SPACE_LEFT(len) \ diff --git a/drivers/crypto/chelsio/chcr_crypto.h b/drivers/crypto/chelsio/chcr_crypto.h index d37ef41f9ebe..655606f2e4d0 100644 --- a/drivers/crypto/chelsio/chcr_crypto.h +++ b/drivers/crypto/chelsio/chcr_crypto.h @@ -41,7 +41,8 @@ #define CCM_B0_SIZE 16 #define CCM_AAD_FIELD_SIZE 2 -#define T6_MAX_AAD_SIZE 511 +// 511 - 16(For IV) +#define T6_MAX_AAD_SIZE 495 /* Define following if h/w is not dropping the AAD and IV data before @@ -185,9 +186,6 @@ struct chcr_aead_reqctx { dma_addr_t b0_dma; unsigned int b0_len; unsigned int op; - short int aad_nents; - short int src_nents; - short int dst_nents; u16 imm; u16 verify; u8 iv[CHCR_MAX_CRYPTO_IV_LEN + MAX_SCRATCH_PAD_SIZE]; @@ -322,10 +320,8 @@ void chcr_aead_dma_unmap(struct device *dev, struct aead_request *req, unsigned short op_type); void chcr_add_aead_dst_ent(struct aead_request *req, struct cpl_rx_phys_dsgl *phys_cpl, - unsigned int assoclen, unsigned short qid); -void chcr_add_aead_src_ent(struct aead_request *req, struct ulptx_sgl *ulptx, - unsigned int assoclen); +void chcr_add_aead_src_ent(struct aead_request *req, struct ulptx_sgl *ulptx); void chcr_add_cipher_src_ent(struct ablkcipher_request *req, void *ulptx, struct cipher_wr_param *wrparam); -- cgit v1.2.3-59-g8ed1b From d5a4dfbdaf54cbd845755a5415cff57688bb983c Mon Sep 17 00:00:00 2001 From: Harsh Jain Date: Tue, 11 Dec 2018 16:21:38 +0530 Subject: crypto: chelsio - Use same value for both channel in single WR Use tx_channel_id instead of rx_channel_id. Signed-off-by: Harsh Jain Signed-off-by: Herbert Xu --- drivers/crypto/chelsio/chcr_algo.c | 13 ++++++------- drivers/crypto/chelsio/chcr_core.h | 1 - 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/drivers/crypto/chelsio/chcr_algo.c b/drivers/crypto/chelsio/chcr_algo.c index 2b26735fd122..ee985ad69039 100644 --- a/drivers/crypto/chelsio/chcr_algo.c +++ b/drivers/crypto/chelsio/chcr_algo.c @@ -717,7 +717,7 @@ static inline void create_wreq(struct chcr_context *ctx, htonl(FW_CRYPTO_LOOKASIDE_WR_LEN16_V(DIV_ROUND_UP(len16, 16))); chcr_req->wreq.cookie = cpu_to_be64((uintptr_t)req); chcr_req->wreq.rx_chid_to_rx_q_id = - FILL_WR_RX_Q_ID(ctx->dev->rx_channel_id, qid, + FILL_WR_RX_Q_ID(ctx->tx_chan_id, qid, !!lcb, ctx->tx_qidx); chcr_req->ulptx.cmd_dest = FILL_ULPTX_CMD_DEST(ctx->tx_chan_id, @@ -773,7 +773,7 @@ static struct sk_buff *create_cipher_wr(struct cipher_wr_param *wrparam) } chcr_req = __skb_put_zero(skb, transhdr_len); chcr_req->sec_cpl.op_ivinsrtofst = - FILL_SEC_CPL_OP_IVINSR(c_ctx(tfm)->dev->rx_channel_id, 2, 1); + FILL_SEC_CPL_OP_IVINSR(c_ctx(tfm)->tx_chan_id, 2, 1); chcr_req->sec_cpl.pldlen = htonl(IV + wrparam->bytes); chcr_req->sec_cpl.aadstart_cipherstop_hi = @@ -1344,7 +1344,6 @@ static int chcr_device_init(struct chcr_context *ctx) spin_lock(&ctx->dev->lock_chcr_dev); ctx->tx_chan_id = ctx->dev->tx_channel_id; ctx->dev->tx_channel_id = !ctx->dev->tx_channel_id; - ctx->dev->rx_channel_id = 0; spin_unlock(&ctx->dev->lock_chcr_dev); rxq_idx = ctx->tx_chan_id * rxq_perchan; rxq_idx += id % rxq_perchan; @@ -1498,7 +1497,7 @@ static struct sk_buff *create_hash_wr(struct ahash_request *req, chcr_req = __skb_put_zero(skb, transhdr_len); chcr_req->sec_cpl.op_ivinsrtofst = - FILL_SEC_CPL_OP_IVINSR(h_ctx(tfm)->dev->rx_channel_id, 2, 0); + FILL_SEC_CPL_OP_IVINSR(h_ctx(tfm)->tx_chan_id, 2, 0); chcr_req->sec_cpl.pldlen = htonl(param->bfr_len + param->sg_len); chcr_req->sec_cpl.aadstart_cipherstop_hi = @@ -2324,7 +2323,7 @@ static struct sk_buff *create_authenc_wr(struct aead_request *req, * to the hardware spec */ chcr_req->sec_cpl.op_ivinsrtofst = - FILL_SEC_CPL_OP_IVINSR(a_ctx(tfm)->dev->rx_channel_id, 2, 1); + FILL_SEC_CPL_OP_IVINSR(a_ctx(tfm)->tx_chan_id, 2, 1); chcr_req->sec_cpl.pldlen = htonl(req->assoclen + IV + req->cryptlen); chcr_req->sec_cpl.aadstart_cipherstop_hi = FILL_SEC_CPL_CIPHERSTOP_HI( null ? 0 : 1 + IV, @@ -2746,7 +2745,7 @@ static void fill_sec_cpl_for_aead(struct cpl_tx_sec_pdu *sec_cpl, struct chcr_aead_ctx *aeadctx = AEAD_CTX(a_ctx(tfm)); unsigned int cipher_mode = CHCR_SCMD_CIPHER_MODE_AES_CCM; unsigned int mac_mode = CHCR_SCMD_AUTH_MODE_CBCMAC; - unsigned int c_id = a_ctx(tfm)->dev->rx_channel_id; + unsigned int c_id = a_ctx(tfm)->tx_chan_id; unsigned int ccm_xtra; unsigned char tag_offset = 0, auth_offset = 0; unsigned int assoclen; @@ -2967,7 +2966,7 @@ static struct sk_buff *create_gcm_wr(struct aead_request *req, //Offset of tag from end temp = (reqctx->op == CHCR_ENCRYPT_OP) ? 0 : authsize; chcr_req->sec_cpl.op_ivinsrtofst = FILL_SEC_CPL_OP_IVINSR( - a_ctx(tfm)->dev->rx_channel_id, 2, 1); + a_ctx(tfm)->tx_chan_id, 2, 1); chcr_req->sec_cpl.pldlen = htonl(req->assoclen + IV + req->cryptlen); chcr_req->sec_cpl.aadstart_cipherstop_hi = FILL_SEC_CPL_CIPHERSTOP_HI( diff --git a/drivers/crypto/chelsio/chcr_core.h b/drivers/crypto/chelsio/chcr_core.h index 4616663e25d7..a50a24f39607 100644 --- a/drivers/crypto/chelsio/chcr_core.h +++ b/drivers/crypto/chelsio/chcr_core.h @@ -133,7 +133,6 @@ struct chcr_dev { spinlock_t lock_chcr_dev; struct uld_ctx *u_ctx; unsigned char tx_channel_id; - unsigned char rx_channel_id; }; struct uld_ctx { -- cgit v1.2.3-59-g8ed1b From c4f6d44d774eff382b6fc79a9fe1ff376b5ac6d7 Mon Sep 17 00:00:00 2001 From: Harsh Jain Date: Tue, 11 Dec 2018 16:21:39 +0530 Subject: crypto: chelsio - cleanup:send addr as value in function argument Send dma address as value to function arguments instead of pointer. Signed-off-by: Harsh Jain Signed-off-by: Herbert Xu --- drivers/crypto/chelsio/chcr_algo.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/crypto/chelsio/chcr_algo.c b/drivers/crypto/chelsio/chcr_algo.c index ee985ad69039..f94364b097dc 100644 --- a/drivers/crypto/chelsio/chcr_algo.c +++ b/drivers/crypto/chelsio/chcr_algo.c @@ -391,7 +391,7 @@ static inline void dsgl_walk_end(struct dsgl_walk *walk, unsigned short qid, static inline void dsgl_walk_add_page(struct dsgl_walk *walk, size_t size, - dma_addr_t *addr) + dma_addr_t addr) { int j; @@ -399,7 +399,7 @@ static inline void dsgl_walk_add_page(struct dsgl_walk *walk, return; j = walk->nents; walk->to->len[j % 8] = htons(size); - walk->to->addr[j % 8] = cpu_to_be64(*addr); + walk->to->addr[j % 8] = cpu_to_be64(addr); j++; if ((j % 8) == 0) walk->to++; @@ -473,16 +473,16 @@ static inline void ulptx_walk_end(struct ulptx_walk *walk) static inline void ulptx_walk_add_page(struct ulptx_walk *walk, size_t size, - dma_addr_t *addr) + dma_addr_t addr) { if (!size) return; if (walk->nents == 0) { walk->sgl->len0 = cpu_to_be32(size); - walk->sgl->addr0 = cpu_to_be64(*addr); + walk->sgl->addr0 = cpu_to_be64(addr); } else { - walk->pair->addr[walk->pair_idx] = cpu_to_be64(*addr); + walk->pair->addr[walk->pair_idx] = cpu_to_be64(addr); walk->pair->len[walk->pair_idx] = cpu_to_be32(size); walk->pair_idx = !walk->pair_idx; if (!walk->pair_idx) @@ -2481,7 +2481,7 @@ void chcr_add_aead_src_ent(struct aead_request *req, ulptx_walk_init(&ulp_walk, ulptx); if (reqctx->b0_len) ulptx_walk_add_page(&ulp_walk, reqctx->b0_len, - &reqctx->b0_dma); + reqctx->b0_dma); ulptx_walk_add_sg(&ulp_walk, req->src, req->cryptlen + req->assoclen, 0); ulptx_walk_end(&ulp_walk); @@ -2500,7 +2500,7 @@ void chcr_add_aead_dst_ent(struct aead_request *req, u32 temp; dsgl_walk_init(&dsgl_walk, phys_cpl); - dsgl_walk_add_page(&dsgl_walk, IV + reqctx->b0_len, &reqctx->iv_dma); + dsgl_walk_add_page(&dsgl_walk, IV + reqctx->b0_len, reqctx->iv_dma); temp = req->assoclen + req->cryptlen + (reqctx->op ? -authsize : authsize); dsgl_walk_add_sg(&dsgl_walk, req->dst, temp, 0); @@ -2571,7 +2571,7 @@ void chcr_add_hash_src_ent(struct ahash_request *req, ulptx_walk_init(&ulp_walk, ulptx); if (param->bfr_len) ulptx_walk_add_page(&ulp_walk, param->bfr_len, - &reqctx->hctx_wr.dma_addr); + reqctx->hctx_wr.dma_addr); ulptx_walk_add_sg(&ulp_walk, reqctx->hctx_wr.srcsg, param->sg_len, reqctx->hctx_wr.src_ofst); reqctx->hctx_wr.srcsg = ulp_walk.last_sg; -- cgit v1.2.3-59-g8ed1b From fef4912b66d6246d958d97382d20d0dd23bcf0bc Mon Sep 17 00:00:00 2001 From: Harsh Jain Date: Tue, 11 Dec 2018 16:21:40 +0530 Subject: crypto: chelsio - Handle PCI shutdown event chcr receives "CXGB4_STATE_DETACH" event on PCI Shutdown. Wait for processing of inflight request and Mark the device unavailable. Signed-off-by: Harsh Jain Signed-off-by: Herbert Xu --- drivers/crypto/chelsio/chcr_algo.c | 157 ++++++++++++++++++++++++++------ drivers/crypto/chelsio/chcr_core.c | 180 ++++++++++++++++++++++++------------- drivers/crypto/chelsio/chcr_core.h | 34 +++++-- 3 files changed, 278 insertions(+), 93 deletions(-) diff --git a/drivers/crypto/chelsio/chcr_algo.c b/drivers/crypto/chelsio/chcr_algo.c index f94364b097dc..df526414f03f 100644 --- a/drivers/crypto/chelsio/chcr_algo.c +++ b/drivers/crypto/chelsio/chcr_algo.c @@ -123,7 +123,7 @@ static inline struct chcr_authenc_ctx *AUTHENC_CTX(struct chcr_aead_ctx *gctx) static inline struct uld_ctx *ULD_CTX(struct chcr_context *ctx) { - return ctx->dev->u_ctx; + return container_of(ctx->dev, struct uld_ctx, dev); } static inline int is_ofld_imm(const struct sk_buff *skb) @@ -198,17 +198,40 @@ void chcr_verify_tag(struct aead_request *req, u8 *input, int *err) *err = 0; } +static int chcr_inc_wrcount(struct chcr_dev *dev) +{ + int err = 0; + + spin_lock_bh(&dev->lock_chcr_dev); + if (dev->state == CHCR_DETACH) + err = 1; + else + atomic_inc(&dev->inflight); + + spin_unlock_bh(&dev->lock_chcr_dev); + + return err; +} + +static inline void chcr_dec_wrcount(struct chcr_dev *dev) +{ + atomic_dec(&dev->inflight); +} + static inline void chcr_handle_aead_resp(struct aead_request *req, unsigned char *input, int err) { struct chcr_aead_reqctx *reqctx = aead_request_ctx(req); + struct crypto_aead *tfm = crypto_aead_reqtfm(req); + struct chcr_dev *dev = a_ctx(tfm)->dev; chcr_aead_common_exit(req); if (reqctx->verify == VERIFY_SW) { chcr_verify_tag(req, input, &err); reqctx->verify = VERIFY_HW; } + chcr_dec_wrcount(dev); req->base.complete(&req->base, err); } @@ -1100,6 +1123,7 @@ static int chcr_handle_cipher_resp(struct ablkcipher_request *req, struct cpl_fw6_pld *fw6_pld = (struct cpl_fw6_pld *)input; struct chcr_blkcipher_req_ctx *reqctx = ablkcipher_request_ctx(req); struct cipher_wr_param wrparam; + struct chcr_dev *dev = c_ctx(tfm)->dev; int bytes; if (err) @@ -1161,6 +1185,7 @@ static int chcr_handle_cipher_resp(struct ablkcipher_request *req, unmap: chcr_cipher_dma_unmap(&ULD_CTX(c_ctx(tfm))->lldi.pdev->dev, req); complete: + chcr_dec_wrcount(dev); req->base.complete(&req->base, err); return err; } @@ -1187,7 +1212,10 @@ static int process_cipher(struct ablkcipher_request *req, ablkctx->enckey_len, req->nbytes, ivsize); goto error; } - chcr_cipher_dma_map(&ULD_CTX(c_ctx(tfm))->lldi.pdev->dev, req); + + err = chcr_cipher_dma_map(&ULD_CTX(c_ctx(tfm))->lldi.pdev->dev, req); + if (err) + goto error; if (req->nbytes < (SGE_MAX_WR_LEN - (sizeof(struct chcr_wr) + AES_MIN_KEY_SIZE + sizeof(struct cpl_rx_phys_dsgl) + @@ -1276,15 +1304,21 @@ error: static int chcr_aes_encrypt(struct ablkcipher_request *req) { struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req); + struct chcr_dev *dev = c_ctx(tfm)->dev; struct sk_buff *skb = NULL; int err, isfull = 0; struct uld_ctx *u_ctx = ULD_CTX(c_ctx(tfm)); + err = chcr_inc_wrcount(dev); + if (err) + return -ENXIO; if (unlikely(cxgb4_is_crypto_q_full(u_ctx->lldi.ports[0], c_ctx(tfm)->tx_qidx))) { isfull = 1; - if (!(req->base.flags & CRYPTO_TFM_REQ_MAY_BACKLOG)) - return -ENOSPC; + if (!(req->base.flags & CRYPTO_TFM_REQ_MAY_BACKLOG)) { + err = -ENOSPC; + goto error; + } } err = process_cipher(req, u_ctx->lldi.rxq_ids[c_ctx(tfm)->rx_qidx], @@ -1295,15 +1329,23 @@ static int chcr_aes_encrypt(struct ablkcipher_request *req) set_wr_txq(skb, CPL_PRIORITY_DATA, c_ctx(tfm)->tx_qidx); chcr_send_wr(skb); return isfull ? -EBUSY : -EINPROGRESS; +error: + chcr_dec_wrcount(dev); + return err; } static int chcr_aes_decrypt(struct ablkcipher_request *req) { struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req); struct uld_ctx *u_ctx = ULD_CTX(c_ctx(tfm)); + struct chcr_dev *dev = c_ctx(tfm)->dev; struct sk_buff *skb = NULL; int err, isfull = 0; + err = chcr_inc_wrcount(dev); + if (err) + return -ENXIO; + if (unlikely(cxgb4_is_crypto_q_full(u_ctx->lldi.ports[0], c_ctx(tfm)->tx_qidx))) { isfull = 1; @@ -1333,10 +1375,11 @@ static int chcr_device_init(struct chcr_context *ctx) if (!ctx->dev) { u_ctx = assign_chcr_device(); if (!u_ctx) { + err = -ENXIO; pr_err("chcr device assignment fails\n"); goto out; } - ctx->dev = u_ctx->dev; + ctx->dev = &u_ctx->dev; adap = padap(ctx->dev); ntxq = u_ctx->lldi.ntxq; rxq_perchan = u_ctx->lldi.nrxq / u_ctx->lldi.nchan; @@ -1561,6 +1604,7 @@ static int chcr_ahash_update(struct ahash_request *req) struct chcr_ahash_req_ctx *req_ctx = ahash_request_ctx(req); struct crypto_ahash *rtfm = crypto_ahash_reqtfm(req); struct uld_ctx *u_ctx = NULL; + struct chcr_dev *dev = h_ctx(rtfm)->dev; struct sk_buff *skb; u8 remainder = 0, bs; unsigned int nbytes = req->nbytes; @@ -1569,12 +1613,6 @@ static int chcr_ahash_update(struct ahash_request *req) bs = crypto_tfm_alg_blocksize(crypto_ahash_tfm(rtfm)); u_ctx = ULD_CTX(h_ctx(rtfm)); - if (unlikely(cxgb4_is_crypto_q_full(u_ctx->lldi.ports[0], - h_ctx(rtfm)->tx_qidx))) { - isfull = 1; - if (!(req->base.flags & CRYPTO_TFM_REQ_MAY_BACKLOG)) - return -ENOSPC; - } if (nbytes + req_ctx->reqlen >= bs) { remainder = (nbytes + req_ctx->reqlen) % bs; @@ -1585,10 +1623,27 @@ static int chcr_ahash_update(struct ahash_request *req) req_ctx->reqlen += nbytes; return 0; } + error = chcr_inc_wrcount(dev); + if (error) + return -ENXIO; + /* Detach state for CHCR means lldi or padap is freed. Increasing + * inflight count for dev guarantees that lldi and padap is valid + */ + if (unlikely(cxgb4_is_crypto_q_full(u_ctx->lldi.ports[0], + h_ctx(rtfm)->tx_qidx))) { + isfull = 1; + if (!(req->base.flags & CRYPTO_TFM_REQ_MAY_BACKLOG)) { + error = -ENOSPC; + goto err; + } + } + chcr_init_hctx_per_wr(req_ctx); error = chcr_hash_dma_map(&u_ctx->lldi.pdev->dev, req); - if (error) - return -ENOMEM; + if (error) { + error = -ENOMEM; + goto err; + } get_alg_config(¶ms.alg_prm, crypto_ahash_digestsize(rtfm)); params.kctx_len = roundup(params.alg_prm.result_size, 16); params.sg_len = chcr_hash_ent_in_wr(req->src, !!req_ctx->reqlen, @@ -1628,6 +1683,8 @@ static int chcr_ahash_update(struct ahash_request *req) return isfull ? -EBUSY : -EINPROGRESS; unmap: chcr_hash_dma_unmap(&u_ctx->lldi.pdev->dev, req); +err: + chcr_dec_wrcount(dev); return error; } @@ -1645,10 +1702,16 @@ static int chcr_ahash_final(struct ahash_request *req) { struct chcr_ahash_req_ctx *req_ctx = ahash_request_ctx(req); struct crypto_ahash *rtfm = crypto_ahash_reqtfm(req); + struct chcr_dev *dev = h_ctx(rtfm)->dev; struct hash_wr_param params; struct sk_buff *skb; struct uld_ctx *u_ctx = NULL; u8 bs = crypto_tfm_alg_blocksize(crypto_ahash_tfm(rtfm)); + int error = -EINVAL; + + error = chcr_inc_wrcount(dev); + if (error) + return -ENXIO; chcr_init_hctx_per_wr(req_ctx); u_ctx = ULD_CTX(h_ctx(rtfm)); @@ -1685,19 +1748,25 @@ static int chcr_ahash_final(struct ahash_request *req) } params.hash_size = crypto_ahash_digestsize(rtfm); skb = create_hash_wr(req, ¶ms); - if (IS_ERR(skb)) - return PTR_ERR(skb); + if (IS_ERR(skb)) { + error = PTR_ERR(skb); + goto err; + } req_ctx->reqlen = 0; skb->dev = u_ctx->lldi.ports[0]; set_wr_txq(skb, CPL_PRIORITY_DATA, h_ctx(rtfm)->tx_qidx); chcr_send_wr(skb); return -EINPROGRESS; +err: + chcr_dec_wrcount(dev); + return error; } static int chcr_ahash_finup(struct ahash_request *req) { struct chcr_ahash_req_ctx *req_ctx = ahash_request_ctx(req); struct crypto_ahash *rtfm = crypto_ahash_reqtfm(req); + struct chcr_dev *dev = h_ctx(rtfm)->dev; struct uld_ctx *u_ctx = NULL; struct sk_buff *skb; struct hash_wr_param params; @@ -1706,17 +1775,24 @@ static int chcr_ahash_finup(struct ahash_request *req) bs = crypto_tfm_alg_blocksize(crypto_ahash_tfm(rtfm)); u_ctx = ULD_CTX(h_ctx(rtfm)); + error = chcr_inc_wrcount(dev); + if (error) + return -ENXIO; if (unlikely(cxgb4_is_crypto_q_full(u_ctx->lldi.ports[0], h_ctx(rtfm)->tx_qidx))) { isfull = 1; - if (!(req->base.flags & CRYPTO_TFM_REQ_MAY_BACKLOG)) - return -ENOSPC; + if (!(req->base.flags & CRYPTO_TFM_REQ_MAY_BACKLOG)) { + error = -ENOSPC; + goto err; + } } chcr_init_hctx_per_wr(req_ctx); error = chcr_hash_dma_map(&u_ctx->lldi.pdev->dev, req); - if (error) - return -ENOMEM; + if (error) { + error = -ENOMEM; + goto err; + } get_alg_config(¶ms.alg_prm, crypto_ahash_digestsize(rtfm)); params.kctx_len = roundup(params.alg_prm.result_size, 16); @@ -1773,6 +1849,8 @@ static int chcr_ahash_finup(struct ahash_request *req) return isfull ? -EBUSY : -EINPROGRESS; unmap: chcr_hash_dma_unmap(&u_ctx->lldi.pdev->dev, req); +err: + chcr_dec_wrcount(dev); return error; } @@ -1780,6 +1858,7 @@ static int chcr_ahash_digest(struct ahash_request *req) { struct chcr_ahash_req_ctx *req_ctx = ahash_request_ctx(req); struct crypto_ahash *rtfm = crypto_ahash_reqtfm(req); + struct chcr_dev *dev = h_ctx(rtfm)->dev; struct uld_ctx *u_ctx = NULL; struct sk_buff *skb; struct hash_wr_param params; @@ -1788,19 +1867,26 @@ static int chcr_ahash_digest(struct ahash_request *req) rtfm->init(req); bs = crypto_tfm_alg_blocksize(crypto_ahash_tfm(rtfm)); + error = chcr_inc_wrcount(dev); + if (error) + return -ENXIO; u_ctx = ULD_CTX(h_ctx(rtfm)); if (unlikely(cxgb4_is_crypto_q_full(u_ctx->lldi.ports[0], h_ctx(rtfm)->tx_qidx))) { isfull = 1; - if (!(req->base.flags & CRYPTO_TFM_REQ_MAY_BACKLOG)) - return -ENOSPC; + if (!(req->base.flags & CRYPTO_TFM_REQ_MAY_BACKLOG)) { + error = -ENOSPC; + goto err; + } } chcr_init_hctx_per_wr(req_ctx); error = chcr_hash_dma_map(&u_ctx->lldi.pdev->dev, req); - if (error) - return -ENOMEM; + if (error) { + error = -ENOMEM; + goto err; + } get_alg_config(¶ms.alg_prm, crypto_ahash_digestsize(rtfm)); params.kctx_len = roundup(params.alg_prm.result_size, 16); @@ -1853,6 +1939,8 @@ static int chcr_ahash_digest(struct ahash_request *req) return isfull ? -EBUSY : -EINPROGRESS; unmap: chcr_hash_dma_unmap(&u_ctx->lldi.pdev->dev, req); +err: + chcr_dec_wrcount(dev); return error; } @@ -1924,6 +2012,7 @@ static inline void chcr_handle_ahash_resp(struct ahash_request *req, int digestsize, updated_digestsize; struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); struct uld_ctx *u_ctx = ULD_CTX(h_ctx(tfm)); + struct chcr_dev *dev = h_ctx(tfm)->dev; if (input == NULL) goto out; @@ -1966,6 +2055,7 @@ unmap: out: + chcr_dec_wrcount(dev); req->base.complete(&req->base, err); } @@ -3553,27 +3643,42 @@ static int chcr_aead_op(struct aead_request *req, create_wr_t create_wr_fn) { struct crypto_aead *tfm = crypto_aead_reqtfm(req); + struct chcr_aead_reqctx *reqctx = aead_request_ctx(req); struct uld_ctx *u_ctx; struct sk_buff *skb; int isfull = 0; + struct chcr_dev *cdev; - if (!a_ctx(tfm)->dev) { + cdev = a_ctx(tfm)->dev; + if (!cdev) { pr_err("chcr : %s : No crypto device.\n", __func__); return -ENXIO; } + + if (chcr_inc_wrcount(cdev)) { + /* Detach state for CHCR means lldi or padap is freed. + * We cannot increment fallback here. + */ + return chcr_aead_fallback(req, reqctx->op); + } + u_ctx = ULD_CTX(a_ctx(tfm)); if (cxgb4_is_crypto_q_full(u_ctx->lldi.ports[0], a_ctx(tfm)->tx_qidx)) { isfull = 1; - if (!(req->base.flags & CRYPTO_TFM_REQ_MAY_BACKLOG)) + if (!(req->base.flags & CRYPTO_TFM_REQ_MAY_BACKLOG)) { + chcr_dec_wrcount(cdev); return -ENOSPC; + } } /* Form a WR from req */ skb = create_wr_fn(req, u_ctx->lldi.rxq_ids[a_ctx(tfm)->rx_qidx], size); - if (IS_ERR(skb) || !skb) + if (IS_ERR(skb) || !skb) { + chcr_dec_wrcount(cdev); return PTR_ERR(skb); + } skb->dev = u_ctx->lldi.ports[0]; set_wr_txq(skb, CPL_PRIORITY_DATA, a_ctx(tfm)->tx_qidx); diff --git a/drivers/crypto/chelsio/chcr_core.c b/drivers/crypto/chelsio/chcr_core.c index 2c472e3c6aeb..f71a97939418 100644 --- a/drivers/crypto/chelsio/chcr_core.c +++ b/drivers/crypto/chelsio/chcr_core.c @@ -26,10 +26,7 @@ #include "chcr_core.h" #include "cxgb4_uld.h" -static LIST_HEAD(uld_ctx_list); -static DEFINE_MUTEX(dev_mutex); -static atomic_t dev_count; -static struct uld_ctx *ctx_rr; +static struct chcr_driver_data drv_data; typedef int (*chcr_handler_func)(struct chcr_dev *dev, unsigned char *input); static int cpl_fw6_pld_handler(struct chcr_dev *dev, unsigned char *input); @@ -53,6 +50,29 @@ static struct cxgb4_uld_info chcr_uld_info = { #endif /* CONFIG_CHELSIO_IPSEC_INLINE */ }; +static void detach_work_fn(struct work_struct *work) +{ + struct chcr_dev *dev; + + dev = container_of(work, struct chcr_dev, detach_work.work); + + if (atomic_read(&dev->inflight)) { + dev->wqretry--; + if (dev->wqretry) { + pr_debug("Request Inflight Count %d\n", + atomic_read(&dev->inflight)); + + schedule_delayed_work(&dev->detach_work, WQ_DETACH_TM); + } else { + WARN(1, "CHCR:%d request Still Pending\n", + atomic_read(&dev->inflight)); + complete(&dev->detach_comp); + } + } else { + complete(&dev->detach_comp); + } +} + struct uld_ctx *assign_chcr_device(void) { struct uld_ctx *u_ctx = NULL; @@ -63,56 +83,70 @@ struct uld_ctx *assign_chcr_device(void) * Although One session must use the same device to * maintain request-response ordering. */ - mutex_lock(&dev_mutex); - if (!list_empty(&uld_ctx_list)) { - u_ctx = ctx_rr; - if (list_is_last(&ctx_rr->entry, &uld_ctx_list)) - ctx_rr = list_first_entry(&uld_ctx_list, - struct uld_ctx, - entry); + mutex_lock(&drv_data.drv_mutex); + if (!list_empty(&drv_data.act_dev)) { + u_ctx = drv_data.last_dev; + if (list_is_last(&drv_data.last_dev->entry, &drv_data.act_dev)) + drv_data.last_dev = list_first_entry(&drv_data.act_dev, + struct uld_ctx, entry); else - ctx_rr = list_next_entry(ctx_rr, entry); + drv_data.last_dev = + list_next_entry(drv_data.last_dev, entry); } - mutex_unlock(&dev_mutex); + mutex_unlock(&drv_data.drv_mutex); return u_ctx; } -static int chcr_dev_add(struct uld_ctx *u_ctx) +static void chcr_dev_add(struct uld_ctx *u_ctx) { struct chcr_dev *dev; - dev = kzalloc(sizeof(*dev), GFP_KERNEL); - if (!dev) - return -ENXIO; + dev = &u_ctx->dev; + dev->state = CHCR_ATTACH; + atomic_set(&dev->inflight, 0); + mutex_lock(&drv_data.drv_mutex); + list_move(&u_ctx->entry, &drv_data.act_dev); + if (!drv_data.last_dev) + drv_data.last_dev = u_ctx; + mutex_unlock(&drv_data.drv_mutex); +} +static void chcr_dev_init(struct uld_ctx *u_ctx) +{ + struct chcr_dev *dev; + + dev = &u_ctx->dev; spin_lock_init(&dev->lock_chcr_dev); - u_ctx->dev = dev; - dev->u_ctx = u_ctx; - atomic_inc(&dev_count); - mutex_lock(&dev_mutex); - list_add_tail(&u_ctx->entry, &uld_ctx_list); - if (!ctx_rr) - ctx_rr = u_ctx; - mutex_unlock(&dev_mutex); - return 0; + INIT_DELAYED_WORK(&dev->detach_work, detach_work_fn); + init_completion(&dev->detach_comp); + dev->state = CHCR_INIT; + dev->wqretry = WQ_RETRY; + atomic_inc(&drv_data.dev_count); + atomic_set(&dev->inflight, 0); + mutex_lock(&drv_data.drv_mutex); + list_add_tail(&u_ctx->entry, &drv_data.inact_dev); + if (!drv_data.last_dev) + drv_data.last_dev = u_ctx; + mutex_unlock(&drv_data.drv_mutex); } -static int chcr_dev_remove(struct uld_ctx *u_ctx) +static int chcr_dev_move(struct uld_ctx *u_ctx) { - if (ctx_rr == u_ctx) { - if (list_is_last(&ctx_rr->entry, &uld_ctx_list)) - ctx_rr = list_first_entry(&uld_ctx_list, - struct uld_ctx, - entry); + mutex_lock(&drv_data.drv_mutex); + if (drv_data.last_dev == u_ctx) { + if (list_is_last(&drv_data.last_dev->entry, &drv_data.act_dev)) + drv_data.last_dev = list_first_entry(&drv_data.act_dev, + struct uld_ctx, entry); else - ctx_rr = list_next_entry(ctx_rr, entry); + drv_data.last_dev = + list_next_entry(drv_data.last_dev, entry); } - list_del(&u_ctx->entry); - if (list_empty(&uld_ctx_list)) - ctx_rr = NULL; - kfree(u_ctx->dev); - u_ctx->dev = NULL; - atomic_dec(&dev_count); + list_move(&u_ctx->entry, &drv_data.inact_dev); + if (list_empty(&drv_data.act_dev)) + drv_data.last_dev = NULL; + atomic_dec(&drv_data.dev_count); + mutex_unlock(&drv_data.drv_mutex); + return 0; } @@ -167,6 +201,7 @@ static void *chcr_uld_add(const struct cxgb4_lld_info *lld) goto out; } u_ctx->lldi = *lld; + chcr_dev_init(u_ctx); #ifdef CONFIG_CHELSIO_IPSEC_INLINE if (lld->crypto & ULP_CRYPTO_IPSEC_INLINE) chcr_add_xfrmops(lld); @@ -179,7 +214,7 @@ int chcr_uld_rx_handler(void *handle, const __be64 *rsp, const struct pkt_gl *pgl) { struct uld_ctx *u_ctx = (struct uld_ctx *)handle; - struct chcr_dev *dev = u_ctx->dev; + struct chcr_dev *dev = &u_ctx->dev; const struct cpl_fw6_pld *rpl = (struct cpl_fw6_pld *)rsp; if (rpl->opcode != CPL_FW6_PLD) { @@ -201,6 +236,28 @@ int chcr_uld_tx_handler(struct sk_buff *skb, struct net_device *dev) } #endif /* CONFIG_CHELSIO_IPSEC_INLINE */ +static void chcr_detach_device(struct uld_ctx *u_ctx) +{ + struct chcr_dev *dev = &u_ctx->dev; + + spin_lock_bh(&dev->lock_chcr_dev); + if (dev->state == CHCR_DETACH) { + spin_unlock_bh(&dev->lock_chcr_dev); + pr_debug("Detached Event received for already detach device\n"); + return; + } + dev->state = CHCR_DETACH; + spin_unlock_bh(&dev->lock_chcr_dev); + + if (atomic_read(&dev->inflight) != 0) { + schedule_delayed_work(&dev->detach_work, WQ_DETACH_TM); + wait_for_completion(&dev->detach_comp); + } + + // Move u_ctx to inactive_dev list + chcr_dev_move(u_ctx); +} + static int chcr_uld_state_change(void *handle, enum cxgb4_state state) { struct uld_ctx *u_ctx = handle; @@ -208,23 +265,16 @@ static int chcr_uld_state_change(void *handle, enum cxgb4_state state) switch (state) { case CXGB4_STATE_UP: - if (!u_ctx->dev) { - ret = chcr_dev_add(u_ctx); - if (ret != 0) - return ret; + if (u_ctx->dev.state != CHCR_INIT) { + // ALready Initialised. + return 0; } - if (atomic_read(&dev_count) == 1) - ret = start_crypto(); + chcr_dev_add(u_ctx); + ret = start_crypto(); break; case CXGB4_STATE_DETACH: - if (u_ctx->dev) { - mutex_lock(&dev_mutex); - chcr_dev_remove(u_ctx); - mutex_unlock(&dev_mutex); - } - if (!atomic_read(&dev_count)) - stop_crypto(); + chcr_detach_device(u_ctx); break; case CXGB4_STATE_START_RECOVERY: @@ -237,7 +287,13 @@ static int chcr_uld_state_change(void *handle, enum cxgb4_state state) static int __init chcr_crypto_init(void) { + INIT_LIST_HEAD(&drv_data.act_dev); + INIT_LIST_HEAD(&drv_data.inact_dev); + atomic_set(&drv_data.dev_count, 0); + mutex_init(&drv_data.drv_mutex); + drv_data.last_dev = NULL; cxgb4_register_uld(CXGB4_ULD_CRYPTO, &chcr_uld_info); + return 0; } @@ -245,18 +301,20 @@ static void __exit chcr_crypto_exit(void) { struct uld_ctx *u_ctx, *tmp; - if (atomic_read(&dev_count)) - stop_crypto(); + stop_crypto(); + cxgb4_unregister_uld(CXGB4_ULD_CRYPTO); /* Remove all devices from list */ - mutex_lock(&dev_mutex); - list_for_each_entry_safe(u_ctx, tmp, &uld_ctx_list, entry) { - if (u_ctx->dev) - chcr_dev_remove(u_ctx); + mutex_lock(&drv_data.drv_mutex); + list_for_each_entry_safe(u_ctx, tmp, &drv_data.act_dev, entry) { + list_del(&u_ctx->entry); kfree(u_ctx); } - mutex_unlock(&dev_mutex); - cxgb4_unregister_uld(CXGB4_ULD_CRYPTO); + list_for_each_entry_safe(u_ctx, tmp, &drv_data.inact_dev, entry) { + list_del(&u_ctx->entry); + kfree(u_ctx); + } + mutex_unlock(&drv_data.drv_mutex); } module_init(chcr_crypto_init); diff --git a/drivers/crypto/chelsio/chcr_core.h b/drivers/crypto/chelsio/chcr_core.h index a50a24f39607..1159dee964ed 100644 --- a/drivers/crypto/chelsio/chcr_core.h +++ b/drivers/crypto/chelsio/chcr_core.h @@ -47,7 +47,7 @@ #define MAX_PENDING_REQ_TO_HW 20 #define CHCR_TEST_RESPONSE_TIMEOUT 1000 - +#define WQ_DETACH_TM (msecs_to_jiffies(50)) #define PAD_ERROR_BIT 1 #define CHK_PAD_ERR_BIT(x) (((x) >> PAD_ERROR_BIT) & 1) @@ -61,9 +61,6 @@ #define HASH_WR_MIN_LEN (sizeof(struct chcr_wr) + \ DUMMY_BYTES + \ sizeof(struct ulptx_sgl)) - -#define padap(dev) pci_get_drvdata(dev->u_ctx->lldi.pdev) - struct uld_ctx; struct _key_ctx { @@ -121,6 +118,20 @@ struct _key_ctx { #define KEYCTX_TX_WR_AUTHIN_G(x) \ (((x) >> KEYCTX_TX_WR_AUTHIN_S) & KEYCTX_TX_WR_AUTHIN_M) +#define WQ_RETRY 5 +struct chcr_driver_data { + struct list_head act_dev; + struct list_head inact_dev; + atomic_t dev_count; + struct mutex drv_mutex; + struct uld_ctx *last_dev; +}; + +enum chcr_state { + CHCR_INIT = 0, + CHCR_ATTACH, + CHCR_DETACH, +}; struct chcr_wr { struct fw_crypto_lookaside_wr wreq; struct ulp_txpkt ulptx; @@ -131,14 +142,18 @@ struct chcr_wr { struct chcr_dev { spinlock_t lock_chcr_dev; - struct uld_ctx *u_ctx; + enum chcr_state state; + atomic_t inflight; + int wqretry; + struct delayed_work detach_work; + struct completion detach_comp; unsigned char tx_channel_id; }; struct uld_ctx { struct list_head entry; struct cxgb4_lld_info lldi; - struct chcr_dev *dev; + struct chcr_dev dev; }; struct sge_opaque_hdr { @@ -189,6 +204,13 @@ static inline unsigned int sgl_len(unsigned int n) return (3 * n) / 2 + (n & 1) + 2; } +static inline void *padap(struct chcr_dev *dev) +{ + struct uld_ctx *u_ctx = container_of(dev, struct uld_ctx, dev); + + return pci_get_drvdata(u_ctx->lldi.pdev); +} + struct uld_ctx *assign_chcr_device(void); int chcr_send_wr(struct sk_buff *skb); int start_crypto(void); -- cgit v1.2.3-59-g8ed1b From 6501ab5ed4d925cce4c2a1c49b63583c42e65bd8 Mon Sep 17 00:00:00 2001 From: Harsh Jain Date: Tue, 11 Dec 2018 16:21:41 +0530 Subject: crypto: chelsio - Reset counters on cxgb4 Detach Reset the counters on receiving detach from Cxgb4. Signed-off-by: Atul Gupta Signed-off-by: Herbert Xu --- drivers/crypto/chelsio/chcr_core.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/crypto/chelsio/chcr_core.c b/drivers/crypto/chelsio/chcr_core.c index f71a97939418..e04b3e8fa623 100644 --- a/drivers/crypto/chelsio/chcr_core.c +++ b/drivers/crypto/chelsio/chcr_core.c @@ -132,6 +132,8 @@ static void chcr_dev_init(struct uld_ctx *u_ctx) static int chcr_dev_move(struct uld_ctx *u_ctx) { + struct adapter *adap; + mutex_lock(&drv_data.drv_mutex); if (drv_data.last_dev == u_ctx) { if (list_is_last(&drv_data.last_dev->entry, &drv_data.act_dev)) @@ -144,6 +146,8 @@ static int chcr_dev_move(struct uld_ctx *u_ctx) list_move(&u_ctx->entry, &drv_data.inact_dev); if (list_empty(&drv_data.act_dev)) drv_data.last_dev = NULL; + adap = padap(&u_ctx->dev); + memset(&adap->chcr_stats, 0, sizeof(adap->chcr_stats)); atomic_dec(&drv_data.dev_count); mutex_unlock(&drv_data.drv_mutex); -- cgit v1.2.3-59-g8ed1b From f31ba0f95f1998118098978dbfb25ecbec6b0891 Mon Sep 17 00:00:00 2001 From: Harsh Jain Date: Tue, 11 Dec 2018 16:21:42 +0530 Subject: crypto: chelsio - Fix wrong error counter increments Fix error counter increment in AEAD decrypt operation when validation of tag is done in Driver instead of H/W. Signed-off-by: Harsh Jain Signed-off-by: Herbert Xu --- drivers/crypto/chelsio/chcr_algo.c | 9 +++++---- drivers/crypto/chelsio/chcr_core.c | 11 +++++------ 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/drivers/crypto/chelsio/chcr_algo.c b/drivers/crypto/chelsio/chcr_algo.c index df526414f03f..eedc33128da4 100644 --- a/drivers/crypto/chelsio/chcr_algo.c +++ b/drivers/crypto/chelsio/chcr_algo.c @@ -218,7 +218,7 @@ static inline void chcr_dec_wrcount(struct chcr_dev *dev) atomic_dec(&dev->inflight); } -static inline void chcr_handle_aead_resp(struct aead_request *req, +static inline int chcr_handle_aead_resp(struct aead_request *req, unsigned char *input, int err) { @@ -233,6 +233,8 @@ static inline void chcr_handle_aead_resp(struct aead_request *req, } chcr_dec_wrcount(dev); req->base.complete(&req->base, err); + + return err; } static void get_aes_decrypt_key(unsigned char *dec_key, @@ -2072,14 +2074,13 @@ int chcr_handle_resp(struct crypto_async_request *req, unsigned char *input, switch (tfm->__crt_alg->cra_flags & CRYPTO_ALG_TYPE_MASK) { case CRYPTO_ALG_TYPE_AEAD: - chcr_handle_aead_resp(aead_request_cast(req), input, err); + err = chcr_handle_aead_resp(aead_request_cast(req), input, err); break; case CRYPTO_ALG_TYPE_ABLKCIPHER: - err = chcr_handle_cipher_resp(ablkcipher_request_cast(req), + chcr_handle_cipher_resp(ablkcipher_request_cast(req), input, err); break; - case CRYPTO_ALG_TYPE_AHASH: chcr_handle_ahash_resp(ahash_request_cast(req), input, err); } diff --git a/drivers/crypto/chelsio/chcr_core.c b/drivers/crypto/chelsio/chcr_core.c index e04b3e8fa623..239b933d6df6 100644 --- a/drivers/crypto/chelsio/chcr_core.c +++ b/drivers/crypto/chelsio/chcr_core.c @@ -169,12 +169,8 @@ static int cpl_fw6_pld_handler(struct chcr_dev *dev, ack_err_status = ntohl(*(__be32 *)((unsigned char *)&fw6_pld->data[0] + 4)); - if (ack_err_status) { - if (CHK_MAC_ERR_BIT(ack_err_status) || - CHK_PAD_ERR_BIT(ack_err_status)) - error_status = -EBADMSG; - atomic_inc(&adap->chcr_stats.error); - } + if (CHK_MAC_ERR_BIT(ack_err_status) || CHK_PAD_ERR_BIT(ack_err_status)) + error_status = -EBADMSG; /* call completion callback with failure status */ if (req) { error_status = chcr_handle_resp(req, input, error_status); @@ -182,6 +178,9 @@ static int cpl_fw6_pld_handler(struct chcr_dev *dev, pr_err("Incorrect request address from the firmware\n"); return -EFAULT; } + if (error_status) + atomic_inc(&adap->chcr_stats.error); + return 0; } -- cgit v1.2.3-59-g8ed1b From 0c99c2a087c60b71e1fc90d070e2e16ca52defbe Mon Sep 17 00:00:00 2001 From: Corentin Labbe Date: Thu, 13 Dec 2018 08:36:37 +0000 Subject: crypto: user - remove unused dump functions This patch removes unused dump functions for crypto_user_stats. There are remains of the copy/paste of crypto_user_base to crypto_user_stat and I forgot to remove them. Signed-off-by: Corentin Labbe Signed-off-by: Herbert Xu --- crypto/crypto_user_base.c | 4 +--- crypto/crypto_user_stat.c | 33 --------------------------------- include/crypto/internal/cryptouser.h | 12 ------------ 3 files changed, 1 insertion(+), 48 deletions(-) diff --git a/crypto/crypto_user_base.c b/crypto/crypto_user_base.c index 5311fd7fae34..f25d3f32c9c2 100644 --- a/crypto/crypto_user_base.c +++ b/crypto/crypto_user_base.c @@ -423,9 +423,7 @@ static const struct crypto_link { .dump = crypto_dump_report, .done = crypto_dump_report_done}, [CRYPTO_MSG_DELRNG - CRYPTO_MSG_BASE] = { .doit = crypto_del_rng }, - [CRYPTO_MSG_GETSTAT - CRYPTO_MSG_BASE] = { .doit = crypto_reportstat, - .dump = crypto_dump_reportstat, - .done = crypto_dump_reportstat_done}, + [CRYPTO_MSG_GETSTAT - CRYPTO_MSG_BASE] = { .doit = crypto_reportstat}, }; static int crypto_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, diff --git a/crypto/crypto_user_stat.c b/crypto/crypto_user_stat.c index 0ba00aaeb810..3e9a53233d80 100644 --- a/crypto/crypto_user_stat.c +++ b/crypto/crypto_user_stat.c @@ -336,37 +336,4 @@ drop_alg: return nlmsg_unicast(crypto_nlsk, skb, NETLINK_CB(in_skb).portid); } -int crypto_dump_reportstat(struct sk_buff *skb, struct netlink_callback *cb) -{ - struct crypto_alg *alg; - struct crypto_dump_info info; - int err; - - if (cb->args[0]) - goto out; - - cb->args[0] = 1; - - info.in_skb = cb->skb; - info.out_skb = skb; - info.nlmsg_seq = cb->nlh->nlmsg_seq; - info.nlmsg_flags = NLM_F_MULTI; - - list_for_each_entry(alg, &crypto_alg_list, cra_list) { - err = crypto_reportstat_alg(alg, &info); - if (err) - goto out_err; - } - -out: - return skb->len; -out_err: - return err; -} - -int crypto_dump_reportstat_done(struct netlink_callback *cb) -{ - return 0; -} - MODULE_LICENSE("GPL"); diff --git a/include/crypto/internal/cryptouser.h b/include/crypto/internal/cryptouser.h index 3492ab42eefb..40623f4457df 100644 --- a/include/crypto/internal/cryptouser.h +++ b/include/crypto/internal/cryptouser.h @@ -4,22 +4,10 @@ struct crypto_alg *crypto_alg_match(struct crypto_user_alg *p, int exact); #ifdef CONFIG_CRYPTO_STATS -int crypto_dump_reportstat(struct sk_buff *skb, struct netlink_callback *cb); int crypto_reportstat(struct sk_buff *in_skb, struct nlmsghdr *in_nlh, struct nlattr **attrs); -int crypto_dump_reportstat_done(struct netlink_callback *cb); #else -static int crypto_dump_reportstat(struct sk_buff *skb, struct netlink_callback *cb) -{ - return -ENOTSUPP; -} - static int crypto_reportstat(struct sk_buff *in_skb, struct nlmsghdr *in_nlh, struct nlattr **attrs) { return -ENOTSUPP; } - -static int crypto_dump_reportstat_done(struct netlink_callback *cb) -{ - return -ENOTSUPP; -} #endif -- cgit v1.2.3-59-g8ed1b From bfad6cb3f8295559216690e1eb9c99003a79b3a0 Mon Sep 17 00:00:00 2001 From: Corentin Labbe Date: Thu, 13 Dec 2018 08:36:38 +0000 Subject: crypto: api - document missing stats member This patchs adds missing member of stats documentation. Signed-off-by: Corentin Labbe Signed-off-by: Herbert Xu --- include/linux/crypto.h | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/include/linux/crypto.h b/include/linux/crypto.h index 9850b41e38ae..81e178fb9ed8 100644 --- a/include/linux/crypto.h +++ b/include/linux/crypto.h @@ -564,6 +564,13 @@ struct crypto_istat_rng { * @cra_destroy: internally used * * @stats: union of all possible crypto_istat_xxx structures + * @stats.aead: statistics for AEAD algorithm + * @stats.akcipher: statistics for akcipher algorithm + * @stats.cipher: statistics for cipher algorithm + * @stats.compress: statistics for compress algorithm + * @stats.hash: statistics for hash algorithm + * @stats.rng: statistics for rng algorithm + * @stats.kpp: statistics for KPP algorithm * * The struct crypto_alg describes a generic Crypto API algorithm and is common * for all of the transformations. Any variable not documented here shall not -- cgit v1.2.3-59-g8ed1b From 2326828ee40357b3d2b1359b8ca7526af201495b Mon Sep 17 00:00:00 2001 From: Fabio Estevam Date: Thu, 13 Dec 2018 07:52:32 -0200 Subject: crypto: mxc-scc - fix build warnings on ARM64 The following build warnings are seen when building for ARM64 allmodconfig: drivers/crypto/mxc-scc.c:181:20: warning: format '%d' expects argument of type 'int', but argument 5 has type 'size_t' {aka 'long unsigned int'} [-Wformat=] drivers/crypto/mxc-scc.c:186:21: warning: format '%d' expects argument of type 'int', but argument 4 has type 'size_t' {aka 'long unsigned int'} [-Wformat=] drivers/crypto/mxc-scc.c:277:21: warning: format '%d' expects argument of type 'int', but argument 4 has type 'size_t' {aka 'long unsigned int'} [-Wformat=] drivers/crypto/mxc-scc.c:339:3: warning: cast to pointer from integer of different size [-Wint-to-pointer-cast] drivers/crypto/mxc-scc.c:340:3: warning: cast to pointer from integer of different size [-Wint-to-pointer-cast] Fix them by using the %zu specifier to print a size_t variable and using a plain %x to print the result of a readl(). Signed-off-by: Fabio Estevam Signed-off-by: Herbert Xu --- drivers/crypto/mxc-scc.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/crypto/mxc-scc.c b/drivers/crypto/mxc-scc.c index e01c46387df8..519086730791 100644 --- a/drivers/crypto/mxc-scc.c +++ b/drivers/crypto/mxc-scc.c @@ -178,12 +178,12 @@ static int mxc_scc_get_data(struct mxc_scc_ctx *ctx, else from = scc->black_memory; - dev_dbg(scc->dev, "pcopy: from 0x%p %d bytes\n", from, + dev_dbg(scc->dev, "pcopy: from 0x%p %zu bytes\n", from, ctx->dst_nents * 8); len = sg_pcopy_from_buffer(ablkreq->dst, ctx->dst_nents, from, ctx->size, ctx->offset); if (!len) { - dev_err(scc->dev, "pcopy err from 0x%p (len=%d)\n", from, len); + dev_err(scc->dev, "pcopy err from 0x%p (len=%zu)\n", from, len); return -EINVAL; } @@ -274,7 +274,7 @@ static int mxc_scc_put_data(struct mxc_scc_ctx *ctx, len = sg_pcopy_to_buffer(req->src, ctx->src_nents, to, len, ctx->offset); if (!len) { - dev_err(scc->dev, "pcopy err to 0x%p (len=%d)\n", to, len); + dev_err(scc->dev, "pcopy err to 0x%p (len=%zu)\n", to, len); return -EINVAL; } @@ -335,9 +335,9 @@ static void mxc_scc_ablkcipher_next(struct mxc_scc_ctx *ctx, return; } - dev_dbg(scc->dev, "Start encryption (0x%p/0x%p)\n", - (void *)readl(scc->base + SCC_SCM_RED_START), - (void *)readl(scc->base + SCC_SCM_BLACK_START)); + dev_dbg(scc->dev, "Start encryption (0x%x/0x%x)\n", + readl(scc->base + SCC_SCM_RED_START), + readl(scc->base + SCC_SCM_BLACK_START)); /* clear interrupt control registers */ writel(SCC_SCM_INTR_CTRL_CLR_INTR, -- cgit v1.2.3-59-g8ed1b From c9613335bf4fe259a654aa0e9701f0c4cddc12ba Mon Sep 17 00:00:00 2001 From: Nagadheeraj Rottela Date: Fri, 14 Dec 2018 11:19:51 +0000 Subject: crypto: cavium/nitrox - Added AEAD cipher support Added support to offload AEAD ciphers to NITROX. Currently supported AEAD cipher is 'gcm(aes)'. Signed-off-by: Nagadheeraj Rottela Reviewed-by: Srikanth Jampala Signed-off-by: Herbert Xu --- drivers/crypto/cavium/nitrox/Makefile | 4 +- drivers/crypto/cavium/nitrox/nitrox_aead.c | 364 ++++++++++++++++ drivers/crypto/cavium/nitrox/nitrox_algs.c | 559 +------------------------ drivers/crypto/cavium/nitrox/nitrox_common.h | 6 +- drivers/crypto/cavium/nitrox/nitrox_req.h | 239 +++++++++-- drivers/crypto/cavium/nitrox/nitrox_reqmgr.c | 38 +- drivers/crypto/cavium/nitrox/nitrox_skcipher.c | 498 ++++++++++++++++++++++ 7 files changed, 1103 insertions(+), 605 deletions(-) create mode 100644 drivers/crypto/cavium/nitrox/nitrox_aead.c create mode 100644 drivers/crypto/cavium/nitrox/nitrox_skcipher.c diff --git a/drivers/crypto/cavium/nitrox/Makefile b/drivers/crypto/cavium/nitrox/Makefile index ad0546630ad8..f83991aaf820 100644 --- a/drivers/crypto/cavium/nitrox/Makefile +++ b/drivers/crypto/cavium/nitrox/Makefile @@ -7,7 +7,9 @@ n5pf-objs := nitrox_main.o \ nitrox_hal.o \ nitrox_reqmgr.o \ nitrox_algs.o \ - nitrox_mbx.o + nitrox_mbx.o \ + nitrox_skcipher.o \ + nitrox_aead.o n5pf-$(CONFIG_PCI_IOV) += nitrox_sriov.o n5pf-$(CONFIG_DEBUG_FS) += nitrox_debugfs.o diff --git a/drivers/crypto/cavium/nitrox/nitrox_aead.c b/drivers/crypto/cavium/nitrox/nitrox_aead.c new file mode 100644 index 000000000000..4f43eacd2557 --- /dev/null +++ b/drivers/crypto/cavium/nitrox/nitrox_aead.c @@ -0,0 +1,364 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "nitrox_dev.h" +#include "nitrox_common.h" +#include "nitrox_req.h" + +#define GCM_AES_SALT_SIZE 4 + +/** + * struct nitrox_crypt_params - Params to set nitrox crypto request. + * @cryptlen: Encryption/Decryption data length + * @authlen: Assoc data length + Cryptlen + * @srclen: Input buffer length + * @dstlen: Output buffer length + * @iv: IV data + * @ivsize: IV data length + * @ctrl_arg: Identifies the request type (ENCRYPT/DECRYPT) + */ +struct nitrox_crypt_params { + unsigned int cryptlen; + unsigned int authlen; + unsigned int srclen; + unsigned int dstlen; + u8 *iv; + int ivsize; + u8 ctrl_arg; +}; + +union gph_p3 { + struct { +#ifdef __BIG_ENDIAN_BITFIELD + u16 iv_offset : 8; + u16 auth_offset : 8; +#else + u16 auth_offset : 8; + u16 iv_offset : 8; +#endif + }; + u16 param; +}; + +static int nitrox_aes_gcm_setkey(struct crypto_aead *aead, const u8 *key, + unsigned int keylen) +{ + int aes_keylen; + struct nitrox_crypto_ctx *nctx = crypto_aead_ctx(aead); + struct flexi_crypto_context *fctx; + union fc_ctx_flags flags; + + aes_keylen = flexi_aes_keylen(keylen); + if (aes_keylen < 0) { + crypto_aead_set_flags(aead, CRYPTO_TFM_RES_BAD_KEY_LEN); + return -EINVAL; + } + + /* fill crypto context */ + fctx = nctx->u.fctx; + flags.f = be64_to_cpu(fctx->flags.f); + flags.w0.aes_keylen = aes_keylen; + fctx->flags.f = cpu_to_be64(flags.f); + + /* copy enc key to context */ + memset(&fctx->crypto, 0, sizeof(fctx->crypto)); + memcpy(fctx->crypto.u.key, key, keylen); + + return 0; +} + +static int nitrox_aead_setauthsize(struct crypto_aead *aead, + unsigned int authsize) +{ + struct nitrox_crypto_ctx *nctx = crypto_aead_ctx(aead); + struct flexi_crypto_context *fctx = nctx->u.fctx; + union fc_ctx_flags flags; + + flags.f = be64_to_cpu(fctx->flags.f); + flags.w0.mac_len = authsize; + fctx->flags.f = cpu_to_be64(flags.f); + + aead->authsize = authsize; + + return 0; +} + +static int alloc_src_sglist(struct aead_request *areq, char *iv, int ivsize, + int buflen) +{ + struct nitrox_kcrypt_request *nkreq = aead_request_ctx(areq); + int nents = sg_nents_for_len(areq->src, buflen) + 1; + int ret; + + if (nents < 0) + return nents; + + /* Allocate buffer to hold IV and input scatterlist array */ + ret = alloc_src_req_buf(nkreq, nents, ivsize); + if (ret) + return ret; + + nitrox_creq_copy_iv(nkreq->src, iv, ivsize); + nitrox_creq_set_src_sg(nkreq, nents, ivsize, areq->src, buflen); + + return 0; +} + +static int alloc_dst_sglist(struct aead_request *areq, int ivsize, int buflen) +{ + struct nitrox_kcrypt_request *nkreq = aead_request_ctx(areq); + int nents = sg_nents_for_len(areq->dst, buflen) + 3; + int ret; + + if (nents < 0) + return nents; + + /* Allocate buffer to hold ORH, COMPLETION and output scatterlist + * array + */ + ret = alloc_dst_req_buf(nkreq, nents); + if (ret) + return ret; + + nitrox_creq_set_orh(nkreq); + nitrox_creq_set_comp(nkreq); + nitrox_creq_set_dst_sg(nkreq, nents, ivsize, areq->dst, buflen); + + return 0; +} + +static void free_src_sglist(struct aead_request *areq) +{ + struct nitrox_kcrypt_request *nkreq = aead_request_ctx(areq); + + kfree(nkreq->src); +} + +static void free_dst_sglist(struct aead_request *areq) +{ + struct nitrox_kcrypt_request *nkreq = aead_request_ctx(areq); + + kfree(nkreq->dst); +} + +static int nitrox_set_creq(struct aead_request *areq, + struct nitrox_crypt_params *params) +{ + struct nitrox_kcrypt_request *nkreq = aead_request_ctx(areq); + struct se_crypto_request *creq = &nkreq->creq; + struct crypto_aead *aead = crypto_aead_reqtfm(areq); + union gph_p3 param3; + struct nitrox_crypto_ctx *nctx = crypto_aead_ctx(aead); + int ret; + + creq->flags = areq->base.flags; + creq->gfp = (areq->base.flags & CRYPTO_TFM_REQ_MAY_SLEEP) ? + GFP_KERNEL : GFP_ATOMIC; + + creq->ctrl.value = 0; + creq->opcode = FLEXI_CRYPTO_ENCRYPT_HMAC; + creq->ctrl.s.arg = params->ctrl_arg; + + creq->gph.param0 = cpu_to_be16(params->cryptlen); + creq->gph.param1 = cpu_to_be16(params->authlen); + creq->gph.param2 = cpu_to_be16(params->ivsize + areq->assoclen); + param3.iv_offset = 0; + param3.auth_offset = params->ivsize; + creq->gph.param3 = cpu_to_be16(param3.param); + + creq->ctx_handle = nctx->u.ctx_handle; + creq->ctrl.s.ctxl = sizeof(struct flexi_crypto_context); + + ret = alloc_src_sglist(areq, params->iv, params->ivsize, + params->srclen); + if (ret) + return ret; + + ret = alloc_dst_sglist(areq, params->ivsize, params->dstlen); + if (ret) { + free_src_sglist(areq); + return ret; + } + + return 0; +} + +static void nitrox_aead_callback(void *arg, int err) +{ + struct aead_request *areq = arg; + + free_src_sglist(areq); + free_dst_sglist(areq); + if (err) { + pr_err_ratelimited("request failed status 0x%0x\n", err); + err = -EINVAL; + } + + areq->base.complete(&areq->base, err); +} + +static int nitrox_aes_gcm_enc(struct aead_request *areq) +{ + struct crypto_aead *aead = crypto_aead_reqtfm(areq); + struct nitrox_crypto_ctx *nctx = crypto_aead_ctx(aead); + struct nitrox_kcrypt_request *nkreq = aead_request_ctx(areq); + struct se_crypto_request *creq = &nkreq->creq; + struct flexi_crypto_context *fctx = nctx->u.fctx; + struct nitrox_crypt_params params; + int ret; + + memcpy(fctx->crypto.iv, areq->iv, GCM_AES_SALT_SIZE); + + memset(¶ms, 0, sizeof(params)); + params.cryptlen = areq->cryptlen; + params.authlen = areq->assoclen + params.cryptlen; + params.srclen = params.authlen; + params.dstlen = params.srclen + aead->authsize; + params.iv = &areq->iv[GCM_AES_SALT_SIZE]; + params.ivsize = GCM_AES_IV_SIZE - GCM_AES_SALT_SIZE; + params.ctrl_arg = ENCRYPT; + ret = nitrox_set_creq(areq, ¶ms); + if (ret) + return ret; + + /* send the crypto request */ + return nitrox_process_se_request(nctx->ndev, creq, nitrox_aead_callback, + areq); +} + +static int nitrox_aes_gcm_dec(struct aead_request *areq) +{ + struct crypto_aead *aead = crypto_aead_reqtfm(areq); + struct nitrox_crypto_ctx *nctx = crypto_aead_ctx(aead); + struct nitrox_kcrypt_request *nkreq = aead_request_ctx(areq); + struct se_crypto_request *creq = &nkreq->creq; + struct flexi_crypto_context *fctx = nctx->u.fctx; + struct nitrox_crypt_params params; + int ret; + + memcpy(fctx->crypto.iv, areq->iv, GCM_AES_SALT_SIZE); + + memset(¶ms, 0, sizeof(params)); + params.cryptlen = areq->cryptlen - aead->authsize; + params.authlen = areq->assoclen + params.cryptlen; + params.srclen = areq->cryptlen + areq->assoclen; + params.dstlen = params.srclen - aead->authsize; + params.iv = &areq->iv[GCM_AES_SALT_SIZE]; + params.ivsize = GCM_AES_IV_SIZE - GCM_AES_SALT_SIZE; + params.ctrl_arg = DECRYPT; + ret = nitrox_set_creq(areq, ¶ms); + if (ret) + return ret; + + /* send the crypto request */ + return nitrox_process_se_request(nctx->ndev, creq, nitrox_aead_callback, + areq); +} + +static int nitrox_aead_init(struct crypto_aead *aead) +{ + struct nitrox_crypto_ctx *nctx = crypto_aead_ctx(aead); + struct crypto_ctx_hdr *chdr; + + /* get the first device */ + nctx->ndev = nitrox_get_first_device(); + if (!nctx->ndev) + return -ENODEV; + + /* allocate nitrox crypto context */ + chdr = crypto_alloc_context(nctx->ndev); + if (!chdr) { + nitrox_put_device(nctx->ndev); + return -ENOMEM; + } + nctx->chdr = chdr; + nctx->u.ctx_handle = (uintptr_t)((u8 *)chdr->vaddr + + sizeof(struct ctx_hdr)); + nctx->u.fctx->flags.f = 0; + + return 0; +} + +static int nitrox_aes_gcm_init(struct crypto_aead *aead) +{ + int ret; + struct nitrox_crypto_ctx *nctx = crypto_aead_ctx(aead); + union fc_ctx_flags *flags; + + ret = nitrox_aead_init(aead); + if (ret) + return ret; + + flags = &nctx->u.fctx->flags; + flags->w0.cipher_type = CIPHER_AES_GCM; + flags->w0.hash_type = AUTH_NULL; + flags->w0.iv_source = IV_FROM_DPTR; + /* ask microcode to calculate ipad/opad */ + flags->w0.auth_input_type = 1; + flags->f = be64_to_cpu(flags->f); + + crypto_aead_set_reqsize(aead, sizeof(struct aead_request) + + sizeof(struct nitrox_kcrypt_request)); + + return 0; +} + +static void nitrox_aead_exit(struct crypto_aead *aead) +{ + struct nitrox_crypto_ctx *nctx = crypto_aead_ctx(aead); + + /* free the nitrox crypto context */ + if (nctx->u.ctx_handle) { + struct flexi_crypto_context *fctx = nctx->u.fctx; + + memzero_explicit(&fctx->crypto, sizeof(struct crypto_keys)); + memzero_explicit(&fctx->auth, sizeof(struct auth_keys)); + crypto_free_context((void *)nctx->chdr); + } + nitrox_put_device(nctx->ndev); + + nctx->u.ctx_handle = 0; + nctx->ndev = NULL; +} + +static struct aead_alg nitrox_aeads[] = { { + .base = { + .cra_name = "gcm(aes)", + .cra_driver_name = "n5_aes_gcm", + .cra_priority = PRIO, + .cra_flags = CRYPTO_ALG_ASYNC, + .cra_blocksize = AES_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct nitrox_crypto_ctx), + .cra_alignmask = 0, + .cra_module = THIS_MODULE, + }, + .setkey = nitrox_aes_gcm_setkey, + .setauthsize = nitrox_aead_setauthsize, + .encrypt = nitrox_aes_gcm_enc, + .decrypt = nitrox_aes_gcm_dec, + .init = nitrox_aes_gcm_init, + .exit = nitrox_aead_exit, + .ivsize = GCM_AES_IV_SIZE, + .maxauthsize = AES_BLOCK_SIZE, +} }; + +int nitrox_register_aeads(void) +{ + return crypto_register_aeads(nitrox_aeads, ARRAY_SIZE(nitrox_aeads)); +} + +void nitrox_unregister_aeads(void) +{ + crypto_unregister_aeads(nitrox_aeads, ARRAY_SIZE(nitrox_aeads)); +} diff --git a/drivers/crypto/cavium/nitrox/nitrox_algs.c b/drivers/crypto/cavium/nitrox/nitrox_algs.c index 10075a97ff0d..d646ae5f29b0 100644 --- a/drivers/crypto/cavium/nitrox/nitrox_algs.c +++ b/drivers/crypto/cavium/nitrox/nitrox_algs.c @@ -1,561 +1,24 @@ -// SPDX-License-Identifier: GPL-2.0 -#include -#include -#include -#include - -#include -#include -#include -#include -#include - -#include "nitrox_dev.h" #include "nitrox_common.h" -#include "nitrox_req.h" - -#define PRIO 4001 - -struct nitrox_cipher { - const char *name; - enum flexi_cipher value; -}; - -/** - * supported cipher list - */ -static const struct nitrox_cipher flexi_cipher_table[] = { - { "null", CIPHER_NULL }, - { "cbc(des3_ede)", CIPHER_3DES_CBC }, - { "ecb(des3_ede)", CIPHER_3DES_ECB }, - { "cbc(aes)", CIPHER_AES_CBC }, - { "ecb(aes)", CIPHER_AES_ECB }, - { "cfb(aes)", CIPHER_AES_CFB }, - { "rfc3686(ctr(aes))", CIPHER_AES_CTR }, - { "xts(aes)", CIPHER_AES_XTS }, - { "cts(cbc(aes))", CIPHER_AES_CBC_CTS }, - { NULL, CIPHER_INVALID } -}; - -static enum flexi_cipher flexi_cipher_type(const char *name) -{ - const struct nitrox_cipher *cipher = flexi_cipher_table; - - while (cipher->name) { - if (!strcmp(cipher->name, name)) - break; - cipher++; - } - return cipher->value; -} - -static int flexi_aes_keylen(int keylen) -{ - int aes_keylen; - - switch (keylen) { - case AES_KEYSIZE_128: - aes_keylen = 1; - break; - case AES_KEYSIZE_192: - aes_keylen = 2; - break; - case AES_KEYSIZE_256: - aes_keylen = 3; - break; - default: - aes_keylen = -EINVAL; - break; - } - return aes_keylen; -} - -static int nitrox_skcipher_init(struct crypto_skcipher *tfm) -{ - struct nitrox_crypto_ctx *nctx = crypto_skcipher_ctx(tfm); - struct crypto_ctx_hdr *chdr; - - /* get the first device */ - nctx->ndev = nitrox_get_first_device(); - if (!nctx->ndev) - return -ENODEV; - - /* allocate nitrox crypto context */ - chdr = crypto_alloc_context(nctx->ndev); - if (!chdr) { - nitrox_put_device(nctx->ndev); - return -ENOMEM; - } - nctx->chdr = chdr; - nctx->u.ctx_handle = (uintptr_t)((u8 *)chdr->vaddr + - sizeof(struct ctx_hdr)); - crypto_skcipher_set_reqsize(tfm, crypto_skcipher_reqsize(tfm) + - sizeof(struct nitrox_kcrypt_request)); - return 0; -} - -static void nitrox_skcipher_exit(struct crypto_skcipher *tfm) -{ - struct nitrox_crypto_ctx *nctx = crypto_skcipher_ctx(tfm); - - /* free the nitrox crypto context */ - if (nctx->u.ctx_handle) { - struct flexi_crypto_context *fctx = nctx->u.fctx; - - memset(&fctx->crypto, 0, sizeof(struct crypto_keys)); - memset(&fctx->auth, 0, sizeof(struct auth_keys)); - crypto_free_context((void *)nctx->chdr); - } - nitrox_put_device(nctx->ndev); - - nctx->u.ctx_handle = 0; - nctx->ndev = NULL; -} - -static inline int nitrox_skcipher_setkey(struct crypto_skcipher *cipher, - int aes_keylen, const u8 *key, - unsigned int keylen) -{ - struct crypto_tfm *tfm = crypto_skcipher_tfm(cipher); - struct nitrox_crypto_ctx *nctx = crypto_tfm_ctx(tfm); - struct flexi_crypto_context *fctx; - enum flexi_cipher cipher_type; - const char *name; - - name = crypto_tfm_alg_name(tfm); - cipher_type = flexi_cipher_type(name); - if (unlikely(cipher_type == CIPHER_INVALID)) { - pr_err("unsupported cipher: %s\n", name); - return -EINVAL; - } - - /* fill crypto context */ - fctx = nctx->u.fctx; - fctx->flags = 0; - fctx->w0.cipher_type = cipher_type; - fctx->w0.aes_keylen = aes_keylen; - fctx->w0.iv_source = IV_FROM_DPTR; - fctx->flags = cpu_to_be64(*(u64 *)&fctx->w0); - /* copy the key to context */ - memcpy(fctx->crypto.u.key, key, keylen); - - return 0; -} - -static int nitrox_aes_setkey(struct crypto_skcipher *cipher, const u8 *key, - unsigned int keylen) -{ - int aes_keylen; - - aes_keylen = flexi_aes_keylen(keylen); - if (aes_keylen < 0) { - crypto_skcipher_set_flags(cipher, CRYPTO_TFM_RES_BAD_KEY_LEN); - return -EINVAL; - } - return nitrox_skcipher_setkey(cipher, aes_keylen, key, keylen); -} - -static int alloc_src_sglist(struct skcipher_request *skreq, int ivsize) -{ - struct nitrox_kcrypt_request *nkreq = skcipher_request_ctx(skreq); - int nents = sg_nents(skreq->src) + 1; - struct se_crypto_request *creq = &nkreq->creq; - char *iv; - struct scatterlist *sg; - - /* Allocate buffer to hold IV and input scatterlist array */ - nkreq->src = alloc_req_buf(nents, ivsize, creq->gfp); - if (!nkreq->src) - return -ENOMEM; - - /* copy iv */ - iv = nkreq->src; - memcpy(iv, skreq->iv, ivsize); - - sg = (struct scatterlist *)(iv + ivsize); - creq->src = sg; - sg_init_table(sg, nents); - - /* Input format: - * +----+----------------+ - * | IV | SRC sg entries | - * +----+----------------+ - */ - - /* IV */ - sg = create_single_sg(sg, iv, ivsize); - /* SRC entries */ - create_multi_sg(sg, skreq->src); - - return 0; -} - -static int alloc_dst_sglist(struct skcipher_request *skreq, int ivsize) -{ - struct nitrox_kcrypt_request *nkreq = skcipher_request_ctx(skreq); - int nents = sg_nents(skreq->dst) + 3; - int extralen = ORH_HLEN + COMP_HLEN; - struct se_crypto_request *creq = &nkreq->creq; - struct scatterlist *sg; - char *iv = nkreq->src; - - /* Allocate buffer to hold ORH, COMPLETION and output scatterlist - * array - */ - nkreq->dst = alloc_req_buf(nents, extralen, creq->gfp); - if (!nkreq->dst) - return -ENOMEM; - - creq->orh = (u64 *)(nkreq->dst); - set_orh_value(creq->orh); - - creq->comp = (u64 *)(nkreq->dst + ORH_HLEN); - set_comp_value(creq->comp); - - sg = (struct scatterlist *)(nkreq->dst + ORH_HLEN + COMP_HLEN); - creq->dst = sg; - sg_init_table(sg, nents); - - /* Output format: - * +-----+----+----------------+-----------------+ - * | ORH | IV | DST sg entries | COMPLETION Bytes| - * +-----+----+----------------+-----------------+ - */ - - /* ORH */ - sg = create_single_sg(sg, creq->orh, ORH_HLEN); - /* IV */ - sg = create_single_sg(sg, iv, ivsize); - /* DST entries */ - sg = create_multi_sg(sg, skreq->dst); - /* COMPLETION Bytes */ - create_single_sg(sg, creq->comp, COMP_HLEN); - - return 0; -} - -static void free_src_sglist(struct skcipher_request *skreq) -{ - struct nitrox_kcrypt_request *nkreq = skcipher_request_ctx(skreq); - - kfree(nkreq->src); -} -static void free_dst_sglist(struct skcipher_request *skreq) +int nitrox_crypto_register(void) { - struct nitrox_kcrypt_request *nkreq = skcipher_request_ctx(skreq); + int err; - kfree(nkreq->dst); -} + err = nitrox_register_skciphers(); + if (err) + return err; -static void nitrox_skcipher_callback(struct skcipher_request *skreq, - int err) -{ - free_src_sglist(skreq); - free_dst_sglist(skreq); + err = nitrox_register_aeads(); if (err) { - pr_err_ratelimited("request failed status 0x%0x\n", err); - err = -EINVAL; - } - - skcipher_request_complete(skreq, err); -} - -static int nitrox_skcipher_crypt(struct skcipher_request *skreq, bool enc) -{ - struct crypto_skcipher *cipher = crypto_skcipher_reqtfm(skreq); - struct nitrox_crypto_ctx *nctx = crypto_skcipher_ctx(cipher); - struct nitrox_kcrypt_request *nkreq = skcipher_request_ctx(skreq); - int ivsize = crypto_skcipher_ivsize(cipher); - struct se_crypto_request *creq; - int ret; - - creq = &nkreq->creq; - creq->flags = skreq->base.flags; - creq->gfp = (skreq->base.flags & CRYPTO_TFM_REQ_MAY_SLEEP) ? - GFP_KERNEL : GFP_ATOMIC; - - /* fill the request */ - creq->ctrl.value = 0; - creq->opcode = FLEXI_CRYPTO_ENCRYPT_HMAC; - creq->ctrl.s.arg = (enc ? ENCRYPT : DECRYPT); - /* param0: length of the data to be encrypted */ - creq->gph.param0 = cpu_to_be16(skreq->cryptlen); - creq->gph.param1 = 0; - /* param2: encryption data offset */ - creq->gph.param2 = cpu_to_be16(ivsize); - creq->gph.param3 = 0; - - creq->ctx_handle = nctx->u.ctx_handle; - creq->ctrl.s.ctxl = sizeof(struct flexi_crypto_context); - - ret = alloc_src_sglist(skreq, ivsize); - if (ret) - return ret; - - ret = alloc_dst_sglist(skreq, ivsize); - if (ret) { - free_src_sglist(skreq); - return ret; - } - - nkreq->nctx = nctx; - nkreq->skreq = skreq; - - /* send the crypto request */ - return nitrox_process_se_request(nctx->ndev, creq, - nitrox_skcipher_callback, skreq); -} - -static int nitrox_aes_encrypt(struct skcipher_request *skreq) -{ - return nitrox_skcipher_crypt(skreq, true); -} - -static int nitrox_aes_decrypt(struct skcipher_request *skreq) -{ - return nitrox_skcipher_crypt(skreq, false); -} - -static int nitrox_3des_setkey(struct crypto_skcipher *cipher, - const u8 *key, unsigned int keylen) -{ - if (keylen != DES3_EDE_KEY_SIZE) { - crypto_skcipher_set_flags(cipher, CRYPTO_TFM_RES_BAD_KEY_LEN); - return -EINVAL; - } - - return nitrox_skcipher_setkey(cipher, 0, key, keylen); -} - -static int nitrox_3des_encrypt(struct skcipher_request *skreq) -{ - return nitrox_skcipher_crypt(skreq, true); -} - -static int nitrox_3des_decrypt(struct skcipher_request *skreq) -{ - return nitrox_skcipher_crypt(skreq, false); -} - -static int nitrox_aes_xts_setkey(struct crypto_skcipher *cipher, - const u8 *key, unsigned int keylen) -{ - struct crypto_tfm *tfm = crypto_skcipher_tfm(cipher); - struct nitrox_crypto_ctx *nctx = crypto_tfm_ctx(tfm); - struct flexi_crypto_context *fctx; - int aes_keylen, ret; - - ret = xts_check_key(tfm, key, keylen); - if (ret) - return ret; - - keylen /= 2; - - aes_keylen = flexi_aes_keylen(keylen); - if (aes_keylen < 0) { - crypto_skcipher_set_flags(cipher, CRYPTO_TFM_RES_BAD_KEY_LEN); - return -EINVAL; + nitrox_unregister_skciphers(); + return err; } - fctx = nctx->u.fctx; - /* copy KEY2 */ - memcpy(fctx->auth.u.key2, (key + keylen), keylen); - - return nitrox_skcipher_setkey(cipher, aes_keylen, key, keylen); -} - -static int nitrox_aes_ctr_rfc3686_setkey(struct crypto_skcipher *cipher, - const u8 *key, unsigned int keylen) -{ - struct crypto_tfm *tfm = crypto_skcipher_tfm(cipher); - struct nitrox_crypto_ctx *nctx = crypto_tfm_ctx(tfm); - struct flexi_crypto_context *fctx; - int aes_keylen; - - if (keylen < CTR_RFC3686_NONCE_SIZE) - return -EINVAL; - - fctx = nctx->u.fctx; - - memcpy(fctx->crypto.iv, key + (keylen - CTR_RFC3686_NONCE_SIZE), - CTR_RFC3686_NONCE_SIZE); - - keylen -= CTR_RFC3686_NONCE_SIZE; - - aes_keylen = flexi_aes_keylen(keylen); - if (aes_keylen < 0) { - crypto_skcipher_set_flags(cipher, CRYPTO_TFM_RES_BAD_KEY_LEN); - return -EINVAL; - } - return nitrox_skcipher_setkey(cipher, aes_keylen, key, keylen); -} - -static struct skcipher_alg nitrox_skciphers[] = { { - .base = { - .cra_name = "cbc(aes)", - .cra_driver_name = "n5_cbc(aes)", - .cra_priority = PRIO, - .cra_flags = CRYPTO_ALG_ASYNC, - .cra_blocksize = AES_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct nitrox_crypto_ctx), - .cra_alignmask = 0, - .cra_module = THIS_MODULE, - }, - .min_keysize = AES_MIN_KEY_SIZE, - .max_keysize = AES_MAX_KEY_SIZE, - .ivsize = AES_BLOCK_SIZE, - .setkey = nitrox_aes_setkey, - .encrypt = nitrox_aes_encrypt, - .decrypt = nitrox_aes_decrypt, - .init = nitrox_skcipher_init, - .exit = nitrox_skcipher_exit, -}, { - .base = { - .cra_name = "ecb(aes)", - .cra_driver_name = "n5_ecb(aes)", - .cra_priority = PRIO, - .cra_flags = CRYPTO_ALG_ASYNC, - .cra_blocksize = AES_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct nitrox_crypto_ctx), - .cra_alignmask = 0, - .cra_module = THIS_MODULE, - }, - .min_keysize = AES_MIN_KEY_SIZE, - .max_keysize = AES_MAX_KEY_SIZE, - .ivsize = AES_BLOCK_SIZE, - .setkey = nitrox_aes_setkey, - .encrypt = nitrox_aes_encrypt, - .decrypt = nitrox_aes_decrypt, - .init = nitrox_skcipher_init, - .exit = nitrox_skcipher_exit, -}, { - .base = { - .cra_name = "cfb(aes)", - .cra_driver_name = "n5_cfb(aes)", - .cra_priority = PRIO, - .cra_flags = CRYPTO_ALG_ASYNC, - .cra_blocksize = AES_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct nitrox_crypto_ctx), - .cra_alignmask = 0, - .cra_module = THIS_MODULE, - }, - .min_keysize = AES_MIN_KEY_SIZE, - .max_keysize = AES_MAX_KEY_SIZE, - .ivsize = AES_BLOCK_SIZE, - .setkey = nitrox_aes_setkey, - .encrypt = nitrox_aes_encrypt, - .decrypt = nitrox_aes_decrypt, - .init = nitrox_skcipher_init, - .exit = nitrox_skcipher_exit, -}, { - .base = { - .cra_name = "xts(aes)", - .cra_driver_name = "n5_xts(aes)", - .cra_priority = PRIO, - .cra_flags = CRYPTO_ALG_ASYNC, - .cra_blocksize = AES_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct nitrox_crypto_ctx), - .cra_alignmask = 0, - .cra_module = THIS_MODULE, - }, - .min_keysize = 2 * AES_MIN_KEY_SIZE, - .max_keysize = 2 * AES_MAX_KEY_SIZE, - .ivsize = AES_BLOCK_SIZE, - .setkey = nitrox_aes_xts_setkey, - .encrypt = nitrox_aes_encrypt, - .decrypt = nitrox_aes_decrypt, - .init = nitrox_skcipher_init, - .exit = nitrox_skcipher_exit, -}, { - .base = { - .cra_name = "rfc3686(ctr(aes))", - .cra_driver_name = "n5_rfc3686(ctr(aes))", - .cra_priority = PRIO, - .cra_flags = CRYPTO_ALG_ASYNC, - .cra_blocksize = 1, - .cra_ctxsize = sizeof(struct nitrox_crypto_ctx), - .cra_alignmask = 0, - .cra_module = THIS_MODULE, - }, - .min_keysize = AES_MIN_KEY_SIZE + CTR_RFC3686_NONCE_SIZE, - .max_keysize = AES_MAX_KEY_SIZE + CTR_RFC3686_NONCE_SIZE, - .ivsize = CTR_RFC3686_IV_SIZE, - .init = nitrox_skcipher_init, - .exit = nitrox_skcipher_exit, - .setkey = nitrox_aes_ctr_rfc3686_setkey, - .encrypt = nitrox_aes_encrypt, - .decrypt = nitrox_aes_decrypt, -}, { - .base = { - .cra_name = "cts(cbc(aes))", - .cra_driver_name = "n5_cts(cbc(aes))", - .cra_priority = PRIO, - .cra_flags = CRYPTO_ALG_ASYNC, - .cra_blocksize = AES_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct nitrox_crypto_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_ablkcipher_type, - .cra_module = THIS_MODULE, - }, - .min_keysize = AES_MIN_KEY_SIZE, - .max_keysize = AES_MAX_KEY_SIZE, - .ivsize = AES_BLOCK_SIZE, - .setkey = nitrox_aes_setkey, - .encrypt = nitrox_aes_encrypt, - .decrypt = nitrox_aes_decrypt, - .init = nitrox_skcipher_init, - .exit = nitrox_skcipher_exit, -}, { - .base = { - .cra_name = "cbc(des3_ede)", - .cra_driver_name = "n5_cbc(des3_ede)", - .cra_priority = PRIO, - .cra_flags = CRYPTO_ALG_ASYNC, - .cra_blocksize = DES3_EDE_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct nitrox_crypto_ctx), - .cra_alignmask = 0, - .cra_module = THIS_MODULE, - }, - .min_keysize = DES3_EDE_KEY_SIZE, - .max_keysize = DES3_EDE_KEY_SIZE, - .ivsize = DES3_EDE_BLOCK_SIZE, - .setkey = nitrox_3des_setkey, - .encrypt = nitrox_3des_encrypt, - .decrypt = nitrox_3des_decrypt, - .init = nitrox_skcipher_init, - .exit = nitrox_skcipher_exit, -}, { - .base = { - .cra_name = "ecb(des3_ede)", - .cra_driver_name = "n5_ecb(des3_ede)", - .cra_priority = PRIO, - .cra_flags = CRYPTO_ALG_ASYNC, - .cra_blocksize = DES3_EDE_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct nitrox_crypto_ctx), - .cra_alignmask = 0, - .cra_module = THIS_MODULE, - }, - .min_keysize = DES3_EDE_KEY_SIZE, - .max_keysize = DES3_EDE_KEY_SIZE, - .ivsize = DES3_EDE_BLOCK_SIZE, - .setkey = nitrox_3des_setkey, - .encrypt = nitrox_3des_encrypt, - .decrypt = nitrox_3des_decrypt, - .init = nitrox_skcipher_init, - .exit = nitrox_skcipher_exit, -} - -}; - -int nitrox_crypto_register(void) -{ - return crypto_register_skciphers(nitrox_skciphers, - ARRAY_SIZE(nitrox_skciphers)); + return 0; } void nitrox_crypto_unregister(void) { - crypto_unregister_skciphers(nitrox_skciphers, - ARRAY_SIZE(nitrox_skciphers)); + nitrox_unregister_aeads(); + nitrox_unregister_skciphers(); } diff --git a/drivers/crypto/cavium/nitrox/nitrox_common.h b/drivers/crypto/cavium/nitrox/nitrox_common.h index 863143a8336b..e4be69d7e6e5 100644 --- a/drivers/crypto/cavium/nitrox/nitrox_common.h +++ b/drivers/crypto/cavium/nitrox/nitrox_common.h @@ -7,6 +7,10 @@ int nitrox_crypto_register(void); void nitrox_crypto_unregister(void); +int nitrox_register_aeads(void); +void nitrox_unregister_aeads(void); +int nitrox_register_skciphers(void); +void nitrox_unregister_skciphers(void); void *crypto_alloc_context(struct nitrox_device *ndev); void crypto_free_context(void *ctx); struct nitrox_device *nitrox_get_first_device(void); @@ -19,7 +23,7 @@ void pkt_slc_resp_tasklet(unsigned long data); int nitrox_process_se_request(struct nitrox_device *ndev, struct se_crypto_request *req, completion_t cb, - struct skcipher_request *skreq); + void *cb_arg); void backlog_qflush_work(struct work_struct *work); diff --git a/drivers/crypto/cavium/nitrox/nitrox_req.h b/drivers/crypto/cavium/nitrox/nitrox_req.h index d45ff91c19a9..76c0f0be7233 100644 --- a/drivers/crypto/cavium/nitrox/nitrox_req.h +++ b/drivers/crypto/cavium/nitrox/nitrox_req.h @@ -8,6 +8,7 @@ #include "nitrox_dev.h" #define PENDING_SIG 0xFFFFFFFFFFFFFFFFUL +#define PRIO 4001 /** * struct gphdr - General purpose Header @@ -106,6 +107,18 @@ enum flexi_cipher { CIPHER_INVALID }; +enum flexi_auth { + AUTH_NULL = 0, + AUTH_MD5, + AUTH_SHA1, + AUTH_SHA2_SHA224, + AUTH_SHA2_SHA256, + AUTH_SHA2_SHA384, + AUTH_SHA2_SHA512, + AUTH_GMAC, + AUTH_INVALID +}; + /** * struct crypto_keys - Crypto keys * @key: Encryption key or KEY1 for AES-XTS @@ -132,6 +145,32 @@ struct auth_keys { u8 opad[64]; }; +union fc_ctx_flags { + __be64 f; + struct { +#if defined(__BIG_ENDIAN_BITFIELD) + u64 cipher_type : 4; + u64 reserved_59 : 1; + u64 aes_keylen : 2; + u64 iv_source : 1; + u64 hash_type : 4; + u64 reserved_49_51 : 3; + u64 auth_input_type: 1; + u64 mac_len : 8; + u64 reserved_0_39 : 40; +#else + u64 reserved_0_39 : 40; + u64 mac_len : 8; + u64 auth_input_type: 1; + u64 reserved_49_51 : 3; + u64 hash_type : 4; + u64 iv_source : 1; + u64 aes_keylen : 2; + u64 reserved_59 : 1; + u64 cipher_type : 4; +#endif + } w0; +}; /** * struct flexi_crypto_context - Crypto context * @cipher_type: Encryption cipher type @@ -146,33 +185,7 @@ struct auth_keys { * @auth: Authentication keys */ struct flexi_crypto_context { - union { - __be64 flags; - struct { -#if defined(__BIG_ENDIAN_BITFIELD) - u64 cipher_type : 4; - u64 reserved_59 : 1; - u64 aes_keylen : 2; - u64 iv_source : 1; - u64 hash_type : 4; - u64 reserved_49_51 : 3; - u64 auth_input_type: 1; - u64 mac_len : 8; - u64 reserved_0_39 : 40; -#else - u64 reserved_0_39 : 40; - u64 mac_len : 8; - u64 auth_input_type: 1; - u64 reserved_49_51 : 3; - u64 hash_type : 4; - u64 iv_source : 1; - u64 aes_keylen : 2; - u64 reserved_59 : 1; - u64 cipher_type : 4; -#endif - } w0; - }; - + union fc_ctx_flags flags; struct crypto_keys crypto; struct auth_keys auth; }; @@ -194,8 +207,6 @@ struct nitrox_crypto_ctx { struct nitrox_kcrypt_request { struct se_crypto_request creq; - struct nitrox_crypto_ctx *nctx; - struct skcipher_request *skreq; u8 *src; u8 *dst; }; @@ -400,7 +411,7 @@ struct resp_hdr { u64 *completion; }; -typedef void (*completion_t)(struct skcipher_request *skreq, int err); +typedef void (*completion_t)(void *arg, int err); /** * struct nitrox_softreq - Represents the NIROX Request. @@ -435,9 +446,30 @@ struct nitrox_softreq { unsigned long tstamp; completion_t callback; - struct skcipher_request *skreq; + void *cb_arg; }; +static inline int flexi_aes_keylen(int keylen) +{ + int aes_keylen; + + switch (keylen) { + case AES_KEYSIZE_128: + aes_keylen = 1; + break; + case AES_KEYSIZE_192: + aes_keylen = 2; + break; + case AES_KEYSIZE_256: + aes_keylen = 3; + break; + default: + aes_keylen = -EINVAL; + break; + } + return aes_keylen; +} + static inline void *alloc_req_buf(int nents, int extralen, gfp_t gfp) { size_t size; @@ -448,6 +480,14 @@ static inline void *alloc_req_buf(int nents, int extralen, gfp_t gfp) return kzalloc(size, gfp); } +/** + * create_single_sg - Point SG entry to the data + * @sg: Destination SG list + * @buf: Data + * @buflen: Data length + * + * Returns next free entry in the destination SG list + **/ static inline struct scatterlist *create_single_sg(struct scatterlist *sg, void *buf, int buflen) { @@ -456,18 +496,33 @@ static inline struct scatterlist *create_single_sg(struct scatterlist *sg, return sg; } +/** + * create_multi_sg - Create multiple sg entries with buflen data length from + * source sglist + * @to_sg: Destination SG list + * @from_sg: Source SG list + * @buflen: Data length + * + * Returns next free entry in the destination SG list + **/ static inline struct scatterlist *create_multi_sg(struct scatterlist *to_sg, - struct scatterlist *from_sg) + struct scatterlist *from_sg, + int buflen) { - struct scatterlist *sg; - int i; + struct scatterlist *sg = to_sg; + unsigned int sglen; + + for (; buflen; buflen -= sglen) { + sglen = from_sg->length; + if (sglen > buflen) + sglen = buflen; - for_each_sg(from_sg, sg, sg_nents(from_sg), i) { - sg_set_buf(to_sg, sg_virt(sg), sg->length); - to_sg++; + sg_set_buf(sg, sg_virt(from_sg), sglen); + from_sg = sg_next(from_sg); + sg++; } - return to_sg; + return sg; } static inline void set_orh_value(u64 *orh) @@ -480,4 +535,112 @@ static inline void set_comp_value(u64 *comp) WRITE_ONCE(*comp, PENDING_SIG); } +static inline int alloc_src_req_buf(struct nitrox_kcrypt_request *nkreq, + int nents, int ivsize) +{ + struct se_crypto_request *creq = &nkreq->creq; + + nkreq->src = alloc_req_buf(nents, ivsize, creq->gfp); + if (!nkreq->src) + return -ENOMEM; + + return 0; +} + +static inline void nitrox_creq_copy_iv(char *dst, char *src, int size) +{ + memcpy(dst, src, size); +} + +static inline struct scatterlist *nitrox_creq_src_sg(char *iv, int ivsize) +{ + return (struct scatterlist *)(iv + ivsize); +} + +static inline void nitrox_creq_set_src_sg(struct nitrox_kcrypt_request *nkreq, + int nents, int ivsize, + struct scatterlist *src, int buflen) +{ + char *iv = nkreq->src; + struct scatterlist *sg; + struct se_crypto_request *creq = &nkreq->creq; + + creq->src = nitrox_creq_src_sg(iv, ivsize); + sg = creq->src; + sg_init_table(sg, nents); + + /* Input format: + * +----+----------------+ + * | IV | SRC sg entries | + * +----+----------------+ + */ + + /* IV */ + sg = create_single_sg(sg, iv, ivsize); + /* SRC entries */ + create_multi_sg(sg, src, buflen); +} + +static inline int alloc_dst_req_buf(struct nitrox_kcrypt_request *nkreq, + int nents) +{ + int extralen = ORH_HLEN + COMP_HLEN; + struct se_crypto_request *creq = &nkreq->creq; + + nkreq->dst = alloc_req_buf(nents, extralen, creq->gfp); + if (!nkreq->dst) + return -ENOMEM; + + return 0; +} + +static inline void nitrox_creq_set_orh(struct nitrox_kcrypt_request *nkreq) +{ + struct se_crypto_request *creq = &nkreq->creq; + + creq->orh = (u64 *)(nkreq->dst); + set_orh_value(creq->orh); +} + +static inline void nitrox_creq_set_comp(struct nitrox_kcrypt_request *nkreq) +{ + struct se_crypto_request *creq = &nkreq->creq; + + creq->comp = (u64 *)(nkreq->dst + ORH_HLEN); + set_comp_value(creq->comp); +} + +static inline struct scatterlist *nitrox_creq_dst_sg(char *dst) +{ + return (struct scatterlist *)(dst + ORH_HLEN + COMP_HLEN); +} + +static inline void nitrox_creq_set_dst_sg(struct nitrox_kcrypt_request *nkreq, + int nents, int ivsize, + struct scatterlist *dst, int buflen) +{ + struct se_crypto_request *creq = &nkreq->creq; + struct scatterlist *sg; + char *iv = nkreq->src; + + creq->dst = nitrox_creq_dst_sg(nkreq->dst); + sg = creq->dst; + sg_init_table(sg, nents); + + /* Output format: + * +-----+----+----------------+-----------------+ + * | ORH | IV | DST sg entries | COMPLETION Bytes| + * +-----+----+----------------+-----------------+ + */ + + /* ORH */ + sg = create_single_sg(sg, creq->orh, ORH_HLEN); + /* IV */ + sg = create_single_sg(sg, iv, ivsize); + /* DST entries */ + sg = create_multi_sg(sg, dst, buflen); + /* COMPLETION Bytes */ + create_single_sg(sg, creq->comp, COMP_HLEN); +} + #endif /* __NITROX_REQ_H */ diff --git a/drivers/crypto/cavium/nitrox/nitrox_reqmgr.c b/drivers/crypto/cavium/nitrox/nitrox_reqmgr.c index d566bb904ec2..e34e4df8fd24 100644 --- a/drivers/crypto/cavium/nitrox/nitrox_reqmgr.c +++ b/drivers/crypto/cavium/nitrox/nitrox_reqmgr.c @@ -269,6 +269,8 @@ static inline bool cmdq_full(struct nitrox_cmdq *cmdq, int qlen) smp_mb__after_atomic(); return true; } + /* sync with other cpus */ + smp_mb__after_atomic(); return false; } @@ -324,8 +326,6 @@ static int post_backlog_cmds(struct nitrox_cmdq *cmdq) spin_lock_bh(&cmdq->backlog_qlock); list_for_each_entry_safe(sr, tmp, &cmdq->backlog_head, backlog) { - struct skcipher_request *skreq; - /* submit until space available */ if (unlikely(cmdq_full(cmdq, ndev->qlen))) { ret = -ENOSPC; @@ -337,12 +337,8 @@ static int post_backlog_cmds(struct nitrox_cmdq *cmdq) /* sync with other cpus */ smp_mb__after_atomic(); - skreq = sr->skreq; /* post the command */ post_se_instr(sr, cmdq); - - /* backlog requests are posted, wakeup with -EINPROGRESS */ - skcipher_request_complete(skreq, -EINPROGRESS); } spin_unlock_bh(&cmdq->backlog_qlock); @@ -365,7 +361,7 @@ static int nitrox_enqueue_request(struct nitrox_softreq *sr) } /* add to backlog list */ backlog_list_add(sr, cmdq); - return -EBUSY; + return -EINPROGRESS; } post_se_instr(sr, cmdq); @@ -382,7 +378,7 @@ static int nitrox_enqueue_request(struct nitrox_softreq *sr) int nitrox_process_se_request(struct nitrox_device *ndev, struct se_crypto_request *req, completion_t callback, - struct skcipher_request *skreq) + void *cb_arg) { struct nitrox_softreq *sr; dma_addr_t ctx_handle = 0; @@ -399,7 +395,7 @@ int nitrox_process_se_request(struct nitrox_device *ndev, sr->flags = req->flags; sr->gfp = req->gfp; sr->callback = callback; - sr->skreq = skreq; + sr->cb_arg = cb_arg; atomic_set(&sr->status, REQ_NOT_POSTED); @@ -513,7 +509,20 @@ void backlog_qflush_work(struct work_struct *work) static bool sr_completed(struct nitrox_softreq *sr) { - return (READ_ONCE(*sr->resp.orh) != READ_ONCE(*sr->resp.completion)); + u64 orh = READ_ONCE(*sr->resp.orh); + unsigned long timeout = jiffies + msecs_to_jiffies(1); + + if ((orh != PENDING_SIG) && (orh & 0xff)) + return true; + + while (READ_ONCE(*sr->resp.completion) == PENDING_SIG) { + if (time_after(jiffies, timeout)) { + pr_err("comp not done\n"); + return false; + } + } + + return true; } /** @@ -527,8 +536,6 @@ static void process_response_list(struct nitrox_cmdq *cmdq) { struct nitrox_device *ndev = cmdq->ndev; struct nitrox_softreq *sr; - struct skcipher_request *skreq; - completion_t callback; int req_completed = 0, err = 0, budget; /* check all pending requests */ @@ -558,15 +565,12 @@ static void process_response_list(struct nitrox_cmdq *cmdq) /* remove from response list */ response_list_del(sr, cmdq); - callback = sr->callback; - skreq = sr->skreq; - /* ORH error code */ err = READ_ONCE(*sr->resp.orh) & 0xff; softreq_destroy(sr); - if (callback) - callback(skreq, err); + if (sr->callback) + sr->callback(sr->cb_arg, err); req_completed++; } diff --git a/drivers/crypto/cavium/nitrox/nitrox_skcipher.c b/drivers/crypto/cavium/nitrox/nitrox_skcipher.c new file mode 100644 index 000000000000..d4935d6cefdd --- /dev/null +++ b/drivers/crypto/cavium/nitrox/nitrox_skcipher.c @@ -0,0 +1,498 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include "nitrox_dev.h" +#include "nitrox_common.h" +#include "nitrox_req.h" + +struct nitrox_cipher { + const char *name; + enum flexi_cipher value; +}; + +/** + * supported cipher list + */ +static const struct nitrox_cipher flexi_cipher_table[] = { + { "null", CIPHER_NULL }, + { "cbc(des3_ede)", CIPHER_3DES_CBC }, + { "ecb(des3_ede)", CIPHER_3DES_ECB }, + { "cbc(aes)", CIPHER_AES_CBC }, + { "ecb(aes)", CIPHER_AES_ECB }, + { "cfb(aes)", CIPHER_AES_CFB }, + { "rfc3686(ctr(aes))", CIPHER_AES_CTR }, + { "xts(aes)", CIPHER_AES_XTS }, + { "cts(cbc(aes))", CIPHER_AES_CBC_CTS }, + { NULL, CIPHER_INVALID } +}; + +static enum flexi_cipher flexi_cipher_type(const char *name) +{ + const struct nitrox_cipher *cipher = flexi_cipher_table; + + while (cipher->name) { + if (!strcmp(cipher->name, name)) + break; + cipher++; + } + return cipher->value; +} + +static int nitrox_skcipher_init(struct crypto_skcipher *tfm) +{ + struct nitrox_crypto_ctx *nctx = crypto_skcipher_ctx(tfm); + struct crypto_ctx_hdr *chdr; + + /* get the first device */ + nctx->ndev = nitrox_get_first_device(); + if (!nctx->ndev) + return -ENODEV; + + /* allocate nitrox crypto context */ + chdr = crypto_alloc_context(nctx->ndev); + if (!chdr) { + nitrox_put_device(nctx->ndev); + return -ENOMEM; + } + nctx->chdr = chdr; + nctx->u.ctx_handle = (uintptr_t)((u8 *)chdr->vaddr + + sizeof(struct ctx_hdr)); + crypto_skcipher_set_reqsize(tfm, crypto_skcipher_reqsize(tfm) + + sizeof(struct nitrox_kcrypt_request)); + return 0; +} + +static void nitrox_skcipher_exit(struct crypto_skcipher *tfm) +{ + struct nitrox_crypto_ctx *nctx = crypto_skcipher_ctx(tfm); + + /* free the nitrox crypto context */ + if (nctx->u.ctx_handle) { + struct flexi_crypto_context *fctx = nctx->u.fctx; + + memzero_explicit(&fctx->crypto, sizeof(struct crypto_keys)); + memzero_explicit(&fctx->auth, sizeof(struct auth_keys)); + crypto_free_context((void *)nctx->chdr); + } + nitrox_put_device(nctx->ndev); + + nctx->u.ctx_handle = 0; + nctx->ndev = NULL; +} + +static inline int nitrox_skcipher_setkey(struct crypto_skcipher *cipher, + int aes_keylen, const u8 *key, + unsigned int keylen) +{ + struct crypto_tfm *tfm = crypto_skcipher_tfm(cipher); + struct nitrox_crypto_ctx *nctx = crypto_tfm_ctx(tfm); + struct flexi_crypto_context *fctx; + union fc_ctx_flags *flags; + enum flexi_cipher cipher_type; + const char *name; + + name = crypto_tfm_alg_name(tfm); + cipher_type = flexi_cipher_type(name); + if (unlikely(cipher_type == CIPHER_INVALID)) { + pr_err("unsupported cipher: %s\n", name); + return -EINVAL; + } + + /* fill crypto context */ + fctx = nctx->u.fctx; + flags = &fctx->flags; + flags->f = 0; + flags->w0.cipher_type = cipher_type; + flags->w0.aes_keylen = aes_keylen; + flags->w0.iv_source = IV_FROM_DPTR; + flags->f = cpu_to_be64(*(u64 *)&flags->w0); + /* copy the key to context */ + memcpy(fctx->crypto.u.key, key, keylen); + + return 0; +} + +static int nitrox_aes_setkey(struct crypto_skcipher *cipher, const u8 *key, + unsigned int keylen) +{ + int aes_keylen; + + aes_keylen = flexi_aes_keylen(keylen); + if (aes_keylen < 0) { + crypto_skcipher_set_flags(cipher, CRYPTO_TFM_RES_BAD_KEY_LEN); + return -EINVAL; + } + return nitrox_skcipher_setkey(cipher, aes_keylen, key, keylen); +} + +static int alloc_src_sglist(struct skcipher_request *skreq, int ivsize) +{ + struct nitrox_kcrypt_request *nkreq = skcipher_request_ctx(skreq); + int nents = sg_nents(skreq->src) + 1; + int ret; + + /* Allocate buffer to hold IV and input scatterlist array */ + ret = alloc_src_req_buf(nkreq, nents, ivsize); + if (ret) + return ret; + + nitrox_creq_copy_iv(nkreq->src, skreq->iv, ivsize); + nitrox_creq_set_src_sg(nkreq, nents, ivsize, skreq->src, + skreq->cryptlen); + + return 0; +} + +static int alloc_dst_sglist(struct skcipher_request *skreq, int ivsize) +{ + struct nitrox_kcrypt_request *nkreq = skcipher_request_ctx(skreq); + int nents = sg_nents(skreq->dst) + 3; + int ret; + + /* Allocate buffer to hold ORH, COMPLETION and output scatterlist + * array + */ + ret = alloc_dst_req_buf(nkreq, nents); + if (ret) + return ret; + + nitrox_creq_set_orh(nkreq); + nitrox_creq_set_comp(nkreq); + nitrox_creq_set_dst_sg(nkreq, nents, ivsize, skreq->dst, + skreq->cryptlen); + + return 0; +} + +static void free_src_sglist(struct skcipher_request *skreq) +{ + struct nitrox_kcrypt_request *nkreq = skcipher_request_ctx(skreq); + + kfree(nkreq->src); +} + +static void free_dst_sglist(struct skcipher_request *skreq) +{ + struct nitrox_kcrypt_request *nkreq = skcipher_request_ctx(skreq); + + kfree(nkreq->dst); +} + +static void nitrox_skcipher_callback(void *arg, int err) +{ + struct skcipher_request *skreq = arg; + + free_src_sglist(skreq); + free_dst_sglist(skreq); + if (err) { + pr_err_ratelimited("request failed status 0x%0x\n", err); + err = -EINVAL; + } + + skcipher_request_complete(skreq, err); +} + +static int nitrox_skcipher_crypt(struct skcipher_request *skreq, bool enc) +{ + struct crypto_skcipher *cipher = crypto_skcipher_reqtfm(skreq); + struct nitrox_crypto_ctx *nctx = crypto_skcipher_ctx(cipher); + struct nitrox_kcrypt_request *nkreq = skcipher_request_ctx(skreq); + int ivsize = crypto_skcipher_ivsize(cipher); + struct se_crypto_request *creq; + int ret; + + creq = &nkreq->creq; + creq->flags = skreq->base.flags; + creq->gfp = (skreq->base.flags & CRYPTO_TFM_REQ_MAY_SLEEP) ? + GFP_KERNEL : GFP_ATOMIC; + + /* fill the request */ + creq->ctrl.value = 0; + creq->opcode = FLEXI_CRYPTO_ENCRYPT_HMAC; + creq->ctrl.s.arg = (enc ? ENCRYPT : DECRYPT); + /* param0: length of the data to be encrypted */ + creq->gph.param0 = cpu_to_be16(skreq->cryptlen); + creq->gph.param1 = 0; + /* param2: encryption data offset */ + creq->gph.param2 = cpu_to_be16(ivsize); + creq->gph.param3 = 0; + + creq->ctx_handle = nctx->u.ctx_handle; + creq->ctrl.s.ctxl = sizeof(struct flexi_crypto_context); + + ret = alloc_src_sglist(skreq, ivsize); + if (ret) + return ret; + + ret = alloc_dst_sglist(skreq, ivsize); + if (ret) { + free_src_sglist(skreq); + return ret; + } + + /* send the crypto request */ + return nitrox_process_se_request(nctx->ndev, creq, + nitrox_skcipher_callback, skreq); +} + +static int nitrox_aes_encrypt(struct skcipher_request *skreq) +{ + return nitrox_skcipher_crypt(skreq, true); +} + +static int nitrox_aes_decrypt(struct skcipher_request *skreq) +{ + return nitrox_skcipher_crypt(skreq, false); +} + +static int nitrox_3des_setkey(struct crypto_skcipher *cipher, + const u8 *key, unsigned int keylen) +{ + if (keylen != DES3_EDE_KEY_SIZE) { + crypto_skcipher_set_flags(cipher, CRYPTO_TFM_RES_BAD_KEY_LEN); + return -EINVAL; + } + + return nitrox_skcipher_setkey(cipher, 0, key, keylen); +} + +static int nitrox_3des_encrypt(struct skcipher_request *skreq) +{ + return nitrox_skcipher_crypt(skreq, true); +} + +static int nitrox_3des_decrypt(struct skcipher_request *skreq) +{ + return nitrox_skcipher_crypt(skreq, false); +} + +static int nitrox_aes_xts_setkey(struct crypto_skcipher *cipher, + const u8 *key, unsigned int keylen) +{ + struct crypto_tfm *tfm = crypto_skcipher_tfm(cipher); + struct nitrox_crypto_ctx *nctx = crypto_tfm_ctx(tfm); + struct flexi_crypto_context *fctx; + int aes_keylen, ret; + + ret = xts_check_key(tfm, key, keylen); + if (ret) + return ret; + + keylen /= 2; + + aes_keylen = flexi_aes_keylen(keylen); + if (aes_keylen < 0) { + crypto_skcipher_set_flags(cipher, CRYPTO_TFM_RES_BAD_KEY_LEN); + return -EINVAL; + } + + fctx = nctx->u.fctx; + /* copy KEY2 */ + memcpy(fctx->auth.u.key2, (key + keylen), keylen); + + return nitrox_skcipher_setkey(cipher, aes_keylen, key, keylen); +} + +static int nitrox_aes_ctr_rfc3686_setkey(struct crypto_skcipher *cipher, + const u8 *key, unsigned int keylen) +{ + struct crypto_tfm *tfm = crypto_skcipher_tfm(cipher); + struct nitrox_crypto_ctx *nctx = crypto_tfm_ctx(tfm); + struct flexi_crypto_context *fctx; + int aes_keylen; + + if (keylen < CTR_RFC3686_NONCE_SIZE) + return -EINVAL; + + fctx = nctx->u.fctx; + + memcpy(fctx->crypto.iv, key + (keylen - CTR_RFC3686_NONCE_SIZE), + CTR_RFC3686_NONCE_SIZE); + + keylen -= CTR_RFC3686_NONCE_SIZE; + + aes_keylen = flexi_aes_keylen(keylen); + if (aes_keylen < 0) { + crypto_skcipher_set_flags(cipher, CRYPTO_TFM_RES_BAD_KEY_LEN); + return -EINVAL; + } + return nitrox_skcipher_setkey(cipher, aes_keylen, key, keylen); +} + +static struct skcipher_alg nitrox_skciphers[] = { { + .base = { + .cra_name = "cbc(aes)", + .cra_driver_name = "n5_cbc(aes)", + .cra_priority = PRIO, + .cra_flags = CRYPTO_ALG_ASYNC, + .cra_blocksize = AES_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct nitrox_crypto_ctx), + .cra_alignmask = 0, + .cra_module = THIS_MODULE, + }, + .min_keysize = AES_MIN_KEY_SIZE, + .max_keysize = AES_MAX_KEY_SIZE, + .ivsize = AES_BLOCK_SIZE, + .setkey = nitrox_aes_setkey, + .encrypt = nitrox_aes_encrypt, + .decrypt = nitrox_aes_decrypt, + .init = nitrox_skcipher_init, + .exit = nitrox_skcipher_exit, +}, { + .base = { + .cra_name = "ecb(aes)", + .cra_driver_name = "n5_ecb(aes)", + .cra_priority = PRIO, + .cra_flags = CRYPTO_ALG_ASYNC, + .cra_blocksize = AES_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct nitrox_crypto_ctx), + .cra_alignmask = 0, + .cra_module = THIS_MODULE, + }, + .min_keysize = AES_MIN_KEY_SIZE, + .max_keysize = AES_MAX_KEY_SIZE, + .ivsize = AES_BLOCK_SIZE, + .setkey = nitrox_aes_setkey, + .encrypt = nitrox_aes_encrypt, + .decrypt = nitrox_aes_decrypt, + .init = nitrox_skcipher_init, + .exit = nitrox_skcipher_exit, +}, { + .base = { + .cra_name = "cfb(aes)", + .cra_driver_name = "n5_cfb(aes)", + .cra_priority = PRIO, + .cra_flags = CRYPTO_ALG_ASYNC, + .cra_blocksize = AES_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct nitrox_crypto_ctx), + .cra_alignmask = 0, + .cra_module = THIS_MODULE, + }, + .min_keysize = AES_MIN_KEY_SIZE, + .max_keysize = AES_MAX_KEY_SIZE, + .ivsize = AES_BLOCK_SIZE, + .setkey = nitrox_aes_setkey, + .encrypt = nitrox_aes_encrypt, + .decrypt = nitrox_aes_decrypt, + .init = nitrox_skcipher_init, + .exit = nitrox_skcipher_exit, +}, { + .base = { + .cra_name = "xts(aes)", + .cra_driver_name = "n5_xts(aes)", + .cra_priority = PRIO, + .cra_flags = CRYPTO_ALG_ASYNC, + .cra_blocksize = AES_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct nitrox_crypto_ctx), + .cra_alignmask = 0, + .cra_module = THIS_MODULE, + }, + .min_keysize = 2 * AES_MIN_KEY_SIZE, + .max_keysize = 2 * AES_MAX_KEY_SIZE, + .ivsize = AES_BLOCK_SIZE, + .setkey = nitrox_aes_xts_setkey, + .encrypt = nitrox_aes_encrypt, + .decrypt = nitrox_aes_decrypt, + .init = nitrox_skcipher_init, + .exit = nitrox_skcipher_exit, +}, { + .base = { + .cra_name = "rfc3686(ctr(aes))", + .cra_driver_name = "n5_rfc3686(ctr(aes))", + .cra_priority = PRIO, + .cra_flags = CRYPTO_ALG_ASYNC, + .cra_blocksize = 1, + .cra_ctxsize = sizeof(struct nitrox_crypto_ctx), + .cra_alignmask = 0, + .cra_module = THIS_MODULE, + }, + .min_keysize = AES_MIN_KEY_SIZE + CTR_RFC3686_NONCE_SIZE, + .max_keysize = AES_MAX_KEY_SIZE + CTR_RFC3686_NONCE_SIZE, + .ivsize = CTR_RFC3686_IV_SIZE, + .init = nitrox_skcipher_init, + .exit = nitrox_skcipher_exit, + .setkey = nitrox_aes_ctr_rfc3686_setkey, + .encrypt = nitrox_aes_encrypt, + .decrypt = nitrox_aes_decrypt, +}, { + .base = { + .cra_name = "cts(cbc(aes))", + .cra_driver_name = "n5_cts(cbc(aes))", + .cra_priority = PRIO, + .cra_flags = CRYPTO_ALG_ASYNC, + .cra_blocksize = AES_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct nitrox_crypto_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_ablkcipher_type, + .cra_module = THIS_MODULE, + }, + .min_keysize = AES_MIN_KEY_SIZE, + .max_keysize = AES_MAX_KEY_SIZE, + .ivsize = AES_BLOCK_SIZE, + .setkey = nitrox_aes_setkey, + .encrypt = nitrox_aes_encrypt, + .decrypt = nitrox_aes_decrypt, + .init = nitrox_skcipher_init, + .exit = nitrox_skcipher_exit, +}, { + .base = { + .cra_name = "cbc(des3_ede)", + .cra_driver_name = "n5_cbc(des3_ede)", + .cra_priority = PRIO, + .cra_flags = CRYPTO_ALG_ASYNC, + .cra_blocksize = DES3_EDE_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct nitrox_crypto_ctx), + .cra_alignmask = 0, + .cra_module = THIS_MODULE, + }, + .min_keysize = DES3_EDE_KEY_SIZE, + .max_keysize = DES3_EDE_KEY_SIZE, + .ivsize = DES3_EDE_BLOCK_SIZE, + .setkey = nitrox_3des_setkey, + .encrypt = nitrox_3des_encrypt, + .decrypt = nitrox_3des_decrypt, + .init = nitrox_skcipher_init, + .exit = nitrox_skcipher_exit, +}, { + .base = { + .cra_name = "ecb(des3_ede)", + .cra_driver_name = "n5_ecb(des3_ede)", + .cra_priority = PRIO, + .cra_flags = CRYPTO_ALG_ASYNC, + .cra_blocksize = DES3_EDE_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct nitrox_crypto_ctx), + .cra_alignmask = 0, + .cra_module = THIS_MODULE, + }, + .min_keysize = DES3_EDE_KEY_SIZE, + .max_keysize = DES3_EDE_KEY_SIZE, + .ivsize = DES3_EDE_BLOCK_SIZE, + .setkey = nitrox_3des_setkey, + .encrypt = nitrox_3des_encrypt, + .decrypt = nitrox_3des_decrypt, + .init = nitrox_skcipher_init, + .exit = nitrox_skcipher_exit, +} + +}; + +int nitrox_register_skciphers(void) +{ + return crypto_register_skciphers(nitrox_skciphers, + ARRAY_SIZE(nitrox_skciphers)); +} + +void nitrox_unregister_skciphers(void) +{ + crypto_unregister_skciphers(nitrox_skciphers, + ARRAY_SIZE(nitrox_skciphers)); +} -- cgit v1.2.3-59-g8ed1b From f9c9bdb5131eee60dc3b92e5126d4c0e291703e2 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sat, 15 Dec 2018 12:40:17 -0800 Subject: crypto: x86/chacha - avoid sleeping under kernel_fpu_begin() Passing atomic=true to skcipher_walk_virt() only makes the later skcipher_walk_done() calls use atomic memory allocations, not skcipher_walk_virt() itself. Thus, we have to move it outside of the preemption-disabled region (kernel_fpu_begin()/kernel_fpu_end()). (skcipher_walk_virt() only allocates memory for certain layouts of the input scatterlist, hence why I didn't notice this earlier...) Reported-by: syzbot+9bf843c33f782d73ae7d@syzkaller.appspotmail.com Fixes: 4af78261870a ("crypto: x86/chacha20 - add XChaCha20 support") Signed-off-by: Eric Biggers Signed-off-by: Herbert Xu --- arch/x86/crypto/chacha_glue.c | 33 ++++++++++++++++++++------------- 1 file changed, 20 insertions(+), 13 deletions(-) diff --git a/arch/x86/crypto/chacha_glue.c b/arch/x86/crypto/chacha_glue.c index 9b1d3fac4943..45c1c4143176 100644 --- a/arch/x86/crypto/chacha_glue.c +++ b/arch/x86/crypto/chacha_glue.c @@ -127,30 +127,27 @@ static void chacha_dosimd(u32 *state, u8 *dst, const u8 *src, } } -static int chacha_simd_stream_xor(struct skcipher_request *req, +static int chacha_simd_stream_xor(struct skcipher_walk *walk, struct chacha_ctx *ctx, u8 *iv) { u32 *state, state_buf[16 + 2] __aligned(8); - struct skcipher_walk walk; int next_yield = 4096; /* bytes until next FPU yield */ - int err; + int err = 0; BUILD_BUG_ON(CHACHA_STATE_ALIGN != 16); state = PTR_ALIGN(state_buf + 0, CHACHA_STATE_ALIGN); - err = skcipher_walk_virt(&walk, req, true); - crypto_chacha_init(state, ctx, iv); - while (walk.nbytes > 0) { - unsigned int nbytes = walk.nbytes; + while (walk->nbytes > 0) { + unsigned int nbytes = walk->nbytes; - if (nbytes < walk.total) { - nbytes = round_down(nbytes, walk.stride); + if (nbytes < walk->total) { + nbytes = round_down(nbytes, walk->stride); next_yield -= nbytes; } - chacha_dosimd(state, walk.dst.virt.addr, walk.src.virt.addr, + chacha_dosimd(state, walk->dst.virt.addr, walk->src.virt.addr, nbytes, ctx->nrounds); if (next_yield <= 0) { @@ -160,7 +157,7 @@ static int chacha_simd_stream_xor(struct skcipher_request *req, next_yield = 4096; } - err = skcipher_walk_done(&walk, walk.nbytes - nbytes); + err = skcipher_walk_done(walk, walk->nbytes - nbytes); } return err; @@ -170,13 +167,18 @@ static int chacha_simd(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm); + struct skcipher_walk walk; int err; if (req->cryptlen <= CHACHA_BLOCK_SIZE || !irq_fpu_usable()) return crypto_chacha_crypt(req); + err = skcipher_walk_virt(&walk, req, true); + if (err) + return err; + kernel_fpu_begin(); - err = chacha_simd_stream_xor(req, ctx, req->iv); + err = chacha_simd_stream_xor(&walk, ctx, req->iv); kernel_fpu_end(); return err; } @@ -185,6 +187,7 @@ static int xchacha_simd(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm); + struct skcipher_walk walk; struct chacha_ctx subctx; u32 *state, state_buf[16 + 2] __aligned(8); u8 real_iv[16]; @@ -193,6 +196,10 @@ static int xchacha_simd(struct skcipher_request *req) if (req->cryptlen <= CHACHA_BLOCK_SIZE || !irq_fpu_usable()) return crypto_xchacha_crypt(req); + err = skcipher_walk_virt(&walk, req, true); + if (err) + return err; + BUILD_BUG_ON(CHACHA_STATE_ALIGN != 16); state = PTR_ALIGN(state_buf + 0, CHACHA_STATE_ALIGN); crypto_chacha_init(state, ctx, req->iv); @@ -204,7 +211,7 @@ static int xchacha_simd(struct skcipher_request *req) memcpy(&real_iv[0], req->iv + 24, 8); memcpy(&real_iv[8], req->iv + 16, 8); - err = chacha_simd_stream_xor(req, &subctx, real_iv); + err = chacha_simd_stream_xor(&walk, &subctx, real_iv); kernel_fpu_end(); -- cgit v1.2.3-59-g8ed1b From bb648291fc04c49197561939b8bfea0ada42bce3 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sat, 15 Dec 2018 12:41:53 -0800 Subject: crypto: skcipher - add might_sleep() to skcipher_walk_virt() skcipher_walk_virt() can still sleep even with atomic=true, since that only affects the later calls to skcipher_walk_done(). But, skcipher_walk_virt() only has to allocate memory for some input data layouts, so incorrectly calling it with preemption disabled can go undetected. Use might_sleep() so that it's detected reliably. Signed-off-by: Eric Biggers Signed-off-by: Herbert Xu --- crypto/skcipher.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/crypto/skcipher.c b/crypto/skcipher.c index 17be8d9c714e..41b4f7f27f45 100644 --- a/crypto/skcipher.c +++ b/crypto/skcipher.c @@ -474,6 +474,8 @@ int skcipher_walk_virt(struct skcipher_walk *walk, { int err; + might_sleep_if(req->base.flags & CRYPTO_TFM_REQ_MAY_SLEEP); + walk->flags &= ~SKCIPHER_WALK_PHYS; err = skcipher_walk_skcipher(walk, req); -- cgit v1.2.3-59-g8ed1b From 101b53d91d57ebcc13cb5fbd437b1230457ba9e2 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sat, 15 Dec 2018 12:42:52 -0800 Subject: crypto: salsa20-generic - don't unnecessarily use atomic walk salsa20-generic doesn't use SIMD instructions or otherwise disable preemption, so passing atomic=true to skcipher_walk_virt() is unnecessary. Signed-off-by: Eric Biggers Signed-off-by: Herbert Xu --- crypto/salsa20_generic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crypto/salsa20_generic.c b/crypto/salsa20_generic.c index 8c77bc78a09f..00fce32ae17a 100644 --- a/crypto/salsa20_generic.c +++ b/crypto/salsa20_generic.c @@ -159,7 +159,7 @@ static int salsa20_crypt(struct skcipher_request *req) u32 state[16]; int err; - err = skcipher_walk_virt(&walk, req, true); + err = skcipher_walk_virt(&walk, req, false); salsa20_init(state, ctx, walk.iv); -- cgit v1.2.3-59-g8ed1b From dec5d0db0de7271c7616d713f6c434c6366c9bfb Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 16 Dec 2018 15:00:30 -0800 Subject: crypto: cavium/nitrox - Fix build with !CONFIG_DEBUG_FS Fixes: cf718eaa8f9b ("crypto: cavium/nitrox - Enabled Mailbox support") Signed-off-by: Eric Biggers Signed-off-by: Herbert Xu --- drivers/crypto/cavium/nitrox/nitrox_debugfs.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/crypto/cavium/nitrox/nitrox_debugfs.h b/drivers/crypto/cavium/nitrox/nitrox_debugfs.h index 7b701ea6227a..a8d85ffa619c 100644 --- a/drivers/crypto/cavium/nitrox/nitrox_debugfs.h +++ b/drivers/crypto/cavium/nitrox/nitrox_debugfs.h @@ -13,9 +13,8 @@ static inline int nitrox_debugfs_init(struct nitrox_device *ndev) return 0; } -static inline int nitrox_sriov_debugfs_init(struct nitrox_device *ndev) +static inline void nitrox_debugfs_exit(struct nitrox_device *ndev) { - return 0; } #endif /* !CONFIG_DEBUG_FS */ -- cgit v1.2.3-59-g8ed1b From c79b411eaa7257204f89c30651c45cea22278769 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 16 Dec 2018 15:55:06 -0800 Subject: crypto: skcipher - remove remnants of internal IV generators Remove dead code related to internal IV generators, which are no longer used since they've been replaced with the "seqiv" and "echainiv" templates. The removed code includes: - The "givcipher" (GIVCIPHER) algorithm type. No algorithms are registered with this type anymore, so it's unneeded. - The "const char *geniv" member of aead_alg, ablkcipher_alg, and blkcipher_alg. A few algorithms still set this, but it isn't used anymore except to show via /proc/crypto and CRYPTO_MSG_GETALG. Just hardcode "" or "" in those cases. - The 'skcipher_givcrypt_request' structure, which is never used. Signed-off-by: Eric Biggers Signed-off-by: Herbert Xu --- Documentation/crypto/api.rst | 9 ----- Documentation/crypto/architecture.rst | 31 +++----------- crypto/ablkcipher.c | 76 +---------------------------------- crypto/blkcipher.c | 6 +-- crypto/cryptd.c | 4 +- crypto/ctr.c | 2 - crypto/skcipher.c | 6 +-- drivers/crypto/bcm/cipher.c | 1 - drivers/crypto/chelsio/chcr_algo.c | 1 - drivers/crypto/ixp4xx_crypto.c | 5 --- drivers/crypto/nx/nx-aes-ctr.c | 1 - drivers/crypto/omap-aes.c | 1 - drivers/crypto/picoxcell_crypto.c | 3 +- drivers/crypto/talitos.c | 1 - include/crypto/aead.h | 3 -- include/crypto/internal/skcipher.h | 2 - include/crypto/skcipher.h | 13 ------ include/linux/crypto.h | 34 ++-------------- 18 files changed, 17 insertions(+), 182 deletions(-) diff --git a/Documentation/crypto/api.rst b/Documentation/crypto/api.rst index 2e519193ab4a..b91b31736df8 100644 --- a/Documentation/crypto/api.rst +++ b/Documentation/crypto/api.rst @@ -1,15 +1,6 @@ Programming Interface ===================== -Please note that the kernel crypto API contains the AEAD givcrypt API -(crypto_aead_giv\* and aead_givcrypt\* function calls in -include/crypto/aead.h). This API is obsolete and will be removed in the -future. To obtain the functionality of an AEAD cipher with internal IV -generation, use the IV generator as a regular cipher. For example, -rfc4106(gcm(aes)) is the AEAD cipher with external IV generation and -seqniv(rfc4106(gcm(aes))) implies that the kernel crypto API generates -the IV. Different IV generators are available. - .. class:: toc-title Table of contents diff --git a/Documentation/crypto/architecture.rst b/Documentation/crypto/architecture.rst index ca2d09b991f5..ee8ff0762d7f 100644 --- a/Documentation/crypto/architecture.rst +++ b/Documentation/crypto/architecture.rst @@ -157,10 +157,6 @@ applicable to a cipher, it is not displayed: - rng for random number generator - - givcipher for cipher with associated IV generator (see the geniv - entry below for the specification of the IV generator type used by - the cipher implementation) - - kpp for a Key-agreement Protocol Primitive (KPP) cipher such as an ECDH or DH implementation @@ -174,16 +170,7 @@ applicable to a cipher, it is not displayed: - digestsize: output size of the message digest -- geniv: IV generation type: - - - eseqiv for encrypted sequence number based IV generation - - - seqiv for sequence number based IV generation - - - chainiv for chain iv generation - - - is a marker that the cipher implements IV generation and - handling as it is specific to the given cipher +- geniv: IV generator (obsolete) Key Sizes --------- @@ -218,10 +205,6 @@ the aforementioned cipher types: - CRYPTO_ALG_TYPE_ABLKCIPHER Asynchronous multi-block cipher -- CRYPTO_ALG_TYPE_GIVCIPHER Asynchronous multi-block cipher packed - together with an IV generator (see geniv field in the /proc/crypto - listing for the known IV generators) - - CRYPTO_ALG_TYPE_KPP Key-agreement Protocol Primitive (KPP) such as an ECDH or DH implementation @@ -338,18 +321,14 @@ uses the API applicable to the cipher type specified for the block. The following call sequence is applicable when the IPSEC layer triggers an encryption operation with the esp_output function. During -configuration, the administrator set up the use of rfc4106(gcm(aes)) as -the cipher for ESP. The following call sequence is now depicted in the -ASCII art above: +configuration, the administrator set up the use of seqiv(rfc4106(gcm(aes))) +as the cipher for ESP. The following call sequence is now depicted in +the ASCII art above: 1. esp_output() invokes crypto_aead_encrypt() to trigger an encryption operation of the AEAD cipher with IV generator. - In case of GCM, the SEQIV implementation is registered as GIVCIPHER - in crypto_rfc4106_alloc(). - - The SEQIV performs its operation to generate an IV where the core - function is seqiv_geniv(). + The SEQIV generates the IV. 2. Now, SEQIV uses the AEAD API function calls to invoke the associated AEAD cipher. In our case, during the instantiation of SEQIV, the diff --git a/crypto/ablkcipher.c b/crypto/ablkcipher.c index b5e9ce19d324..b339587073c3 100644 --- a/crypto/ablkcipher.c +++ b/crypto/ablkcipher.c @@ -368,8 +368,7 @@ static int crypto_ablkcipher_report(struct sk_buff *skb, struct crypto_alg *alg) memset(&rblkcipher, 0, sizeof(rblkcipher)); strscpy(rblkcipher.type, "ablkcipher", sizeof(rblkcipher.type)); - strscpy(rblkcipher.geniv, alg->cra_ablkcipher.geniv ?: "", - sizeof(rblkcipher.geniv)); + strscpy(rblkcipher.geniv, "", sizeof(rblkcipher.geniv)); rblkcipher.blocksize = alg->cra_blocksize; rblkcipher.min_keysize = alg->cra_ablkcipher.min_keysize; @@ -399,7 +398,7 @@ static void crypto_ablkcipher_show(struct seq_file *m, struct crypto_alg *alg) seq_printf(m, "min keysize : %u\n", ablkcipher->min_keysize); seq_printf(m, "max keysize : %u\n", ablkcipher->max_keysize); seq_printf(m, "ivsize : %u\n", ablkcipher->ivsize); - seq_printf(m, "geniv : %s\n", ablkcipher->geniv ?: ""); + seq_printf(m, "geniv : \n"); } const struct crypto_type crypto_ablkcipher_type = { @@ -411,74 +410,3 @@ const struct crypto_type crypto_ablkcipher_type = { .report = crypto_ablkcipher_report, }; EXPORT_SYMBOL_GPL(crypto_ablkcipher_type); - -static int crypto_init_givcipher_ops(struct crypto_tfm *tfm, u32 type, - u32 mask) -{ - struct ablkcipher_alg *alg = &tfm->__crt_alg->cra_ablkcipher; - struct ablkcipher_tfm *crt = &tfm->crt_ablkcipher; - - if (alg->ivsize > PAGE_SIZE / 8) - return -EINVAL; - - crt->setkey = tfm->__crt_alg->cra_flags & CRYPTO_ALG_GENIV ? - alg->setkey : setkey; - crt->encrypt = alg->encrypt; - crt->decrypt = alg->decrypt; - crt->base = __crypto_ablkcipher_cast(tfm); - crt->ivsize = alg->ivsize; - - return 0; -} - -#ifdef CONFIG_NET -static int crypto_givcipher_report(struct sk_buff *skb, struct crypto_alg *alg) -{ - struct crypto_report_blkcipher rblkcipher; - - memset(&rblkcipher, 0, sizeof(rblkcipher)); - - strscpy(rblkcipher.type, "givcipher", sizeof(rblkcipher.type)); - strscpy(rblkcipher.geniv, alg->cra_ablkcipher.geniv ?: "", - sizeof(rblkcipher.geniv)); - - rblkcipher.blocksize = alg->cra_blocksize; - rblkcipher.min_keysize = alg->cra_ablkcipher.min_keysize; - rblkcipher.max_keysize = alg->cra_ablkcipher.max_keysize; - rblkcipher.ivsize = alg->cra_ablkcipher.ivsize; - - return nla_put(skb, CRYPTOCFGA_REPORT_BLKCIPHER, - sizeof(rblkcipher), &rblkcipher); -} -#else -static int crypto_givcipher_report(struct sk_buff *skb, struct crypto_alg *alg) -{ - return -ENOSYS; -} -#endif - -static void crypto_givcipher_show(struct seq_file *m, struct crypto_alg *alg) - __maybe_unused; -static void crypto_givcipher_show(struct seq_file *m, struct crypto_alg *alg) -{ - struct ablkcipher_alg *ablkcipher = &alg->cra_ablkcipher; - - seq_printf(m, "type : givcipher\n"); - seq_printf(m, "async : %s\n", alg->cra_flags & CRYPTO_ALG_ASYNC ? - "yes" : "no"); - seq_printf(m, "blocksize : %u\n", alg->cra_blocksize); - seq_printf(m, "min keysize : %u\n", ablkcipher->min_keysize); - seq_printf(m, "max keysize : %u\n", ablkcipher->max_keysize); - seq_printf(m, "ivsize : %u\n", ablkcipher->ivsize); - seq_printf(m, "geniv : %s\n", ablkcipher->geniv ?: ""); -} - -const struct crypto_type crypto_givcipher_type = { - .ctxsize = crypto_ablkcipher_ctxsize, - .init = crypto_init_givcipher_ops, -#ifdef CONFIG_PROC_FS - .show = crypto_givcipher_show, -#endif - .report = crypto_givcipher_report, -}; -EXPORT_SYMBOL_GPL(crypto_givcipher_type); diff --git a/crypto/blkcipher.c b/crypto/blkcipher.c index 193237514e90..c5398bd54942 100644 --- a/crypto/blkcipher.c +++ b/crypto/blkcipher.c @@ -510,8 +510,7 @@ static int crypto_blkcipher_report(struct sk_buff *skb, struct crypto_alg *alg) memset(&rblkcipher, 0, sizeof(rblkcipher)); strscpy(rblkcipher.type, "blkcipher", sizeof(rblkcipher.type)); - strscpy(rblkcipher.geniv, alg->cra_blkcipher.geniv ?: "", - sizeof(rblkcipher.geniv)); + strscpy(rblkcipher.geniv, "", sizeof(rblkcipher.geniv)); rblkcipher.blocksize = alg->cra_blocksize; rblkcipher.min_keysize = alg->cra_blkcipher.min_keysize; @@ -537,8 +536,7 @@ static void crypto_blkcipher_show(struct seq_file *m, struct crypto_alg *alg) seq_printf(m, "min keysize : %u\n", alg->cra_blkcipher.min_keysize); seq_printf(m, "max keysize : %u\n", alg->cra_blkcipher.max_keysize); seq_printf(m, "ivsize : %u\n", alg->cra_blkcipher.ivsize); - seq_printf(m, "geniv : %s\n", alg->cra_blkcipher.geniv ?: - ""); + seq_printf(m, "geniv : \n"); } const struct crypto_type crypto_blkcipher_type = { diff --git a/crypto/cryptd.c b/crypto/cryptd.c index 7118fb5efbaa..5640e5db7bdb 100644 --- a/crypto/cryptd.c +++ b/crypto/cryptd.c @@ -422,8 +422,6 @@ static int cryptd_create_blkcipher(struct crypto_template *tmpl, inst->alg.cra_ablkcipher.min_keysize = alg->cra_blkcipher.min_keysize; inst->alg.cra_ablkcipher.max_keysize = alg->cra_blkcipher.max_keysize; - inst->alg.cra_ablkcipher.geniv = alg->cra_blkcipher.geniv; - inst->alg.cra_ctxsize = sizeof(struct cryptd_blkcipher_ctx); inst->alg.cra_init = cryptd_blkcipher_init_tfm; @@ -1174,7 +1172,7 @@ struct cryptd_ablkcipher *cryptd_alloc_ablkcipher(const char *alg_name, return ERR_PTR(-EINVAL); type = crypto_skcipher_type(type); mask &= ~CRYPTO_ALG_TYPE_MASK; - mask |= (CRYPTO_ALG_GENIV | CRYPTO_ALG_TYPE_BLKCIPHER_MASK); + mask |= CRYPTO_ALG_TYPE_BLKCIPHER_MASK; tfm = crypto_alloc_base(cryptd_alg_name, type, mask); if (IS_ERR(tfm)) return ERR_CAST(tfm); diff --git a/crypto/ctr.c b/crypto/ctr.c index 435b75bd619e..30f3946efc6d 100644 --- a/crypto/ctr.c +++ b/crypto/ctr.c @@ -233,8 +233,6 @@ static struct crypto_instance *crypto_ctr_alloc(struct rtattr **tb) inst->alg.cra_blkcipher.encrypt = crypto_ctr_crypt; inst->alg.cra_blkcipher.decrypt = crypto_ctr_crypt; - inst->alg.cra_blkcipher.geniv = "chainiv"; - out: crypto_mod_put(alg); return inst; diff --git a/crypto/skcipher.c b/crypto/skcipher.c index 41b4f7f27f45..2a969296bc24 100644 --- a/crypto/skcipher.c +++ b/crypto/skcipher.c @@ -579,8 +579,7 @@ static unsigned int crypto_skcipher_extsize(struct crypto_alg *alg) if (alg->cra_type == &crypto_blkcipher_type) return sizeof(struct crypto_blkcipher *); - if (alg->cra_type == &crypto_ablkcipher_type || - alg->cra_type == &crypto_givcipher_type) + if (alg->cra_type == &crypto_ablkcipher_type) return sizeof(struct crypto_ablkcipher *); return crypto_alg_extsize(alg); @@ -844,8 +843,7 @@ static int crypto_skcipher_init_tfm(struct crypto_tfm *tfm) if (tfm->__crt_alg->cra_type == &crypto_blkcipher_type) return crypto_init_skcipher_ops_blkcipher(tfm); - if (tfm->__crt_alg->cra_type == &crypto_ablkcipher_type || - tfm->__crt_alg->cra_type == &crypto_givcipher_type) + if (tfm->__crt_alg->cra_type == &crypto_ablkcipher_type) return crypto_init_skcipher_ops_ablkcipher(tfm); skcipher->setkey = skcipher_setkey; diff --git a/drivers/crypto/bcm/cipher.c b/drivers/crypto/bcm/cipher.c index 2ce3a16d3d10..c9393ffb70ed 100644 --- a/drivers/crypto/bcm/cipher.c +++ b/drivers/crypto/bcm/cipher.c @@ -3868,7 +3868,6 @@ static struct iproc_alg_s driver_algs[] = { .cra_driver_name = "ctr-aes-iproc", .cra_blocksize = AES_BLOCK_SIZE, .cra_ablkcipher = { - /* .geniv = "chainiv", */ .min_keysize = AES_MIN_KEY_SIZE, .max_keysize = AES_MAX_KEY_SIZE, .ivsize = AES_BLOCK_SIZE, diff --git a/drivers/crypto/chelsio/chcr_algo.c b/drivers/crypto/chelsio/chcr_algo.c index eedc33128da4..bcef76508dfa 100644 --- a/drivers/crypto/chelsio/chcr_algo.c +++ b/drivers/crypto/chelsio/chcr_algo.c @@ -3816,7 +3816,6 @@ static struct chcr_alg_template driver_algs[] = { .setkey = chcr_aes_rfc3686_setkey, .encrypt = chcr_aes_encrypt, .decrypt = chcr_aes_decrypt, - .geniv = "seqiv", } } }, diff --git a/drivers/crypto/ixp4xx_crypto.c b/drivers/crypto/ixp4xx_crypto.c index 27f7dad2d45d..19fba998b86b 100644 --- a/drivers/crypto/ixp4xx_crypto.c +++ b/drivers/crypto/ixp4xx_crypto.c @@ -1194,7 +1194,6 @@ static struct ixp_alg ixp4xx_algos[] = { .min_keysize = DES_KEY_SIZE, .max_keysize = DES_KEY_SIZE, .ivsize = DES_BLOCK_SIZE, - .geniv = "eseqiv", } } }, @@ -1221,7 +1220,6 @@ static struct ixp_alg ixp4xx_algos[] = { .min_keysize = DES3_EDE_KEY_SIZE, .max_keysize = DES3_EDE_KEY_SIZE, .ivsize = DES3_EDE_BLOCK_SIZE, - .geniv = "eseqiv", } } }, @@ -1247,7 +1245,6 @@ static struct ixp_alg ixp4xx_algos[] = { .min_keysize = AES_MIN_KEY_SIZE, .max_keysize = AES_MAX_KEY_SIZE, .ivsize = AES_BLOCK_SIZE, - .geniv = "eseqiv", } } }, @@ -1273,7 +1270,6 @@ static struct ixp_alg ixp4xx_algos[] = { .min_keysize = AES_MIN_KEY_SIZE, .max_keysize = AES_MAX_KEY_SIZE, .ivsize = AES_BLOCK_SIZE, - .geniv = "eseqiv", } } }, @@ -1287,7 +1283,6 @@ static struct ixp_alg ixp4xx_algos[] = { .min_keysize = AES_MIN_KEY_SIZE, .max_keysize = AES_MAX_KEY_SIZE, .ivsize = AES_BLOCK_SIZE, - .geniv = "eseqiv", .setkey = ablk_rfc3686_setkey, .encrypt = ablk_rfc3686_crypt, .decrypt = ablk_rfc3686_crypt } diff --git a/drivers/crypto/nx/nx-aes-ctr.c b/drivers/crypto/nx/nx-aes-ctr.c index 898c0a280511..5a26fcd75d2d 100644 --- a/drivers/crypto/nx/nx-aes-ctr.c +++ b/drivers/crypto/nx/nx-aes-ctr.c @@ -159,7 +159,6 @@ struct crypto_alg nx_ctr3686_aes_alg = { .min_keysize = AES_MIN_KEY_SIZE + CTR_RFC3686_NONCE_SIZE, .max_keysize = AES_MAX_KEY_SIZE + CTR_RFC3686_NONCE_SIZE, .ivsize = CTR_RFC3686_IV_SIZE, - .geniv = "seqiv", .setkey = ctr3686_aes_nx_set_key, .encrypt = ctr3686_aes_nx_crypt, .decrypt = ctr3686_aes_nx_crypt, diff --git a/drivers/crypto/omap-aes.c b/drivers/crypto/omap-aes.c index 4c0ea8142923..0120feb2d746 100644 --- a/drivers/crypto/omap-aes.c +++ b/drivers/crypto/omap-aes.c @@ -749,7 +749,6 @@ static struct crypto_alg algs_ctr[] = { .cra_u.ablkcipher = { .min_keysize = AES_MIN_KEY_SIZE, .max_keysize = AES_MAX_KEY_SIZE, - .geniv = "eseqiv", .ivsize = AES_BLOCK_SIZE, .setkey = omap_aes_setkey, .encrypt = omap_aes_ctr_encrypt, diff --git a/drivers/crypto/picoxcell_crypto.c b/drivers/crypto/picoxcell_crypto.c index a28f1d18fe01..17068b55fea5 100644 --- a/drivers/crypto/picoxcell_crypto.c +++ b/drivers/crypto/picoxcell_crypto.c @@ -1585,8 +1585,7 @@ static struct spacc_alg l2_engine_algs[] = { .cra_name = "f8(kasumi)", .cra_driver_name = "f8-kasumi-picoxcell", .cra_priority = SPACC_CRYPTO_ALG_PRIORITY, - .cra_flags = CRYPTO_ALG_TYPE_GIVCIPHER | - CRYPTO_ALG_ASYNC | + .cra_flags = CRYPTO_ALG_ASYNC | CRYPTO_ALG_KERN_DRIVER_ONLY, .cra_blocksize = 8, .cra_ctxsize = sizeof(struct spacc_ablk_ctx), diff --git a/drivers/crypto/talitos.c b/drivers/crypto/talitos.c index 6988012deca4..45e20707cef8 100644 --- a/drivers/crypto/talitos.c +++ b/drivers/crypto/talitos.c @@ -3155,7 +3155,6 @@ static struct talitos_crypto_alg *talitos_alg_alloc(struct device *dev, alg->cra_ablkcipher.setkey = ablkcipher_setkey; alg->cra_ablkcipher.encrypt = ablkcipher_encrypt; alg->cra_ablkcipher.decrypt = ablkcipher_decrypt; - alg->cra_ablkcipher.geniv = "eseqiv"; break; case CRYPTO_ALG_TYPE_AEAD: alg = &t_alg->algt.alg.aead.base; diff --git a/include/crypto/aead.h b/include/crypto/aead.h index b7b8d24cf765..9ad595f97c65 100644 --- a/include/crypto/aead.h +++ b/include/crypto/aead.h @@ -115,7 +115,6 @@ struct aead_request { * @setkey: see struct skcipher_alg * @encrypt: see struct skcipher_alg * @decrypt: see struct skcipher_alg - * @geniv: see struct skcipher_alg * @ivsize: see struct skcipher_alg * @chunksize: see struct skcipher_alg * @init: Initialize the cryptographic transformation object. This function @@ -142,8 +141,6 @@ struct aead_alg { int (*init)(struct crypto_aead *tfm); void (*exit)(struct crypto_aead *tfm); - const char *geniv; - unsigned int ivsize; unsigned int maxauthsize; unsigned int chunksize; diff --git a/include/crypto/internal/skcipher.h b/include/crypto/internal/skcipher.h index e42f7063f245..453e867b4bd9 100644 --- a/include/crypto/internal/skcipher.h +++ b/include/crypto/internal/skcipher.h @@ -70,8 +70,6 @@ struct skcipher_walk { unsigned int alignmask; }; -extern const struct crypto_type crypto_givcipher_type; - static inline struct crypto_instance *skcipher_crypto_instance( struct skcipher_instance *inst) { diff --git a/include/crypto/skcipher.h b/include/crypto/skcipher.h index 480f8301a47d..e555294ed77f 100644 --- a/include/crypto/skcipher.h +++ b/include/crypto/skcipher.h @@ -39,19 +39,6 @@ struct skcipher_request { void *__ctx[] CRYPTO_MINALIGN_ATTR; }; -/** - * struct skcipher_givcrypt_request - Crypto request with IV generation - * @seq: Sequence number for IV generation - * @giv: Space for generated IV - * @creq: The crypto request itself - */ -struct skcipher_givcrypt_request { - u64 seq; - u8 *giv; - - struct ablkcipher_request creq; -}; - struct crypto_skcipher { int (*setkey)(struct crypto_skcipher *tfm, const u8 *key, unsigned int keylen); diff --git a/include/linux/crypto.h b/include/linux/crypto.h index 81e178fb9ed8..902ec171fc6d 100644 --- a/include/linux/crypto.h +++ b/include/linux/crypto.h @@ -49,7 +49,6 @@ #define CRYPTO_ALG_TYPE_BLKCIPHER 0x00000004 #define CRYPTO_ALG_TYPE_ABLKCIPHER 0x00000005 #define CRYPTO_ALG_TYPE_SKCIPHER 0x00000005 -#define CRYPTO_ALG_TYPE_GIVCIPHER 0x00000006 #define CRYPTO_ALG_TYPE_KPP 0x00000008 #define CRYPTO_ALG_TYPE_ACOMPRESS 0x0000000a #define CRYPTO_ALG_TYPE_SCOMPRESS 0x0000000b @@ -76,12 +75,6 @@ */ #define CRYPTO_ALG_NEED_FALLBACK 0x00000100 -/* - * This bit is set for symmetric key ciphers that have already been wrapped - * with a generic IV generator to prevent them from being wrapped again. - */ -#define CRYPTO_ALG_GENIV 0x00000200 - /* * Set if the algorithm has passed automated run-time testing. Note that * if there is no run-time testing for a given algorithm it is considered @@ -157,7 +150,6 @@ struct crypto_async_request; struct crypto_blkcipher; struct crypto_tfm; struct crypto_type; -struct skcipher_givcrypt_request; typedef void (*crypto_completion_t)(struct crypto_async_request *req, int err); @@ -246,31 +238,16 @@ struct cipher_desc { * be called in parallel with the same transformation object. * @decrypt: Decrypt a single block. This is a reverse counterpart to @encrypt * and the conditions are exactly the same. - * @givencrypt: Update the IV for encryption. With this function, a cipher - * implementation may provide the function on how to update the IV - * for encryption. - * @givdecrypt: Update the IV for decryption. This is the reverse of - * @givencrypt . - * @geniv: The transformation implementation may use an "IV generator" provided - * by the kernel crypto API. Several use cases have a predefined - * approach how IVs are to be updated. For such use cases, the kernel - * crypto API provides ready-to-use implementations that can be - * referenced with this variable. * @ivsize: IV size applicable for transformation. The consumer must provide an * IV of exactly that size to perform the encrypt or decrypt operation. * - * All fields except @givencrypt , @givdecrypt , @geniv and @ivsize are - * mandatory and must be filled. + * All fields except @ivsize are mandatory and must be filled. */ struct ablkcipher_alg { int (*setkey)(struct crypto_ablkcipher *tfm, const u8 *key, unsigned int keylen); int (*encrypt)(struct ablkcipher_request *req); int (*decrypt)(struct ablkcipher_request *req); - int (*givencrypt)(struct skcipher_givcrypt_request *req); - int (*givdecrypt)(struct skcipher_givcrypt_request *req); - - const char *geniv; unsigned int min_keysize; unsigned int max_keysize; @@ -284,10 +261,9 @@ struct ablkcipher_alg { * @setkey: see struct ablkcipher_alg * @encrypt: see struct ablkcipher_alg * @decrypt: see struct ablkcipher_alg - * @geniv: see struct ablkcipher_alg * @ivsize: see struct ablkcipher_alg * - * All fields except @geniv and @ivsize are mandatory and must be filled. + * All fields except @ivsize are mandatory and must be filled. */ struct blkcipher_alg { int (*setkey)(struct crypto_tfm *tfm, const u8 *key, @@ -299,8 +275,6 @@ struct blkcipher_alg { struct scatterlist *dst, struct scatterlist *src, unsigned int nbytes); - const char *geniv; - unsigned int min_keysize; unsigned int max_keysize; unsigned int ivsize; @@ -931,14 +905,14 @@ static inline struct crypto_ablkcipher *__crypto_ablkcipher_cast( static inline u32 crypto_skcipher_type(u32 type) { - type &= ~(CRYPTO_ALG_TYPE_MASK | CRYPTO_ALG_GENIV); + type &= ~CRYPTO_ALG_TYPE_MASK; type |= CRYPTO_ALG_TYPE_BLKCIPHER; return type; } static inline u32 crypto_skcipher_mask(u32 mask) { - mask &= ~(CRYPTO_ALG_TYPE_MASK | CRYPTO_ALG_GENIV); + mask &= ~CRYPTO_ALG_TYPE_MASK; mask |= CRYPTO_ALG_TYPE_BLKCIPHER_MASK; return mask; } -- cgit v1.2.3-59-g8ed1b