From 986dfbcf8b493928188f0e634068993bf2067ad7 Mon Sep 17 00:00:00 2001 From: Ruchika Gupta Date: Fri, 26 Apr 2013 15:44:54 +0530 Subject: crypto: caam - FIX RNG init for RNG greater than equal to 4 For SEC including a RNG block version >= 4, special initialization must occur before any descriptor that uses RNG block can be submitted. This initialization is required not only for SEC with version greater than 5.0, but for SEC with RNG version >=4. There may be a case where RNG has already been instantiated by u-boot or boot ROM code.In such SoCs, if RNG is initialized again SEC would returns "Instantiation error". Hence, the initialization status of RNG4 should be also checked before doing RNG init. Signed-off-by: Ruchika Gupta Signed-off-by: Alex Porosanu Signed-off-by: Andy Fleming Reviewed-by: Vakul Garg Signed-off-by: Herbert Xu --- drivers/crypto/caam/ctrl.c | 10 +++++++--- drivers/crypto/caam/regs.h | 42 +++++++++++++++++++++++++++++++++++++++++- 2 files changed, 48 insertions(+), 4 deletions(-) diff --git a/drivers/crypto/caam/ctrl.c b/drivers/crypto/caam/ctrl.c index 6e94bcd94678..f5d6deced1cb 100644 --- a/drivers/crypto/caam/ctrl.c +++ b/drivers/crypto/caam/ctrl.c @@ -202,6 +202,7 @@ static int caam_probe(struct platform_device *pdev) #ifdef CONFIG_DEBUG_FS struct caam_perfmon *perfmon; #endif + u64 cha_vid; ctrlpriv = kzalloc(sizeof(struct caam_drv_private), GFP_KERNEL); if (!ctrlpriv) @@ -293,11 +294,14 @@ static int caam_probe(struct platform_device *pdev) return -ENOMEM; } + cha_vid = rd_reg64(&topregs->ctrl.perfmon.cha_id); + /* - * RNG4 based SECs (v5+) need special initialization prior - * to executing any descriptors + * If SEC has RNG version >= 4 and RNG state handle has not been + * already instantiated ,do RNG instantiation */ - if (of_device_is_compatible(nprop, "fsl,sec-v5.0")) { + if ((cha_vid & CHA_ID_RNG_MASK) >> CHA_ID_RNG_SHIFT >= 4 && + !(rd_reg32(&topregs->ctrl.r4tst[0].rdsta) & RDSTA_IF0)) { kick_trng(pdev); ret = instantiate_rng(ctrlpriv->jrdev[0]); if (ret) { diff --git a/drivers/crypto/caam/regs.h b/drivers/crypto/caam/regs.h index cd6fedad9935..c09142fc13e3 100644 --- a/drivers/crypto/caam/regs.h +++ b/drivers/crypto/caam/regs.h @@ -117,6 +117,43 @@ struct jr_outentry { #define CHA_NUM_DECONUM_SHIFT 56 #define CHA_NUM_DECONUM_MASK (0xfull << CHA_NUM_DECONUM_SHIFT) +/* CHA Version IDs */ +#define CHA_ID_AES_SHIFT 0 +#define CHA_ID_AES_MASK (0xfull << CHA_ID_AES_SHIFT) + +#define CHA_ID_DES_SHIFT 4 +#define CHA_ID_DES_MASK (0xfull << CHA_ID_DES_SHIFT) + +#define CHA_ID_ARC4_SHIFT 8 +#define CHA_ID_ARC4_MASK (0xfull << CHA_ID_ARC4_SHIFT) + +#define CHA_ID_MD_SHIFT 12 +#define CHA_ID_MD_MASK (0xfull << CHA_ID_MD_SHIFT) + +#define CHA_ID_RNG_SHIFT 16 +#define CHA_ID_RNG_MASK (0xfull << CHA_ID_RNG_SHIFT) + +#define CHA_ID_SNW8_SHIFT 20 +#define CHA_ID_SNW8_MASK (0xfull << CHA_ID_SNW8_SHIFT) + +#define CHA_ID_KAS_SHIFT 24 +#define CHA_ID_KAS_MASK (0xfull << CHA_ID_KAS_SHIFT) + +#define CHA_ID_PK_SHIFT 28 +#define CHA_ID_PK_MASK (0xfull << CHA_ID_PK_SHIFT) + +#define CHA_ID_CRC_SHIFT 32 +#define CHA_ID_CRC_MASK (0xfull << CHA_ID_CRC_SHIFT) + +#define CHA_ID_SNW9_SHIFT 36 +#define CHA_ID_SNW9_MASK (0xfull << CHA_ID_SNW9_SHIFT) + +#define CHA_ID_DECO_SHIFT 56 +#define CHA_ID_DECO_MASK (0xfull << CHA_ID_DECO_SHIFT) + +#define CHA_ID_JR_SHIFT 60 +#define CHA_ID_JR_MASK (0xfull << CHA_ID_JR_SHIFT) + struct sec_vid { u16 ip_id; u8 maj_rev; @@ -228,7 +265,10 @@ struct rng4tst { u32 rtfrqmax; /* PRGM=1: freq. count max. limit register */ u32 rtfrqcnt; /* PRGM=0: freq. count register */ }; - u32 rsvd1[56]; + u32 rsvd1[40]; +#define RDSTA_IF0 0x00000001 + u32 rdsta; + u32 rsvd2[15]; }; /* -- cgit v1.2.3-59-g8ed1b From 2d31e518a42828df7877bca23a958627d60408bc Mon Sep 17 00:00:00 2001 From: Tim Chen Date: Wed, 1 May 2013 12:52:48 -0700 Subject: crypto: crct10dif - Wrap crc_t10dif function all to use crypto transform framework When CRC T10 DIF is calculated using the crypto transform framework, we wrap the crc_t10dif function call to utilize it. This allows us to take advantage of any accelerated CRC T10 DIF transform that is plugged into the crypto framework. Signed-off-by: Tim Chen Signed-off-by: Herbert Xu --- crypto/Kconfig | 8 ++ crypto/Makefile | 1 + crypto/crct10dif.c | 178 +++++++++++++++++++++++++++++++++++++++++++++ include/linux/crc-t10dif.h | 4 + lib/Kconfig | 2 + lib/crc-t10dif.c | 75 ++++++++----------- 6 files changed, 225 insertions(+), 43 deletions(-) create mode 100644 crypto/crct10dif.c diff --git a/crypto/Kconfig b/crypto/Kconfig index 622d8a48cbe9..ceb3611efe44 100644 --- a/crypto/Kconfig +++ b/crypto/Kconfig @@ -376,6 +376,14 @@ config CRYPTO_CRC32_PCLMUL which will enable any routine to use the CRC-32-IEEE 802.3 checksum and gain better performance as compared with the table implementation. +config CRYPTO_CRCT10DIF + tristate "CRCT10DIF algorithm" + select CRYPTO_HASH + help + CRC T10 Data Integrity Field computation is being cast as + a crypto transform. This allows for faster crc t10 diff + transforms to be used if they are available. + config CRYPTO_GHASH tristate "GHASH digest algorithm" select CRYPTO_GF128MUL diff --git a/crypto/Makefile b/crypto/Makefile index a8e9b0fefbe9..62af87df8729 100644 --- a/crypto/Makefile +++ b/crypto/Makefile @@ -83,6 +83,7 @@ obj-$(CONFIG_CRYPTO_ZLIB) += zlib.o obj-$(CONFIG_CRYPTO_MICHAEL_MIC) += michael_mic.o obj-$(CONFIG_CRYPTO_CRC32C) += crc32c.o obj-$(CONFIG_CRYPTO_CRC32) += crc32.o +obj-$(CONFIG_CRYPTO_CRCT10DIF) += crct10dif.o obj-$(CONFIG_CRYPTO_AUTHENC) += authenc.o authencesn.o obj-$(CONFIG_CRYPTO_LZO) += lzo.o obj-$(CONFIG_CRYPTO_842) += 842.o diff --git a/crypto/crct10dif.c b/crypto/crct10dif.c new file mode 100644 index 000000000000..92aca96d6b98 --- /dev/null +++ b/crypto/crct10dif.c @@ -0,0 +1,178 @@ +/* + * Cryptographic API. + * + * T10 Data Integrity Field CRC16 Crypto Transform + * + * Copyright (c) 2007 Oracle Corporation. All rights reserved. + * Written by Martin K. Petersen + * Copyright (C) 2013 Intel Corporation + * Author: Tim Chen + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#include +#include +#include +#include +#include +#include +#include + +struct chksum_desc_ctx { + __u16 crc; +}; + +/* Table generated using the following polynomium: + * x^16 + x^15 + x^11 + x^9 + x^8 + x^7 + x^5 + x^4 + x^2 + x + 1 + * gt: 0x8bb7 + */ +static const __u16 t10_dif_crc_table[256] = { + 0x0000, 0x8BB7, 0x9CD9, 0x176E, 0xB205, 0x39B2, 0x2EDC, 0xA56B, + 0xEFBD, 0x640A, 0x7364, 0xF8D3, 0x5DB8, 0xD60F, 0xC161, 0x4AD6, + 0x54CD, 0xDF7A, 0xC814, 0x43A3, 0xE6C8, 0x6D7F, 0x7A11, 0xF1A6, + 0xBB70, 0x30C7, 0x27A9, 0xAC1E, 0x0975, 0x82C2, 0x95AC, 0x1E1B, + 0xA99A, 0x222D, 0x3543, 0xBEF4, 0x1B9F, 0x9028, 0x8746, 0x0CF1, + 0x4627, 0xCD90, 0xDAFE, 0x5149, 0xF422, 0x7F95, 0x68FB, 0xE34C, + 0xFD57, 0x76E0, 0x618E, 0xEA39, 0x4F52, 0xC4E5, 0xD38B, 0x583C, + 0x12EA, 0x995D, 0x8E33, 0x0584, 0xA0EF, 0x2B58, 0x3C36, 0xB781, + 0xD883, 0x5334, 0x445A, 0xCFED, 0x6A86, 0xE131, 0xF65F, 0x7DE8, + 0x373E, 0xBC89, 0xABE7, 0x2050, 0x853B, 0x0E8C, 0x19E2, 0x9255, + 0x8C4E, 0x07F9, 0x1097, 0x9B20, 0x3E4B, 0xB5FC, 0xA292, 0x2925, + 0x63F3, 0xE844, 0xFF2A, 0x749D, 0xD1F6, 0x5A41, 0x4D2F, 0xC698, + 0x7119, 0xFAAE, 0xEDC0, 0x6677, 0xC31C, 0x48AB, 0x5FC5, 0xD472, + 0x9EA4, 0x1513, 0x027D, 0x89CA, 0x2CA1, 0xA716, 0xB078, 0x3BCF, + 0x25D4, 0xAE63, 0xB90D, 0x32BA, 0x97D1, 0x1C66, 0x0B08, 0x80BF, + 0xCA69, 0x41DE, 0x56B0, 0xDD07, 0x786C, 0xF3DB, 0xE4B5, 0x6F02, + 0x3AB1, 0xB106, 0xA668, 0x2DDF, 0x88B4, 0x0303, 0x146D, 0x9FDA, + 0xD50C, 0x5EBB, 0x49D5, 0xC262, 0x6709, 0xECBE, 0xFBD0, 0x7067, + 0x6E7C, 0xE5CB, 0xF2A5, 0x7912, 0xDC79, 0x57CE, 0x40A0, 0xCB17, + 0x81C1, 0x0A76, 0x1D18, 0x96AF, 0x33C4, 0xB873, 0xAF1D, 0x24AA, + 0x932B, 0x189C, 0x0FF2, 0x8445, 0x212E, 0xAA99, 0xBDF7, 0x3640, + 0x7C96, 0xF721, 0xE04F, 0x6BF8, 0xCE93, 0x4524, 0x524A, 0xD9FD, + 0xC7E6, 0x4C51, 0x5B3F, 0xD088, 0x75E3, 0xFE54, 0xE93A, 0x628D, + 0x285B, 0xA3EC, 0xB482, 0x3F35, 0x9A5E, 0x11E9, 0x0687, 0x8D30, + 0xE232, 0x6985, 0x7EEB, 0xF55C, 0x5037, 0xDB80, 0xCCEE, 0x4759, + 0x0D8F, 0x8638, 0x9156, 0x1AE1, 0xBF8A, 0x343D, 0x2353, 0xA8E4, + 0xB6FF, 0x3D48, 0x2A26, 0xA191, 0x04FA, 0x8F4D, 0x9823, 0x1394, + 0x5942, 0xD2F5, 0xC59B, 0x4E2C, 0xEB47, 0x60F0, 0x779E, 0xFC29, + 0x4BA8, 0xC01F, 0xD771, 0x5CC6, 0xF9AD, 0x721A, 0x6574, 0xEEC3, + 0xA415, 0x2FA2, 0x38CC, 0xB37B, 0x1610, 0x9DA7, 0x8AC9, 0x017E, + 0x1F65, 0x94D2, 0x83BC, 0x080B, 0xAD60, 0x26D7, 0x31B9, 0xBA0E, + 0xF0D8, 0x7B6F, 0x6C01, 0xE7B6, 0x42DD, 0xC96A, 0xDE04, 0x55B3 +}; + +__u16 crc_t10dif_generic(__u16 crc, const unsigned char *buffer, size_t len) +{ + unsigned int i; + + for (i = 0 ; i < len ; i++) + crc = (crc << 8) ^ t10_dif_crc_table[((crc >> 8) ^ buffer[i]) & 0xff]; + + return crc; +} +EXPORT_SYMBOL(crc_t10dif_generic); + +/* + * Steps through buffer one byte at at time, calculates reflected + * crc using table. + */ + +static int chksum_init(struct shash_desc *desc) +{ + struct chksum_desc_ctx *ctx = shash_desc_ctx(desc); + + ctx->crc = 0; + + return 0; +} + +static int chksum_update(struct shash_desc *desc, const u8 *data, + unsigned int length) +{ + struct chksum_desc_ctx *ctx = shash_desc_ctx(desc); + + ctx->crc = crc_t10dif_generic(ctx->crc, data, length); + return 0; +} + +static int chksum_final(struct shash_desc *desc, u8 *out) +{ + struct chksum_desc_ctx *ctx = shash_desc_ctx(desc); + + *(__u16 *)out = ctx->crc; + return 0; +} + +static int __chksum_finup(__u16 *crcp, const u8 *data, unsigned int len, + u8 *out) +{ + *(__u16 *)out = crc_t10dif_generic(*crcp, data, len); + return 0; +} + +static int chksum_finup(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *out) +{ + struct chksum_desc_ctx *ctx = shash_desc_ctx(desc); + + return __chksum_finup(&ctx->crc, data, len, out); +} + +static int chksum_digest(struct shash_desc *desc, const u8 *data, + unsigned int length, u8 *out) +{ + struct chksum_desc_ctx *ctx = shash_desc_ctx(desc); + + return __chksum_finup(&ctx->crc, data, length, out); +} + +static struct shash_alg alg = { + .digestsize = CRC_T10DIF_DIGEST_SIZE, + .init = chksum_init, + .update = chksum_update, + .final = chksum_final, + .finup = chksum_finup, + .digest = chksum_digest, + .descsize = sizeof(struct chksum_desc_ctx), + .base = { + .cra_name = "crct10dif", + .cra_driver_name = "crct10dif-generic", + .cra_priority = 100, + .cra_blocksize = CRC_T10DIF_BLOCK_SIZE, + .cra_module = THIS_MODULE, + } +}; + +static int __init crct10dif_mod_init(void) +{ + int ret; + + ret = crypto_register_shash(&alg); + return ret; +} + +static void __exit crct10dif_mod_fini(void) +{ + crypto_unregister_shash(&alg); +} + +module_init(crct10dif_mod_init); +module_exit(crct10dif_mod_fini); + +MODULE_AUTHOR("Tim Chen "); +MODULE_DESCRIPTION("T10 DIF CRC calculation."); +MODULE_LICENSE("GPL"); diff --git a/include/linux/crc-t10dif.h b/include/linux/crc-t10dif.h index a9c96d865ee7..b3cb71f0d3b0 100644 --- a/include/linux/crc-t10dif.h +++ b/include/linux/crc-t10dif.h @@ -3,6 +3,10 @@ #include +#define CRC_T10DIF_DIGEST_SIZE 2 +#define CRC_T10DIF_BLOCK_SIZE 1 + +__u16 crc_t10dif_generic(__u16 crc, const unsigned char *buffer, size_t len); __u16 crc_t10dif(unsigned char const *, size_t); #endif diff --git a/lib/Kconfig b/lib/Kconfig index fe01d418b09a..339d5a4292f8 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -63,6 +63,8 @@ config CRC16 config CRC_T10DIF tristate "CRC calculation for the T10 Data Integrity Field" + select CRYPTO + select CRYPTO_CRCT10DIF help This option is only needed if a module that's not in the kernel tree needs to calculate CRC checks for use with the diff --git a/lib/crc-t10dif.c b/lib/crc-t10dif.c index fbbd66ed86cd..0cb64634f513 100644 --- a/lib/crc-t10dif.c +++ b/lib/crc-t10dif.c @@ -11,57 +11,46 @@ #include #include #include +#include +#include +#include -/* Table generated using the following polynomium: - * x^16 + x^15 + x^11 + x^9 + x^8 + x^7 + x^5 + x^4 + x^2 + x + 1 - * gt: 0x8bb7 - */ -static const __u16 t10_dif_crc_table[256] = { - 0x0000, 0x8BB7, 0x9CD9, 0x176E, 0xB205, 0x39B2, 0x2EDC, 0xA56B, - 0xEFBD, 0x640A, 0x7364, 0xF8D3, 0x5DB8, 0xD60F, 0xC161, 0x4AD6, - 0x54CD, 0xDF7A, 0xC814, 0x43A3, 0xE6C8, 0x6D7F, 0x7A11, 0xF1A6, - 0xBB70, 0x30C7, 0x27A9, 0xAC1E, 0x0975, 0x82C2, 0x95AC, 0x1E1B, - 0xA99A, 0x222D, 0x3543, 0xBEF4, 0x1B9F, 0x9028, 0x8746, 0x0CF1, - 0x4627, 0xCD90, 0xDAFE, 0x5149, 0xF422, 0x7F95, 0x68FB, 0xE34C, - 0xFD57, 0x76E0, 0x618E, 0xEA39, 0x4F52, 0xC4E5, 0xD38B, 0x583C, - 0x12EA, 0x995D, 0x8E33, 0x0584, 0xA0EF, 0x2B58, 0x3C36, 0xB781, - 0xD883, 0x5334, 0x445A, 0xCFED, 0x6A86, 0xE131, 0xF65F, 0x7DE8, - 0x373E, 0xBC89, 0xABE7, 0x2050, 0x853B, 0x0E8C, 0x19E2, 0x9255, - 0x8C4E, 0x07F9, 0x1097, 0x9B20, 0x3E4B, 0xB5FC, 0xA292, 0x2925, - 0x63F3, 0xE844, 0xFF2A, 0x749D, 0xD1F6, 0x5A41, 0x4D2F, 0xC698, - 0x7119, 0xFAAE, 0xEDC0, 0x6677, 0xC31C, 0x48AB, 0x5FC5, 0xD472, - 0x9EA4, 0x1513, 0x027D, 0x89CA, 0x2CA1, 0xA716, 0xB078, 0x3BCF, - 0x25D4, 0xAE63, 0xB90D, 0x32BA, 0x97D1, 0x1C66, 0x0B08, 0x80BF, - 0xCA69, 0x41DE, 0x56B0, 0xDD07, 0x786C, 0xF3DB, 0xE4B5, 0x6F02, - 0x3AB1, 0xB106, 0xA668, 0x2DDF, 0x88B4, 0x0303, 0x146D, 0x9FDA, - 0xD50C, 0x5EBB, 0x49D5, 0xC262, 0x6709, 0xECBE, 0xFBD0, 0x7067, - 0x6E7C, 0xE5CB, 0xF2A5, 0x7912, 0xDC79, 0x57CE, 0x40A0, 0xCB17, - 0x81C1, 0x0A76, 0x1D18, 0x96AF, 0x33C4, 0xB873, 0xAF1D, 0x24AA, - 0x932B, 0x189C, 0x0FF2, 0x8445, 0x212E, 0xAA99, 0xBDF7, 0x3640, - 0x7C96, 0xF721, 0xE04F, 0x6BF8, 0xCE93, 0x4524, 0x524A, 0xD9FD, - 0xC7E6, 0x4C51, 0x5B3F, 0xD088, 0x75E3, 0xFE54, 0xE93A, 0x628D, - 0x285B, 0xA3EC, 0xB482, 0x3F35, 0x9A5E, 0x11E9, 0x0687, 0x8D30, - 0xE232, 0x6985, 0x7EEB, 0xF55C, 0x5037, 0xDB80, 0xCCEE, 0x4759, - 0x0D8F, 0x8638, 0x9156, 0x1AE1, 0xBF8A, 0x343D, 0x2353, 0xA8E4, - 0xB6FF, 0x3D48, 0x2A26, 0xA191, 0x04FA, 0x8F4D, 0x9823, 0x1394, - 0x5942, 0xD2F5, 0xC59B, 0x4E2C, 0xEB47, 0x60F0, 0x779E, 0xFC29, - 0x4BA8, 0xC01F, 0xD771, 0x5CC6, 0xF9AD, 0x721A, 0x6574, 0xEEC3, - 0xA415, 0x2FA2, 0x38CC, 0xB37B, 0x1610, 0x9DA7, 0x8AC9, 0x017E, - 0x1F65, 0x94D2, 0x83BC, 0x080B, 0xAD60, 0x26D7, 0x31B9, 0xBA0E, - 0xF0D8, 0x7B6F, 0x6C01, 0xE7B6, 0x42DD, 0xC96A, 0xDE04, 0x55B3 -}; +static struct crypto_shash *crct10dif_tfm; __u16 crc_t10dif(const unsigned char *buffer, size_t len) { - __u16 crc = 0; - unsigned int i; + struct { + struct shash_desc shash; + char ctx[2]; + } desc; + int err; + + desc.shash.tfm = crct10dif_tfm; + desc.shash.flags = 0; + *(__u16 *)desc.ctx = 0; - for (i = 0 ; i < len ; i++) - crc = (crc << 8) ^ t10_dif_crc_table[((crc >> 8) ^ buffer[i]) & 0xff]; + err = crypto_shash_update(&desc.shash, buffer, len); + BUG_ON(err); - return crc; + return *(__u16 *)desc.ctx; } EXPORT_SYMBOL(crc_t10dif); +static int __init crc_t10dif_mod_init(void) +{ + crct10dif_tfm = crypto_alloc_shash("crct10dif", 0, 0); + if (IS_ERR(crct10dif_tfm)) + return PTR_ERR(crct10dif_tfm); + return 0; +} + +static void __exit crc_t10dif_mod_fini(void) +{ + crypto_free_shash(crct10dif_tfm); +} + +module_init(crc_t10dif_mod_init); +module_exit(crc_t10dif_mod_fini); + MODULE_DESCRIPTION("T10 DIF CRC calculation"); MODULE_LICENSE("GPL"); -- cgit v1.2.3-59-g8ed1b From 31d939625a9a20b1badd2d4e6bf6fd39fa523405 Mon Sep 17 00:00:00 2001 From: Tim Chen Date: Wed, 1 May 2013 12:52:49 -0700 Subject: crypto: crct10dif - Accelerated CRC T10 DIF computation with PCLMULQDQ instruction This is the x86_64 CRC T10 DIF transform accelerated with the PCLMULQDQ instructions. Details discussing the implementation can be found in the paper: "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction" http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf Signed-off-by: Tim Chen Signed-off-by: Herbert Xu --- arch/x86/crypto/crct10dif-pcl-asm_64.S | 643 +++++++++++++++++++++++++++++++++ 1 file changed, 643 insertions(+) create mode 100644 arch/x86/crypto/crct10dif-pcl-asm_64.S diff --git a/arch/x86/crypto/crct10dif-pcl-asm_64.S b/arch/x86/crypto/crct10dif-pcl-asm_64.S new file mode 100644 index 000000000000..35e97569d05f --- /dev/null +++ b/arch/x86/crypto/crct10dif-pcl-asm_64.S @@ -0,0 +1,643 @@ +######################################################################## +# Implement fast CRC-T10DIF computation with SSE and PCLMULQDQ instructions +# +# Copyright (c) 2013, Intel Corporation +# +# Authors: +# Erdinc Ozturk +# Vinodh Gopal +# James Guilford +# Tim Chen +# +# This software is available to you under a choice of one of two +# licenses. You may choose to be licensed under the terms of the GNU +# General Public License (GPL) Version 2, available from the file +# COPYING in the main directory of this source tree, or the +# OpenIB.org BSD license below: +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the +# distribution. +# +# * Neither the name of the Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# +# THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +######################################################################## +# Function API: +# UINT16 crc_t10dif_pcl( +# UINT16 init_crc, //initial CRC value, 16 bits +# const unsigned char *buf, //buffer pointer to calculate CRC on +# UINT64 len //buffer length in bytes (64-bit data) +# ); +# +# Reference paper titled "Fast CRC Computation for Generic +# Polynomials Using PCLMULQDQ Instruction" +# URL: http://www.intel.com/content/dam/www/public/us/en/documents +# /white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf +# +# + +#include + +.text + +#define arg1 %rdi +#define arg2 %rsi +#define arg3 %rdx + +#define arg1_low32 %edi + +ENTRY(crc_t10dif_pcl) +.align 16 + + # adjust the 16-bit initial_crc value, scale it to 32 bits + shl $16, arg1_low32 + + # Allocate Stack Space + mov %rsp, %rcx + sub $16*2, %rsp + # align stack to 16 byte boundary + and $~(0x10 - 1), %rsp + + # check if smaller than 256 + cmp $256, arg3 + + # for sizes less than 128, we can't fold 64B at a time... + jl _less_than_128 + + + # load the initial crc value + movd arg1_low32, %xmm10 # initial crc + + # crc value does not need to be byte-reflected, but it needs + # to be moved to the high part of the register. + # because data will be byte-reflected and will align with + # initial crc at correct place. + pslldq $12, %xmm10 + + movdqa SHUF_MASK(%rip), %xmm11 + # receive the initial 64B data, xor the initial crc value + movdqu 16*0(arg2), %xmm0 + movdqu 16*1(arg2), %xmm1 + movdqu 16*2(arg2), %xmm2 + movdqu 16*3(arg2), %xmm3 + movdqu 16*4(arg2), %xmm4 + movdqu 16*5(arg2), %xmm5 + movdqu 16*6(arg2), %xmm6 + movdqu 16*7(arg2), %xmm7 + + pshufb %xmm11, %xmm0 + # XOR the initial_crc value + pxor %xmm10, %xmm0 + pshufb %xmm11, %xmm1 + pshufb %xmm11, %xmm2 + pshufb %xmm11, %xmm3 + pshufb %xmm11, %xmm4 + pshufb %xmm11, %xmm5 + pshufb %xmm11, %xmm6 + pshufb %xmm11, %xmm7 + + movdqa rk3(%rip), %xmm10 #xmm10 has rk3 and rk4 + #imm value of pclmulqdq instruction + #will determine which constant to use + + ################################################################# + # we subtract 256 instead of 128 to save one instruction from the loop + sub $256, arg3 + + # at this section of the code, there is 64*x+y (0<=y<64) bytes of + # buffer. The _fold_64_B_loop will fold 64B at a time + # until we have 64+y Bytes of buffer + + + # fold 64B at a time. This section of the code folds 4 xmm + # registers in parallel +_fold_64_B_loop: + + # update the buffer pointer + add $128, arg2 # buf += 64# + + movdqu 16*0(arg2), %xmm9 + movdqu 16*1(arg2), %xmm12 + pshufb %xmm11, %xmm9 + pshufb %xmm11, %xmm12 + movdqa %xmm0, %xmm8 + movdqa %xmm1, %xmm13 + pclmulqdq $0x0 , %xmm10, %xmm0 + pclmulqdq $0x11, %xmm10, %xmm8 + pclmulqdq $0x0 , %xmm10, %xmm1 + pclmulqdq $0x11, %xmm10, %xmm13 + pxor %xmm9 , %xmm0 + xorps %xmm8 , %xmm0 + pxor %xmm12, %xmm1 + xorps %xmm13, %xmm1 + + movdqu 16*2(arg2), %xmm9 + movdqu 16*3(arg2), %xmm12 + pshufb %xmm11, %xmm9 + pshufb %xmm11, %xmm12 + movdqa %xmm2, %xmm8 + movdqa %xmm3, %xmm13 + pclmulqdq $0x0, %xmm10, %xmm2 + pclmulqdq $0x11, %xmm10, %xmm8 + pclmulqdq $0x0, %xmm10, %xmm3 + pclmulqdq $0x11, %xmm10, %xmm13 + pxor %xmm9 , %xmm2 + xorps %xmm8 , %xmm2 + pxor %xmm12, %xmm3 + xorps %xmm13, %xmm3 + + movdqu 16*4(arg2), %xmm9 + movdqu 16*5(arg2), %xmm12 + pshufb %xmm11, %xmm9 + pshufb %xmm11, %xmm12 + movdqa %xmm4, %xmm8 + movdqa %xmm5, %xmm13 + pclmulqdq $0x0, %xmm10, %xmm4 + pclmulqdq $0x11, %xmm10, %xmm8 + pclmulqdq $0x0, %xmm10, %xmm5 + pclmulqdq $0x11, %xmm10, %xmm13 + pxor %xmm9 , %xmm4 + xorps %xmm8 , %xmm4 + pxor %xmm12, %xmm5 + xorps %xmm13, %xmm5 + + movdqu 16*6(arg2), %xmm9 + movdqu 16*7(arg2), %xmm12 + pshufb %xmm11, %xmm9 + pshufb %xmm11, %xmm12 + movdqa %xmm6 , %xmm8 + movdqa %xmm7 , %xmm13 + pclmulqdq $0x0 , %xmm10, %xmm6 + pclmulqdq $0x11, %xmm10, %xmm8 + pclmulqdq $0x0 , %xmm10, %xmm7 + pclmulqdq $0x11, %xmm10, %xmm13 + pxor %xmm9 , %xmm6 + xorps %xmm8 , %xmm6 + pxor %xmm12, %xmm7 + xorps %xmm13, %xmm7 + + sub $128, arg3 + + # check if there is another 64B in the buffer to be able to fold + jge _fold_64_B_loop + ################################################################## + + + add $128, arg2 + # at this point, the buffer pointer is pointing at the last y Bytes + # of the buffer the 64B of folded data is in 4 of the xmm + # registers: xmm0, xmm1, xmm2, xmm3 + + + # fold the 8 xmm registers to 1 xmm register with different constants + + movdqa rk9(%rip), %xmm10 + movdqa %xmm0, %xmm8 + pclmulqdq $0x11, %xmm10, %xmm0 + pclmulqdq $0x0 , %xmm10, %xmm8 + pxor %xmm8, %xmm7 + xorps %xmm0, %xmm7 + + movdqa rk11(%rip), %xmm10 + movdqa %xmm1, %xmm8 + pclmulqdq $0x11, %xmm10, %xmm1 + pclmulqdq $0x0 , %xmm10, %xmm8 + pxor %xmm8, %xmm7 + xorps %xmm1, %xmm7 + + movdqa rk13(%rip), %xmm10 + movdqa %xmm2, %xmm8 + pclmulqdq $0x11, %xmm10, %xmm2 + pclmulqdq $0x0 , %xmm10, %xmm8 + pxor %xmm8, %xmm7 + pxor %xmm2, %xmm7 + + movdqa rk15(%rip), %xmm10 + movdqa %xmm3, %xmm8 + pclmulqdq $0x11, %xmm10, %xmm3 + pclmulqdq $0x0 , %xmm10, %xmm8 + pxor %xmm8, %xmm7 + xorps %xmm3, %xmm7 + + movdqa rk17(%rip), %xmm10 + movdqa %xmm4, %xmm8 + pclmulqdq $0x11, %xmm10, %xmm4 + pclmulqdq $0x0 , %xmm10, %xmm8 + pxor %xmm8, %xmm7 + pxor %xmm4, %xmm7 + + movdqa rk19(%rip), %xmm10 + movdqa %xmm5, %xmm8 + pclmulqdq $0x11, %xmm10, %xmm5 + pclmulqdq $0x0 , %xmm10, %xmm8 + pxor %xmm8, %xmm7 + xorps %xmm5, %xmm7 + + movdqa rk1(%rip), %xmm10 #xmm10 has rk1 and rk2 + #imm value of pclmulqdq instruction + #will determine which constant to use + movdqa %xmm6, %xmm8 + pclmulqdq $0x11, %xmm10, %xmm6 + pclmulqdq $0x0 , %xmm10, %xmm8 + pxor %xmm8, %xmm7 + pxor %xmm6, %xmm7 + + + # instead of 64, we add 48 to the loop counter to save 1 instruction + # from the loop instead of a cmp instruction, we use the negative + # flag with the jl instruction + add $128-16, arg3 + jl _final_reduction_for_128 + + # now we have 16+y bytes left to reduce. 16 Bytes is in register xmm7 + # and the rest is in memory. We can fold 16 bytes at a time if y>=16 + # continue folding 16B at a time + +_16B_reduction_loop: + movdqa %xmm7, %xmm8 + pclmulqdq $0x11, %xmm10, %xmm7 + pclmulqdq $0x0 , %xmm10, %xmm8 + pxor %xmm8, %xmm7 + movdqu (arg2), %xmm0 + pshufb %xmm11, %xmm0 + pxor %xmm0 , %xmm7 + add $16, arg2 + sub $16, arg3 + # instead of a cmp instruction, we utilize the flags with the + # jge instruction equivalent of: cmp arg3, 16-16 + # check if there is any more 16B in the buffer to be able to fold + jge _16B_reduction_loop + + #now we have 16+z bytes left to reduce, where 0<= z < 16. + #first, we reduce the data in the xmm7 register + + +_final_reduction_for_128: + # check if any more data to fold. If not, compute the CRC of + # the final 128 bits + add $16, arg3 + je _128_done + + # here we are getting data that is less than 16 bytes. + # since we know that there was data before the pointer, we can + # offset the input pointer before the actual point, to receive + # exactly 16 bytes. after that the registers need to be adjusted. +_get_last_two_xmms: + movdqa %xmm7, %xmm2 + + movdqu -16(arg2, arg3), %xmm1 + pshufb %xmm11, %xmm1 + + # get rid of the extra data that was loaded before + # load the shift constant + lea pshufb_shf_table+16(%rip), %rax + sub arg3, %rax + movdqu (%rax), %xmm0 + + # shift xmm2 to the left by arg3 bytes + pshufb %xmm0, %xmm2 + + # shift xmm7 to the right by 16-arg3 bytes + pxor mask1(%rip), %xmm0 + pshufb %xmm0, %xmm7 + pblendvb %xmm2, %xmm1 #xmm0 is implicit + + # fold 16 Bytes + movdqa %xmm1, %xmm2 + movdqa %xmm7, %xmm8 + pclmulqdq $0x11, %xmm10, %xmm7 + pclmulqdq $0x0 , %xmm10, %xmm8 + pxor %xmm8, %xmm7 + pxor %xmm2, %xmm7 + +_128_done: + # compute crc of a 128-bit value + movdqa rk5(%rip), %xmm10 # rk5 and rk6 in xmm10 + movdqa %xmm7, %xmm0 + + #64b fold + pclmulqdq $0x1, %xmm10, %xmm7 + pslldq $8 , %xmm0 + pxor %xmm0, %xmm7 + + #32b fold + movdqa %xmm7, %xmm0 + + pand mask2(%rip), %xmm0 + + psrldq $12, %xmm7 + pclmulqdq $0x10, %xmm10, %xmm7 + pxor %xmm0, %xmm7 + + #barrett reduction +_barrett: + movdqa rk7(%rip), %xmm10 # rk7 and rk8 in xmm10 + movdqa %xmm7, %xmm0 + pclmulqdq $0x01, %xmm10, %xmm7 + pslldq $4, %xmm7 + pclmulqdq $0x11, %xmm10, %xmm7 + + pslldq $4, %xmm7 + pxor %xmm0, %xmm7 + pextrd $1, %xmm7, %eax + +_cleanup: + # scale the result back to 16 bits + shr $16, %eax + mov %rcx, %rsp + ret + +######################################################################## + +.align 16 +_less_than_128: + + # check if there is enough buffer to be able to fold 16B at a time + cmp $32, arg3 + jl _less_than_32 + movdqa SHUF_MASK(%rip), %xmm11 + + # now if there is, load the constants + movdqa rk1(%rip), %xmm10 # rk1 and rk2 in xmm10 + + movd arg1_low32, %xmm0 # get the initial crc value + pslldq $12, %xmm0 # align it to its correct place + movdqu (arg2), %xmm7 # load the plaintext + pshufb %xmm11, %xmm7 # byte-reflect the plaintext + pxor %xmm0, %xmm7 + + + # update the buffer pointer + add $16, arg2 + + # update the counter. subtract 32 instead of 16 to save one + # instruction from the loop + sub $32, arg3 + + jmp _16B_reduction_loop + + +.align 16 +_less_than_32: + # mov initial crc to the return value. this is necessary for + # zero-length buffers. + mov arg1_low32, %eax + test arg3, arg3 + je _cleanup + + movdqa SHUF_MASK(%rip), %xmm11 + + movd arg1_low32, %xmm0 # get the initial crc value + pslldq $12, %xmm0 # align it to its correct place + + cmp $16, arg3 + je _exact_16_left + jl _less_than_16_left + + movdqu (arg2), %xmm7 # load the plaintext + pshufb %xmm11, %xmm7 # byte-reflect the plaintext + pxor %xmm0 , %xmm7 # xor the initial crc value + add $16, arg2 + sub $16, arg3 + movdqa rk1(%rip), %xmm10 # rk1 and rk2 in xmm10 + jmp _get_last_two_xmms + + +.align 16 +_less_than_16_left: + # use stack space to load data less than 16 bytes, zero-out + # the 16B in memory first. + + pxor %xmm1, %xmm1 + mov %rsp, %r11 + movdqa %xmm1, (%r11) + + cmp $4, arg3 + jl _only_less_than_4 + + # backup the counter value + mov arg3, %r9 + cmp $8, arg3 + jl _less_than_8_left + + # load 8 Bytes + mov (arg2), %rax + mov %rax, (%r11) + add $8, %r11 + sub $8, arg3 + add $8, arg2 +_less_than_8_left: + + cmp $4, arg3 + jl _less_than_4_left + + # load 4 Bytes + mov (arg2), %eax + mov %eax, (%r11) + add $4, %r11 + sub $4, arg3 + add $4, arg2 +_less_than_4_left: + + cmp $2, arg3 + jl _less_than_2_left + + # load 2 Bytes + mov (arg2), %ax + mov %ax, (%r11) + add $2, %r11 + sub $2, arg3 + add $2, arg2 +_less_than_2_left: + cmp $1, arg3 + jl _zero_left + + # load 1 Byte + mov (arg2), %al + mov %al, (%r11) +_zero_left: + movdqa (%rsp), %xmm7 + pshufb %xmm11, %xmm7 + pxor %xmm0 , %xmm7 # xor the initial crc value + + # shl r9, 4 + lea pshufb_shf_table+16(%rip), %rax + sub %r9, %rax + movdqu (%rax), %xmm0 + pxor mask1(%rip), %xmm0 + + pshufb %xmm0, %xmm7 + jmp _128_done + +.align 16 +_exact_16_left: + movdqu (arg2), %xmm7 + pshufb %xmm11, %xmm7 + pxor %xmm0 , %xmm7 # xor the initial crc value + + jmp _128_done + +_only_less_than_4: + cmp $3, arg3 + jl _only_less_than_3 + + # load 3 Bytes + mov (arg2), %al + mov %al, (%r11) + + mov 1(arg2), %al + mov %al, 1(%r11) + + mov 2(arg2), %al + mov %al, 2(%r11) + + movdqa (%rsp), %xmm7 + pshufb %xmm11, %xmm7 + pxor %xmm0 , %xmm7 # xor the initial crc value + + psrldq $5, %xmm7 + + jmp _barrett +_only_less_than_3: + cmp $2, arg3 + jl _only_less_than_2 + + # load 2 Bytes + mov (arg2), %al + mov %al, (%r11) + + mov 1(arg2), %al + mov %al, 1(%r11) + + movdqa (%rsp), %xmm7 + pshufb %xmm11, %xmm7 + pxor %xmm0 , %xmm7 # xor the initial crc value + + psrldq $6, %xmm7 + + jmp _barrett +_only_less_than_2: + + # load 1 Byte + mov (arg2), %al + mov %al, (%r11) + + movdqa (%rsp), %xmm7 + pshufb %xmm11, %xmm7 + pxor %xmm0 , %xmm7 # xor the initial crc value + + psrldq $7, %xmm7 + + jmp _barrett + +ENDPROC(crc_t10dif_pcl) + +.data + +# precomputed constants +# these constants are precomputed from the poly: +# 0x8bb70000 (0x8bb7 scaled to 32 bits) +.align 16 +# Q = 0x18BB70000 +# rk1 = 2^(32*3) mod Q << 32 +# rk2 = 2^(32*5) mod Q << 32 +# rk3 = 2^(32*15) mod Q << 32 +# rk4 = 2^(32*17) mod Q << 32 +# rk5 = 2^(32*3) mod Q << 32 +# rk6 = 2^(32*2) mod Q << 32 +# rk7 = floor(2^64/Q) +# rk8 = Q +rk1: +.quad 0x2d56000000000000 +rk2: +.quad 0x06df000000000000 +rk3: +.quad 0x9d9d000000000000 +rk4: +.quad 0x7cf5000000000000 +rk5: +.quad 0x2d56000000000000 +rk6: +.quad 0x1368000000000000 +rk7: +.quad 0x00000001f65a57f8 +rk8: +.quad 0x000000018bb70000 + +rk9: +.quad 0xceae000000000000 +rk10: +.quad 0xbfd6000000000000 +rk11: +.quad 0x1e16000000000000 +rk12: +.quad 0x713c000000000000 +rk13: +.quad 0xf7f9000000000000 +rk14: +.quad 0x80a6000000000000 +rk15: +.quad 0x044c000000000000 +rk16: +.quad 0xe658000000000000 +rk17: +.quad 0xad18000000000000 +rk18: +.quad 0xa497000000000000 +rk19: +.quad 0x6ee3000000000000 +rk20: +.quad 0xe7b5000000000000 + + + +mask1: +.octa 0x80808080808080808080808080808080 +mask2: +.octa 0x00000000FFFFFFFFFFFFFFFFFFFFFFFF + +SHUF_MASK: +.octa 0x000102030405060708090A0B0C0D0E0F + +pshufb_shf_table: +# use these values for shift constants for the pshufb instruction +# different alignments result in values as shown: +# DDQ 0x008f8e8d8c8b8a898887868584838281 # shl 15 (16-1) / shr1 +# DDQ 0x01008f8e8d8c8b8a8988878685848382 # shl 14 (16-3) / shr2 +# DDQ 0x0201008f8e8d8c8b8a89888786858483 # shl 13 (16-4) / shr3 +# DDQ 0x030201008f8e8d8c8b8a898887868584 # shl 12 (16-4) / shr4 +# DDQ 0x04030201008f8e8d8c8b8a8988878685 # shl 11 (16-5) / shr5 +# DDQ 0x0504030201008f8e8d8c8b8a89888786 # shl 10 (16-6) / shr6 +# DDQ 0x060504030201008f8e8d8c8b8a898887 # shl 9 (16-7) / shr7 +# DDQ 0x07060504030201008f8e8d8c8b8a8988 # shl 8 (16-8) / shr8 +# DDQ 0x0807060504030201008f8e8d8c8b8a89 # shl 7 (16-9) / shr9 +# DDQ 0x090807060504030201008f8e8d8c8b8a # shl 6 (16-10) / shr10 +# DDQ 0x0a090807060504030201008f8e8d8c8b # shl 5 (16-11) / shr11 +# DDQ 0x0b0a090807060504030201008f8e8d8c # shl 4 (16-12) / shr12 +# DDQ 0x0c0b0a090807060504030201008f8e8d # shl 3 (16-13) / shr13 +# DDQ 0x0d0c0b0a090807060504030201008f8e # shl 2 (16-14) / shr14 +# DDQ 0x0e0d0c0b0a090807060504030201008f # shl 1 (16-15) / shr15 +.octa 0x8f8e8d8c8b8a89888786858483828100 +.octa 0x000e0d0c0b0a09080706050403020100 -- cgit v1.2.3-59-g8ed1b From 0b95a7f85718adcbba36407ef88bba0a7379ed03 Mon Sep 17 00:00:00 2001 From: Tim Chen Date: Wed, 1 May 2013 12:52:50 -0700 Subject: crypto: crct10dif - Glue code to cast accelerated CRCT10DIF assembly as a crypto transform Glue code that plugs the PCLMULQDQ accelerated CRC T10 DIF hash into the crypto framework. The config CRYPTO_CRCT10DIF_PCLMUL should be turned on to enable the feature. The crc_t10dif crypto library function will use this faster algorithm when crct10dif_pclmul module is loaded. Signed-off-by: Tim Chen Signed-off-by: Herbert Xu --- arch/x86/crypto/Makefile | 2 + arch/x86/crypto/crct10dif-pclmul_glue.c | 151 ++++++++++++++++++++++++++++++++ crypto/Kconfig | 11 +++ 3 files changed, 164 insertions(+) create mode 100644 arch/x86/crypto/crct10dif-pclmul_glue.c diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile index a3a0ed80f17c..94cb151adc1d 100644 --- a/arch/x86/crypto/Makefile +++ b/arch/x86/crypto/Makefile @@ -29,6 +29,7 @@ obj-$(CONFIG_CRYPTO_SHA1_SSSE3) += sha1-ssse3.o obj-$(CONFIG_CRYPTO_CRC32_PCLMUL) += crc32-pclmul.o obj-$(CONFIG_CRYPTO_SHA256_SSSE3) += sha256-ssse3.o obj-$(CONFIG_CRYPTO_SHA512_SSSE3) += sha512-ssse3.o +obj-$(CONFIG_CRYPTO_CRCT10DIF_PCLMUL) += crct10dif-pclmul.o # These modules require assembler to support AVX. ifeq ($(avx_supported),yes) @@ -87,3 +88,4 @@ crc32c-intel-$(CONFIG_64BIT) += crc32c-pcl-intel-asm_64.o crc32-pclmul-y := crc32-pclmul_asm.o crc32-pclmul_glue.o sha256-ssse3-y := sha256-ssse3-asm.o sha256-avx-asm.o sha256-avx2-asm.o sha256_ssse3_glue.o sha512-ssse3-y := sha512-ssse3-asm.o sha512-avx-asm.o sha512-avx2-asm.o sha512_ssse3_glue.o +crct10dif-pclmul-y := crct10dif-pcl-asm_64.o crct10dif-pclmul_glue.o diff --git a/arch/x86/crypto/crct10dif-pclmul_glue.c b/arch/x86/crypto/crct10dif-pclmul_glue.c new file mode 100644 index 000000000000..7845d7fd54c0 --- /dev/null +++ b/arch/x86/crypto/crct10dif-pclmul_glue.c @@ -0,0 +1,151 @@ +/* + * Cryptographic API. + * + * T10 Data Integrity Field CRC16 Crypto Transform using PCLMULQDQ Instructions + * + * Copyright (C) 2013 Intel Corporation + * Author: Tim Chen + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +asmlinkage __u16 crc_t10dif_pcl(__u16 crc, const unsigned char *buf, + size_t len); + +struct chksum_desc_ctx { + __u16 crc; +}; + +/* + * Steps through buffer one byte at at time, calculates reflected + * crc using table. + */ + +static int chksum_init(struct shash_desc *desc) +{ + struct chksum_desc_ctx *ctx = shash_desc_ctx(desc); + + ctx->crc = 0; + + return 0; +} + +static int chksum_update(struct shash_desc *desc, const u8 *data, + unsigned int length) +{ + struct chksum_desc_ctx *ctx = shash_desc_ctx(desc); + + if (irq_fpu_usable()) { + kernel_fpu_begin(); + ctx->crc = crc_t10dif_pcl(ctx->crc, data, length); + kernel_fpu_end(); + } else + ctx->crc = crc_t10dif_generic(ctx->crc, data, length); + return 0; +} + +static int chksum_final(struct shash_desc *desc, u8 *out) +{ + struct chksum_desc_ctx *ctx = shash_desc_ctx(desc); + + *(__u16 *)out = ctx->crc; + return 0; +} + +static int __chksum_finup(__u16 *crcp, const u8 *data, unsigned int len, + u8 *out) +{ + if (irq_fpu_usable()) { + kernel_fpu_begin(); + *(__u16 *)out = crc_t10dif_pcl(*crcp, data, len); + kernel_fpu_end(); + } else + *(__u16 *)out = crc_t10dif_generic(*crcp, data, len); + return 0; +} + +static int chksum_finup(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *out) +{ + struct chksum_desc_ctx *ctx = shash_desc_ctx(desc); + + return __chksum_finup(&ctx->crc, data, len, out); +} + +static int chksum_digest(struct shash_desc *desc, const u8 *data, + unsigned int length, u8 *out) +{ + struct chksum_desc_ctx *ctx = shash_desc_ctx(desc); + + return __chksum_finup(&ctx->crc, data, length, out); +} + +static struct shash_alg alg = { + .digestsize = CRC_T10DIF_DIGEST_SIZE, + .init = chksum_init, + .update = chksum_update, + .final = chksum_final, + .finup = chksum_finup, + .digest = chksum_digest, + .descsize = sizeof(struct chksum_desc_ctx), + .base = { + .cra_name = "crct10dif", + .cra_driver_name = "crct10dif-pclmul", + .cra_priority = 200, + .cra_blocksize = CRC_T10DIF_BLOCK_SIZE, + .cra_module = THIS_MODULE, + } +}; + +static const struct x86_cpu_id crct10dif_cpu_id[] = { + X86_FEATURE_MATCH(X86_FEATURE_PCLMULQDQ), + {} +}; +MODULE_DEVICE_TABLE(x86cpu, crct10dif_cpu_id); + +static int __init crct10dif_intel_mod_init(void) +{ + if (!x86_match_cpu(crct10dif_cpu_id)) + return -ENODEV; + + return crypto_register_shash(&alg); +} + +static void __exit crct10dif_intel_mod_fini(void) +{ + crypto_unregister_shash(&alg); +} + +module_init(crct10dif_intel_mod_init); +module_exit(crct10dif_intel_mod_fini); + +MODULE_AUTHOR("Tim Chen "); +MODULE_DESCRIPTION("T10 DIF CRC calculation accelerated with PCLMULQDQ."); +MODULE_LICENSE("GPL"); + +MODULE_ALIAS("crct10dif"); +MODULE_ALIAS("crct10dif-pclmul"); diff --git a/crypto/Kconfig b/crypto/Kconfig index ceb3611efe44..d1ca6312d798 100644 --- a/crypto/Kconfig +++ b/crypto/Kconfig @@ -384,6 +384,17 @@ config CRYPTO_CRCT10DIF a crypto transform. This allows for faster crc t10 diff transforms to be used if they are available. +config CRYPTO_CRCT10DIF_PCLMUL + tristate "CRCT10DIF PCLMULQDQ hardware acceleration" + depends on X86 && 64BIT && CRC_T10DIF + select CRYPTO_HASH + help + For x86_64 processors with SSE4.2 and PCLMULQDQ supported, + CRC T10 DIF PCLMULQDQ computation can be hardware + accelerated PCLMULQDQ instruction. This option will create + 'crct10dif-plcmul' module, which is faster when computing the + crct10dif checksum as compared with the generic table implementation. + config CRYPTO_GHASH tristate "GHASH digest algorithm" select CRYPTO_GF128MUL -- cgit v1.2.3-59-g8ed1b From 39761214eefc6b070f29402aa1165f24d789b3f7 Mon Sep 17 00:00:00 2001 From: Tim Chen Date: Wed, 1 May 2013 12:52:51 -0700 Subject: crypto: crct10dif - Simple correctness and speed test for CRCT10DIF hash These are simple tests to do sanity check of CRC T10 DIF hash. The correctness of the transform can be checked with the command modprobe tcrypt mode=47 The speed of the transform can be evaluated with the command modprobe tcrypt mode=320 Set the cpu frequency to constant and turn turbo off when running the speed test so the frequency governor will not tweak the frequency and affects the measurements. Signed-off-by: Tim Chen Signed-off-by: Herbert Xu --- crypto/tcrypt.c | 8 ++++++++ crypto/testmgr.c | 10 ++++++++++ crypto/testmgr.h | 33 +++++++++++++++++++++++++++++++++ 3 files changed, 51 insertions(+) diff --git a/crypto/tcrypt.c b/crypto/tcrypt.c index 66d254ce0d11..25a5934f0e50 100644 --- a/crypto/tcrypt.c +++ b/crypto/tcrypt.c @@ -1174,6 +1174,10 @@ static int do_test(int m) ret += tcrypt_test("ghash"); break; + case 47: + ret += tcrypt_test("crct10dif"); + break; + case 100: ret += tcrypt_test("hmac(md5)"); break; @@ -1498,6 +1502,10 @@ static int do_test(int m) test_hash_speed("crc32c", sec, generic_hash_speed_template); if (mode > 300 && mode < 400) break; + case 320: + test_hash_speed("crct10dif", sec, generic_hash_speed_template); + if (mode > 300 && mode < 400) break; + case 399: break; diff --git a/crypto/testmgr.c b/crypto/testmgr.c index 5823735cf381..f19a392ade78 100644 --- a/crypto/testmgr.c +++ b/crypto/testmgr.c @@ -1973,6 +1973,16 @@ static const struct alg_test_desc alg_test_descs[] = { .count = CRC32C_TEST_VECTORS } } + }, { + .alg = "crct10dif", + .test = alg_test_hash, + .fips_allowed = 1, + .suite = { + .hash = { + .vecs = crct10dif_tv_template, + .count = CRCT10DIF_TEST_VECTORS + } + } }, { .alg = "cryptd(__driver-cbc-aes-aesni)", .test = alg_test_null, diff --git a/crypto/testmgr.h b/crypto/testmgr.h index 1e701bc075b9..7d44aa3d6b44 100644 --- a/crypto/testmgr.h +++ b/crypto/testmgr.h @@ -450,6 +450,39 @@ static struct hash_testvec rmd320_tv_template[] = { } }; +#define CRCT10DIF_TEST_VECTORS 3 +static struct hash_testvec crct10dif_tv_template[] = { + { + .plaintext = "abc", + .psize = 3, +#ifdef __LITTLE_ENDIAN + .digest = "\x3b\x44", +#else + .digest = "\x44\x3b", +#endif + }, { + .plaintext = "1234567890123456789012345678901234567890" + "123456789012345678901234567890123456789", + .psize = 79, +#ifdef __LITTLE_ENDIAN + .digest = "\x70\x4b", +#else + .digest = "\x4b\x70", +#endif + }, { + .plaintext = + "abcddddddddddddddddddddddddddddddddddddddddddddddddddddd", + .psize = 56, +#ifdef __LITTLE_ENDIAN + .digest = "\xe3\x9c", +#else + .digest = "\x9c\xe3", +#endif + .np = 2, + .tap = { 28, 28 } + } +}; + /* * SHA1 test vectors from from FIPS PUB 180-1 * Long vector from CAVS 5.0 -- cgit v1.2.3-59-g8ed1b From 30862281f717835ec06eca9aeb2b3ddc3da33bc1 Mon Sep 17 00:00:00 2001 From: Laurent Navet Date: Thu, 2 May 2013 14:00:38 +0200 Subject: drivers: crypto: use devm_ioremap_resource() Replace calls to deprecated devm_request_and_ioremap by devm_ioremap_resource. Found with coccicheck and this semantic patch: scripts/coccinelle/api/devm_request_and_ioremap.cocci. Signed-off-by: Laurent Navet Signed-off-by: Herbert Xu --- drivers/crypto/omap-aes.c | 7 +++---- drivers/crypto/omap-sham.c | 7 +++---- 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/drivers/crypto/omap-aes.c b/drivers/crypto/omap-aes.c index ee15b0f7849a..305a2aacf9b3 100644 --- a/drivers/crypto/omap-aes.c +++ b/drivers/crypto/omap-aes.c @@ -1125,10 +1125,9 @@ static int omap_aes_probe(struct platform_device *pdev) if (err) goto err_res; - dd->io_base = devm_request_and_ioremap(dev, &res); - if (!dd->io_base) { - dev_err(dev, "can't ioremap\n"); - err = -ENOMEM; + dd->io_base = devm_ioremap_resource(dev, &res); + if (IS_ERR(dd->io_base)) { + err = PTR_ERR(dd->io_base); goto err_res; } dd->phys_base = res.start; diff --git a/drivers/crypto/omap-sham.c b/drivers/crypto/omap-sham.c index a1e1b4756ee5..4bb67652c200 100644 --- a/drivers/crypto/omap-sham.c +++ b/drivers/crypto/omap-sham.c @@ -1686,10 +1686,9 @@ static int omap_sham_probe(struct platform_device *pdev) if (err) goto res_err; - dd->io_base = devm_request_and_ioremap(dev, &res); - if (!dd->io_base) { - dev_err(dev, "can't ioremap\n"); - err = -ENOMEM; + dd->io_base = devm_ioremap_resource(dev, &res); + if (IS_ERR(dd->io_base)) { + err = PTR_ERR(dd->io_base); goto res_err; } dd->phys_base = res.start; -- cgit v1.2.3-59-g8ed1b From 928e47ec84c1f889de996b64c4a4687df8d2709a Mon Sep 17 00:00:00 2001 From: Sachin Kamat Date: Fri, 3 May 2013 14:48:14 +0530 Subject: crypto: mv_cesa: Remove redundant platform_set_drvdata() Commit 0998d06310 (device-core: Ensure drvdata = NULL when no driver is bound) removes the need to set driver data field to NULL. Signed-off-by: Sachin Kamat Cc: Sebastian Andrzej Siewior Signed-off-by: Herbert Xu --- drivers/crypto/mv_cesa.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/crypto/mv_cesa.c b/drivers/crypto/mv_cesa.c index ce6290e5471a..3374a3ebe4c7 100644 --- a/drivers/crypto/mv_cesa.c +++ b/drivers/crypto/mv_cesa.c @@ -1146,7 +1146,6 @@ err_unmap_reg: err: kfree(cp); cpg = NULL; - platform_set_drvdata(pdev, NULL); return ret; } -- cgit v1.2.3-59-g8ed1b From 612dde34f3f2d0341fefb914a40b512521ddc3d4 Mon Sep 17 00:00:00 2001 From: Sachin Kamat Date: Fri, 3 May 2013 14:48:15 +0530 Subject: crypto: s5p-sss: Remove redundant platform_set_drvdata() Commit 0998d06310 (device-core: Ensure drvdata = NULL when no driver is bound) removes the need to set driver data field to NULL. Signed-off-by: Sachin Kamat Cc: Vladimir Zapolskiy Signed-off-by: Herbert Xu --- drivers/crypto/s5p-sss.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/crypto/s5p-sss.c b/drivers/crypto/s5p-sss.c index 4b314326f48a..cf149b19ff47 100644 --- a/drivers/crypto/s5p-sss.c +++ b/drivers/crypto/s5p-sss.c @@ -647,7 +647,6 @@ static int s5p_aes_probe(struct platform_device *pdev) clk_disable(pdata->clk); s5p_dev = NULL; - platform_set_drvdata(pdev, NULL); return err; } @@ -668,7 +667,6 @@ static int s5p_aes_remove(struct platform_device *pdev) clk_disable(pdata->clk); s5p_dev = NULL; - platform_set_drvdata(pdev, NULL); return 0; } -- cgit v1.2.3-59-g8ed1b From 19e21b1d6d51a14c79ee3bc80352d665c7772b1c Mon Sep 17 00:00:00 2001 From: Jingoo Han Date: Mon, 6 May 2013 13:37:13 +0900 Subject: hwrng: atmel - remove unnecessary platform_set_drvdata() The driver core clears the driver data to NULL after device_release or on probe failure, since commit 0998d0631001288a5974afc0b2a5f568bcdecb4d (device-core: Ensure drvdata = NULL when no driver is bound). Thus, it is not needed to manually clear the device driver data to NULL. Signed-off-by: Jingoo Han Signed-off-by: Herbert Xu --- drivers/char/hw_random/atmel-rng.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/char/hw_random/atmel-rng.c b/drivers/char/hw_random/atmel-rng.c index 7c73d4aca36b..bf9fc6b79328 100644 --- a/drivers/char/hw_random/atmel-rng.c +++ b/drivers/char/hw_random/atmel-rng.c @@ -108,8 +108,6 @@ static int atmel_trng_remove(struct platform_device *pdev) clk_disable(trng->clk); clk_put(trng->clk); - platform_set_drvdata(pdev, NULL); - return 0; } -- cgit v1.2.3-59-g8ed1b From 938ae2f83b7f94cf44b66757d63e56e9865ffdf9 Mon Sep 17 00:00:00 2001 From: Jingoo Han Date: Mon, 6 May 2013 13:38:27 +0900 Subject: hwrng: bcm63xx - remove unnecessary platform_set_drvdata() The driver core clears the driver data to NULL after device_release or on probe failure, since commit 0998d0631001288a5974afc0b2a5f568bcdecb4d (device-core: Ensure drvdata = NULL when no driver is bound). Thus, it is not needed to manually clear the device driver data to NULL. Signed-off-by: Jingoo Han Signed-off-by: Herbert Xu --- drivers/char/hw_random/bcm63xx-rng.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/char/hw_random/bcm63xx-rng.c b/drivers/char/hw_random/bcm63xx-rng.c index f343b7d0dfa1..36581ea562cb 100644 --- a/drivers/char/hw_random/bcm63xx-rng.c +++ b/drivers/char/hw_random/bcm63xx-rng.c @@ -137,7 +137,6 @@ static int bcm63xx_rng_probe(struct platform_device *pdev) out_clk_disable: clk_disable(clk); out_free_rng: - platform_set_drvdata(pdev, NULL); kfree(rng); out_free_priv: kfree(priv); @@ -154,7 +153,6 @@ static int bcm63xx_rng_remove(struct platform_device *pdev) clk_disable(priv->clk); kfree(priv); kfree(rng); - platform_set_drvdata(pdev, NULL); return 0; } -- cgit v1.2.3-59-g8ed1b From 08d63a22ced6e410dfceaa3792b79f1e01d594c7 Mon Sep 17 00:00:00 2001 From: Jingoo Han Date: Mon, 6 May 2013 13:39:38 +0900 Subject: hwrng: timeriomem - remove unnecessary platform_set_drvdata() The driver core clears the driver data to NULL after device_release or on probe failure, since commit 0998d0631001288a5974afc0b2a5f568bcdecb4d (device-core: Ensure drvdata = NULL when no driver is bound). Thus, it is not needed to manually clear the device driver data to NULL. Signed-off-by: Jingoo Han Signed-off-by: Herbert Xu --- drivers/char/hw_random/timeriomem-rng.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/char/hw_random/timeriomem-rng.c b/drivers/char/hw_random/timeriomem-rng.c index 3e75737f5fe1..d2120ba8f3f9 100644 --- a/drivers/char/hw_random/timeriomem-rng.c +++ b/drivers/char/hw_random/timeriomem-rng.c @@ -192,7 +192,6 @@ out_release_io: out_timer: del_timer_sync(&priv->timer); out_free: - platform_set_drvdata(pdev, NULL); kfree(priv); return err; } @@ -209,7 +208,6 @@ static int timeriomem_rng_remove(struct platform_device *pdev) del_timer_sync(&priv->timer); iounmap(priv->io_base); release_mem_region(res->start, resource_size(res)); - platform_set_drvdata(pdev, NULL); kfree(priv); return 0; -- cgit v1.2.3-59-g8ed1b From 1ef997dca4ebeaca56783430de848a8d207ec879 Mon Sep 17 00:00:00 2001 From: Jingoo Han Date: Mon, 6 May 2013 13:40:21 +0900 Subject: hwrng: tx4939 - remove unnecessary platform_set_drvdata() The driver core clears the driver data to NULL after device_release or on probe failure, since commit 0998d0631001288a5974afc0b2a5f568bcdecb4d (device-core: Ensure drvdata = NULL when no driver is bound). Thus, it is not needed to manually clear the device driver data to NULL. Signed-off-by: Jingoo Han Signed-off-by: Herbert Xu --- drivers/char/hw_random/tx4939-rng.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/char/hw_random/tx4939-rng.c b/drivers/char/hw_random/tx4939-rng.c index d34a24a0d484..00593c847cf0 100644 --- a/drivers/char/hw_random/tx4939-rng.c +++ b/drivers/char/hw_random/tx4939-rng.c @@ -154,7 +154,6 @@ static int __exit tx4939_rng_remove(struct platform_device *dev) struct tx4939_rng *rngdev = platform_get_drvdata(dev); hwrng_unregister(&rngdev->rng); - platform_set_drvdata(dev, NULL); return 0; } -- cgit v1.2.3-59-g8ed1b From 74d24d838be0ee64c1085558238e5c51e1659817 Mon Sep 17 00:00:00 2001 From: Paul Bolle Date: Sun, 12 May 2013 13:57:19 +0200 Subject: crypto: sahara - remove dependency on EXPERIMENTAL The Kconfig symbol EXPERIMENTAL was removed in v3.9. So this dependency makes it impossible to set CRYPTO_DEV_SAHARA. It's unlikely that this is what is intended, so let's remove this dependency. Signed-off-by: Paul Bolle Signed-off-by: Herbert Xu --- drivers/crypto/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/crypto/Kconfig b/drivers/crypto/Kconfig index dffb85525368..47c6f4dcc675 100644 --- a/drivers/crypto/Kconfig +++ b/drivers/crypto/Kconfig @@ -278,7 +278,7 @@ config CRYPTO_DEV_PICOXCELL config CRYPTO_DEV_SAHARA tristate "Support for SAHARA crypto accelerator" - depends on ARCH_MXC && EXPERIMENTAL && OF + depends on ARCH_MXC && OF select CRYPTO_BLKCIPHER select CRYPTO_AES select CRYPTO_ECB -- cgit v1.2.3-59-g8ed1b From e2d4ea9444024e38217e4b7e380c6b9f39f9a5f2 Mon Sep 17 00:00:00 2001 From: Andrei Varvara Date: Tue, 28 May 2013 15:37:05 +0800 Subject: crypto: caam - fix SEQ IN PTR command when RTO or PRE bit is set SEQ IN PTR command does not require pointer if RTO or PRE bit is set Updated desc_constr.h accordingly. Signed-off-by: Andrei Varvara Reviewed-by: Phillips Kim-R1AAHA Reviewed-by: Fleming Andrew-AFLEMING Signed-off-by: Herbert Xu --- drivers/crypto/caam/desc.h | 4 ++-- drivers/crypto/caam/desc_constr.h | 8 ++++++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/drivers/crypto/caam/desc.h b/drivers/crypto/caam/desc.h index f7f833be8c67..89fa3d045612 100644 --- a/drivers/crypto/caam/desc.h +++ b/drivers/crypto/caam/desc.h @@ -1294,10 +1294,10 @@ struct sec4_sg_entry { #define SQOUT_SGF 0x01000000 /* Appends to a previous pointer */ -#define SQOUT_PRE 0x00800000 +#define SQOUT_PRE SQIN_PRE /* Restore sequence with pointer/length */ -#define SQOUT_RTO 0x00200000 +#define SQOUT_RTO SQIN_RTO /* Use extended length following pointer */ #define SQOUT_EXT 0x00400000 diff --git a/drivers/crypto/caam/desc_constr.h b/drivers/crypto/caam/desc_constr.h index c85c1f058401..19501b548db4 100644 --- a/drivers/crypto/caam/desc_constr.h +++ b/drivers/crypto/caam/desc_constr.h @@ -122,7 +122,8 @@ static inline void append_cmd_ptr_extlen(u32 *desc, dma_addr_t ptr, unsigned int len, u32 command) { append_cmd(desc, command); - append_ptr(desc, ptr); + if (!(command & (SQIN_RTO | SQIN_PRE))) + append_ptr(desc, ptr); append_cmd(desc, len); } @@ -186,7 +187,10 @@ static inline void append_seq_##cmd##_ptr_intlen(u32 *desc, dma_addr_t ptr, \ u32 options) \ { \ PRINT_POS; \ - append_cmd_ptr(desc, ptr, len, CMD_SEQ_##op##_PTR | options); \ + if (options & (SQIN_RTO | SQIN_PRE)) \ + append_cmd(desc, CMD_SEQ_##op##_PTR | len | options); \ + else \ + append_cmd_ptr(desc, ptr, len, CMD_SEQ_##op##_PTR | options); \ } APPEND_SEQ_PTR_INTLEN(in, IN) APPEND_SEQ_PTR_INTLEN(out, OUT) -- cgit v1.2.3-59-g8ed1b From 524f1bd9a3d401cca16bca16582cd71c0abc8bd9 Mon Sep 17 00:00:00 2001 From: Andrei Varvara Date: Tue, 28 May 2013 15:37:06 +0800 Subject: crypto: caam - Fix STORE command to support overwriting Shared Descriptor's memory In case Store command is used with overwrite Shared Descriptor feature there is no need for pointer, it is using the address from which the Shared Descriptor was fetched. Signed-off-by: Andrei Varvara Reviewed-by: Phillips Kim-R1AAHA Reviewed-by: Fleming Andrew-AFLEMING Signed-off-by: Herbert Xu --- drivers/crypto/caam/desc_constr.h | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/drivers/crypto/caam/desc_constr.h b/drivers/crypto/caam/desc_constr.h index 19501b548db4..fc4470a2c337 100644 --- a/drivers/crypto/caam/desc_constr.h +++ b/drivers/crypto/caam/desc_constr.h @@ -177,10 +177,26 @@ static inline void append_##cmd(u32 *desc, dma_addr_t ptr, unsigned int len, \ } APPEND_CMD_PTR(key, KEY) APPEND_CMD_PTR(load, LOAD) -APPEND_CMD_PTR(store, STORE) APPEND_CMD_PTR(fifo_load, FIFO_LOAD) APPEND_CMD_PTR(fifo_store, FIFO_STORE) +static inline void append_store(u32 *desc, dma_addr_t ptr, unsigned int len, + u32 options) +{ + u32 cmd_src; + + cmd_src = options & LDST_SRCDST_MASK; + + append_cmd(desc, CMD_STORE | options | len); + + /* The following options do not require pointer */ + if (!(cmd_src == LDST_SRCDST_WORD_DESCBUF_SHARED || + cmd_src == LDST_SRCDST_WORD_DESCBUF_JOB || + cmd_src == LDST_SRCDST_WORD_DESCBUF_JOB_WE || + cmd_src == LDST_SRCDST_WORD_DESCBUF_SHARED_WE)) + append_ptr(desc, ptr); +} + #define APPEND_SEQ_PTR_INTLEN(cmd, op) \ static inline void append_seq_##cmd##_ptr_intlen(u32 *desc, dma_addr_t ptr, \ unsigned int len, \ -- cgit v1.2.3-59-g8ed1b From d179333b0c6298291c384a0b0ba51ee67d983758 Mon Sep 17 00:00:00 2001 From: Andrei Varvara Date: Tue, 28 May 2013 15:37:06 +0800 Subject: crypto: caam - Add MATH command to support shld function Perform 32-bit left shift of DEST and concatenate with left 32 bits of SRC1. {DEST[31:0],SRC1[63:32]} Signed-off-by: Andrei Varvara Acked-by: Mihai Serb Reviewed-by: Phillips Kim-R1AAHA Reviewed-by: Fleming Andrew-AFLEMING Signed-off-by: Herbert Xu --- drivers/crypto/caam/desc_constr.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/crypto/caam/desc_constr.h b/drivers/crypto/caam/desc_constr.h index fc4470a2c337..08c95a192234 100644 --- a/drivers/crypto/caam/desc_constr.h +++ b/drivers/crypto/caam/desc_constr.h @@ -299,6 +299,8 @@ append_cmd(desc, CMD_MATH | MATH_FUN_##op | MATH_DEST_##dest | \ APPEND_MATH(LSHIFT, desc, dest, src0, src1, len) #define append_math_rshift(desc, dest, src0, src1, len) \ APPEND_MATH(RSHIFT, desc, dest, src0, src1, len) +#define append_math_ldshift(desc, dest, src0, src1, len) \ + APPEND_MATH(SHLD, desc, dest, src0, src1, len) /* Exactly one source is IMM. Data is passed in as u32 value */ #define APPEND_MATH_IMM_u32(op, desc, dest, src_0, src_1, data) \ -- cgit v1.2.3-59-g8ed1b From 3ebfa92f49a61a00e967112632ad10e92998bb75 Mon Sep 17 00:00:00 2001 From: Andrei Varvara Date: Tue, 28 May 2013 15:37:07 +0800 Subject: crypto: caam - Add new macros for building extended SEC descriptors (> 64 words) added all supported math funtion on 8 byte boundary with immediate flag bit set automatically added MATH_SRC0_DPOVRD & MATH_SRC1_DPOVRD The function/defines above are needed for creating descriptors longer than 64 words Signed-off-by: Andrei Varvara Reviewed-by: Phillips Kim-R1AAHA Reviewed-by: Fleming Andrew-AFLEMING Signed-off-by: Herbert Xu --- drivers/crypto/caam/desc.h | 4 +++ drivers/crypto/caam/desc_constr.h | 53 ++++++++++++++++++++++++++++++++++++++- 2 files changed, 56 insertions(+), 1 deletion(-) diff --git a/drivers/crypto/caam/desc.h b/drivers/crypto/caam/desc.h index 89fa3d045612..7d8b14723465 100644 --- a/drivers/crypto/caam/desc.h +++ b/drivers/crypto/caam/desc.h @@ -366,6 +366,7 @@ struct sec4_sg_entry { #define FIFOLD_TYPE_LAST2FLUSH1 (0x05 << FIFOLD_TYPE_SHIFT) #define FIFOLD_TYPE_LASTBOTH (0x06 << FIFOLD_TYPE_SHIFT) #define FIFOLD_TYPE_LASTBOTHFL (0x07 << FIFOLD_TYPE_SHIFT) +#define FIFOLD_TYPE_NOINFOFIFO (0x0F << FIFOLD_TYPE_SHIFT) #define FIFOLDST_LEN_MASK 0xffff #define FIFOLDST_EXT_LEN_MASK 0xffffffff @@ -1359,6 +1360,7 @@ struct sec4_sg_entry { #define MOVE_DEST_MATH3 (0x07 << MOVE_DEST_SHIFT) #define MOVE_DEST_CLASS1INFIFO (0x08 << MOVE_DEST_SHIFT) #define MOVE_DEST_CLASS2INFIFO (0x09 << MOVE_DEST_SHIFT) +#define MOVE_DEST_INFIFO_NOINFO (0x0a << MOVE_DEST_SHIFT) #define MOVE_DEST_PK_A (0x0c << MOVE_DEST_SHIFT) #define MOVE_DEST_CLASS1KEY (0x0d << MOVE_DEST_SHIFT) #define MOVE_DEST_CLASS2KEY (0x0e << MOVE_DEST_SHIFT) @@ -1411,6 +1413,7 @@ struct sec4_sg_entry { #define MATH_SRC0_REG2 (0x02 << MATH_SRC0_SHIFT) #define MATH_SRC0_REG3 (0x03 << MATH_SRC0_SHIFT) #define MATH_SRC0_IMM (0x04 << MATH_SRC0_SHIFT) +#define MATH_SRC0_DPOVRD (0x07 << MATH_SRC0_SHIFT) #define MATH_SRC0_SEQINLEN (0x08 << MATH_SRC0_SHIFT) #define MATH_SRC0_SEQOUTLEN (0x09 << MATH_SRC0_SHIFT) #define MATH_SRC0_VARSEQINLEN (0x0a << MATH_SRC0_SHIFT) @@ -1425,6 +1428,7 @@ struct sec4_sg_entry { #define MATH_SRC1_REG2 (0x02 << MATH_SRC1_SHIFT) #define MATH_SRC1_REG3 (0x03 << MATH_SRC1_SHIFT) #define MATH_SRC1_IMM (0x04 << MATH_SRC1_SHIFT) +#define MATH_SRC1_DPOVRD (0x07 << MATH_SRC0_SHIFT) #define MATH_SRC1_INFIFO (0x0a << MATH_SRC1_SHIFT) #define MATH_SRC1_OUTFIFO (0x0b << MATH_SRC1_SHIFT) #define MATH_SRC1_ONE (0x0c << MATH_SRC1_SHIFT) diff --git a/drivers/crypto/caam/desc_constr.h b/drivers/crypto/caam/desc_constr.h index 08c95a192234..fe3bfd1b08ca 100644 --- a/drivers/crypto/caam/desc_constr.h +++ b/drivers/crypto/caam/desc_constr.h @@ -110,6 +110,26 @@ static inline void append_cmd(u32 *desc, u32 command) (*desc)++; } +#define append_u32 append_cmd + +static inline void append_u64(u32 *desc, u64 data) +{ + u32 *offset = desc_end(desc); + + *offset = upper_32_bits(data); + *(++offset) = lower_32_bits(data); + + (*desc) += 2; +} + +/* Write command without affecting header, and return pointer to next word */ +static inline u32 *write_cmd(u32 *desc, u32 command) +{ + *desc = command; + + return desc + 1; +} + static inline void append_cmd_ptr(u32 *desc, dma_addr_t ptr, int len, u32 command) { @@ -279,7 +299,7 @@ APPEND_CMD_RAW_IMM(load, LOAD, u32); */ #define APPEND_MATH(op, desc, dest, src_0, src_1, len) \ append_cmd(desc, CMD_MATH | MATH_FUN_##op | MATH_DEST_##dest | \ - MATH_SRC0_##src_0 | MATH_SRC1_##src_1 | (u32) (len & MATH_LEN_MASK)); + MATH_SRC0_##src_0 | MATH_SRC1_##src_1 | (u32)len); #define append_math_add(desc, dest, src0, src1, len) \ APPEND_MATH(ADD, desc, dest, src0, src1, len) @@ -327,3 +347,34 @@ do { \ APPEND_MATH_IMM_u32(LSHIFT, desc, dest, src0, src1, data) #define append_math_rshift_imm_u32(desc, dest, src0, src1, data) \ APPEND_MATH_IMM_u32(RSHIFT, desc, dest, src0, src1, data) + +/* Exactly one source is IMM. Data is passed in as u64 value */ +#define APPEND_MATH_IMM_u64(op, desc, dest, src_0, src_1, data) \ +do { \ + u32 upper = (data >> 16) >> 16; \ + APPEND_MATH(op, desc, dest, src_0, src_1, CAAM_CMD_SZ * 2 | \ + (upper ? 0 : MATH_IFB)); \ + if (upper) \ + append_u64(desc, data); \ + else \ + append_u32(desc, data); \ +} while (0) + +#define append_math_add_imm_u64(desc, dest, src0, src1, data) \ + APPEND_MATH_IMM_u64(ADD, desc, dest, src0, src1, data) +#define append_math_sub_imm_u64(desc, dest, src0, src1, data) \ + APPEND_MATH_IMM_u64(SUB, desc, dest, src0, src1, data) +#define append_math_add_c_imm_u64(desc, dest, src0, src1, data) \ + APPEND_MATH_IMM_u64(ADDC, desc, dest, src0, src1, data) +#define append_math_sub_b_imm_u64(desc, dest, src0, src1, data) \ + APPEND_MATH_IMM_u64(SUBB, desc, dest, src0, src1, data) +#define append_math_and_imm_u64(desc, dest, src0, src1, data) \ + APPEND_MATH_IMM_u64(AND, desc, dest, src0, src1, data) +#define append_math_or_imm_u64(desc, dest, src0, src1, data) \ + APPEND_MATH_IMM_u64(OR, desc, dest, src0, src1, data) +#define append_math_xor_imm_u64(desc, dest, src0, src1, data) \ + APPEND_MATH_IMM_u64(XOR, desc, dest, src0, src1, data) +#define append_math_lshift_imm_u64(desc, dest, src0, src1, data) \ + APPEND_MATH_IMM_u64(LSHIFT, desc, dest, src0, src1, data) +#define append_math_rshift_imm_u64(desc, dest, src0, src1, data) \ + APPEND_MATH_IMM_u64(RSHIFT, desc, dest, src0, src1, data) -- cgit v1.2.3-59-g8ed1b From 1f50be97f630012e096968d79f5b2fbfebadaf81 Mon Sep 17 00:00:00 2001 From: Andrei Varvara Date: Tue, 28 May 2013 15:37:07 +0800 Subject: crypto: caam - Add defines for overwriting Descriptor's memory Store command has options to overwrite the Job Desc, Shared Desc or the entire Descriptor in memory, using the address from which the Descriptor was fetched. Signed-off-by: Andrei Varvara Reviewed-by: Phillips Kim-R1AAHA Reviewed-by: Fleming Andrew-AFLEMING Signed-off-by: Herbert Xu --- drivers/crypto/caam/desc.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/crypto/caam/desc.h b/drivers/crypto/caam/desc.h index 7d8b14723465..972ad145d76c 100644 --- a/drivers/crypto/caam/desc.h +++ b/drivers/crypto/caam/desc.h @@ -232,6 +232,10 @@ struct sec4_sg_entry { #define LDST_SRCDST_WORD_PKHA_N_SZ (0x12 << LDST_SRCDST_SHIFT) #define LDST_SRCDST_WORD_PKHA_E_SZ (0x13 << LDST_SRCDST_SHIFT) #define LDST_SRCDST_WORD_DESCBUF (0x40 << LDST_SRCDST_SHIFT) +#define LDST_SRCDST_WORD_DESCBUF_JOB (0x41 << LDST_SRCDST_SHIFT) +#define LDST_SRCDST_WORD_DESCBUF_SHARED (0x42 << LDST_SRCDST_SHIFT) +#define LDST_SRCDST_WORD_DESCBUF_JOB_WE (0x45 << LDST_SRCDST_SHIFT) +#define LDST_SRCDST_WORD_DESCBUF_SHARED_WE (0x46 << LDST_SRCDST_SHIFT) #define LDST_SRCDST_WORD_INFO_FIFO (0x7a << LDST_SRCDST_SHIFT) /* Offset in source/destination */ -- cgit v1.2.3-59-g8ed1b From 590f9667a2307b24e9b56138d901dd2f228db6e0 Mon Sep 17 00:00:00 2001 From: Andrei Varvara Date: Tue, 28 May 2013 15:37:08 +0800 Subject: crypto: caam - Add defines for CAAM commands add defines for: append load immediate command setting SEQ LIODN equal to the Non-SEQ LIODN for the job replace job descriptor command Signed-off-by: Andrei Varvara Reviewed-by: Phillips Kim-R1AAHA Reviewed-by: Fleming Andrew-AFLEMING Signed-off-by: Herbert Xu --- drivers/crypto/caam/desc.h | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/crypto/caam/desc.h b/drivers/crypto/caam/desc.h index 972ad145d76c..8214dc1c49e8 100644 --- a/drivers/crypto/caam/desc.h +++ b/drivers/crypto/caam/desc.h @@ -1608,4 +1608,13 @@ struct sec4_sg_entry { #define NFIFOENTRY_PLEN_SHIFT 0 #define NFIFOENTRY_PLEN_MASK (0xFF << NFIFOENTRY_PLEN_SHIFT) +/* Append Load Immediate Command */ +#define FD_CMD_APPEND_LOAD_IMMEDIATE 0x80000000 + +/* Set SEQ LIODN equal to the Non-SEQ LIODN for the job */ +#define FD_CMD_SET_SEQ_LIODN_EQUAL_NONSEQ_LIODN 0x40000000 + +/* Frame Descriptor Command for Replacement Job Descriptor */ +#define FD_CMD_REPLACE_JOB_DESC 0x20000000 + #endif /* DESC_H */ -- cgit v1.2.3-59-g8ed1b From da64e35810e0717ec63ca25a46118f48e320caff Mon Sep 17 00:00:00 2001 From: Andrei Varvara Date: Tue, 28 May 2013 15:37:08 +0800 Subject: crypto: caam - Add define for Adjust Output Frame Length in PDB Add define for "Adjust Output Frame Length" in order to set the AOFL bit in the IPsec ESP Decapsulation PDB. Signed-off-by: Anca-Jeanina Floarea Signed-off-by: Andrei Varvara Reviewed-by: Phillips Kim-R1AAHA Reviewed-by: Fleming Andrew-AFLEMING Signed-off-by: Herbert Xu --- drivers/crypto/caam/pdb.h | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/crypto/caam/pdb.h b/drivers/crypto/caam/pdb.h index 62950d22ac13..3a87c0cf879a 100644 --- a/drivers/crypto/caam/pdb.h +++ b/drivers/crypto/caam/pdb.h @@ -44,6 +44,7 @@ #define PDBOPTS_ESP_IPHDRSRC 0x08 /* IP header comes from PDB (encap) */ #define PDBOPTS_ESP_INCIPHDR 0x04 /* Prepend IP header to output frame */ #define PDBOPTS_ESP_IPVSN 0x02 /* process IPv6 header */ +#define PDBOPTS_ESP_AOFL 0x04 /* adjust out frame len (decap, SEC>=5.3)*/ #define PDBOPTS_ESP_TUNNEL 0x01 /* tunnel mode next-header byte */ #define PDBOPTS_ESP_IPV6 0x02 /* ip header version is V6 */ #define PDBOPTS_ESP_DIFFSERV 0x40 /* copy TOS/TC from inner iphdr */ -- cgit v1.2.3-59-g8ed1b From 91dc363a86142afcbc4e9c3ab31b0f4563b0f269 Mon Sep 17 00:00:00 2001 From: Andrei Varvara Date: Tue, 28 May 2013 15:37:08 +0800 Subject: crypto: caam - add missing flag for the LOAD/STORE commands Add Class Context SRC / DEST flags for the LOAD & STORE commands Signed-off-by: Andrei Varvara Reviewed-by: Phillips Kim-R1AAHA Reviewed-by: Fleming Andrew-AFLEMING Signed-off-by: Herbert Xu --- drivers/crypto/caam/desc.h | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/crypto/caam/desc.h b/drivers/crypto/caam/desc.h index 8214dc1c49e8..53b296f78b0d 100644 --- a/drivers/crypto/caam/desc.h +++ b/drivers/crypto/caam/desc.h @@ -231,6 +231,7 @@ struct sec4_sg_entry { #define LDST_SRCDST_WORD_PKHA_B_SZ (0x11 << LDST_SRCDST_SHIFT) #define LDST_SRCDST_WORD_PKHA_N_SZ (0x12 << LDST_SRCDST_SHIFT) #define LDST_SRCDST_WORD_PKHA_E_SZ (0x13 << LDST_SRCDST_SHIFT) +#define LDST_SRCDST_WORD_CLASS_CTX (0x20 << LDST_SRCDST_SHIFT) #define LDST_SRCDST_WORD_DESCBUF (0x40 << LDST_SRCDST_SHIFT) #define LDST_SRCDST_WORD_DESCBUF_JOB (0x41 << LDST_SRCDST_SHIFT) #define LDST_SRCDST_WORD_DESCBUF_SHARED (0x42 << LDST_SRCDST_SHIFT) -- cgit v1.2.3-59-g8ed1b From 519d8b1a9d81be7e4ffad8aa6b0e3ea03984bb86 Mon Sep 17 00:00:00 2001 From: Tobias Rauter Date: Sun, 19 May 2013 21:59:38 +0200 Subject: crypto: dcp - Added support for Freescale's DCP co-processor This patch enables the DCP crypto functionality on imx28. Currently, only aes-128-cbc is supported. Moreover, the dcpboot misc-device, which is used by Freescale's SDK tools and uses a non-software-readable OTP-key, is added. Changes of v2: - ring buffer for hardware-descriptors - use of ablkcipher walk - OTP key encryption/decryption via misc-device (compatible to Freescale-SDK) - overall cleanup The DCP is also capable of sha1/sha256 but I won't be able to add that anytime soon. Tested with built-in runtime-self-test, tcrypt and openssl via cryptodev 1.6 on imx28-evk and a custom built imx28-board. Signed-off-by: Tobias Rauter Signed-off-by: Herbert Xu --- arch/arm/boot/dts/imx28.dtsi | 2 +- drivers/crypto/Kconfig | 10 + drivers/crypto/Makefile | 1 + drivers/crypto/dcp.c | 925 +++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 937 insertions(+), 1 deletion(-) create mode 100644 drivers/crypto/dcp.c diff --git a/arch/arm/boot/dts/imx28.dtsi b/arch/arm/boot/dts/imx28.dtsi index 600f7cb51f3e..077d0eb1a8b3 100644 --- a/arch/arm/boot/dts/imx28.dtsi +++ b/arch/arm/boot/dts/imx28.dtsi @@ -699,7 +699,7 @@ dcp@80028000 { reg = <0x80028000 0x2000>; interrupts = <52 53 54>; - status = "disabled"; + compatible = "fsl-dcp"; }; pxp@8002a000 { diff --git a/drivers/crypto/Kconfig b/drivers/crypto/Kconfig index 47c6f4dcc675..8ff7c230d82e 100644 --- a/drivers/crypto/Kconfig +++ b/drivers/crypto/Kconfig @@ -286,6 +286,16 @@ config CRYPTO_DEV_SAHARA This option enables support for the SAHARA HW crypto accelerator found in some Freescale i.MX chips. +config CRYPTO_DEV_DCP + tristate "Support for the DCP engine" + depends on ARCH_MXS && OF + select CRYPTO_BLKCIPHER + select CRYPTO_AES + select CRYPTO_CBC + help + This options enables support for the hardware crypto-acceleration + capabilities of the DCP co-processor + config CRYPTO_DEV_S5P tristate "Support for Samsung S5PV210 crypto accelerator" depends on ARCH_S5PV210 diff --git a/drivers/crypto/Makefile b/drivers/crypto/Makefile index 38ce13d3b79b..b4946ddd2550 100644 --- a/drivers/crypto/Makefile +++ b/drivers/crypto/Makefile @@ -13,6 +13,7 @@ obj-$(CONFIG_CRYPTO_DEV_OMAP_SHAM) += omap-sham.o obj-$(CONFIG_CRYPTO_DEV_OMAP_AES) += omap-aes.o obj-$(CONFIG_CRYPTO_DEV_PICOXCELL) += picoxcell_crypto.o obj-$(CONFIG_CRYPTO_DEV_SAHARA) += sahara.o +obj-$(CONFIG_CRYPTO_DEV_DCP) += dcp.o obj-$(CONFIG_CRYPTO_DEV_S5P) += s5p-sss.o obj-$(CONFIG_CRYPTO_DEV_TEGRA_AES) += tegra-aes.o obj-$(CONFIG_CRYPTO_DEV_UX500) += ux500/ diff --git a/drivers/crypto/dcp.c b/drivers/crypto/dcp.c new file mode 100644 index 000000000000..eea194c34527 --- /dev/null +++ b/drivers/crypto/dcp.c @@ -0,0 +1,925 @@ +/* + * Cryptographic API. + * + * Support for DCP cryptographic accelerator. + * + * Copyright (c) 2013 + * Author: Tobias Rauter + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + * + * Based on tegra-aes.c, dcp.c (from freescale SDK) and sahara.c + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + + +/* IOCTL for DCP OTP Key AES - taken from Freescale's SDK*/ +#define DBS_IOCTL_BASE 'd' +#define DBS_ENC _IOW(DBS_IOCTL_BASE, 0x00, uint8_t[16]) +#define DBS_DEC _IOW(DBS_IOCTL_BASE, 0x01, uint8_t[16]) + +/* DCP channel used for AES */ +#define USED_CHANNEL 1 +/* Ring Buffers' maximum size */ +#define DCP_MAX_PKG 20 + +/* Control Register */ +#define DCP_REG_CTRL 0x000 +#define DCP_CTRL_SFRST (1<<31) +#define DCP_CTRL_CLKGATE (1<<30) +#define DCP_CTRL_CRYPTO_PRESENT (1<<29) +#define DCP_CTRL_SHA_PRESENT (1<<28) +#define DCP_CTRL_GATHER_RES_WRITE (1<<23) +#define DCP_CTRL_ENABLE_CONTEXT_CACHE (1<<22) +#define DCP_CTRL_ENABLE_CONTEXT_SWITCH (1<<21) +#define DCP_CTRL_CH_IRQ_E_0 0x01 +#define DCP_CTRL_CH_IRQ_E_1 0x02 +#define DCP_CTRL_CH_IRQ_E_2 0x04 +#define DCP_CTRL_CH_IRQ_E_3 0x08 + +/* Status register */ +#define DCP_REG_STAT 0x010 +#define DCP_STAT_OTP_KEY_READY (1<<28) +#define DCP_STAT_CUR_CHANNEL(stat) ((stat>>24)&0x0F) +#define DCP_STAT_READY_CHANNEL(stat) ((stat>>16)&0x0F) +#define DCP_STAT_IRQ(stat) (stat&0x0F) +#define DCP_STAT_CHAN_0 (0x01) +#define DCP_STAT_CHAN_1 (0x02) +#define DCP_STAT_CHAN_2 (0x04) +#define DCP_STAT_CHAN_3 (0x08) + +/* Channel Control Register */ +#define DCP_REG_CHAN_CTRL 0x020 +#define DCP_CHAN_CTRL_CH0_IRQ_MERGED (1<<16) +#define DCP_CHAN_CTRL_HIGH_PRIO_0 (0x0100) +#define DCP_CHAN_CTRL_HIGH_PRIO_1 (0x0200) +#define DCP_CHAN_CTRL_HIGH_PRIO_2 (0x0400) +#define DCP_CHAN_CTRL_HIGH_PRIO_3 (0x0800) +#define DCP_CHAN_CTRL_ENABLE_0 (0x01) +#define DCP_CHAN_CTRL_ENABLE_1 (0x02) +#define DCP_CHAN_CTRL_ENABLE_2 (0x04) +#define DCP_CHAN_CTRL_ENABLE_3 (0x08) + +/* + * Channel Registers: + * The DCP has 4 channels. Each of this channels + * has 4 registers (command pointer, semaphore, status and options). + * The address of register REG of channel CHAN is obtained by + * dcp_chan_reg(REG, CHAN) + */ +#define DCP_REG_CHAN_PTR 0x00000100 +#define DCP_REG_CHAN_SEMA 0x00000110 +#define DCP_REG_CHAN_STAT 0x00000120 +#define DCP_REG_CHAN_OPT 0x00000130 + +#define DCP_CHAN_STAT_NEXT_CHAIN_IS_0 0x010000 +#define DCP_CHAN_STAT_NO_CHAIN 0x020000 +#define DCP_CHAN_STAT_CONTEXT_ERROR 0x030000 +#define DCP_CHAN_STAT_PAYLOAD_ERROR 0x040000 +#define DCP_CHAN_STAT_INVALID_MODE 0x050000 +#define DCP_CHAN_STAT_PAGEFAULT 0x40 +#define DCP_CHAN_STAT_DST 0x20 +#define DCP_CHAN_STAT_SRC 0x10 +#define DCP_CHAN_STAT_PACKET 0x08 +#define DCP_CHAN_STAT_SETUP 0x04 +#define DCP_CHAN_STAT_MISMATCH 0x02 + +/* hw packet control*/ + +#define DCP_PKT_PAYLOAD_KEY (1<<11) +#define DCP_PKT_OTP_KEY (1<<10) +#define DCP_PKT_CIPHER_INIT (1<<9) +#define DCP_PKG_CIPHER_ENCRYPT (1<<8) +#define DCP_PKT_CIPHER_ENABLE (1<<5) +#define DCP_PKT_DECR_SEM (1<<1) +#define DCP_PKT_CHAIN (1<<2) +#define DCP_PKT_IRQ 1 + +#define DCP_PKT_MODE_CBC (1<<4) +#define DCP_PKT_KEYSELECT_OTP (0xFF<<8) + +/* cipher flags */ +#define DCP_ENC 0x0001 +#define DCP_DEC 0x0002 +#define DCP_ECB 0x0004 +#define DCP_CBC 0x0008 +#define DCP_CBC_INIT 0x0010 +#define DCP_NEW_KEY 0x0040 +#define DCP_OTP_KEY 0x0080 +#define DCP_AES 0x1000 + +/* DCP Flags */ +#define DCP_FLAG_BUSY 0x01 +#define DCP_FLAG_PRODUCING 0x02 + +/* clock defines */ +#define CLOCK_ON 1 +#define CLOCK_OFF 0 + +struct dcp_dev_req_ctx { + int mode; +}; + +struct dcp_op { + unsigned int flags; + u8 key[AES_KEYSIZE_128]; + int keylen; + + struct ablkcipher_request *req; + struct crypto_ablkcipher *fallback; + + uint32_t stat; + uint32_t pkt1; + uint32_t pkt2; + struct ablkcipher_walk walk; +}; + +struct dcp_dev { + struct device *dev; + void __iomem *dcp_regs_base; + + int dcp_vmi_irq; + int dcp_irq; + + spinlock_t queue_lock; + struct crypto_queue queue; + + uint32_t pkt_produced; + uint32_t pkt_consumed; + + struct dcp_hw_packet *hw_pkg[DCP_MAX_PKG]; + dma_addr_t hw_phys_pkg; + + /* [KEY][IV] Both with 16 Bytes */ + u8 *payload_base; + dma_addr_t payload_base_dma; + + + struct tasklet_struct done_task; + struct tasklet_struct queue_task; + struct timer_list watchdog; + + unsigned long flags; + + struct dcp_op *ctx; + + struct miscdevice dcp_bootstream_misc; +}; + +struct dcp_hw_packet { + uint32_t next; + uint32_t pkt1; + uint32_t pkt2; + uint32_t src; + uint32_t dst; + uint32_t size; + uint32_t payload; + uint32_t stat; +}; + +struct dcp_dev *global_dev; + +static inline u32 dcp_chan_reg(u32 reg, int chan) +{ + return reg + (chan) * 0x40; +} + +static inline void dcp_write(struct dcp_dev *dev, u32 data, u32 reg) +{ + writel(data, dev->dcp_regs_base + reg); +} + +static inline void dcp_set(struct dcp_dev *dev, u32 data, u32 reg) +{ + writel(data, dev->dcp_regs_base + (reg | 0x04)); +} + +static inline void dcp_clear(struct dcp_dev *dev, u32 data, u32 reg) +{ + writel(data, dev->dcp_regs_base + (reg | 0x08)); +} + +static inline void dcp_toggle(struct dcp_dev *dev, u32 data, u32 reg) +{ + writel(data, dev->dcp_regs_base + (reg | 0x0C)); +} + +static inline unsigned int dcp_read(struct dcp_dev *dev, u32 reg) +{ + return readl(dev->dcp_regs_base + reg); +} + +void dcp_dma_unmap(struct dcp_dev *dev, struct dcp_hw_packet *pkt) +{ + dma_unmap_page(dev->dev, pkt->src, pkt->size, DMA_TO_DEVICE); + dma_unmap_page(dev->dev, pkt->dst, pkt->size, DMA_FROM_DEVICE); + dev_dbg(dev->dev, "unmap packet %x", (unsigned int) pkt); +} + +int dcp_dma_map(struct dcp_dev *dev, + struct ablkcipher_walk *walk, struct dcp_hw_packet *pkt) +{ + dev_dbg(dev->dev, "map packet %x", (unsigned int) pkt); + /* align to length = 16 */ + pkt->size = walk->nbytes - (walk->nbytes % 16); + + pkt->src = dma_map_page(dev->dev, walk->src.page, walk->src.offset, + pkt->size, DMA_TO_DEVICE); + + if (pkt->src == 0) { + dev_err(dev->dev, "Unable to map src"); + return -ENOMEM; + } + + pkt->dst = dma_map_page(dev->dev, walk->dst.page, walk->dst.offset, + pkt->size, DMA_FROM_DEVICE); + + if (pkt->dst == 0) { + dev_err(dev->dev, "Unable to map dst"); + dma_unmap_page(dev->dev, pkt->src, pkt->size, DMA_TO_DEVICE); + return -ENOMEM; + } + + return 0; +} + +static void dcp_op_one(struct dcp_dev *dev, struct dcp_hw_packet *pkt, + uint8_t last) +{ + struct dcp_op *ctx = dev->ctx; + pkt->pkt1 = ctx->pkt1; + pkt->pkt2 = ctx->pkt2; + + pkt->payload = (u32) dev->payload_base_dma; + pkt->stat = 0; + + if (ctx->flags & DCP_CBC_INIT) { + pkt->pkt1 |= DCP_PKT_CIPHER_INIT; + ctx->flags &= ~DCP_CBC_INIT; + } + + mod_timer(&dev->watchdog, jiffies + msecs_to_jiffies(500)); + pkt->pkt1 |= DCP_PKT_IRQ; + if (!last) + pkt->pkt1 |= DCP_PKT_CHAIN; + + dev->pkt_produced++; + + dcp_write(dev, 1, + dcp_chan_reg(DCP_REG_CHAN_SEMA, USED_CHANNEL)); +} + +static void dcp_op_proceed(struct dcp_dev *dev) +{ + struct dcp_op *ctx = dev->ctx; + struct dcp_hw_packet *pkt; + + while (ctx->walk.nbytes) { + int err = 0; + + pkt = dev->hw_pkg[dev->pkt_produced % DCP_MAX_PKG]; + err = dcp_dma_map(dev, &ctx->walk, pkt); + if (err) { + dev->ctx->stat |= err; + /* start timer to wait for already set up calls */ + mod_timer(&dev->watchdog, + jiffies + msecs_to_jiffies(500)); + break; + } + + + err = ctx->walk.nbytes - pkt->size; + ablkcipher_walk_done(dev->ctx->req, &dev->ctx->walk, err); + + dcp_op_one(dev, pkt, ctx->walk.nbytes == 0); + /* we have to wait if no space is left in buffer */ + if (dev->pkt_produced - dev->pkt_consumed == DCP_MAX_PKG) + break; + } + clear_bit(DCP_FLAG_PRODUCING, &dev->flags); +} + +static void dcp_op_start(struct dcp_dev *dev, uint8_t use_walk) +{ + struct dcp_op *ctx = dev->ctx; + + if (ctx->flags & DCP_NEW_KEY) { + memcpy(dev->payload_base, ctx->key, ctx->keylen); + ctx->flags &= ~DCP_NEW_KEY; + } + + ctx->pkt1 = 0; + ctx->pkt1 |= DCP_PKT_CIPHER_ENABLE; + ctx->pkt1 |= DCP_PKT_DECR_SEM; + + if (ctx->flags & DCP_OTP_KEY) + ctx->pkt1 |= DCP_PKT_OTP_KEY; + else + ctx->pkt1 |= DCP_PKT_PAYLOAD_KEY; + + if (ctx->flags & DCP_ENC) + ctx->pkt1 |= DCP_PKG_CIPHER_ENCRYPT; + + ctx->pkt2 = 0; + if (ctx->flags & DCP_CBC) + ctx->pkt2 |= DCP_PKT_MODE_CBC; + + dev->pkt_produced = 0; + dev->pkt_consumed = 0; + + ctx->stat = 0; + dcp_clear(dev, -1, dcp_chan_reg(DCP_REG_CHAN_STAT, USED_CHANNEL)); + dcp_write(dev, (u32) dev->hw_phys_pkg, + dcp_chan_reg(DCP_REG_CHAN_PTR, USED_CHANNEL)); + + set_bit(DCP_FLAG_PRODUCING, &dev->flags); + + if (use_walk) { + ablkcipher_walk_init(&ctx->walk, ctx->req->dst, + ctx->req->src, ctx->req->nbytes); + ablkcipher_walk_phys(ctx->req, &ctx->walk); + dcp_op_proceed(dev); + } else { + dcp_op_one(dev, dev->hw_pkg[0], 1); + clear_bit(DCP_FLAG_PRODUCING, &dev->flags); + } +} + +static void dcp_done_task(unsigned long data) +{ + struct dcp_dev *dev = (struct dcp_dev *)data; + struct dcp_hw_packet *last_packet; + int fin; + fin = 0; + + for (last_packet = dev->hw_pkg[(dev->pkt_consumed) % DCP_MAX_PKG]; + last_packet->stat == 1; + last_packet = + dev->hw_pkg[++(dev->pkt_consumed) % DCP_MAX_PKG]) { + + dcp_dma_unmap(dev, last_packet); + last_packet->stat = 0; + fin++; + } + /* the last call of this function already consumed this IRQ's packet */ + if (fin == 0) + return; + + dev_dbg(dev->dev, + "Packet(s) done with status %x; finished: %d, produced:%d, complete consumed: %d", + dev->ctx->stat, fin, dev->pkt_produced, dev->pkt_consumed); + + last_packet = dev->hw_pkg[(dev->pkt_consumed - 1) % DCP_MAX_PKG]; + if (!dev->ctx->stat && last_packet->pkt1 & DCP_PKT_CHAIN) { + if (!test_and_set_bit(DCP_FLAG_PRODUCING, &dev->flags)) + dcp_op_proceed(dev); + return; + } + + while (unlikely(dev->pkt_consumed < dev->pkt_produced)) { + dcp_dma_unmap(dev, + dev->hw_pkg[dev->pkt_consumed++ % DCP_MAX_PKG]); + } + + if (dev->ctx->flags & DCP_OTP_KEY) { + /* we used the miscdevice, no walk to finish */ + clear_bit(DCP_FLAG_BUSY, &dev->flags); + return; + } + + ablkcipher_walk_complete(&dev->ctx->walk); + dev->ctx->req->base.complete(&dev->ctx->req->base, + dev->ctx->stat); + dev->ctx->req = 0; + /* in case there are other requests in the queue */ + tasklet_schedule(&dev->queue_task); +} + +void dcp_watchdog(unsigned long data) +{ + struct dcp_dev *dev = (struct dcp_dev *)data; + dev->ctx->stat |= dcp_read(dev, + dcp_chan_reg(DCP_REG_CHAN_STAT, USED_CHANNEL)); + + dev_err(dev->dev, "Timeout, Channel status: %x", dev->ctx->stat); + + if (!dev->ctx->stat) + dev->ctx->stat = -ETIMEDOUT; + + dcp_done_task(data); +} + + +static irqreturn_t dcp_common_irq(int irq, void *context) +{ + u32 msk; + struct dcp_dev *dev = (struct dcp_dev *) context; + + del_timer(&dev->watchdog); + + msk = DCP_STAT_IRQ(dcp_read(dev, DCP_REG_STAT)); + dcp_clear(dev, msk, DCP_REG_STAT); + if (msk == 0) + return IRQ_NONE; + + dev->ctx->stat |= dcp_read(dev, + dcp_chan_reg(DCP_REG_CHAN_STAT, USED_CHANNEL)); + + if (msk & DCP_STAT_CHAN_1) + tasklet_schedule(&dev->done_task); + + return IRQ_HANDLED; +} + +static irqreturn_t dcp_vmi_irq(int irq, void *context) +{ + return dcp_common_irq(irq, context); +} + +static irqreturn_t dcp_irq(int irq, void *context) +{ + return dcp_common_irq(irq, context); +} + +static void dcp_crypt(struct dcp_dev *dev, struct dcp_op *ctx) +{ + dev->ctx = ctx; + + if ((ctx->flags & DCP_CBC) && ctx->req->info) { + ctx->flags |= DCP_CBC_INIT; + memcpy(dev->payload_base + AES_KEYSIZE_128, + ctx->req->info, AES_KEYSIZE_128); + } + + dcp_op_start(dev, 1); +} + +static void dcp_queue_task(unsigned long data) +{ + struct dcp_dev *dev = (struct dcp_dev *) data; + struct crypto_async_request *async_req, *backlog; + struct crypto_ablkcipher *tfm; + struct dcp_op *ctx; + struct dcp_dev_req_ctx *rctx; + struct ablkcipher_request *req; + unsigned long flags; + + spin_lock_irqsave(&dev->queue_lock, flags); + + backlog = crypto_get_backlog(&dev->queue); + async_req = crypto_dequeue_request(&dev->queue); + + spin_unlock_irqrestore(&dev->queue_lock, flags); + + if (!async_req) + goto ret_nothing_done; + + if (backlog) + backlog->complete(backlog, -EINPROGRESS); + + req = ablkcipher_request_cast(async_req); + tfm = crypto_ablkcipher_reqtfm(req); + rctx = ablkcipher_request_ctx(req); + ctx = crypto_ablkcipher_ctx(tfm); + + if (!req->src || !req->dst) + goto ret_nothing_done; + + ctx->flags |= rctx->mode; + ctx->req = req; + + dcp_crypt(dev, ctx); + + return; + +ret_nothing_done: + clear_bit(DCP_FLAG_BUSY, &dev->flags); +} + + +static int dcp_cra_init(struct crypto_tfm *tfm) +{ + const char *name = tfm->__crt_alg->cra_name; + struct dcp_op *ctx = crypto_tfm_ctx(tfm); + + tfm->crt_ablkcipher.reqsize = sizeof(struct dcp_dev_req_ctx); + + ctx->fallback = crypto_alloc_ablkcipher(name, 0, + CRYPTO_ALG_ASYNC | CRYPTO_ALG_NEED_FALLBACK); + + if (IS_ERR(ctx->fallback)) { + dev_err(global_dev->dev, "Error allocating fallback algo %s\n", + name); + return PTR_ERR(ctx->fallback); + } + + return 0; +} + +static void dcp_cra_exit(struct crypto_tfm *tfm) +{ + struct dcp_op *ctx = crypto_tfm_ctx(tfm); + + if (ctx->fallback) + crypto_free_ablkcipher(ctx->fallback); + + ctx->fallback = NULL; +} + +/* async interface */ +static int dcp_aes_setkey(struct crypto_ablkcipher *tfm, const u8 *key, + unsigned int len) +{ + struct dcp_op *ctx = crypto_ablkcipher_ctx(tfm); + unsigned int ret = 0; + ctx->keylen = len; + ctx->flags = 0; + if (len == AES_KEYSIZE_128) { + if (memcmp(ctx->key, key, AES_KEYSIZE_128)) { + memcpy(ctx->key, key, len); + ctx->flags |= DCP_NEW_KEY; + } + return 0; + } + + ctx->fallback->base.crt_flags &= ~CRYPTO_TFM_REQ_MASK; + ctx->fallback->base.crt_flags |= + (tfm->base.crt_flags & CRYPTO_TFM_REQ_MASK); + + ret = crypto_ablkcipher_setkey(ctx->fallback, key, len); + if (ret) { + struct crypto_tfm *tfm_aux = crypto_ablkcipher_tfm(tfm); + + tfm_aux->crt_flags &= ~CRYPTO_TFM_RES_MASK; + tfm_aux->crt_flags |= + (ctx->fallback->base.crt_flags & CRYPTO_TFM_RES_MASK); + } + return ret; +} + +static int dcp_aes_cbc_crypt(struct ablkcipher_request *req, int mode) +{ + struct dcp_dev_req_ctx *rctx = ablkcipher_request_ctx(req); + struct dcp_dev *dev = global_dev; + unsigned long flags; + int err = 0; + + if (!IS_ALIGNED(req->nbytes, AES_BLOCK_SIZE)) + return -EINVAL; + + rctx->mode = mode; + + spin_lock_irqsave(&dev->queue_lock, flags); + err = ablkcipher_enqueue_request(&dev->queue, req); + spin_unlock_irqrestore(&dev->queue_lock, flags); + + flags = test_and_set_bit(DCP_FLAG_BUSY, &dev->flags); + + if (!(flags & DCP_FLAG_BUSY)) + tasklet_schedule(&dev->queue_task); + + return err; +} + +static int dcp_aes_cbc_encrypt(struct ablkcipher_request *req) +{ + struct crypto_tfm *tfm = + crypto_ablkcipher_tfm(crypto_ablkcipher_reqtfm(req)); + struct dcp_op *ctx = crypto_ablkcipher_ctx( + crypto_ablkcipher_reqtfm(req)); + + if (unlikely(ctx->keylen != AES_KEYSIZE_128)) { + int err = 0; + ablkcipher_request_set_tfm(req, ctx->fallback); + err = crypto_ablkcipher_encrypt(req); + ablkcipher_request_set_tfm(req, __crypto_ablkcipher_cast(tfm)); + return err; + } + + return dcp_aes_cbc_crypt(req, DCP_AES | DCP_ENC | DCP_CBC); +} + +static int dcp_aes_cbc_decrypt(struct ablkcipher_request *req) +{ + struct crypto_tfm *tfm = + crypto_ablkcipher_tfm(crypto_ablkcipher_reqtfm(req)); + struct dcp_op *ctx = crypto_ablkcipher_ctx( + crypto_ablkcipher_reqtfm(req)); + + if (unlikely(ctx->keylen != AES_KEYSIZE_128)) { + int err = 0; + ablkcipher_request_set_tfm(req, ctx->fallback); + err = crypto_ablkcipher_decrypt(req); + ablkcipher_request_set_tfm(req, __crypto_ablkcipher_cast(tfm)); + return err; + } + return dcp_aes_cbc_crypt(req, DCP_AES | DCP_DEC | DCP_CBC); +} + +static struct crypto_alg algs[] = { + { + .cra_name = "cbc(aes)", + .cra_driver_name = "dcp-cbc-aes", + .cra_alignmask = 3, + .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC | + CRYPTO_ALG_NEED_FALLBACK, + .cra_blocksize = AES_KEYSIZE_128, + .cra_type = &crypto_ablkcipher_type, + .cra_priority = 300, + .cra_u.ablkcipher = { + .min_keysize = AES_KEYSIZE_128, + .max_keysize = AES_KEYSIZE_128, + .setkey = dcp_aes_setkey, + .encrypt = dcp_aes_cbc_encrypt, + .decrypt = dcp_aes_cbc_decrypt, + .ivsize = AES_KEYSIZE_128, + } + + }, +}; + +/* DCP bootstream verification interface: uses OTP key for crypto */ +static int dcp_bootstream_open(struct inode *inode, struct file *file) +{ + file->private_data = container_of((file->private_data), + struct dcp_dev, dcp_bootstream_misc); + return 0; +} + +static long dcp_bootstream_ioctl(struct file *file, + unsigned int cmd, unsigned long arg) +{ + struct dcp_dev *dev = (struct dcp_dev *) file->private_data; + void __user *argp = (void __user *)arg; + int ret; + + if (dev == NULL) + return -EBADF; + + if (cmd != DBS_ENC && cmd != DBS_DEC) + return -EINVAL; + + if (copy_from_user(dev->payload_base, argp, 16)) + return -EFAULT; + + if (test_and_set_bit(DCP_FLAG_BUSY, &dev->flags)) + return -EAGAIN; + + dev->ctx = kzalloc(sizeof(struct dcp_op), GFP_KERNEL); + if (!dev->ctx) { + dev_err(dev->dev, + "cannot allocate context for OTP crypto"); + clear_bit(DCP_FLAG_BUSY, &dev->flags); + return -ENOMEM; + } + + dev->ctx->flags = DCP_AES | DCP_ECB | DCP_OTP_KEY | DCP_CBC_INIT; + dev->ctx->flags |= (cmd == DBS_ENC) ? DCP_ENC : DCP_DEC; + dev->hw_pkg[0]->src = dev->payload_base_dma; + dev->hw_pkg[0]->dst = dev->payload_base_dma; + dev->hw_pkg[0]->size = 16; + + dcp_op_start(dev, 0); + + while (test_bit(DCP_FLAG_BUSY, &dev->flags)) + cpu_relax(); + + ret = dev->ctx->stat; + if (!ret && copy_to_user(argp, dev->payload_base, 16)) + ret = -EFAULT; + + kfree(dev->ctx); + + return ret; +} + +static const struct file_operations dcp_bootstream_fops = { + .owner = THIS_MODULE, + .unlocked_ioctl = dcp_bootstream_ioctl, + .open = dcp_bootstream_open, +}; + +static int dcp_probe(struct platform_device *pdev) +{ + struct dcp_dev *dev = NULL; + struct resource *r; + int i, ret, j; + + dev = kzalloc(sizeof(*dev), GFP_KERNEL); + if (dev == NULL) { + dev_err(&pdev->dev, "Failed to allocate structure\n"); + ret = -ENOMEM; + goto err; + } + global_dev = dev; + dev->dev = &pdev->dev; + + platform_set_drvdata(pdev, dev); + + r = platform_get_resource(pdev, IORESOURCE_MEM, 0); + if (!r) { + dev_err(&pdev->dev, "failed to get IORESOURCE_MEM\n"); + ret = -ENXIO; + goto err_dev; + } + dev->dcp_regs_base = ioremap(r->start, resource_size(r)); + + + dcp_set(dev, DCP_CTRL_SFRST, DCP_REG_CTRL); + udelay(10); + dcp_clear(dev, DCP_CTRL_SFRST | DCP_CTRL_CLKGATE, DCP_REG_CTRL); + + dcp_write(dev, DCP_CTRL_GATHER_RES_WRITE | + DCP_CTRL_ENABLE_CONTEXT_CACHE | DCP_CTRL_CH_IRQ_E_1, + DCP_REG_CTRL); + + dcp_write(dev, DCP_CHAN_CTRL_ENABLE_1, DCP_REG_CHAN_CTRL); + + for (i = 0; i < 4; i++) + dcp_clear(dev, -1, dcp_chan_reg(DCP_REG_CHAN_STAT, i)); + + dcp_clear(dev, -1, DCP_REG_STAT); + + + r = platform_get_resource(pdev, IORESOURCE_IRQ, 0); + if (!r) { + dev_err(&pdev->dev, "can't get IRQ resource (0)\n"); + ret = -EIO; + goto err_unmap_mem; + } + dev->dcp_vmi_irq = r->start; + ret = request_irq(dev->dcp_vmi_irq, dcp_vmi_irq, 0, "dcp", dev); + if (ret != 0) { + dev_err(&pdev->dev, "can't request_irq (0)\n"); + ret = -EIO; + goto err_unmap_mem; + } + + r = platform_get_resource(pdev, IORESOURCE_IRQ, 1); + if (!r) { + dev_err(&pdev->dev, "can't get IRQ resource (1)\n"); + ret = -EIO; + goto err_free_irq0; + } + dev->dcp_irq = r->start; + ret = request_irq(dev->dcp_irq, dcp_irq, 0, "dcp", dev); + if (ret != 0) { + dev_err(&pdev->dev, "can't request_irq (1)\n"); + ret = -EIO; + goto err_free_irq0; + } + + dev->hw_pkg[0] = dma_alloc_coherent(&pdev->dev, + DCP_MAX_PKG * sizeof(struct dcp_hw_packet), + &dev->hw_phys_pkg, + GFP_KERNEL); + if (!dev->hw_pkg[0]) { + dev_err(&pdev->dev, "Could not allocate hw descriptors\n"); + ret = -ENOMEM; + goto err_free_irq1; + } + + for (i = 1; i < DCP_MAX_PKG; i++) { + dev->hw_pkg[i - 1]->next = dev->hw_phys_pkg + + i * sizeof(struct dcp_hw_packet); + dev->hw_pkg[i] = dev->hw_pkg[i - 1] + 1; + } + dev->hw_pkg[i - 1]->next = dev->hw_phys_pkg; + + + dev->payload_base = dma_alloc_coherent(&pdev->dev, 2 * AES_KEYSIZE_128, + &dev->payload_base_dma, GFP_KERNEL); + if (!dev->payload_base) { + dev_err(&pdev->dev, "Could not allocate memory for key\n"); + ret = -ENOMEM; + goto err_free_hw_packet; + } + tasklet_init(&dev->queue_task, dcp_queue_task, + (unsigned long) dev); + tasklet_init(&dev->done_task, dcp_done_task, + (unsigned long) dev); + spin_lock_init(&dev->queue_lock); + + crypto_init_queue(&dev->queue, 10); + + init_timer(&dev->watchdog); + dev->watchdog.function = &dcp_watchdog; + dev->watchdog.data = (unsigned long)dev; + + dev->dcp_bootstream_misc.minor = MISC_DYNAMIC_MINOR, + dev->dcp_bootstream_misc.name = "dcpboot", + dev->dcp_bootstream_misc.fops = &dcp_bootstream_fops, + ret = misc_register(&dev->dcp_bootstream_misc); + if (ret != 0) { + dev_err(dev->dev, "Unable to register misc device\n"); + goto err_free_key_iv; + } + + for (i = 0; i < ARRAY_SIZE(algs); i++) { + algs[i].cra_priority = 300; + algs[i].cra_ctxsize = sizeof(struct dcp_op); + algs[i].cra_module = THIS_MODULE; + algs[i].cra_init = dcp_cra_init; + algs[i].cra_exit = dcp_cra_exit; + if (crypto_register_alg(&algs[i])) { + dev_err(&pdev->dev, "register algorithm failed\n"); + ret = -ENOMEM; + goto err_unregister; + } + } + dev_notice(&pdev->dev, "DCP crypto enabled.!\n"); + + return 0; + +err_unregister: + for (j = 0; j < i; j++) + crypto_unregister_alg(&algs[j]); +err_free_key_iv: + dma_free_coherent(&pdev->dev, 2 * AES_KEYSIZE_128, dev->payload_base, + dev->payload_base_dma); +err_free_hw_packet: + dma_free_coherent(&pdev->dev, DCP_MAX_PKG * + sizeof(struct dcp_hw_packet), dev->hw_pkg[0], + dev->hw_phys_pkg); +err_free_irq1: + free_irq(dev->dcp_irq, dev); +err_free_irq0: + free_irq(dev->dcp_vmi_irq, dev); +err_unmap_mem: + iounmap((void *) dev->dcp_regs_base); +err_dev: + kfree(dev); +err: + return ret; +} + +static int dcp_remove(struct platform_device *pdev) +{ + struct dcp_dev *dev; + int j; + dev = platform_get_drvdata(pdev); + platform_set_drvdata(pdev, NULL); + + dma_free_coherent(&pdev->dev, + DCP_MAX_PKG * sizeof(struct dcp_hw_packet), + dev->hw_pkg[0], dev->hw_phys_pkg); + + dma_free_coherent(&pdev->dev, 2 * AES_KEYSIZE_128, dev->payload_base, + dev->payload_base_dma); + + free_irq(dev->dcp_irq, dev); + free_irq(dev->dcp_vmi_irq, dev); + + tasklet_kill(&dev->done_task); + tasklet_kill(&dev->queue_task); + + iounmap((void *) dev->dcp_regs_base); + + for (j = 0; j < ARRAY_SIZE(algs); j++) + crypto_unregister_alg(&algs[j]); + + misc_deregister(&dev->dcp_bootstream_misc); + + kfree(dev); + return 0; +} + +static struct of_device_id fs_dcp_of_match[] = { + { .compatible = "fsl-dcp"}, + {}, +}; + +static struct platform_driver fs_dcp_driver = { + .probe = dcp_probe, + .remove = dcp_remove, + .driver = { + .name = "fsl-dcp", + .owner = THIS_MODULE, + .of_match_table = fs_dcp_of_match + } +}; + +module_platform_driver(fs_dcp_driver); + + +MODULE_AUTHOR("Tobias Rauter "); +MODULE_DESCRIPTION("Freescale DCP Crypto Driver"); +MODULE_LICENSE("GPL"); -- cgit v1.2.3-59-g8ed1b From b02266531f3e7f9b3ce8fc95c06a15b99fd13b7f Mon Sep 17 00:00:00 2001 From: Lars-Peter Clausen Date: Mon, 20 May 2013 19:14:50 +0200 Subject: crypto: hifn_795x - Pass correct pointer to free_irq() free_irq() expects the same pointer that was passed to request_irq(), otherwise the IRQ is not freed. The issue was found using the following coccinelle script: @r1@ type T; T devid; @@ request_irq(..., devid) @r2@ type r1.T; T devid; position p; @@ free_irq@p(..., devid) @@ position p != r2.p; @@ *free_irq@p(...) Signed-off-by: Lars-Peter Clausen Signed-off-by: Herbert Xu --- drivers/crypto/hifn_795x.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/crypto/hifn_795x.c b/drivers/crypto/hifn_795x.c index ebf130e894b5..12fea3e22348 100644 --- a/drivers/crypto/hifn_795x.c +++ b/drivers/crypto/hifn_795x.c @@ -2676,7 +2676,7 @@ err_out_stop_device: hifn_reset_dma(dev, 1); hifn_stop_device(dev); err_out_free_irq: - free_irq(dev->irq, dev->name); + free_irq(dev->irq, dev); tasklet_kill(&dev->tasklet); err_out_free_desc: pci_free_consistent(pdev, sizeof(struct hifn_dma), @@ -2711,7 +2711,7 @@ static void hifn_remove(struct pci_dev *pdev) hifn_reset_dma(dev, 1); hifn_stop_device(dev); - free_irq(dev->irq, dev->name); + free_irq(dev->irq, dev); tasklet_kill(&dev->tasklet); hifn_flush(dev); -- cgit v1.2.3-59-g8ed1b From d329581493802cede83ca672093588f399d02a65 Mon Sep 17 00:00:00 2001 From: Jussi Kivilinna Date: Tue, 21 May 2013 17:10:10 +0300 Subject: crypto: sha512_generic - set cra_driver_name 'sha512_generic' should set driver name now that there is alternative sha512 provider (sha512_ssse3). Signed-off-by: Jussi Kivilinna Signed-off-by: Herbert Xu --- crypto/sha512_generic.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/crypto/sha512_generic.c b/crypto/sha512_generic.c index 4c5862095679..6ed124f3ea0f 100644 --- a/crypto/sha512_generic.c +++ b/crypto/sha512_generic.c @@ -251,6 +251,7 @@ static struct shash_alg sha512_algs[2] = { { .descsize = sizeof(struct sha512_state), .base = { .cra_name = "sha512", + .cra_driver_name = "sha512-generic", .cra_flags = CRYPTO_ALG_TYPE_SHASH, .cra_blocksize = SHA512_BLOCK_SIZE, .cra_module = THIS_MODULE, @@ -263,6 +264,7 @@ static struct shash_alg sha512_algs[2] = { { .descsize = sizeof(struct sha512_state), .base = { .cra_name = "sha384", + .cra_driver_name = "sha384-generic", .cra_flags = CRYPTO_ALG_TYPE_SHASH, .cra_blocksize = SHA384_BLOCK_SIZE, .cra_module = THIS_MODULE, -- cgit v1.2.3-59-g8ed1b From 340991e30ccef7b983cf2814ecea610504f5d059 Mon Sep 17 00:00:00 2001 From: Jussi Kivilinna Date: Tue, 21 May 2013 17:10:44 +0300 Subject: crypto: sha512_ssse3 - add sha384 support Add sha384 implementation to sha512_ssse3 module. This also fixes sha512_ssse3 module autoloading issue when 'sha384' is used before 'sha512'. Previously in such case, just sha512_generic was loaded and not sha512_ssse3 (since it did not provide sha384). Now if 'sha512' was used after 'sha384' usage, sha512_ssse3 would remain unloaded. For example, this happens with tcrypt testing module since it tests 'sha384' before 'sha512'. Cc: Tim Chen Signed-off-by: Jussi Kivilinna Signed-off-by: Herbert Xu --- arch/x86/crypto/sha512_ssse3_glue.c | 58 +++++++++++++++++++++++++++++++++---- 1 file changed, 53 insertions(+), 5 deletions(-) diff --git a/arch/x86/crypto/sha512_ssse3_glue.c b/arch/x86/crypto/sha512_ssse3_glue.c index 6cbd8df348d2..f30cd10293f0 100644 --- a/arch/x86/crypto/sha512_ssse3_glue.c +++ b/arch/x86/crypto/sha512_ssse3_glue.c @@ -194,7 +194,37 @@ static int sha512_ssse3_import(struct shash_desc *desc, const void *in) return 0; } -static struct shash_alg alg = { +static int sha384_ssse3_init(struct shash_desc *desc) +{ + struct sha512_state *sctx = shash_desc_ctx(desc); + + sctx->state[0] = SHA384_H0; + sctx->state[1] = SHA384_H1; + sctx->state[2] = SHA384_H2; + sctx->state[3] = SHA384_H3; + sctx->state[4] = SHA384_H4; + sctx->state[5] = SHA384_H5; + sctx->state[6] = SHA384_H6; + sctx->state[7] = SHA384_H7; + + sctx->count[0] = sctx->count[1] = 0; + + return 0; +} + +static int sha384_ssse3_final(struct shash_desc *desc, u8 *hash) +{ + u8 D[SHA512_DIGEST_SIZE]; + + sha512_ssse3_final(desc, D); + + memcpy(hash, D, SHA384_DIGEST_SIZE); + memset(D, 0, SHA512_DIGEST_SIZE); + + return 0; +} + +static struct shash_alg algs[] = { { .digestsize = SHA512_DIGEST_SIZE, .init = sha512_ssse3_init, .update = sha512_ssse3_update, @@ -211,7 +241,24 @@ static struct shash_alg alg = { .cra_blocksize = SHA512_BLOCK_SIZE, .cra_module = THIS_MODULE, } -}; +}, { + .digestsize = SHA384_DIGEST_SIZE, + .init = sha384_ssse3_init, + .update = sha512_ssse3_update, + .final = sha384_ssse3_final, + .export = sha512_ssse3_export, + .import = sha512_ssse3_import, + .descsize = sizeof(struct sha512_state), + .statesize = sizeof(struct sha512_state), + .base = { + .cra_name = "sha384", + .cra_driver_name = "sha384-ssse3", + .cra_priority = 150, + .cra_flags = CRYPTO_ALG_TYPE_SHASH, + .cra_blocksize = SHA384_BLOCK_SIZE, + .cra_module = THIS_MODULE, + } +} }; #ifdef CONFIG_AS_AVX static bool __init avx_usable(void) @@ -234,7 +281,7 @@ static bool __init avx_usable(void) static int __init sha512_ssse3_mod_init(void) { - /* test for SSE3 first */ + /* test for SSSE3 first */ if (cpu_has_ssse3) sha512_transform_asm = sha512_transform_ssse3; @@ -261,7 +308,7 @@ static int __init sha512_ssse3_mod_init(void) else #endif pr_info("Using SSSE3 optimized SHA-512 implementation\n"); - return crypto_register_shash(&alg); + return crypto_register_shashes(algs, ARRAY_SIZE(algs)); } pr_info("Neither AVX nor SSSE3 is available/usable.\n"); @@ -270,7 +317,7 @@ static int __init sha512_ssse3_mod_init(void) static void __exit sha512_ssse3_mod_fini(void) { - crypto_unregister_shash(&alg); + crypto_unregister_shashes(algs, ARRAY_SIZE(algs)); } module_init(sha512_ssse3_mod_init); @@ -280,3 +327,4 @@ MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("SHA512 Secure Hash Algorithm, Supplemental SSE3 accelerated"); MODULE_ALIAS("sha512"); +MODULE_ALIAS("sha384"); -- cgit v1.2.3-59-g8ed1b From a710f761fc9ae5728765a5917f8beabb49f98483 Mon Sep 17 00:00:00 2001 From: Jussi Kivilinna Date: Tue, 21 May 2013 17:10:49 +0300 Subject: crypto: sha256_ssse3 - add sha224 support Add sha224 implementation to sha256_ssse3 module. This also fixes sha256_ssse3 module autoloading issue when 'sha224' is used before 'sha256'. Previously in such case, just sha256_generic was loaded and not sha256_ssse3 (since it did not provide sha224). Now if 'sha256' was used after 'sha224' usage, sha256_ssse3 would remain unloaded. Cc: Tim Chen Signed-off-by: Jussi Kivilinna Signed-off-by: Herbert Xu --- arch/x86/crypto/sha256_ssse3_glue.c | 57 +++++++++++++++++++++++++++++++++---- 1 file changed, 52 insertions(+), 5 deletions(-) diff --git a/arch/x86/crypto/sha256_ssse3_glue.c b/arch/x86/crypto/sha256_ssse3_glue.c index 597d4da69656..50226c4b86ed 100644 --- a/arch/x86/crypto/sha256_ssse3_glue.c +++ b/arch/x86/crypto/sha256_ssse3_glue.c @@ -187,7 +187,36 @@ static int sha256_ssse3_import(struct shash_desc *desc, const void *in) return 0; } -static struct shash_alg alg = { +static int sha224_ssse3_init(struct shash_desc *desc) +{ + struct sha256_state *sctx = shash_desc_ctx(desc); + + sctx->state[0] = SHA224_H0; + sctx->state[1] = SHA224_H1; + sctx->state[2] = SHA224_H2; + sctx->state[3] = SHA224_H3; + sctx->state[4] = SHA224_H4; + sctx->state[5] = SHA224_H5; + sctx->state[6] = SHA224_H6; + sctx->state[7] = SHA224_H7; + sctx->count = 0; + + return 0; +} + +static int sha224_ssse3_final(struct shash_desc *desc, u8 *hash) +{ + u8 D[SHA256_DIGEST_SIZE]; + + sha256_ssse3_final(desc, D); + + memcpy(hash, D, SHA224_DIGEST_SIZE); + memset(D, 0, SHA256_DIGEST_SIZE); + + return 0; +} + +static struct shash_alg algs[] = { { .digestsize = SHA256_DIGEST_SIZE, .init = sha256_ssse3_init, .update = sha256_ssse3_update, @@ -204,7 +233,24 @@ static struct shash_alg alg = { .cra_blocksize = SHA256_BLOCK_SIZE, .cra_module = THIS_MODULE, } -}; +}, { + .digestsize = SHA224_DIGEST_SIZE, + .init = sha224_ssse3_init, + .update = sha256_ssse3_update, + .final = sha224_ssse3_final, + .export = sha256_ssse3_export, + .import = sha256_ssse3_import, + .descsize = sizeof(struct sha256_state), + .statesize = sizeof(struct sha256_state), + .base = { + .cra_name = "sha224", + .cra_driver_name = "sha224-ssse3", + .cra_priority = 150, + .cra_flags = CRYPTO_ALG_TYPE_SHASH, + .cra_blocksize = SHA224_BLOCK_SIZE, + .cra_module = THIS_MODULE, + } +} }; #ifdef CONFIG_AS_AVX static bool __init avx_usable(void) @@ -227,7 +273,7 @@ static bool __init avx_usable(void) static int __init sha256_ssse3_mod_init(void) { - /* test for SSE3 first */ + /* test for SSSE3 first */ if (cpu_has_ssse3) sha256_transform_asm = sha256_transform_ssse3; @@ -254,7 +300,7 @@ static int __init sha256_ssse3_mod_init(void) else #endif pr_info("Using SSSE3 optimized SHA-256 implementation\n"); - return crypto_register_shash(&alg); + return crypto_register_shashes(algs, ARRAY_SIZE(algs)); } pr_info("Neither AVX nor SSSE3 is available/usable.\n"); @@ -263,7 +309,7 @@ static int __init sha256_ssse3_mod_init(void) static void __exit sha256_ssse3_mod_fini(void) { - crypto_unregister_shash(&alg); + crypto_unregister_shashes(algs, ARRAY_SIZE(algs)); } module_init(sha256_ssse3_mod_init); @@ -273,3 +319,4 @@ MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("SHA256 Secure Hash Algorithm, Supplemental SSE3 accelerated"); MODULE_ALIAS("sha256"); +MODULE_ALIAS("sha384"); -- cgit v1.2.3-59-g8ed1b From bbb85b25688fcdc70b895711870c23de7b12721b Mon Sep 17 00:00:00 2001 From: Thomas Meyer Date: Wed, 22 May 2013 21:38:40 +0200 Subject: crypto: ux500 - Cocci spatch "resource_size.spatch" Signed-off-by: Thomas Meyer Signed-off-by: Herbert Xu --- drivers/crypto/ux500/cryp/cryp_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/crypto/ux500/cryp/cryp_core.c b/drivers/crypto/ux500/cryp/cryp_core.c index 32f480622b97..3f1f9825fadb 100644 --- a/drivers/crypto/ux500/cryp/cryp_core.c +++ b/drivers/crypto/ux500/cryp/cryp_core.c @@ -1600,7 +1600,7 @@ static int ux500_cryp_remove(struct platform_device *pdev) res = platform_get_resource(pdev, IORESOURCE_MEM, 0); if (res) - release_mem_region(res->start, res->end - res->start + 1); + release_mem_region(res->start, resource_size(res)); kfree(device_data); -- cgit v1.2.3-59-g8ed1b From 67822649d7305caf3dd50ed46c27b99c94eff996 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Wed, 5 Jun 2013 16:27:51 +0800 Subject: crypto: crct10dif - Use PTR_RET lib/crc-t10dif.c:42:1-3: WARNING: PTR_RET can be used Use PTR_RET rather than if(IS_ERR(...)) + PTR_ERR Generated by: coccinelle/api/ptr_ret.cocci Reported-by: Fengguang Wu Signed-off-by: Herbert Xu --- lib/crc-t10dif.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/lib/crc-t10dif.c b/lib/crc-t10dif.c index 0cb64634f513..fe3428c07b47 100644 --- a/lib/crc-t10dif.c +++ b/lib/crc-t10dif.c @@ -39,9 +39,7 @@ EXPORT_SYMBOL(crc_t10dif); static int __init crc_t10dif_mod_init(void) { crct10dif_tfm = crypto_alloc_shash("crct10dif", 0, 0); - if (IS_ERR(crct10dif_tfm)) - return PTR_ERR(crct10dif_tfm); - return 0; + return PTR_RET(crct10dif_tfm); } static void __exit crc_t10dif_mod_fini(void) -- cgit v1.2.3-59-g8ed1b From a3485e685faa37ba92462bec2f18b98c3c825c76 Mon Sep 17 00:00:00 2001 From: Joel A Fernandes Date: Tue, 28 May 2013 19:02:55 -0500 Subject: crypto: omap-aes - Don't idle/start AES device between Encrypt operations Calling runtime PM API for every block causes serious perf hit to crypto operations that are done on a long buffer. As crypto is performed on a page boundary, encrypting large buffers can cause a series of crypto operations divided by page. The runtime PM API is also called those many times. We call runtime_pm_get_sync only at beginning on the session (cra_init) and runtime_pm_put at the end. This result in upto a 50% speedup as below. This doesn't make the driver to keep the system awake as runtime get/put is only called during a crypto session which completes usually quickly. Before: root@beagleboard:~# time -v openssl speed -evp aes-128-cbc Doing aes-128-cbc for 3s on 16 size blocks: 13310 aes-128-cbc's in 0.01s Doing aes-128-cbc for 3s on 64 size blocks: 13040 aes-128-cbc's in 0.04s Doing aes-128-cbc for 3s on 256 size blocks: 9134 aes-128-cbc's in 0.03s Doing aes-128-cbc for 3s on 1024 size blocks: 8939 aes-128-cbc's in 0.01s Doing aes-128-cbc for 3s on 8192 size blocks: 4299 aes-128-cbc's in 0.00s After: root@beagleboard:~# time -v openssl speed -evp aes-128-cbc Doing aes-128-cbc for 3s on 16 size blocks: 18911 aes-128-cbc's in 0.02s Doing aes-128-cbc for 3s on 64 size blocks: 18878 aes-128-cbc's in 0.02s Doing aes-128-cbc for 3s on 256 size blocks: 11878 aes-128-cbc's in 0.10s Doing aes-128-cbc for 3s on 1024 size blocks: 11538 aes-128-cbc's in 0.05s Doing aes-128-cbc for 3s on 8192 size blocks: 4857 aes-128-cbc's in 0.03s While at it, also drop enter and exit pr_debugs, in related code. tracers can be used for that. Tested on a Beaglebone (AM335x SoC) board. Signed-off-by: Joel A Fernandes Acked-by: Kevin Hilman Signed-off-by: Herbert Xu --- drivers/crypto/omap-aes.c | 29 +++++++++++++++++++---------- 1 file changed, 19 insertions(+), 10 deletions(-) diff --git a/drivers/crypto/omap-aes.c b/drivers/crypto/omap-aes.c index 305a2aacf9b3..5f7980586850 100644 --- a/drivers/crypto/omap-aes.c +++ b/drivers/crypto/omap-aes.c @@ -203,13 +203,6 @@ static void omap_aes_write_n(struct omap_aes_dev *dd, u32 offset, static int omap_aes_hw_init(struct omap_aes_dev *dd) { - /* - * clocks are enabled when request starts and disabled when finished. - * It may be long delays between requests. - * Device might go to off mode to save power. - */ - pm_runtime_get_sync(dd->dev); - if (!(dd->flags & FLAGS_INIT)) { dd->flags |= FLAGS_INIT; dd->err = 0; @@ -636,7 +629,6 @@ static void omap_aes_finish_req(struct omap_aes_dev *dd, int err) pr_debug("err: %d\n", err); - pm_runtime_put(dd->dev); dd->flags &= ~FLAGS_BUSY; req->base.complete(&req->base, err); @@ -837,8 +829,16 @@ static int omap_aes_ctr_decrypt(struct ablkcipher_request *req) static int omap_aes_cra_init(struct crypto_tfm *tfm) { - pr_debug("enter\n"); + struct omap_aes_dev *dd = NULL; + + /* Find AES device, currently picks the first device */ + spin_lock_bh(&list_lock); + list_for_each_entry(dd, &dev_list, list) { + break; + } + spin_unlock_bh(&list_lock); + pm_runtime_get_sync(dd->dev); tfm->crt_ablkcipher.reqsize = sizeof(struct omap_aes_reqctx); return 0; @@ -846,7 +846,16 @@ static int omap_aes_cra_init(struct crypto_tfm *tfm) static void omap_aes_cra_exit(struct crypto_tfm *tfm) { - pr_debug("enter\n"); + struct omap_aes_dev *dd = NULL; + + /* Find AES device, currently picks the first device */ + spin_lock_bh(&list_lock); + list_for_each_entry(dd, &dev_list, list) { + break; + } + spin_unlock_bh(&list_lock); + + pm_runtime_put_sync(dd->dev); } /* ********************** ALGS ************************************ */ -- cgit v1.2.3-59-g8ed1b From 1f539bcb13120213f266d5506fe7339bc78bd953 Mon Sep 17 00:00:00 2001 From: Jingoo Han Date: Wed, 29 May 2013 09:47:29 +0900 Subject: hwrng: use platform_{get,set}_drvdata() Use the wrapper functions for getting and setting the driver data using platform_device instead of using dev_{get,set}_drvdata() with &pdev->dev, so we can directly pass a struct platform_device. Also, unnecessary dev_set_drvdata() is removed, because the driver core clears the driver data to NULL after device_release or on probe failure. Signed-off-by: Jingoo Han Signed-off-by: Herbert Xu --- drivers/char/hw_random/n2-drv.c | 6 ++---- drivers/char/hw_random/octeon-rng.c | 4 ++-- drivers/char/hw_random/omap-rng.c | 6 +++--- 3 files changed, 7 insertions(+), 9 deletions(-) diff --git a/drivers/char/hw_random/n2-drv.c b/drivers/char/hw_random/n2-drv.c index 20b962e1d832..f9beed54d0c8 100644 --- a/drivers/char/hw_random/n2-drv.c +++ b/drivers/char/hw_random/n2-drv.c @@ -700,7 +700,7 @@ static int n2rng_probe(struct platform_device *op) if (err) goto out_free_units; - dev_set_drvdata(&op->dev, np); + platform_set_drvdata(op, np); schedule_delayed_work(&np->work, 0); @@ -721,7 +721,7 @@ out: static int n2rng_remove(struct platform_device *op) { - struct n2rng *np = dev_get_drvdata(&op->dev); + struct n2rng *np = platform_get_drvdata(op); np->flags |= N2RNG_FLAG_SHUTDOWN; @@ -736,8 +736,6 @@ static int n2rng_remove(struct platform_device *op) kfree(np); - dev_set_drvdata(&op->dev, NULL); - return 0; } diff --git a/drivers/char/hw_random/octeon-rng.c b/drivers/char/hw_random/octeon-rng.c index 1eada566ca70..f2885dbe1849 100644 --- a/drivers/char/hw_random/octeon-rng.c +++ b/drivers/char/hw_random/octeon-rng.c @@ -96,7 +96,7 @@ static int octeon_rng_probe(struct platform_device *pdev) rng->ops = ops; - dev_set_drvdata(&pdev->dev, &rng->ops); + platform_set_drvdata(pdev, &rng->ops); ret = hwrng_register(&rng->ops); if (ret) return -ENOENT; @@ -108,7 +108,7 @@ static int octeon_rng_probe(struct platform_device *pdev) static int __exit octeon_rng_remove(struct platform_device *pdev) { - struct hwrng *rng = dev_get_drvdata(&pdev->dev); + struct hwrng *rng = platform_get_drvdata(pdev); hwrng_unregister(rng); diff --git a/drivers/char/hw_random/omap-rng.c b/drivers/char/hw_random/omap-rng.c index 749dc16ca2cc..e5deb99feb28 100644 --- a/drivers/char/hw_random/omap-rng.c +++ b/drivers/char/hw_random/omap-rng.c @@ -116,7 +116,7 @@ static int omap_rng_probe(struct platform_device *pdev) }; omap_rng_ops.priv = (unsigned long)priv; - dev_set_drvdata(&pdev->dev, priv); + platform_set_drvdata(pdev, priv); priv->mem_res = platform_get_resource(pdev, IORESOURCE_MEM, 0); if (!priv->mem_res) { @@ -129,7 +129,7 @@ static int omap_rng_probe(struct platform_device *pdev) ret = PTR_ERR(priv->base); goto err_ioremap; } - dev_set_drvdata(&pdev->dev, priv); + platform_set_drvdata(pdev, priv); pm_runtime_enable(&pdev->dev); pm_runtime_get_sync(&pdev->dev); @@ -156,7 +156,7 @@ err_ioremap: static int __exit omap_rng_remove(struct platform_device *pdev) { - struct omap_rng_private_data *priv = dev_get_drvdata(&pdev->dev); + struct omap_rng_private_data *priv = platform_get_drvdata(pdev); hwrng_unregister(&omap_rng_ops); -- cgit v1.2.3-59-g8ed1b From d6dfd683e6b196db4cf92cf226c9f63ee66a5eb1 Mon Sep 17 00:00:00 2001 From: Sachin Kamat Date: Wed, 29 May 2013 17:09:09 +0530 Subject: crypto: dcp - Remove redundant platform_set_drvdata() Commit 0998d06310 (device-core: Ensure drvdata = NULL when no driver is bound) removes the need to set driver data field to NULL. Signed-off-by: Sachin Kamat Tested-by: Tobias Rauter Signed-off-by: Herbert Xu --- drivers/crypto/dcp.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/crypto/dcp.c b/drivers/crypto/dcp.c index eea194c34527..83eaefcefb64 100644 --- a/drivers/crypto/dcp.c +++ b/drivers/crypto/dcp.c @@ -876,7 +876,6 @@ static int dcp_remove(struct platform_device *pdev) struct dcp_dev *dev; int j; dev = platform_get_drvdata(pdev); - platform_set_drvdata(pdev, NULL); dma_free_coherent(&pdev->dev, DCP_MAX_PKG * sizeof(struct dcp_hw_packet), -- cgit v1.2.3-59-g8ed1b From 0bc8a7188ff1b9b1cca2e49fd78f0e1797a75631 Mon Sep 17 00:00:00 2001 From: Sachin Kamat Date: Wed, 29 May 2013 17:09:10 +0530 Subject: crypto: dcp - Use devm_* APIs devm_* APIs are device managed and make cleanup and exit code simpler. Signed-off-by: Sachin Kamat Tested-by: Tobias Rauter Signed-off-by: Herbert Xu --- drivers/crypto/dcp.c | 32 ++++++++++---------------------- 1 file changed, 10 insertions(+), 22 deletions(-) diff --git a/drivers/crypto/dcp.c b/drivers/crypto/dcp.c index 83eaefcefb64..9e91c2cf3bf8 100644 --- a/drivers/crypto/dcp.c +++ b/drivers/crypto/dcp.c @@ -723,12 +723,10 @@ static int dcp_probe(struct platform_device *pdev) struct resource *r; int i, ret, j; - dev = kzalloc(sizeof(*dev), GFP_KERNEL); - if (dev == NULL) { - dev_err(&pdev->dev, "Failed to allocate structure\n"); - ret = -ENOMEM; - goto err; - } + dev = devm_kzalloc(&pdev->dev, sizeof(*dev), GFP_KERNEL); + if (!dev) + return -ENOMEM; + global_dev = dev; dev->dev = &pdev->dev; @@ -737,11 +735,10 @@ static int dcp_probe(struct platform_device *pdev) r = platform_get_resource(pdev, IORESOURCE_MEM, 0); if (!r) { dev_err(&pdev->dev, "failed to get IORESOURCE_MEM\n"); - ret = -ENXIO; - goto err_dev; + return -ENXIO; } - dev->dcp_regs_base = ioremap(r->start, resource_size(r)); - + dev->dcp_regs_base = devm_ioremap(&pdev->dev, r->start, + resource_size(r)); dcp_set(dev, DCP_CTRL_SFRST, DCP_REG_CTRL); udelay(10); @@ -762,15 +759,13 @@ static int dcp_probe(struct platform_device *pdev) r = platform_get_resource(pdev, IORESOURCE_IRQ, 0); if (!r) { dev_err(&pdev->dev, "can't get IRQ resource (0)\n"); - ret = -EIO; - goto err_unmap_mem; + return -EIO; } dev->dcp_vmi_irq = r->start; ret = request_irq(dev->dcp_vmi_irq, dcp_vmi_irq, 0, "dcp", dev); if (ret != 0) { dev_err(&pdev->dev, "can't request_irq (0)\n"); - ret = -EIO; - goto err_unmap_mem; + return -EIO; } r = platform_get_resource(pdev, IORESOURCE_IRQ, 1); @@ -863,11 +858,7 @@ err_free_irq1: free_irq(dev->dcp_irq, dev); err_free_irq0: free_irq(dev->dcp_vmi_irq, dev); -err_unmap_mem: - iounmap((void *) dev->dcp_regs_base); -err_dev: - kfree(dev); -err: + return ret; } @@ -890,14 +881,11 @@ static int dcp_remove(struct platform_device *pdev) tasklet_kill(&dev->done_task); tasklet_kill(&dev->queue_task); - iounmap((void *) dev->dcp_regs_base); - for (j = 0; j < ARRAY_SIZE(algs); j++) crypto_unregister_alg(&algs[j]); misc_deregister(&dev->dcp_bootstream_misc); - kfree(dev); return 0; } -- cgit v1.2.3-59-g8ed1b From c2415471702e29a1a39a2e8b46996ecc0f95788c Mon Sep 17 00:00:00 2001 From: Sachin Kamat Date: Wed, 29 May 2013 17:09:11 +0530 Subject: crypto: dcp - Use NULL instead of 0 Use NULL instead of 0 for pointer variables. Signed-off-by: Sachin Kamat Tested-by: Tobias Rauter Signed-off-by: Herbert Xu --- drivers/crypto/dcp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/crypto/dcp.c b/drivers/crypto/dcp.c index 9e91c2cf3bf8..49dd560c0152 100644 --- a/drivers/crypto/dcp.c +++ b/drivers/crypto/dcp.c @@ -408,7 +408,7 @@ static void dcp_done_task(unsigned long data) ablkcipher_walk_complete(&dev->ctx->walk); dev->ctx->req->base.complete(&dev->ctx->req->base, dev->ctx->stat); - dev->ctx->req = 0; + dev->ctx->req = NULL; /* in case there are other requests in the queue */ tasklet_schedule(&dev->queue_task); } -- cgit v1.2.3-59-g8ed1b From 9a733019aa46c37db6ef4686d23a4f5cdbf6fd90 Mon Sep 17 00:00:00 2001 From: Sachin Kamat Date: Wed, 29 May 2013 17:09:12 +0530 Subject: crypto: dcp - Staticize local symbols These symbols are referenced only in this file and hence should be static. Signed-off-by: Sachin Kamat Tested-by: Tobias Rauter Signed-off-by: Herbert Xu --- drivers/crypto/dcp.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/crypto/dcp.c b/drivers/crypto/dcp.c index 49dd560c0152..a8a7dd4b0d25 100644 --- a/drivers/crypto/dcp.c +++ b/drivers/crypto/dcp.c @@ -195,7 +195,7 @@ struct dcp_hw_packet { uint32_t stat; }; -struct dcp_dev *global_dev; +static struct dcp_dev *global_dev; static inline u32 dcp_chan_reg(u32 reg, int chan) { @@ -227,14 +227,14 @@ static inline unsigned int dcp_read(struct dcp_dev *dev, u32 reg) return readl(dev->dcp_regs_base + reg); } -void dcp_dma_unmap(struct dcp_dev *dev, struct dcp_hw_packet *pkt) +static void dcp_dma_unmap(struct dcp_dev *dev, struct dcp_hw_packet *pkt) { dma_unmap_page(dev->dev, pkt->src, pkt->size, DMA_TO_DEVICE); dma_unmap_page(dev->dev, pkt->dst, pkt->size, DMA_FROM_DEVICE); dev_dbg(dev->dev, "unmap packet %x", (unsigned int) pkt); } -int dcp_dma_map(struct dcp_dev *dev, +static int dcp_dma_map(struct dcp_dev *dev, struct ablkcipher_walk *walk, struct dcp_hw_packet *pkt) { dev_dbg(dev->dev, "map packet %x", (unsigned int) pkt); @@ -413,7 +413,7 @@ static void dcp_done_task(unsigned long data) tasklet_schedule(&dev->queue_task); } -void dcp_watchdog(unsigned long data) +static void dcp_watchdog(unsigned long data) { struct dcp_dev *dev = (struct dcp_dev *)data; dev->ctx->stat |= dcp_read(dev, -- cgit v1.2.3-59-g8ed1b From 61e2d1a9b2e8105631ac7ff5b048342fcfd3d622 Mon Sep 17 00:00:00 2001 From: Jingoo Han Date: Sat, 1 Jun 2013 16:05:57 +0900 Subject: crypto: picoxcell - replace strict_strtoul() with kstrtoul() The usage of strict_strtoul() is not preferred, because strict_strtoul() is obsolete. Thus, kstrtoul() should be used. Signed-off-by: Jingoo Han Signed-off-by: Herbert Xu --- drivers/crypto/picoxcell_crypto.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/crypto/picoxcell_crypto.c b/drivers/crypto/picoxcell_crypto.c index ac30724d923d..888f7f4a6d3f 100644 --- a/drivers/crypto/picoxcell_crypto.c +++ b/drivers/crypto/picoxcell_crypto.c @@ -1298,7 +1298,7 @@ static ssize_t spacc_stat_irq_thresh_store(struct device *dev, struct spacc_engine *engine = spacc_dev_to_engine(dev); unsigned long thresh; - if (strict_strtoul(buf, 0, &thresh)) + if (kstrtoul(buf, 0, &thresh)) return -EINVAL; thresh = clamp(thresh, 1UL, engine->fifo_sz - 1); -- cgit v1.2.3-59-g8ed1b From beca35d05cc224d7434b0d1b5911c7b25febd336 Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Mon, 3 Jun 2013 02:02:09 +0200 Subject: hwrng: nomadik - use clk_prepare_enable() The Nomadik HW RNG driver has seen some rust and is not preparing the clock before use. Fix this up so we get rid of runtime complaints from the clock subsystem. Signed-off-by: Linus Walleij Signed-off-by: Herbert Xu --- drivers/char/hw_random/nomadik-rng.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/char/hw_random/nomadik-rng.c b/drivers/char/hw_random/nomadik-rng.c index 96de0249e595..232b87fb5fc9 100644 --- a/drivers/char/hw_random/nomadik-rng.c +++ b/drivers/char/hw_random/nomadik-rng.c @@ -51,7 +51,7 @@ static int nmk_rng_probe(struct amba_device *dev, const struct amba_id *id) return ret; } - clk_enable(rng_clk); + clk_prepare_enable(rng_clk); ret = amba_request_regions(dev, dev->dev.init_name); if (ret) -- cgit v1.2.3-59-g8ed1b From 046174d766ad40148ecf3564755055fd91ed2569 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Sat, 1 Jun 2013 00:22:42 +0200 Subject: hwrng: bcm2835 - fix MODULE_LICENSE tag The MODULE_LICENSE macro invocation must use either "GPL" or "GPL v2", but not "GPLv2" in order to be detected by the module loader. This fixes the allmodconfig build error: FATAL: modpost: GPL-incompatible module bcm2835-rng.ko uses GPL-only symbol 'platform_driver_unregister' Signed-off-by: Arnd Bergmann Cc: Dom Cobley Cc: Lubomir Rintel Cc: Stephen Warren Cc: Matt Mackall Cc: linux-rpi-kernel@lists.infradead.org Acked-by: Lubomir Rintel Signed-off-by: Herbert Xu --- drivers/char/hw_random/bcm2835-rng.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/char/hw_random/bcm2835-rng.c b/drivers/char/hw_random/bcm2835-rng.c index eb7f14725ebd..43577ca780e3 100644 --- a/drivers/char/hw_random/bcm2835-rng.c +++ b/drivers/char/hw_random/bcm2835-rng.c @@ -110,4 +110,4 @@ module_platform_driver(bcm2835_rng_driver); MODULE_AUTHOR("Lubomir Rintel "); MODULE_DESCRIPTION("BCM2835 Random Number Generator (RNG) driver"); -MODULE_LICENSE("GPLv2"); +MODULE_LICENSE("GPL v2"); -- cgit v1.2.3-59-g8ed1b From acfffdb803b6ab5d520bf9a816bfb155ba3a263d Mon Sep 17 00:00:00 2001 From: Jussi Kivilinna Date: Sat, 8 Jun 2013 12:00:59 +0300 Subject: crypto: camellia-aesni-avx2 - tune assembly code for more performance Add implementation tuned for more performance on real hardware. Changes are mostly around the part mixing 128-bit extract and insert instructions and AES-NI instructions. Also 'vpbroadcastb' instructions have been change to 'vpshufb with zero mask'. Tests on Intel Core i5-4570: tcrypt ECB results, old-AVX2 vs new-AVX2: size 128bit key 256bit key enc dec enc dec 256 1.00x 1.00x 1.00x 1.00x 1k 1.08x 1.09x 1.05x 1.06x 8k 1.06x 1.06x 1.06x 1.06x tcrypt ECB results, AVX vs new-AVX2: size 128bit key 256bit key enc dec enc dec 256 1.00x 1.00x 1.00x 1.00x 1k 1.51x 1.50x 1.52x 1.50x 8k 1.47x 1.48x 1.48x 1.48x Signed-off-by: Jussi Kivilinna Signed-off-by: Herbert Xu --- arch/x86/crypto/camellia-aesni-avx2-asm_64.S | 160 +++++++++++++++------------ 1 file changed, 89 insertions(+), 71 deletions(-) diff --git a/arch/x86/crypto/camellia-aesni-avx2-asm_64.S b/arch/x86/crypto/camellia-aesni-avx2-asm_64.S index 91a1878fcc3e..0e0b8863a34b 100644 --- a/arch/x86/crypto/camellia-aesni-avx2-asm_64.S +++ b/arch/x86/crypto/camellia-aesni-avx2-asm_64.S @@ -51,16 +51,6 @@ #define ymm14_x xmm14 #define ymm15_x xmm15 -/* - * AES-NI instructions do not support ymmX registers, so we need splitting and - * merging. - */ -#define vaesenclast256(zero, yreg, tmp) \ - vextracti128 $1, yreg, tmp##_x; \ - vaesenclast zero##_x, yreg##_x, yreg##_x; \ - vaesenclast zero##_x, tmp##_x, tmp##_x; \ - vinserti128 $1, tmp##_x, yreg, yreg; - /********************************************************************** 32-way camellia **********************************************************************/ @@ -79,46 +69,70 @@ * S-function with AES subbytes \ */ \ vbroadcasti128 .Linv_shift_row, t4; \ - vpbroadcastb .L0f0f0f0f, t7; \ - vbroadcasti128 .Lpre_tf_lo_s1, t0; \ - vbroadcasti128 .Lpre_tf_hi_s1, t1; \ + vpbroadcastd .L0f0f0f0f, t7; \ + vbroadcasti128 .Lpre_tf_lo_s1, t5; \ + vbroadcasti128 .Lpre_tf_hi_s1, t6; \ + vbroadcasti128 .Lpre_tf_lo_s4, t2; \ + vbroadcasti128 .Lpre_tf_hi_s4, t3; \ \ /* AES inverse shift rows */ \ vpshufb t4, x0, x0; \ vpshufb t4, x7, x7; \ - vpshufb t4, x1, x1; \ - vpshufb t4, x4, x4; \ - vpshufb t4, x2, x2; \ - vpshufb t4, x5, x5; \ vpshufb t4, x3, x3; \ vpshufb t4, x6, x6; \ + vpshufb t4, x2, x2; \ + vpshufb t4, x5, x5; \ + vpshufb t4, x1, x1; \ + vpshufb t4, x4, x4; \ \ /* prefilter sboxes 1, 2 and 3 */ \ - vbroadcasti128 .Lpre_tf_lo_s4, t2; \ - vbroadcasti128 .Lpre_tf_hi_s4, t3; \ - filter_8bit(x0, t0, t1, t7, t6); \ - filter_8bit(x7, t0, t1, t7, t6); \ - filter_8bit(x1, t0, t1, t7, t6); \ - filter_8bit(x4, t0, t1, t7, t6); \ - filter_8bit(x2, t0, t1, t7, t6); \ - filter_8bit(x5, t0, t1, t7, t6); \ - \ /* prefilter sbox 4 */ \ + filter_8bit(x0, t5, t6, t7, t4); \ + filter_8bit(x7, t5, t6, t7, t4); \ + vextracti128 $1, x0, t0##_x; \ + vextracti128 $1, x7, t1##_x; \ + filter_8bit(x3, t2, t3, t7, t4); \ + filter_8bit(x6, t2, t3, t7, t4); \ + vextracti128 $1, x3, t3##_x; \ + vextracti128 $1, x6, t2##_x; \ + filter_8bit(x2, t5, t6, t7, t4); \ + filter_8bit(x5, t5, t6, t7, t4); \ + filter_8bit(x1, t5, t6, t7, t4); \ + filter_8bit(x4, t5, t6, t7, t4); \ + \ vpxor t4##_x, t4##_x, t4##_x; \ - filter_8bit(x3, t2, t3, t7, t6); \ - filter_8bit(x6, t2, t3, t7, t6); \ \ /* AES subbytes + AES shift rows */ \ + vextracti128 $1, x2, t6##_x; \ + vextracti128 $1, x5, t5##_x; \ + vaesenclast t4##_x, x0##_x, x0##_x; \ + vaesenclast t4##_x, t0##_x, t0##_x; \ + vinserti128 $1, t0##_x, x0, x0; \ + vaesenclast t4##_x, x7##_x, x7##_x; \ + vaesenclast t4##_x, t1##_x, t1##_x; \ + vinserti128 $1, t1##_x, x7, x7; \ + vaesenclast t4##_x, x3##_x, x3##_x; \ + vaesenclast t4##_x, t3##_x, t3##_x; \ + vinserti128 $1, t3##_x, x3, x3; \ + vaesenclast t4##_x, x6##_x, x6##_x; \ + vaesenclast t4##_x, t2##_x, t2##_x; \ + vinserti128 $1, t2##_x, x6, x6; \ + vextracti128 $1, x1, t3##_x; \ + vextracti128 $1, x4, t2##_x; \ vbroadcasti128 .Lpost_tf_lo_s1, t0; \ vbroadcasti128 .Lpost_tf_hi_s1, t1; \ - vaesenclast256(t4, x0, t5); \ - vaesenclast256(t4, x7, t5); \ - vaesenclast256(t4, x1, t5); \ - vaesenclast256(t4, x4, t5); \ - vaesenclast256(t4, x2, t5); \ - vaesenclast256(t4, x5, t5); \ - vaesenclast256(t4, x3, t5); \ - vaesenclast256(t4, x6, t5); \ + vaesenclast t4##_x, x2##_x, x2##_x; \ + vaesenclast t4##_x, t6##_x, t6##_x; \ + vinserti128 $1, t6##_x, x2, x2; \ + vaesenclast t4##_x, x5##_x, x5##_x; \ + vaesenclast t4##_x, t5##_x, t5##_x; \ + vinserti128 $1, t5##_x, x5, x5; \ + vaesenclast t4##_x, x1##_x, x1##_x; \ + vaesenclast t4##_x, t3##_x, t3##_x; \ + vinserti128 $1, t3##_x, x1, x1; \ + vaesenclast t4##_x, x4##_x, x4##_x; \ + vaesenclast t4##_x, t2##_x, t2##_x; \ + vinserti128 $1, t2##_x, x4, x4; \ \ /* postfilter sboxes 1 and 4 */ \ vbroadcasti128 .Lpost_tf_lo_s3, t2; \ @@ -139,22 +153,12 @@ /* postfilter sbox 2 */ \ filter_8bit(x1, t4, t5, t7, t2); \ filter_8bit(x4, t4, t5, t7, t2); \ + vpxor t7, t7, t7; \ \ vpsrldq $1, t0, t1; \ vpsrldq $2, t0, t2; \ + vpshufb t7, t1, t1; \ vpsrldq $3, t0, t3; \ - vpsrldq $4, t0, t4; \ - vpsrldq $5, t0, t5; \ - vpsrldq $6, t0, t6; \ - vpsrldq $7, t0, t7; \ - vpbroadcastb t0##_x, t0; \ - vpbroadcastb t1##_x, t1; \ - vpbroadcastb t2##_x, t2; \ - vpbroadcastb t3##_x, t3; \ - vpbroadcastb t4##_x, t4; \ - vpbroadcastb t6##_x, t6; \ - vpbroadcastb t5##_x, t5; \ - vpbroadcastb t7##_x, t7; \ \ /* P-function */ \ vpxor x5, x0, x0; \ @@ -162,11 +166,21 @@ vpxor x7, x2, x2; \ vpxor x4, x3, x3; \ \ + vpshufb t7, t2, t2; \ + vpsrldq $4, t0, t4; \ + vpshufb t7, t3, t3; \ + vpsrldq $5, t0, t5; \ + vpshufb t7, t4, t4; \ + \ vpxor x2, x4, x4; \ vpxor x3, x5, x5; \ vpxor x0, x6, x6; \ vpxor x1, x7, x7; \ \ + vpsrldq $6, t0, t6; \ + vpshufb t7, t5, t5; \ + vpshufb t7, t6, t6; \ + \ vpxor x7, x0, x0; \ vpxor x4, x1, x1; \ vpxor x5, x2, x2; \ @@ -179,12 +193,16 @@ \ /* Add key material and result to CD (x becomes new CD) */ \ \ - vpxor t7, x0, x0; \ - vpxor 4 * 32(mem_cd), x0, x0; \ - \ vpxor t6, x1, x1; \ vpxor 5 * 32(mem_cd), x1, x1; \ \ + vpsrldq $7, t0, t6; \ + vpshufb t7, t0, t0; \ + vpshufb t7, t6, t7; \ + \ + vpxor t7, x0, x0; \ + vpxor 4 * 32(mem_cd), x0, x0; \ + \ vpxor t5, x2, x2; \ vpxor 6 * 32(mem_cd), x2, x2; \ \ @@ -204,7 +222,7 @@ vpxor 3 * 32(mem_cd), x7, x7; /* - * Size optimization... with inlined roundsm16 binary would be over 5 times + * Size optimization... with inlined roundsm32 binary would be over 5 times * larger and would only marginally faster. */ .align 8 @@ -324,13 +342,13 @@ ENDPROC(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab) */ \ vpbroadcastd kll, t0; /* only lowest 32-bit used */ \ vpxor tt0, tt0, tt0; \ - vpbroadcastb t0##_x, t3; \ + vpshufb tt0, t0, t3; \ vpsrldq $1, t0, t0; \ - vpbroadcastb t0##_x, t2; \ + vpshufb tt0, t0, t2; \ vpsrldq $1, t0, t0; \ - vpbroadcastb t0##_x, t1; \ + vpshufb tt0, t0, t1; \ vpsrldq $1, t0, t0; \ - vpbroadcastb t0##_x, t0; \ + vpshufb tt0, t0, t0; \ \ vpand l0, t0, t0; \ vpand l1, t1, t1; \ @@ -340,6 +358,7 @@ ENDPROC(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab) rol32_1_32(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \ \ vpxor l4, t0, l4; \ + vpbroadcastd krr, t0; /* only lowest 32-bit used */ \ vmovdqu l4, 4 * 32(l); \ vpxor l5, t1, l5; \ vmovdqu l5, 5 * 32(l); \ @@ -354,14 +373,13 @@ ENDPROC(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab) * rl ^= t2; \ */ \ \ - vpbroadcastd krr, t0; /* only lowest 32-bit used */ \ - vpbroadcastb t0##_x, t3; \ + vpshufb tt0, t0, t3; \ vpsrldq $1, t0, t0; \ - vpbroadcastb t0##_x, t2; \ + vpshufb tt0, t0, t2; \ vpsrldq $1, t0, t0; \ - vpbroadcastb t0##_x, t1; \ + vpshufb tt0, t0, t1; \ vpsrldq $1, t0, t0; \ - vpbroadcastb t0##_x, t0; \ + vpshufb tt0, t0, t0; \ \ vpor 4 * 32(r), t0, t0; \ vpor 5 * 32(r), t1, t1; \ @@ -373,6 +391,7 @@ ENDPROC(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab) vpxor 2 * 32(r), t2, t2; \ vpxor 3 * 32(r), t3, t3; \ vmovdqu t0, 0 * 32(r); \ + vpbroadcastd krl, t0; /* only lowest 32-bit used */ \ vmovdqu t1, 1 * 32(r); \ vmovdqu t2, 2 * 32(r); \ vmovdqu t3, 3 * 32(r); \ @@ -382,14 +401,13 @@ ENDPROC(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab) * t2 &= rl; \ * rr ^= rol32(t2, 1); \ */ \ - vpbroadcastd krl, t0; /* only lowest 32-bit used */ \ - vpbroadcastb t0##_x, t3; \ + vpshufb tt0, t0, t3; \ vpsrldq $1, t0, t0; \ - vpbroadcastb t0##_x, t2; \ + vpshufb tt0, t0, t2; \ vpsrldq $1, t0, t0; \ - vpbroadcastb t0##_x, t1; \ + vpshufb tt0, t0, t1; \ vpsrldq $1, t0, t0; \ - vpbroadcastb t0##_x, t0; \ + vpshufb tt0, t0, t0; \ \ vpand 0 * 32(r), t0, t0; \ vpand 1 * 32(r), t1, t1; \ @@ -403,6 +421,7 @@ ENDPROC(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab) vpxor 6 * 32(r), t2, t2; \ vpxor 7 * 32(r), t3, t3; \ vmovdqu t0, 4 * 32(r); \ + vpbroadcastd klr, t0; /* only lowest 32-bit used */ \ vmovdqu t1, 5 * 32(r); \ vmovdqu t2, 6 * 32(r); \ vmovdqu t3, 7 * 32(r); \ @@ -413,14 +432,13 @@ ENDPROC(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab) * ll ^= t0; \ */ \ \ - vpbroadcastd klr, t0; /* only lowest 32-bit used */ \ - vpbroadcastb t0##_x, t3; \ + vpshufb tt0, t0, t3; \ vpsrldq $1, t0, t0; \ - vpbroadcastb t0##_x, t2; \ + vpshufb tt0, t0, t2; \ vpsrldq $1, t0, t0; \ - vpbroadcastb t0##_x, t1; \ + vpshufb tt0, t0, t1; \ vpsrldq $1, t0, t0; \ - vpbroadcastb t0##_x, t0; \ + vpshufb tt0, t0, t0; \ \ vpor l4, t0, t0; \ vpor l5, t1, t1; \ -- cgit v1.2.3-59-g8ed1b From 3d387ef08c40382315b8e9baa4bc9a07f7c49fce Mon Sep 17 00:00:00 2001 From: Jussi Kivilinna Date: Sat, 8 Jun 2013 12:17:42 +0300 Subject: Revert "crypto: blowfish - add AVX2/x86_64 implementation of blowfish cipher" This reverts commit 604880107010a1e5794552d184cd5471ea31b973. Instruction (vpgatherdd) that this implementation relied on turned out to be slow performer on real hardware (i5-4570). The previous 4-way blowfish implementation is therefore faster and this implementation should be removed. Signed-off-by: Jussi Kivilinna Signed-off-by: Herbert Xu --- arch/x86/crypto/Makefile | 4 - arch/x86/crypto/blowfish-avx2-asm_64.S | 449 ------------------------- arch/x86/crypto/blowfish_avx2_glue.c | 585 --------------------------------- arch/x86/crypto/blowfish_glue.c | 32 +- arch/x86/include/asm/crypto/blowfish.h | 43 --- crypto/Kconfig | 18 - crypto/testmgr.c | 12 - 7 files changed, 24 insertions(+), 1119 deletions(-) delete mode 100644 arch/x86/crypto/blowfish-avx2-asm_64.S delete mode 100644 arch/x86/crypto/blowfish_avx2_glue.c delete mode 100644 arch/x86/include/asm/crypto/blowfish.h diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile index 94cb151adc1d..9ce341839f4a 100644 --- a/arch/x86/crypto/Makefile +++ b/arch/x86/crypto/Makefile @@ -3,8 +3,6 @@ # avx_supported := $(call as-instr,vpxor %xmm0$(comma)%xmm0$(comma)%xmm0,yes,no) -avx2_supported := $(call as-instr,vpgatherdd %ymm0$(comma)(%eax$(comma)%ymm1\ - $(comma)4)$(comma)%ymm2,yes,no) obj-$(CONFIG_CRYPTO_ABLK_HELPER_X86) += ablk_helper.o obj-$(CONFIG_CRYPTO_GLUE_HELPER_X86) += glue_helper.o @@ -43,7 +41,6 @@ endif # These modules require assembler to support AVX2. ifeq ($(avx2_supported),yes) - obj-$(CONFIG_CRYPTO_BLOWFISH_AVX2_X86_64) += blowfish-avx2.o obj-$(CONFIG_CRYPTO_CAMELLIA_AESNI_AVX2_X86_64) += camellia-aesni-avx2.o obj-$(CONFIG_CRYPTO_SERPENT_AVX2_X86_64) += serpent-avx2.o obj-$(CONFIG_CRYPTO_TWOFISH_AVX2_X86_64) += twofish-avx2.o @@ -74,7 +71,6 @@ ifeq ($(avx_supported),yes) endif ifeq ($(avx2_supported),yes) - blowfish-avx2-y := blowfish-avx2-asm_64.o blowfish_avx2_glue.o camellia-aesni-avx2-y := camellia-aesni-avx2-asm_64.o camellia_aesni_avx2_glue.o serpent-avx2-y := serpent-avx2-asm_64.o serpent_avx2_glue.o twofish-avx2-y := twofish-avx2-asm_64.o twofish_avx2_glue.o diff --git a/arch/x86/crypto/blowfish-avx2-asm_64.S b/arch/x86/crypto/blowfish-avx2-asm_64.S deleted file mode 100644 index 784452e0d05d..000000000000 --- a/arch/x86/crypto/blowfish-avx2-asm_64.S +++ /dev/null @@ -1,449 +0,0 @@ -/* - * x86_64/AVX2 assembler optimized version of Blowfish - * - * Copyright © 2012-2013 Jussi Kivilinna - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - */ - -#include - -.file "blowfish-avx2-asm_64.S" - -.data -.align 32 - -.Lprefetch_mask: -.long 0*64 -.long 1*64 -.long 2*64 -.long 3*64 -.long 4*64 -.long 5*64 -.long 6*64 -.long 7*64 - -.Lbswap32_mask: -.long 0x00010203 -.long 0x04050607 -.long 0x08090a0b -.long 0x0c0d0e0f - -.Lbswap128_mask: - .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 -.Lbswap_iv_mask: - .byte 7, 6, 5, 4, 3, 2, 1, 0, 7, 6, 5, 4, 3, 2, 1, 0 - -.text -/* structure of crypto context */ -#define p 0 -#define s0 ((16 + 2) * 4) -#define s1 ((16 + 2 + (1 * 256)) * 4) -#define s2 ((16 + 2 + (2 * 256)) * 4) -#define s3 ((16 + 2 + (3 * 256)) * 4) - -/* register macros */ -#define CTX %rdi -#define RIO %rdx - -#define RS0 %rax -#define RS1 %r8 -#define RS2 %r9 -#define RS3 %r10 - -#define RLOOP %r11 -#define RLOOPd %r11d - -#define RXr0 %ymm8 -#define RXr1 %ymm9 -#define RXr2 %ymm10 -#define RXr3 %ymm11 -#define RXl0 %ymm12 -#define RXl1 %ymm13 -#define RXl2 %ymm14 -#define RXl3 %ymm15 - -/* temp regs */ -#define RT0 %ymm0 -#define RT0x %xmm0 -#define RT1 %ymm1 -#define RT1x %xmm1 -#define RIDX0 %ymm2 -#define RIDX1 %ymm3 -#define RIDX1x %xmm3 -#define RIDX2 %ymm4 -#define RIDX3 %ymm5 - -/* vpgatherdd mask and '-1' */ -#define RNOT %ymm6 - -/* byte mask, (-1 >> 24) */ -#define RBYTE %ymm7 - -/*********************************************************************** - * 32-way AVX2 blowfish - ***********************************************************************/ -#define F(xl, xr) \ - vpsrld $24, xl, RIDX0; \ - vpsrld $16, xl, RIDX1; \ - vpsrld $8, xl, RIDX2; \ - vpand RBYTE, RIDX1, RIDX1; \ - vpand RBYTE, RIDX2, RIDX2; \ - vpand RBYTE, xl, RIDX3; \ - \ - vpgatherdd RNOT, (RS0, RIDX0, 4), RT0; \ - vpcmpeqd RNOT, RNOT, RNOT; \ - vpcmpeqd RIDX0, RIDX0, RIDX0; \ - \ - vpgatherdd RNOT, (RS1, RIDX1, 4), RT1; \ - vpcmpeqd RIDX1, RIDX1, RIDX1; \ - vpaddd RT0, RT1, RT0; \ - \ - vpgatherdd RIDX0, (RS2, RIDX2, 4), RT1; \ - vpxor RT0, RT1, RT0; \ - \ - vpgatherdd RIDX1, (RS3, RIDX3, 4), RT1; \ - vpcmpeqd RNOT, RNOT, RNOT; \ - vpaddd RT0, RT1, RT0; \ - \ - vpxor RT0, xr, xr; - -#define add_roundkey(xl, nmem) \ - vpbroadcastd nmem, RT0; \ - vpxor RT0, xl ## 0, xl ## 0; \ - vpxor RT0, xl ## 1, xl ## 1; \ - vpxor RT0, xl ## 2, xl ## 2; \ - vpxor RT0, xl ## 3, xl ## 3; - -#define round_enc() \ - add_roundkey(RXr, p(CTX,RLOOP,4)); \ - F(RXl0, RXr0); \ - F(RXl1, RXr1); \ - F(RXl2, RXr2); \ - F(RXl3, RXr3); \ - \ - add_roundkey(RXl, p+4(CTX,RLOOP,4)); \ - F(RXr0, RXl0); \ - F(RXr1, RXl1); \ - F(RXr2, RXl2); \ - F(RXr3, RXl3); - -#define round_dec() \ - add_roundkey(RXr, p+4*2(CTX,RLOOP,4)); \ - F(RXl0, RXr0); \ - F(RXl1, RXr1); \ - F(RXl2, RXr2); \ - F(RXl3, RXr3); \ - \ - add_roundkey(RXl, p+4(CTX,RLOOP,4)); \ - F(RXr0, RXl0); \ - F(RXr1, RXl1); \ - F(RXr2, RXl2); \ - F(RXr3, RXl3); - -#define init_round_constants() \ - vpcmpeqd RNOT, RNOT, RNOT; \ - leaq s0(CTX), RS0; \ - leaq s1(CTX), RS1; \ - leaq s2(CTX), RS2; \ - leaq s3(CTX), RS3; \ - vpsrld $24, RNOT, RBYTE; - -#define transpose_2x2(x0, x1, t0) \ - vpunpckldq x0, x1, t0; \ - vpunpckhdq x0, x1, x1; \ - \ - vpunpcklqdq t0, x1, x0; \ - vpunpckhqdq t0, x1, x1; - -#define read_block(xl, xr) \ - vbroadcasti128 .Lbswap32_mask, RT1; \ - \ - vpshufb RT1, xl ## 0, xl ## 0; \ - vpshufb RT1, xr ## 0, xr ## 0; \ - vpshufb RT1, xl ## 1, xl ## 1; \ - vpshufb RT1, xr ## 1, xr ## 1; \ - vpshufb RT1, xl ## 2, xl ## 2; \ - vpshufb RT1, xr ## 2, xr ## 2; \ - vpshufb RT1, xl ## 3, xl ## 3; \ - vpshufb RT1, xr ## 3, xr ## 3; \ - \ - transpose_2x2(xl ## 0, xr ## 0, RT0); \ - transpose_2x2(xl ## 1, xr ## 1, RT0); \ - transpose_2x2(xl ## 2, xr ## 2, RT0); \ - transpose_2x2(xl ## 3, xr ## 3, RT0); - -#define write_block(xl, xr) \ - vbroadcasti128 .Lbswap32_mask, RT1; \ - \ - transpose_2x2(xl ## 0, xr ## 0, RT0); \ - transpose_2x2(xl ## 1, xr ## 1, RT0); \ - transpose_2x2(xl ## 2, xr ## 2, RT0); \ - transpose_2x2(xl ## 3, xr ## 3, RT0); \ - \ - vpshufb RT1, xl ## 0, xl ## 0; \ - vpshufb RT1, xr ## 0, xr ## 0; \ - vpshufb RT1, xl ## 1, xl ## 1; \ - vpshufb RT1, xr ## 1, xr ## 1; \ - vpshufb RT1, xl ## 2, xl ## 2; \ - vpshufb RT1, xr ## 2, xr ## 2; \ - vpshufb RT1, xl ## 3, xl ## 3; \ - vpshufb RT1, xr ## 3, xr ## 3; - -.align 8 -__blowfish_enc_blk32: - /* input: - * %rdi: ctx, CTX - * RXl0..4, RXr0..4: plaintext - * output: - * RXl0..4, RXr0..4: ciphertext (RXl <=> RXr swapped) - */ - init_round_constants(); - - read_block(RXl, RXr); - - movl $1, RLOOPd; - add_roundkey(RXl, p+4*(0)(CTX)); - -.align 4 -.L__enc_loop: - round_enc(); - - leal 2(RLOOPd), RLOOPd; - cmpl $17, RLOOPd; - jne .L__enc_loop; - - add_roundkey(RXr, p+4*(17)(CTX)); - - write_block(RXl, RXr); - - ret; -ENDPROC(__blowfish_enc_blk32) - -.align 8 -__blowfish_dec_blk32: - /* input: - * %rdi: ctx, CTX - * RXl0..4, RXr0..4: ciphertext - * output: - * RXl0..4, RXr0..4: plaintext (RXl <=> RXr swapped) - */ - init_round_constants(); - - read_block(RXl, RXr); - - movl $14, RLOOPd; - add_roundkey(RXl, p+4*(17)(CTX)); - -.align 4 -.L__dec_loop: - round_dec(); - - addl $-2, RLOOPd; - jns .L__dec_loop; - - add_roundkey(RXr, p+4*(0)(CTX)); - - write_block(RXl, RXr); - - ret; -ENDPROC(__blowfish_dec_blk32) - -ENTRY(blowfish_ecb_enc_32way) - /* input: - * %rdi: ctx, CTX - * %rsi: dst - * %rdx: src - */ - - vzeroupper; - - vmovdqu 0*32(%rdx), RXl0; - vmovdqu 1*32(%rdx), RXr0; - vmovdqu 2*32(%rdx), RXl1; - vmovdqu 3*32(%rdx), RXr1; - vmovdqu 4*32(%rdx), RXl2; - vmovdqu 5*32(%rdx), RXr2; - vmovdqu 6*32(%rdx), RXl3; - vmovdqu 7*32(%rdx), RXr3; - - call __blowfish_enc_blk32; - - vmovdqu RXr0, 0*32(%rsi); - vmovdqu RXl0, 1*32(%rsi); - vmovdqu RXr1, 2*32(%rsi); - vmovdqu RXl1, 3*32(%rsi); - vmovdqu RXr2, 4*32(%rsi); - vmovdqu RXl2, 5*32(%rsi); - vmovdqu RXr3, 6*32(%rsi); - vmovdqu RXl3, 7*32(%rsi); - - vzeroupper; - - ret; -ENDPROC(blowfish_ecb_enc_32way) - -ENTRY(blowfish_ecb_dec_32way) - /* input: - * %rdi: ctx, CTX - * %rsi: dst - * %rdx: src - */ - - vzeroupper; - - vmovdqu 0*32(%rdx), RXl0; - vmovdqu 1*32(%rdx), RXr0; - vmovdqu 2*32(%rdx), RXl1; - vmovdqu 3*32(%rdx), RXr1; - vmovdqu 4*32(%rdx), RXl2; - vmovdqu 5*32(%rdx), RXr2; - vmovdqu 6*32(%rdx), RXl3; - vmovdqu 7*32(%rdx), RXr3; - - call __blowfish_dec_blk32; - - vmovdqu RXr0, 0*32(%rsi); - vmovdqu RXl0, 1*32(%rsi); - vmovdqu RXr1, 2*32(%rsi); - vmovdqu RXl1, 3*32(%rsi); - vmovdqu RXr2, 4*32(%rsi); - vmovdqu RXl2, 5*32(%rsi); - vmovdqu RXr3, 6*32(%rsi); - vmovdqu RXl3, 7*32(%rsi); - - vzeroupper; - - ret; -ENDPROC(blowfish_ecb_dec_32way) - -ENTRY(blowfish_cbc_dec_32way) - /* input: - * %rdi: ctx, CTX - * %rsi: dst - * %rdx: src - */ - - vzeroupper; - - vmovdqu 0*32(%rdx), RXl0; - vmovdqu 1*32(%rdx), RXr0; - vmovdqu 2*32(%rdx), RXl1; - vmovdqu 3*32(%rdx), RXr1; - vmovdqu 4*32(%rdx), RXl2; - vmovdqu 5*32(%rdx), RXr2; - vmovdqu 6*32(%rdx), RXl3; - vmovdqu 7*32(%rdx), RXr3; - - call __blowfish_dec_blk32; - - /* xor with src */ - vmovq (%rdx), RT0x; - vpshufd $0x4f, RT0x, RT0x; - vinserti128 $1, 8(%rdx), RT0, RT0; - vpxor RT0, RXr0, RXr0; - vpxor 0*32+24(%rdx), RXl0, RXl0; - vpxor 1*32+24(%rdx), RXr1, RXr1; - vpxor 2*32+24(%rdx), RXl1, RXl1; - vpxor 3*32+24(%rdx), RXr2, RXr2; - vpxor 4*32+24(%rdx), RXl2, RXl2; - vpxor 5*32+24(%rdx), RXr3, RXr3; - vpxor 6*32+24(%rdx), RXl3, RXl3; - - vmovdqu RXr0, (0*32)(%rsi); - vmovdqu RXl0, (1*32)(%rsi); - vmovdqu RXr1, (2*32)(%rsi); - vmovdqu RXl1, (3*32)(%rsi); - vmovdqu RXr2, (4*32)(%rsi); - vmovdqu RXl2, (5*32)(%rsi); - vmovdqu RXr3, (6*32)(%rsi); - vmovdqu RXl3, (7*32)(%rsi); - - vzeroupper; - - ret; -ENDPROC(blowfish_cbc_dec_32way) - -ENTRY(blowfish_ctr_32way) - /* input: - * %rdi: ctx, CTX - * %rsi: dst - * %rdx: src - * %rcx: iv (big endian, 64bit) - */ - - vzeroupper; - - vpcmpeqd RT0, RT0, RT0; - vpsrldq $8, RT0, RT0; /* a: -1, b: 0, c: -1, d: 0 */ - - vpcmpeqd RT1x, RT1x, RT1x; - vpaddq RT1x, RT1x, RT1x; /* a: -2, b: -2 */ - vpxor RIDX0, RIDX0, RIDX0; - vinserti128 $1, RT1x, RIDX0, RIDX0; /* a: 0, b: 0, c: -2, d: -2 */ - - vpaddq RIDX0, RT0, RT0; /* a: -1, b: 0, c: -3, d: -2 */ - - vpcmpeqd RT1, RT1, RT1; - vpaddq RT1, RT1, RT1; /* a: -2, b: -2, c: -2, d: -2 */ - vpaddq RT1, RT1, RIDX2; /* a: -4, b: -4, c: -4, d: -4 */ - - vbroadcasti128 .Lbswap_iv_mask, RIDX0; - vbroadcasti128 .Lbswap128_mask, RIDX1; - - /* load IV and byteswap */ - vmovq (%rcx), RT1x; - vinserti128 $1, RT1x, RT1, RT1; /* a: BE, b: 0, c: BE, d: 0 */ - vpshufb RIDX0, RT1, RT1; /* a: LE, b: LE, c: LE, d: LE */ - - /* construct IVs */ - vpsubq RT0, RT1, RT1; /* a: le1, b: le0, c: le3, d: le2 */ - vpshufb RIDX1, RT1, RXl0; /* a: be0, b: be1, c: be2, d: be3 */ - vpsubq RIDX2, RT1, RT1; /* le5, le4, le7, le6 */ - vpshufb RIDX1, RT1, RXr0; /* be4, be5, be6, be7 */ - vpsubq RIDX2, RT1, RT1; - vpshufb RIDX1, RT1, RXl1; - vpsubq RIDX2, RT1, RT1; - vpshufb RIDX1, RT1, RXr1; - vpsubq RIDX2, RT1, RT1; - vpshufb RIDX1, RT1, RXl2; - vpsubq RIDX2, RT1, RT1; - vpshufb RIDX1, RT1, RXr2; - vpsubq RIDX2, RT1, RT1; - vpshufb RIDX1, RT1, RXl3; - vpsubq RIDX2, RT1, RT1; - vpshufb RIDX1, RT1, RXr3; - - /* store last IV */ - vpsubq RIDX2, RT1, RT1; /* a: le33, b: le32, ... */ - vpshufb RIDX1x, RT1x, RT1x; /* a: be32, ... */ - vmovq RT1x, (%rcx); - - call __blowfish_enc_blk32; - - /* dst = src ^ iv */ - vpxor 0*32(%rdx), RXr0, RXr0; - vpxor 1*32(%rdx), RXl0, RXl0; - vpxor 2*32(%rdx), RXr1, RXr1; - vpxor 3*32(%rdx), RXl1, RXl1; - vpxor 4*32(%rdx), RXr2, RXr2; - vpxor 5*32(%rdx), RXl2, RXl2; - vpxor 6*32(%rdx), RXr3, RXr3; - vpxor 7*32(%rdx), RXl3, RXl3; - vmovdqu RXr0, (0*32)(%rsi); - vmovdqu RXl0, (1*32)(%rsi); - vmovdqu RXr1, (2*32)(%rsi); - vmovdqu RXl1, (3*32)(%rsi); - vmovdqu RXr2, (4*32)(%rsi); - vmovdqu RXl2, (5*32)(%rsi); - vmovdqu RXr3, (6*32)(%rsi); - vmovdqu RXl3, (7*32)(%rsi); - - vzeroupper; - - ret; -ENDPROC(blowfish_ctr_32way) diff --git a/arch/x86/crypto/blowfish_avx2_glue.c b/arch/x86/crypto/blowfish_avx2_glue.c deleted file mode 100644 index 4417e9aea78d..000000000000 --- a/arch/x86/crypto/blowfish_avx2_glue.c +++ /dev/null @@ -1,585 +0,0 @@ -/* - * Glue Code for x86_64/AVX2 assembler optimized version of Blowfish - * - * Copyright © 2012-2013 Jussi Kivilinna - * - * CBC & ECB parts based on code (crypto/cbc.c,ecb.c) by: - * Copyright (c) 2006 Herbert Xu - * CTR part based on code (crypto/ctr.c) by: - * (C) Copyright IBM Corp. 2007 - Joy Latten - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#define BF_AVX2_PARALLEL_BLOCKS 32 - -/* 32-way AVX2 parallel cipher functions */ -asmlinkage void blowfish_ecb_enc_32way(struct bf_ctx *ctx, u8 *dst, - const u8 *src); -asmlinkage void blowfish_ecb_dec_32way(struct bf_ctx *ctx, u8 *dst, - const u8 *src); -asmlinkage void blowfish_cbc_dec_32way(struct bf_ctx *ctx, u8 *dst, - const u8 *src); -asmlinkage void blowfish_ctr_32way(struct bf_ctx *ctx, u8 *dst, const u8 *src, - __be64 *iv); - -static inline bool bf_fpu_begin(bool fpu_enabled, unsigned int nbytes) -{ - if (fpu_enabled) - return true; - - /* FPU is only used when chunk to be processed is large enough, so - * do not enable FPU until it is necessary. - */ - if (nbytes < BF_BLOCK_SIZE * BF_AVX2_PARALLEL_BLOCKS) - return false; - - kernel_fpu_begin(); - return true; -} - -static inline void bf_fpu_end(bool fpu_enabled) -{ - if (fpu_enabled) - kernel_fpu_end(); -} - -static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk, - bool enc) -{ - bool fpu_enabled = false; - struct bf_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); - const unsigned int bsize = BF_BLOCK_SIZE; - unsigned int nbytes; - int err; - - err = blkcipher_walk_virt(desc, walk); - desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; - - while ((nbytes = walk->nbytes)) { - u8 *wsrc = walk->src.virt.addr; - u8 *wdst = walk->dst.virt.addr; - - fpu_enabled = bf_fpu_begin(fpu_enabled, nbytes); - - /* Process multi-block AVX2 batch */ - if (nbytes >= bsize * BF_AVX2_PARALLEL_BLOCKS) { - do { - if (enc) - blowfish_ecb_enc_32way(ctx, wdst, wsrc); - else - blowfish_ecb_dec_32way(ctx, wdst, wsrc); - - wsrc += bsize * BF_AVX2_PARALLEL_BLOCKS; - wdst += bsize * BF_AVX2_PARALLEL_BLOCKS; - nbytes -= bsize * BF_AVX2_PARALLEL_BLOCKS; - } while (nbytes >= bsize * BF_AVX2_PARALLEL_BLOCKS); - - if (nbytes < bsize) - goto done; - } - - /* Process multi-block batch */ - if (nbytes >= bsize * BF_PARALLEL_BLOCKS) { - do { - if (enc) - blowfish_enc_blk_4way(ctx, wdst, wsrc); - else - blowfish_dec_blk_4way(ctx, wdst, wsrc); - - wsrc += bsize * BF_PARALLEL_BLOCKS; - wdst += bsize * BF_PARALLEL_BLOCKS; - nbytes -= bsize * BF_PARALLEL_BLOCKS; - } while (nbytes >= bsize * BF_PARALLEL_BLOCKS); - - if (nbytes < bsize) - goto done; - } - - /* Handle leftovers */ - do { - if (enc) - blowfish_enc_blk(ctx, wdst, wsrc); - else - blowfish_dec_blk(ctx, wdst, wsrc); - - wsrc += bsize; - wdst += bsize; - nbytes -= bsize; - } while (nbytes >= bsize); - -done: - err = blkcipher_walk_done(desc, walk, nbytes); - } - - bf_fpu_end(fpu_enabled); - return err; -} - -static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) -{ - struct blkcipher_walk walk; - - blkcipher_walk_init(&walk, dst, src, nbytes); - return ecb_crypt(desc, &walk, true); -} - -static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) -{ - struct blkcipher_walk walk; - - blkcipher_walk_init(&walk, dst, src, nbytes); - return ecb_crypt(desc, &walk, false); -} - -static unsigned int __cbc_encrypt(struct blkcipher_desc *desc, - struct blkcipher_walk *walk) -{ - struct bf_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); - unsigned int bsize = BF_BLOCK_SIZE; - unsigned int nbytes = walk->nbytes; - u64 *src = (u64 *)walk->src.virt.addr; - u64 *dst = (u64 *)walk->dst.virt.addr; - u64 *iv = (u64 *)walk->iv; - - do { - *dst = *src ^ *iv; - blowfish_enc_blk(ctx, (u8 *)dst, (u8 *)dst); - iv = dst; - - src += 1; - dst += 1; - nbytes -= bsize; - } while (nbytes >= bsize); - - *(u64 *)walk->iv = *iv; - return nbytes; -} - -static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) -{ - struct blkcipher_walk walk; - int err; - - blkcipher_walk_init(&walk, dst, src, nbytes); - err = blkcipher_walk_virt(desc, &walk); - - while ((nbytes = walk.nbytes)) { - nbytes = __cbc_encrypt(desc, &walk); - err = blkcipher_walk_done(desc, &walk, nbytes); - } - - return err; -} - -static unsigned int __cbc_decrypt(struct blkcipher_desc *desc, - struct blkcipher_walk *walk) -{ - struct bf_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); - const unsigned int bsize = BF_BLOCK_SIZE; - unsigned int nbytes = walk->nbytes; - u64 *src = (u64 *)walk->src.virt.addr; - u64 *dst = (u64 *)walk->dst.virt.addr; - u64 last_iv; - int i; - - /* Start of the last block. */ - src += nbytes / bsize - 1; - dst += nbytes / bsize - 1; - - last_iv = *src; - - /* Process multi-block AVX2 batch */ - if (nbytes >= bsize * BF_AVX2_PARALLEL_BLOCKS) { - do { - nbytes -= bsize * (BF_AVX2_PARALLEL_BLOCKS - 1); - src -= BF_AVX2_PARALLEL_BLOCKS - 1; - dst -= BF_AVX2_PARALLEL_BLOCKS - 1; - - blowfish_cbc_dec_32way(ctx, (u8 *)dst, (u8 *)src); - - nbytes -= bsize; - if (nbytes < bsize) - goto done; - - *dst ^= *(src - 1); - src -= 1; - dst -= 1; - } while (nbytes >= bsize * BF_AVX2_PARALLEL_BLOCKS); - - if (nbytes < bsize) - goto done; - } - - /* Process multi-block batch */ - if (nbytes >= bsize * BF_PARALLEL_BLOCKS) { - u64 ivs[BF_PARALLEL_BLOCKS - 1]; - - do { - nbytes -= bsize * (BF_PARALLEL_BLOCKS - 1); - src -= BF_PARALLEL_BLOCKS - 1; - dst -= BF_PARALLEL_BLOCKS - 1; - - for (i = 0; i < BF_PARALLEL_BLOCKS - 1; i++) - ivs[i] = src[i]; - - blowfish_dec_blk_4way(ctx, (u8 *)dst, (u8 *)src); - - for (i = 0; i < BF_PARALLEL_BLOCKS - 1; i++) - dst[i + 1] ^= ivs[i]; - - nbytes -= bsize; - if (nbytes < bsize) - goto done; - - *dst ^= *(src - 1); - src -= 1; - dst -= 1; - } while (nbytes >= bsize * BF_PARALLEL_BLOCKS); - - if (nbytes < bsize) - goto done; - } - - /* Handle leftovers */ - for (;;) { - blowfish_dec_blk(ctx, (u8 *)dst, (u8 *)src); - - nbytes -= bsize; - if (nbytes < bsize) - break; - - *dst ^= *(src - 1); - src -= 1; - dst -= 1; - } - -done: - *dst ^= *(u64 *)walk->iv; - *(u64 *)walk->iv = last_iv; - - return nbytes; -} - -static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) -{ - bool fpu_enabled = false; - struct blkcipher_walk walk; - int err; - - blkcipher_walk_init(&walk, dst, src, nbytes); - err = blkcipher_walk_virt(desc, &walk); - desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; - - while ((nbytes = walk.nbytes)) { - fpu_enabled = bf_fpu_begin(fpu_enabled, nbytes); - nbytes = __cbc_decrypt(desc, &walk); - err = blkcipher_walk_done(desc, &walk, nbytes); - } - - bf_fpu_end(fpu_enabled); - return err; -} - -static void ctr_crypt_final(struct blkcipher_desc *desc, - struct blkcipher_walk *walk) -{ - struct bf_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); - u8 *ctrblk = walk->iv; - u8 keystream[BF_BLOCK_SIZE]; - u8 *src = walk->src.virt.addr; - u8 *dst = walk->dst.virt.addr; - unsigned int nbytes = walk->nbytes; - - blowfish_enc_blk(ctx, keystream, ctrblk); - crypto_xor(keystream, src, nbytes); - memcpy(dst, keystream, nbytes); - - crypto_inc(ctrblk, BF_BLOCK_SIZE); -} - -static unsigned int __ctr_crypt(struct blkcipher_desc *desc, - struct blkcipher_walk *walk) -{ - struct bf_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); - unsigned int bsize = BF_BLOCK_SIZE; - unsigned int nbytes = walk->nbytes; - u64 *src = (u64 *)walk->src.virt.addr; - u64 *dst = (u64 *)walk->dst.virt.addr; - int i; - - /* Process multi-block AVX2 batch */ - if (nbytes >= bsize * BF_AVX2_PARALLEL_BLOCKS) { - do { - blowfish_ctr_32way(ctx, (u8 *)dst, (u8 *)src, - (__be64 *)walk->iv); - - src += BF_AVX2_PARALLEL_BLOCKS; - dst += BF_AVX2_PARALLEL_BLOCKS; - nbytes -= bsize * BF_AVX2_PARALLEL_BLOCKS; - } while (nbytes >= bsize * BF_AVX2_PARALLEL_BLOCKS); - - if (nbytes < bsize) - goto done; - } - - /* Process four block batch */ - if (nbytes >= bsize * BF_PARALLEL_BLOCKS) { - __be64 ctrblocks[BF_PARALLEL_BLOCKS]; - u64 ctrblk = be64_to_cpu(*(__be64 *)walk->iv); - - do { - /* create ctrblks for parallel encrypt */ - for (i = 0; i < BF_PARALLEL_BLOCKS; i++) { - if (dst != src) - dst[i] = src[i]; - - ctrblocks[i] = cpu_to_be64(ctrblk++); - } - - blowfish_enc_blk_xor_4way(ctx, (u8 *)dst, - (u8 *)ctrblocks); - - src += BF_PARALLEL_BLOCKS; - dst += BF_PARALLEL_BLOCKS; - nbytes -= bsize * BF_PARALLEL_BLOCKS; - } while (nbytes >= bsize * BF_PARALLEL_BLOCKS); - - *(__be64 *)walk->iv = cpu_to_be64(ctrblk); - - if (nbytes < bsize) - goto done; - } - - /* Handle leftovers */ - do { - u64 ctrblk; - - if (dst != src) - *dst = *src; - - ctrblk = *(u64 *)walk->iv; - be64_add_cpu((__be64 *)walk->iv, 1); - - blowfish_enc_blk_xor(ctx, (u8 *)dst, (u8 *)&ctrblk); - - src += 1; - dst += 1; - } while ((nbytes -= bsize) >= bsize); - -done: - return nbytes; -} - -static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) -{ - bool fpu_enabled = false; - struct blkcipher_walk walk; - int err; - - blkcipher_walk_init(&walk, dst, src, nbytes); - err = blkcipher_walk_virt_block(desc, &walk, BF_BLOCK_SIZE); - desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; - - while ((nbytes = walk.nbytes) >= BF_BLOCK_SIZE) { - fpu_enabled = bf_fpu_begin(fpu_enabled, nbytes); - nbytes = __ctr_crypt(desc, &walk); - err = blkcipher_walk_done(desc, &walk, nbytes); - } - - bf_fpu_end(fpu_enabled); - - if (walk.nbytes) { - ctr_crypt_final(desc, &walk); - err = blkcipher_walk_done(desc, &walk, 0); - } - - return err; -} - -static struct crypto_alg bf_algs[6] = { { - .cra_name = "__ecb-blowfish-avx2", - .cra_driver_name = "__driver-ecb-blowfish-avx2", - .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, - .cra_blocksize = BF_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct bf_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_blkcipher_type, - .cra_module = THIS_MODULE, - .cra_u = { - .blkcipher = { - .min_keysize = BF_MIN_KEY_SIZE, - .max_keysize = BF_MAX_KEY_SIZE, - .setkey = blowfish_setkey, - .encrypt = ecb_encrypt, - .decrypt = ecb_decrypt, - }, - }, -}, { - .cra_name = "__cbc-blowfish-avx2", - .cra_driver_name = "__driver-cbc-blowfish-avx2", - .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, - .cra_blocksize = BF_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct bf_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_blkcipher_type, - .cra_module = THIS_MODULE, - .cra_u = { - .blkcipher = { - .min_keysize = BF_MIN_KEY_SIZE, - .max_keysize = BF_MAX_KEY_SIZE, - .setkey = blowfish_setkey, - .encrypt = cbc_encrypt, - .decrypt = cbc_decrypt, - }, - }, -}, { - .cra_name = "__ctr-blowfish-avx2", - .cra_driver_name = "__driver-ctr-blowfish-avx2", - .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, - .cra_blocksize = 1, - .cra_ctxsize = sizeof(struct bf_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_blkcipher_type, - .cra_module = THIS_MODULE, - .cra_u = { - .blkcipher = { - .min_keysize = BF_MIN_KEY_SIZE, - .max_keysize = BF_MAX_KEY_SIZE, - .ivsize = BF_BLOCK_SIZE, - .setkey = blowfish_setkey, - .encrypt = ctr_crypt, - .decrypt = ctr_crypt, - }, - }, -}, { - .cra_name = "ecb(blowfish)", - .cra_driver_name = "ecb-blowfish-avx2", - .cra_priority = 400, - .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, - .cra_blocksize = BF_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct async_helper_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_ablkcipher_type, - .cra_module = THIS_MODULE, - .cra_init = ablk_init, - .cra_exit = ablk_exit, - .cra_u = { - .ablkcipher = { - .min_keysize = BF_MIN_KEY_SIZE, - .max_keysize = BF_MAX_KEY_SIZE, - .setkey = ablk_set_key, - .encrypt = ablk_encrypt, - .decrypt = ablk_decrypt, - }, - }, -}, { - .cra_name = "cbc(blowfish)", - .cra_driver_name = "cbc-blowfish-avx2", - .cra_priority = 400, - .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, - .cra_blocksize = BF_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct async_helper_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_ablkcipher_type, - .cra_module = THIS_MODULE, - .cra_init = ablk_init, - .cra_exit = ablk_exit, - .cra_u = { - .ablkcipher = { - .min_keysize = BF_MIN_KEY_SIZE, - .max_keysize = BF_MAX_KEY_SIZE, - .ivsize = BF_BLOCK_SIZE, - .setkey = ablk_set_key, - .encrypt = __ablk_encrypt, - .decrypt = ablk_decrypt, - }, - }, -}, { - .cra_name = "ctr(blowfish)", - .cra_driver_name = "ctr-blowfish-avx2", - .cra_priority = 400, - .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, - .cra_blocksize = 1, - .cra_ctxsize = sizeof(struct async_helper_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_ablkcipher_type, - .cra_module = THIS_MODULE, - .cra_init = ablk_init, - .cra_exit = ablk_exit, - .cra_u = { - .ablkcipher = { - .min_keysize = BF_MIN_KEY_SIZE, - .max_keysize = BF_MAX_KEY_SIZE, - .ivsize = BF_BLOCK_SIZE, - .setkey = ablk_set_key, - .encrypt = ablk_encrypt, - .decrypt = ablk_encrypt, - .geniv = "chainiv", - }, - }, -} }; - - -static int __init init(void) -{ - u64 xcr0; - - if (!cpu_has_avx2 || !cpu_has_osxsave) { - pr_info("AVX2 instructions are not detected.\n"); - return -ENODEV; - } - - xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); - if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) { - pr_info("AVX detected but unusable.\n"); - return -ENODEV; - } - - return crypto_register_algs(bf_algs, ARRAY_SIZE(bf_algs)); -} - -static void __exit fini(void) -{ - crypto_unregister_algs(bf_algs, ARRAY_SIZE(bf_algs)); -} - -module_init(init); -module_exit(fini); - -MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("Blowfish Cipher Algorithm, AVX2 optimized"); -MODULE_ALIAS("blowfish"); -MODULE_ALIAS("blowfish-asm"); diff --git a/arch/x86/crypto/blowfish_glue.c b/arch/x86/crypto/blowfish_glue.c index 3548d76dbaa9..50ec333b70e6 100644 --- a/arch/x86/crypto/blowfish_glue.c +++ b/arch/x86/crypto/blowfish_glue.c @@ -1,7 +1,7 @@ /* * Glue Code for assembler optimized version of Blowfish * - * Copyright © 2011-2013 Jussi Kivilinna + * Copyright (c) 2011 Jussi Kivilinna * * CBC & ECB parts based on code (crypto/cbc.c,ecb.c) by: * Copyright (c) 2006 Herbert Xu @@ -32,24 +32,40 @@ #include #include #include -#include /* regular block cipher functions */ asmlinkage void __blowfish_enc_blk(struct bf_ctx *ctx, u8 *dst, const u8 *src, bool xor); -EXPORT_SYMBOL_GPL(__blowfish_enc_blk); - asmlinkage void blowfish_dec_blk(struct bf_ctx *ctx, u8 *dst, const u8 *src); -EXPORT_SYMBOL_GPL(blowfish_dec_blk); /* 4-way parallel cipher functions */ asmlinkage void __blowfish_enc_blk_4way(struct bf_ctx *ctx, u8 *dst, const u8 *src, bool xor); -EXPORT_SYMBOL_GPL(__blowfish_enc_blk_4way); - asmlinkage void blowfish_dec_blk_4way(struct bf_ctx *ctx, u8 *dst, const u8 *src); -EXPORT_SYMBOL_GPL(blowfish_dec_blk_4way); + +static inline void blowfish_enc_blk(struct bf_ctx *ctx, u8 *dst, const u8 *src) +{ + __blowfish_enc_blk(ctx, dst, src, false); +} + +static inline void blowfish_enc_blk_xor(struct bf_ctx *ctx, u8 *dst, + const u8 *src) +{ + __blowfish_enc_blk(ctx, dst, src, true); +} + +static inline void blowfish_enc_blk_4way(struct bf_ctx *ctx, u8 *dst, + const u8 *src) +{ + __blowfish_enc_blk_4way(ctx, dst, src, false); +} + +static inline void blowfish_enc_blk_xor_4way(struct bf_ctx *ctx, u8 *dst, + const u8 *src) +{ + __blowfish_enc_blk_4way(ctx, dst, src, true); +} static void blowfish_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) { diff --git a/arch/x86/include/asm/crypto/blowfish.h b/arch/x86/include/asm/crypto/blowfish.h deleted file mode 100644 index f097b2face10..000000000000 --- a/arch/x86/include/asm/crypto/blowfish.h +++ /dev/null @@ -1,43 +0,0 @@ -#ifndef ASM_X86_BLOWFISH_H -#define ASM_X86_BLOWFISH_H - -#include -#include - -#define BF_PARALLEL_BLOCKS 4 - -/* regular block cipher functions */ -asmlinkage void __blowfish_enc_blk(struct bf_ctx *ctx, u8 *dst, const u8 *src, - bool xor); -asmlinkage void blowfish_dec_blk(struct bf_ctx *ctx, u8 *dst, const u8 *src); - -/* 4-way parallel cipher functions */ -asmlinkage void __blowfish_enc_blk_4way(struct bf_ctx *ctx, u8 *dst, - const u8 *src, bool xor); -asmlinkage void blowfish_dec_blk_4way(struct bf_ctx *ctx, u8 *dst, - const u8 *src); - -static inline void blowfish_enc_blk(struct bf_ctx *ctx, u8 *dst, const u8 *src) -{ - __blowfish_enc_blk(ctx, dst, src, false); -} - -static inline void blowfish_enc_blk_xor(struct bf_ctx *ctx, u8 *dst, - const u8 *src) -{ - __blowfish_enc_blk(ctx, dst, src, true); -} - -static inline void blowfish_enc_blk_4way(struct bf_ctx *ctx, u8 *dst, - const u8 *src) -{ - __blowfish_enc_blk_4way(ctx, dst, src, false); -} - -static inline void blowfish_enc_blk_xor_4way(struct bf_ctx *ctx, u8 *dst, - const u8 *src) -{ - __blowfish_enc_blk_4way(ctx, dst, src, true); -} - -#endif diff --git a/crypto/Kconfig b/crypto/Kconfig index d1ca6312d798..4ef0ee715171 100644 --- a/crypto/Kconfig +++ b/crypto/Kconfig @@ -839,24 +839,6 @@ config CRYPTO_BLOWFISH_X86_64 See also: -config CRYPTO_BLOWFISH_AVX2_X86_64 - tristate "Blowfish cipher algorithm (x86_64/AVX2)" - depends on X86 && 64BIT - select CRYPTO_ALGAPI - select CRYPTO_CRYPTD - select CRYPTO_ABLK_HELPER_X86 - select CRYPTO_BLOWFISH_COMMON - select CRYPTO_BLOWFISH_X86_64 - help - Blowfish cipher algorithm (x86_64/AVX2), by Bruce Schneier. - - This is a variable key length cipher which can use keys from 32 - bits to 448 bits in length. It's fast, simple and specifically - designed for use on "large microprocessors". - - See also: - - config CRYPTO_CAMELLIA tristate "Camellia cipher algorithms" depends on CRYPTO diff --git a/crypto/testmgr.c b/crypto/testmgr.c index f19a392ade78..27f111876523 100644 --- a/crypto/testmgr.c +++ b/crypto/testmgr.c @@ -1660,9 +1660,6 @@ static const struct alg_test_desc alg_test_descs[] = { .alg = "__driver-cbc-aes-aesni", .test = alg_test_null, .fips_allowed = 1, - }, { - .alg = "__driver-cbc-blowfish-avx2", - .test = alg_test_null, }, { .alg = "__driver-cbc-camellia-aesni", .test = alg_test_null, @@ -1694,9 +1691,6 @@ static const struct alg_test_desc alg_test_descs[] = { .alg = "__driver-ecb-aes-aesni", .test = alg_test_null, .fips_allowed = 1, - }, { - .alg = "__driver-ecb-blowfish-avx2", - .test = alg_test_null, }, { .alg = "__driver-ecb-camellia-aesni", .test = alg_test_null, @@ -1987,9 +1981,6 @@ static const struct alg_test_desc alg_test_descs[] = { .alg = "cryptd(__driver-cbc-aes-aesni)", .test = alg_test_null, .fips_allowed = 1, - }, { - .alg = "cryptd(__driver-cbc-blowfish-avx2)", - .test = alg_test_null, }, { .alg = "cryptd(__driver-cbc-camellia-aesni)", .test = alg_test_null, @@ -2003,9 +1994,6 @@ static const struct alg_test_desc alg_test_descs[] = { .alg = "cryptd(__driver-ecb-aes-aesni)", .test = alg_test_null, .fips_allowed = 1, - }, { - .alg = "cryptd(__driver-ecb-blowfish-avx2)", - .test = alg_test_null, }, { .alg = "cryptd(__driver-ecb-camellia-aesni)", .test = alg_test_null, -- cgit v1.2.3-59-g8ed1b From 99f42f937a080995b34e1ed75ed6934b5f96f9ca Mon Sep 17 00:00:00 2001 From: Jussi Kivilinna Date: Sat, 8 Jun 2013 12:17:47 +0300 Subject: Revert "crypto: twofish - add AVX2/x86_64 assembler implementation of twofish cipher" This reverts commit cf1521a1a5e21fd1e79a458605c4282fbfbbeee2. Instruction (vpgatherdd) that this implementation relied on turned out to be slow performer on real hardware (i5-4570). The previous 8-way twofish/AVX implementation is therefore faster and this implementation should be removed. Converting this implementation to use the same method as in twofish/AVX for table look-ups would give additional ~3% speed up vs twofish/AVX, but would hardly be worth of the added code and binary size. Signed-off-by: Jussi Kivilinna Signed-off-by: Herbert Xu --- arch/x86/crypto/Makefile | 2 - arch/x86/crypto/twofish-avx2-asm_64.S | 600 ---------------------------------- arch/x86/crypto/twofish_avx2_glue.c | 584 --------------------------------- arch/x86/crypto/twofish_avx_glue.c | 14 +- arch/x86/include/asm/crypto/twofish.h | 18 - crypto/Kconfig | 24 -- crypto/testmgr.c | 12 - 7 files changed, 2 insertions(+), 1252 deletions(-) delete mode 100644 arch/x86/crypto/twofish-avx2-asm_64.S delete mode 100644 arch/x86/crypto/twofish_avx2_glue.c diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile index 9ce341839f4a..7d6ba9db1be9 100644 --- a/arch/x86/crypto/Makefile +++ b/arch/x86/crypto/Makefile @@ -43,7 +43,6 @@ endif ifeq ($(avx2_supported),yes) obj-$(CONFIG_CRYPTO_CAMELLIA_AESNI_AVX2_X86_64) += camellia-aesni-avx2.o obj-$(CONFIG_CRYPTO_SERPENT_AVX2_X86_64) += serpent-avx2.o - obj-$(CONFIG_CRYPTO_TWOFISH_AVX2_X86_64) += twofish-avx2.o endif aes-i586-y := aes-i586-asm_32.o aes_glue.o @@ -73,7 +72,6 @@ endif ifeq ($(avx2_supported),yes) camellia-aesni-avx2-y := camellia-aesni-avx2-asm_64.o camellia_aesni_avx2_glue.o serpent-avx2-y := serpent-avx2-asm_64.o serpent_avx2_glue.o - twofish-avx2-y := twofish-avx2-asm_64.o twofish_avx2_glue.o endif aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o fpu.o diff --git a/arch/x86/crypto/twofish-avx2-asm_64.S b/arch/x86/crypto/twofish-avx2-asm_64.S deleted file mode 100644 index e1a83b9cd389..000000000000 --- a/arch/x86/crypto/twofish-avx2-asm_64.S +++ /dev/null @@ -1,600 +0,0 @@ -/* - * x86_64/AVX2 assembler optimized version of Twofish - * - * Copyright © 2012-2013 Jussi Kivilinna - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - */ - -#include -#include "glue_helper-asm-avx2.S" - -.file "twofish-avx2-asm_64.S" - -.data -.align 16 - -.Lvpshufb_mask0: -.long 0x80808000 -.long 0x80808004 -.long 0x80808008 -.long 0x8080800c - -.Lbswap128_mask: - .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 -.Lxts_gf128mul_and_shl1_mask_0: - .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0 -.Lxts_gf128mul_and_shl1_mask_1: - .byte 0x0e, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0 - -.text - -/* structure of crypto context */ -#define s0 0 -#define s1 1024 -#define s2 2048 -#define s3 3072 -#define w 4096 -#define k 4128 - -/* register macros */ -#define CTX %rdi - -#define RS0 CTX -#define RS1 %r8 -#define RS2 %r9 -#define RS3 %r10 -#define RK %r11 -#define RW %rax -#define RROUND %r12 -#define RROUNDd %r12d - -#define RA0 %ymm8 -#define RB0 %ymm9 -#define RC0 %ymm10 -#define RD0 %ymm11 -#define RA1 %ymm12 -#define RB1 %ymm13 -#define RC1 %ymm14 -#define RD1 %ymm15 - -/* temp regs */ -#define RX0 %ymm0 -#define RY0 %ymm1 -#define RX1 %ymm2 -#define RY1 %ymm3 -#define RT0 %ymm4 -#define RIDX %ymm5 - -#define RX0x %xmm0 -#define RY0x %xmm1 -#define RX1x %xmm2 -#define RY1x %xmm3 -#define RT0x %xmm4 - -/* vpgatherdd mask and '-1' */ -#define RNOT %ymm6 - -/* byte mask, (-1 >> 24) */ -#define RBYTE %ymm7 - -/********************************************************************** - 16-way AVX2 twofish - **********************************************************************/ -#define init_round_constants() \ - vpcmpeqd RNOT, RNOT, RNOT; \ - vpsrld $24, RNOT, RBYTE; \ - leaq k(CTX), RK; \ - leaq w(CTX), RW; \ - leaq s1(CTX), RS1; \ - leaq s2(CTX), RS2; \ - leaq s3(CTX), RS3; \ - -#define g16(ab, rs0, rs1, rs2, rs3, xy) \ - vpand RBYTE, ab ## 0, RIDX; \ - vpgatherdd RNOT, (rs0, RIDX, 4), xy ## 0; \ - vpcmpeqd RNOT, RNOT, RNOT; \ - \ - vpand RBYTE, ab ## 1, RIDX; \ - vpgatherdd RNOT, (rs0, RIDX, 4), xy ## 1; \ - vpcmpeqd RNOT, RNOT, RNOT; \ - \ - vpsrld $8, ab ## 0, RIDX; \ - vpand RBYTE, RIDX, RIDX; \ - vpgatherdd RNOT, (rs1, RIDX, 4), RT0; \ - vpcmpeqd RNOT, RNOT, RNOT; \ - vpxor RT0, xy ## 0, xy ## 0; \ - \ - vpsrld $8, ab ## 1, RIDX; \ - vpand RBYTE, RIDX, RIDX; \ - vpgatherdd RNOT, (rs1, RIDX, 4), RT0; \ - vpcmpeqd RNOT, RNOT, RNOT; \ - vpxor RT0, xy ## 1, xy ## 1; \ - \ - vpsrld $16, ab ## 0, RIDX; \ - vpand RBYTE, RIDX, RIDX; \ - vpgatherdd RNOT, (rs2, RIDX, 4), RT0; \ - vpcmpeqd RNOT, RNOT, RNOT; \ - vpxor RT0, xy ## 0, xy ## 0; \ - \ - vpsrld $16, ab ## 1, RIDX; \ - vpand RBYTE, RIDX, RIDX; \ - vpgatherdd RNOT, (rs2, RIDX, 4), RT0; \ - vpcmpeqd RNOT, RNOT, RNOT; \ - vpxor RT0, xy ## 1, xy ## 1; \ - \ - vpsrld $24, ab ## 0, RIDX; \ - vpgatherdd RNOT, (rs3, RIDX, 4), RT0; \ - vpcmpeqd RNOT, RNOT, RNOT; \ - vpxor RT0, xy ## 0, xy ## 0; \ - \ - vpsrld $24, ab ## 1, RIDX; \ - vpgatherdd RNOT, (rs3, RIDX, 4), RT0; \ - vpcmpeqd RNOT, RNOT, RNOT; \ - vpxor RT0, xy ## 1, xy ## 1; - -#define g1_16(a, x) \ - g16(a, RS0, RS1, RS2, RS3, x); - -#define g2_16(b, y) \ - g16(b, RS1, RS2, RS3, RS0, y); - -#define encrypt_round_end16(a, b, c, d, nk) \ - vpaddd RY0, RX0, RX0; \ - vpaddd RX0, RY0, RY0; \ - vpbroadcastd nk(RK,RROUND,8), RT0; \ - vpaddd RT0, RX0, RX0; \ - vpbroadcastd 4+nk(RK,RROUND,8), RT0; \ - vpaddd RT0, RY0, RY0; \ - \ - vpxor RY0, d ## 0, d ## 0; \ - \ - vpxor RX0, c ## 0, c ## 0; \ - vpsrld $1, c ## 0, RT0; \ - vpslld $31, c ## 0, c ## 0; \ - vpor RT0, c ## 0, c ## 0; \ - \ - vpaddd RY1, RX1, RX1; \ - vpaddd RX1, RY1, RY1; \ - vpbroadcastd nk(RK,RROUND,8), RT0; \ - vpaddd RT0, RX1, RX1; \ - vpbroadcastd 4+nk(RK,RROUND,8), RT0; \ - vpaddd RT0, RY1, RY1; \ - \ - vpxor RY1, d ## 1, d ## 1; \ - \ - vpxor RX1, c ## 1, c ## 1; \ - vpsrld $1, c ## 1, RT0; \ - vpslld $31, c ## 1, c ## 1; \ - vpor RT0, c ## 1, c ## 1; \ - -#define encrypt_round16(a, b, c, d, nk) \ - g2_16(b, RY); \ - \ - vpslld $1, b ## 0, RT0; \ - vpsrld $31, b ## 0, b ## 0; \ - vpor RT0, b ## 0, b ## 0; \ - \ - vpslld $1, b ## 1, RT0; \ - vpsrld $31, b ## 1, b ## 1; \ - vpor RT0, b ## 1, b ## 1; \ - \ - g1_16(a, RX); \ - \ - encrypt_round_end16(a, b, c, d, nk); - -#define encrypt_round_first16(a, b, c, d, nk) \ - vpslld $1, d ## 0, RT0; \ - vpsrld $31, d ## 0, d ## 0; \ - vpor RT0, d ## 0, d ## 0; \ - \ - vpslld $1, d ## 1, RT0; \ - vpsrld $31, d ## 1, d ## 1; \ - vpor RT0, d ## 1, d ## 1; \ - \ - encrypt_round16(a, b, c, d, nk); - -#define encrypt_round_last16(a, b, c, d, nk) \ - g2_16(b, RY); \ - \ - g1_16(a, RX); \ - \ - encrypt_round_end16(a, b, c, d, nk); - -#define decrypt_round_end16(a, b, c, d, nk) \ - vpaddd RY0, RX0, RX0; \ - vpaddd RX0, RY0, RY0; \ - vpbroadcastd nk(RK,RROUND,8), RT0; \ - vpaddd RT0, RX0, RX0; \ - vpbroadcastd 4+nk(RK,RROUND,8), RT0; \ - vpaddd RT0, RY0, RY0; \ - \ - vpxor RX0, c ## 0, c ## 0; \ - \ - vpxor RY0, d ## 0, d ## 0; \ - vpsrld $1, d ## 0, RT0; \ - vpslld $31, d ## 0, d ## 0; \ - vpor RT0, d ## 0, d ## 0; \ - \ - vpaddd RY1, RX1, RX1; \ - vpaddd RX1, RY1, RY1; \ - vpbroadcastd nk(RK,RROUND,8), RT0; \ - vpaddd RT0, RX1, RX1; \ - vpbroadcastd 4+nk(RK,RROUND,8), RT0; \ - vpaddd RT0, RY1, RY1; \ - \ - vpxor RX1, c ## 1, c ## 1; \ - \ - vpxor RY1, d ## 1, d ## 1; \ - vpsrld $1, d ## 1, RT0; \ - vpslld $31, d ## 1, d ## 1; \ - vpor RT0, d ## 1, d ## 1; - -#define decrypt_round16(a, b, c, d, nk) \ - g1_16(a, RX); \ - \ - vpslld $1, a ## 0, RT0; \ - vpsrld $31, a ## 0, a ## 0; \ - vpor RT0, a ## 0, a ## 0; \ - \ - vpslld $1, a ## 1, RT0; \ - vpsrld $31, a ## 1, a ## 1; \ - vpor RT0, a ## 1, a ## 1; \ - \ - g2_16(b, RY); \ - \ - decrypt_round_end16(a, b, c, d, nk); - -#define decrypt_round_first16(a, b, c, d, nk) \ - vpslld $1, c ## 0, RT0; \ - vpsrld $31, c ## 0, c ## 0; \ - vpor RT0, c ## 0, c ## 0; \ - \ - vpslld $1, c ## 1, RT0; \ - vpsrld $31, c ## 1, c ## 1; \ - vpor RT0, c ## 1, c ## 1; \ - \ - decrypt_round16(a, b, c, d, nk) - -#define decrypt_round_last16(a, b, c, d, nk) \ - g1_16(a, RX); \ - \ - g2_16(b, RY); \ - \ - decrypt_round_end16(a, b, c, d, nk); - -#define encrypt_cycle16() \ - encrypt_round16(RA, RB, RC, RD, 0); \ - encrypt_round16(RC, RD, RA, RB, 8); - -#define encrypt_cycle_first16() \ - encrypt_round_first16(RA, RB, RC, RD, 0); \ - encrypt_round16(RC, RD, RA, RB, 8); - -#define encrypt_cycle_last16() \ - encrypt_round16(RA, RB, RC, RD, 0); \ - encrypt_round_last16(RC, RD, RA, RB, 8); - -#define decrypt_cycle16(n) \ - decrypt_round16(RC, RD, RA, RB, 8); \ - decrypt_round16(RA, RB, RC, RD, 0); - -#define decrypt_cycle_first16(n) \ - decrypt_round_first16(RC, RD, RA, RB, 8); \ - decrypt_round16(RA, RB, RC, RD, 0); - -#define decrypt_cycle_last16(n) \ - decrypt_round16(RC, RD, RA, RB, 8); \ - decrypt_round_last16(RA, RB, RC, RD, 0); - -#define transpose_4x4(x0,x1,x2,x3,t1,t2) \ - vpunpckhdq x1, x0, t2; \ - vpunpckldq x1, x0, x0; \ - \ - vpunpckldq x3, x2, t1; \ - vpunpckhdq x3, x2, x2; \ - \ - vpunpckhqdq t1, x0, x1; \ - vpunpcklqdq t1, x0, x0; \ - \ - vpunpckhqdq x2, t2, x3; \ - vpunpcklqdq x2, t2, x2; - -#define read_blocks8(offs,a,b,c,d) \ - transpose_4x4(a, b, c, d, RX0, RY0); - -#define write_blocks8(offs,a,b,c,d) \ - transpose_4x4(a, b, c, d, RX0, RY0); - -#define inpack_enc8(a,b,c,d) \ - vpbroadcastd 4*0(RW), RT0; \ - vpxor RT0, a, a; \ - \ - vpbroadcastd 4*1(RW), RT0; \ - vpxor RT0, b, b; \ - \ - vpbroadcastd 4*2(RW), RT0; \ - vpxor RT0, c, c; \ - \ - vpbroadcastd 4*3(RW), RT0; \ - vpxor RT0, d, d; - -#define outunpack_enc8(a,b,c,d) \ - vpbroadcastd 4*4(RW), RX0; \ - vpbroadcastd 4*5(RW), RY0; \ - vpxor RX0, c, RX0; \ - vpxor RY0, d, RY0; \ - \ - vpbroadcastd 4*6(RW), RT0; \ - vpxor RT0, a, c; \ - vpbroadcastd 4*7(RW), RT0; \ - vpxor RT0, b, d; \ - \ - vmovdqa RX0, a; \ - vmovdqa RY0, b; - -#define inpack_dec8(a,b,c,d) \ - vpbroadcastd 4*4(RW), RX0; \ - vpbroadcastd 4*5(RW), RY0; \ - vpxor RX0, a, RX0; \ - vpxor RY0, b, RY0; \ - \ - vpbroadcastd 4*6(RW), RT0; \ - vpxor RT0, c, a; \ - vpbroadcastd 4*7(RW), RT0; \ - vpxor RT0, d, b; \ - \ - vmovdqa RX0, c; \ - vmovdqa RY0, d; - -#define outunpack_dec8(a,b,c,d) \ - vpbroadcastd 4*0(RW), RT0; \ - vpxor RT0, a, a; \ - \ - vpbroadcastd 4*1(RW), RT0; \ - vpxor RT0, b, b; \ - \ - vpbroadcastd 4*2(RW), RT0; \ - vpxor RT0, c, c; \ - \ - vpbroadcastd 4*3(RW), RT0; \ - vpxor RT0, d, d; - -#define read_blocks16(a,b,c,d) \ - read_blocks8(0, a ## 0, b ## 0, c ## 0, d ## 0); \ - read_blocks8(8, a ## 1, b ## 1, c ## 1, d ## 1); - -#define write_blocks16(a,b,c,d) \ - write_blocks8(0, a ## 0, b ## 0, c ## 0, d ## 0); \ - write_blocks8(8, a ## 1, b ## 1, c ## 1, d ## 1); - -#define xor_blocks16(a,b,c,d) \ - xor_blocks8(0, a ## 0, b ## 0, c ## 0, d ## 0); \ - xor_blocks8(8, a ## 1, b ## 1, c ## 1, d ## 1); - -#define inpack_enc16(a,b,c,d) \ - inpack_enc8(a ## 0, b ## 0, c ## 0, d ## 0); \ - inpack_enc8(a ## 1, b ## 1, c ## 1, d ## 1); - -#define outunpack_enc16(a,b,c,d) \ - outunpack_enc8(a ## 0, b ## 0, c ## 0, d ## 0); \ - outunpack_enc8(a ## 1, b ## 1, c ## 1, d ## 1); - -#define inpack_dec16(a,b,c,d) \ - inpack_dec8(a ## 0, b ## 0, c ## 0, d ## 0); \ - inpack_dec8(a ## 1, b ## 1, c ## 1, d ## 1); - -#define outunpack_dec16(a,b,c,d) \ - outunpack_dec8(a ## 0, b ## 0, c ## 0, d ## 0); \ - outunpack_dec8(a ## 1, b ## 1, c ## 1, d ## 1); - -.align 8 -__twofish_enc_blk16: - /* input: - * %rdi: ctx, CTX - * RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1: plaintext - * output: - * RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1: ciphertext - */ - init_round_constants(); - - read_blocks16(RA, RB, RC, RD); - inpack_enc16(RA, RB, RC, RD); - - xorl RROUNDd, RROUNDd; - encrypt_cycle_first16(); - movl $2, RROUNDd; - -.align 4 -.L__enc_loop: - encrypt_cycle16(); - - addl $2, RROUNDd; - cmpl $14, RROUNDd; - jne .L__enc_loop; - - encrypt_cycle_last16(); - - outunpack_enc16(RA, RB, RC, RD); - write_blocks16(RA, RB, RC, RD); - - ret; -ENDPROC(__twofish_enc_blk16) - -.align 8 -__twofish_dec_blk16: - /* input: - * %rdi: ctx, CTX - * RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1: ciphertext - * output: - * RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1: plaintext - */ - init_round_constants(); - - read_blocks16(RA, RB, RC, RD); - inpack_dec16(RA, RB, RC, RD); - - movl $14, RROUNDd; - decrypt_cycle_first16(); - movl $12, RROUNDd; - -.align 4 -.L__dec_loop: - decrypt_cycle16(); - - addl $-2, RROUNDd; - jnz .L__dec_loop; - - decrypt_cycle_last16(); - - outunpack_dec16(RA, RB, RC, RD); - write_blocks16(RA, RB, RC, RD); - - ret; -ENDPROC(__twofish_dec_blk16) - -ENTRY(twofish_ecb_enc_16way) - /* input: - * %rdi: ctx, CTX - * %rsi: dst - * %rdx: src - */ - - vzeroupper; - pushq %r12; - - load_16way(%rdx, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1); - - call __twofish_enc_blk16; - - store_16way(%rsi, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1); - - popq %r12; - vzeroupper; - - ret; -ENDPROC(twofish_ecb_enc_16way) - -ENTRY(twofish_ecb_dec_16way) - /* input: - * %rdi: ctx, CTX - * %rsi: dst - * %rdx: src - */ - - vzeroupper; - pushq %r12; - - load_16way(%rdx, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1); - - call __twofish_dec_blk16; - - store_16way(%rsi, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1); - - popq %r12; - vzeroupper; - - ret; -ENDPROC(twofish_ecb_dec_16way) - -ENTRY(twofish_cbc_dec_16way) - /* input: - * %rdi: ctx, CTX - * %rsi: dst - * %rdx: src - */ - - vzeroupper; - pushq %r12; - - load_16way(%rdx, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1); - - call __twofish_dec_blk16; - - store_cbc_16way(%rdx, %rsi, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1, - RX0); - - popq %r12; - vzeroupper; - - ret; -ENDPROC(twofish_cbc_dec_16way) - -ENTRY(twofish_ctr_16way) - /* input: - * %rdi: ctx, CTX - * %rsi: dst (16 blocks) - * %rdx: src (16 blocks) - * %rcx: iv (little endian, 128bit) - */ - - vzeroupper; - pushq %r12; - - load_ctr_16way(%rcx, .Lbswap128_mask, RA0, RB0, RC0, RD0, RA1, RB1, RC1, - RD1, RX0, RX0x, RX1, RX1x, RY0, RY0x, RY1, RY1x, RNOT, - RBYTE); - - call __twofish_enc_blk16; - - store_ctr_16way(%rdx, %rsi, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1); - - popq %r12; - vzeroupper; - - ret; -ENDPROC(twofish_ctr_16way) - -.align 8 -twofish_xts_crypt_16way: - /* input: - * %rdi: ctx, CTX - * %rsi: dst (16 blocks) - * %rdx: src (16 blocks) - * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) - * %r8: pointer to __twofish_enc_blk16 or __twofish_dec_blk16 - */ - - vzeroupper; - pushq %r12; - - load_xts_16way(%rcx, %rdx, %rsi, RA0, RB0, RC0, RD0, RA1, RB1, RC1, - RD1, RX0, RX0x, RX1, RX1x, RY0, RY0x, RY1, RY1x, RNOT, - .Lxts_gf128mul_and_shl1_mask_0, - .Lxts_gf128mul_and_shl1_mask_1); - - call *%r8; - - store_xts_16way(%rsi, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1); - - popq %r12; - vzeroupper; - - ret; -ENDPROC(twofish_xts_crypt_16way) - -ENTRY(twofish_xts_enc_16way) - /* input: - * %rdi: ctx, CTX - * %rsi: dst (16 blocks) - * %rdx: src (16 blocks) - * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) - */ - leaq __twofish_enc_blk16, %r8; - jmp twofish_xts_crypt_16way; -ENDPROC(twofish_xts_enc_16way) - -ENTRY(twofish_xts_dec_16way) - /* input: - * %rdi: ctx, CTX - * %rsi: dst (16 blocks) - * %rdx: src (16 blocks) - * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) - */ - leaq __twofish_dec_blk16, %r8; - jmp twofish_xts_crypt_16way; -ENDPROC(twofish_xts_dec_16way) diff --git a/arch/x86/crypto/twofish_avx2_glue.c b/arch/x86/crypto/twofish_avx2_glue.c deleted file mode 100644 index ce33b5be64ee..000000000000 --- a/arch/x86/crypto/twofish_avx2_glue.c +++ /dev/null @@ -1,584 +0,0 @@ -/* - * Glue Code for x86_64/AVX2 assembler optimized version of Twofish - * - * Copyright © 2012-2013 Jussi Kivilinna - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#define TF_AVX2_PARALLEL_BLOCKS 16 - -/* 16-way AVX2 parallel cipher functions */ -asmlinkage void twofish_ecb_enc_16way(struct twofish_ctx *ctx, u8 *dst, - const u8 *src); -asmlinkage void twofish_ecb_dec_16way(struct twofish_ctx *ctx, u8 *dst, - const u8 *src); -asmlinkage void twofish_cbc_dec_16way(void *ctx, u128 *dst, const u128 *src); - -asmlinkage void twofish_ctr_16way(void *ctx, u128 *dst, const u128 *src, - le128 *iv); - -asmlinkage void twofish_xts_enc_16way(struct twofish_ctx *ctx, u8 *dst, - const u8 *src, le128 *iv); -asmlinkage void twofish_xts_dec_16way(struct twofish_ctx *ctx, u8 *dst, - const u8 *src, le128 *iv); - -static inline void twofish_enc_blk_3way(struct twofish_ctx *ctx, u8 *dst, - const u8 *src) -{ - __twofish_enc_blk_3way(ctx, dst, src, false); -} - -static const struct common_glue_ctx twofish_enc = { - .num_funcs = 4, - .fpu_blocks_limit = 8, - - .funcs = { { - .num_blocks = 16, - .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_ecb_enc_16way) } - }, { - .num_blocks = 8, - .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_ecb_enc_8way) } - }, { - .num_blocks = 3, - .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_enc_blk_3way) } - }, { - .num_blocks = 1, - .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_enc_blk) } - } } -}; - -static const struct common_glue_ctx twofish_ctr = { - .num_funcs = 4, - .fpu_blocks_limit = 8, - - .funcs = { { - .num_blocks = 16, - .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(twofish_ctr_16way) } - }, { - .num_blocks = 8, - .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(twofish_ctr_8way) } - }, { - .num_blocks = 3, - .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(twofish_enc_blk_ctr_3way) } - }, { - .num_blocks = 1, - .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(twofish_enc_blk_ctr) } - } } -}; - -static const struct common_glue_ctx twofish_enc_xts = { - .num_funcs = 3, - .fpu_blocks_limit = 8, - - .funcs = { { - .num_blocks = 16, - .fn_u = { .xts = GLUE_XTS_FUNC_CAST(twofish_xts_enc_16way) } - }, { - .num_blocks = 8, - .fn_u = { .xts = GLUE_XTS_FUNC_CAST(twofish_xts_enc_8way) } - }, { - .num_blocks = 1, - .fn_u = { .xts = GLUE_XTS_FUNC_CAST(twofish_xts_enc) } - } } -}; - -static const struct common_glue_ctx twofish_dec = { - .num_funcs = 4, - .fpu_blocks_limit = 8, - - .funcs = { { - .num_blocks = 16, - .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_ecb_dec_16way) } - }, { - .num_blocks = 8, - .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_ecb_dec_8way) } - }, { - .num_blocks = 3, - .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_dec_blk_3way) } - }, { - .num_blocks = 1, - .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_dec_blk) } - } } -}; - -static const struct common_glue_ctx twofish_dec_cbc = { - .num_funcs = 4, - .fpu_blocks_limit = 8, - - .funcs = { { - .num_blocks = 16, - .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_cbc_dec_16way) } - }, { - .num_blocks = 8, - .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_cbc_dec_8way) } - }, { - .num_blocks = 3, - .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_dec_blk_cbc_3way) } - }, { - .num_blocks = 1, - .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_dec_blk) } - } } -}; - -static const struct common_glue_ctx twofish_dec_xts = { - .num_funcs = 3, - .fpu_blocks_limit = 8, - - .funcs = { { - .num_blocks = 16, - .fn_u = { .xts = GLUE_XTS_FUNC_CAST(twofish_xts_dec_16way) } - }, { - .num_blocks = 8, - .fn_u = { .xts = GLUE_XTS_FUNC_CAST(twofish_xts_dec_8way) } - }, { - .num_blocks = 1, - .fn_u = { .xts = GLUE_XTS_FUNC_CAST(twofish_xts_dec) } - } } -}; - -static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) -{ - return glue_ecb_crypt_128bit(&twofish_enc, desc, dst, src, nbytes); -} - -static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) -{ - return glue_ecb_crypt_128bit(&twofish_dec, desc, dst, src, nbytes); -} - -static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) -{ - return glue_cbc_encrypt_128bit(GLUE_FUNC_CAST(twofish_enc_blk), desc, - dst, src, nbytes); -} - -static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) -{ - return glue_cbc_decrypt_128bit(&twofish_dec_cbc, desc, dst, src, - nbytes); -} - -static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) -{ - return glue_ctr_crypt_128bit(&twofish_ctr, desc, dst, src, nbytes); -} - -static inline bool twofish_fpu_begin(bool fpu_enabled, unsigned int nbytes) -{ - /* since reusing AVX functions, starts using FPU at 8 parallel blocks */ - return glue_fpu_begin(TF_BLOCK_SIZE, 8, NULL, fpu_enabled, nbytes); -} - -static inline void twofish_fpu_end(bool fpu_enabled) -{ - glue_fpu_end(fpu_enabled); -} - -struct crypt_priv { - struct twofish_ctx *ctx; - bool fpu_enabled; -}; - -static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) -{ - const unsigned int bsize = TF_BLOCK_SIZE; - struct crypt_priv *ctx = priv; - int i; - - ctx->fpu_enabled = twofish_fpu_begin(ctx->fpu_enabled, nbytes); - - while (nbytes >= TF_AVX2_PARALLEL_BLOCKS * bsize) { - twofish_ecb_enc_16way(ctx->ctx, srcdst, srcdst); - srcdst += bsize * TF_AVX2_PARALLEL_BLOCKS; - nbytes -= bsize * TF_AVX2_PARALLEL_BLOCKS; - } - - while (nbytes >= 8 * bsize) { - twofish_ecb_enc_8way(ctx->ctx, srcdst, srcdst); - srcdst += bsize * 8; - nbytes -= bsize * 8; - } - - while (nbytes >= 3 * bsize) { - twofish_enc_blk_3way(ctx->ctx, srcdst, srcdst); - srcdst += bsize * 3; - nbytes -= bsize * 3; - } - - for (i = 0; i < nbytes / bsize; i++, srcdst += bsize) - twofish_enc_blk(ctx->ctx, srcdst, srcdst); -} - -static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) -{ - const unsigned int bsize = TF_BLOCK_SIZE; - struct crypt_priv *ctx = priv; - int i; - - ctx->fpu_enabled = twofish_fpu_begin(ctx->fpu_enabled, nbytes); - - while (nbytes >= TF_AVX2_PARALLEL_BLOCKS * bsize) { - twofish_ecb_dec_16way(ctx->ctx, srcdst, srcdst); - srcdst += bsize * TF_AVX2_PARALLEL_BLOCKS; - nbytes -= bsize * TF_AVX2_PARALLEL_BLOCKS; - } - - while (nbytes >= 8 * bsize) { - twofish_ecb_dec_8way(ctx->ctx, srcdst, srcdst); - srcdst += bsize * 8; - nbytes -= bsize * 8; - } - - while (nbytes >= 3 * bsize) { - twofish_dec_blk_3way(ctx->ctx, srcdst, srcdst); - srcdst += bsize * 3; - nbytes -= bsize * 3; - } - - for (i = 0; i < nbytes / bsize; i++, srcdst += bsize) - twofish_dec_blk(ctx->ctx, srcdst, srcdst); -} - -static int lrw_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) -{ - struct twofish_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); - be128 buf[TF_AVX2_PARALLEL_BLOCKS]; - struct crypt_priv crypt_ctx = { - .ctx = &ctx->twofish_ctx, - .fpu_enabled = false, - }; - struct lrw_crypt_req req = { - .tbuf = buf, - .tbuflen = sizeof(buf), - - .table_ctx = &ctx->lrw_table, - .crypt_ctx = &crypt_ctx, - .crypt_fn = encrypt_callback, - }; - int ret; - - desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; - ret = lrw_crypt(desc, dst, src, nbytes, &req); - twofish_fpu_end(crypt_ctx.fpu_enabled); - - return ret; -} - -static int lrw_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) -{ - struct twofish_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); - be128 buf[TF_AVX2_PARALLEL_BLOCKS]; - struct crypt_priv crypt_ctx = { - .ctx = &ctx->twofish_ctx, - .fpu_enabled = false, - }; - struct lrw_crypt_req req = { - .tbuf = buf, - .tbuflen = sizeof(buf), - - .table_ctx = &ctx->lrw_table, - .crypt_ctx = &crypt_ctx, - .crypt_fn = decrypt_callback, - }; - int ret; - - desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; - ret = lrw_crypt(desc, dst, src, nbytes, &req); - twofish_fpu_end(crypt_ctx.fpu_enabled); - - return ret; -} - -static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) -{ - struct twofish_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); - - return glue_xts_crypt_128bit(&twofish_enc_xts, desc, dst, src, nbytes, - XTS_TWEAK_CAST(twofish_enc_blk), - &ctx->tweak_ctx, &ctx->crypt_ctx); -} - -static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) -{ - struct twofish_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); - - return glue_xts_crypt_128bit(&twofish_dec_xts, desc, dst, src, nbytes, - XTS_TWEAK_CAST(twofish_enc_blk), - &ctx->tweak_ctx, &ctx->crypt_ctx); -} - -static struct crypto_alg tf_algs[10] = { { - .cra_name = "__ecb-twofish-avx2", - .cra_driver_name = "__driver-ecb-twofish-avx2", - .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, - .cra_blocksize = TF_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct twofish_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_blkcipher_type, - .cra_module = THIS_MODULE, - .cra_u = { - .blkcipher = { - .min_keysize = TF_MIN_KEY_SIZE, - .max_keysize = TF_MAX_KEY_SIZE, - .setkey = twofish_setkey, - .encrypt = ecb_encrypt, - .decrypt = ecb_decrypt, - }, - }, -}, { - .cra_name = "__cbc-twofish-avx2", - .cra_driver_name = "__driver-cbc-twofish-avx2", - .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, - .cra_blocksize = TF_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct twofish_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_blkcipher_type, - .cra_module = THIS_MODULE, - .cra_u = { - .blkcipher = { - .min_keysize = TF_MIN_KEY_SIZE, - .max_keysize = TF_MAX_KEY_SIZE, - .setkey = twofish_setkey, - .encrypt = cbc_encrypt, - .decrypt = cbc_decrypt, - }, - }, -}, { - .cra_name = "__ctr-twofish-avx2", - .cra_driver_name = "__driver-ctr-twofish-avx2", - .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, - .cra_blocksize = 1, - .cra_ctxsize = sizeof(struct twofish_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_blkcipher_type, - .cra_module = THIS_MODULE, - .cra_u = { - .blkcipher = { - .min_keysize = TF_MIN_KEY_SIZE, - .max_keysize = TF_MAX_KEY_SIZE, - .ivsize = TF_BLOCK_SIZE, - .setkey = twofish_setkey, - .encrypt = ctr_crypt, - .decrypt = ctr_crypt, - }, - }, -}, { - .cra_name = "__lrw-twofish-avx2", - .cra_driver_name = "__driver-lrw-twofish-avx2", - .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, - .cra_blocksize = TF_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct twofish_lrw_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_blkcipher_type, - .cra_module = THIS_MODULE, - .cra_exit = lrw_twofish_exit_tfm, - .cra_u = { - .blkcipher = { - .min_keysize = TF_MIN_KEY_SIZE + - TF_BLOCK_SIZE, - .max_keysize = TF_MAX_KEY_SIZE + - TF_BLOCK_SIZE, - .ivsize = TF_BLOCK_SIZE, - .setkey = lrw_twofish_setkey, - .encrypt = lrw_encrypt, - .decrypt = lrw_decrypt, - }, - }, -}, { - .cra_name = "__xts-twofish-avx2", - .cra_driver_name = "__driver-xts-twofish-avx2", - .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, - .cra_blocksize = TF_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct twofish_xts_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_blkcipher_type, - .cra_module = THIS_MODULE, - .cra_u = { - .blkcipher = { - .min_keysize = TF_MIN_KEY_SIZE * 2, - .max_keysize = TF_MAX_KEY_SIZE * 2, - .ivsize = TF_BLOCK_SIZE, - .setkey = xts_twofish_setkey, - .encrypt = xts_encrypt, - .decrypt = xts_decrypt, - }, - }, -}, { - .cra_name = "ecb(twofish)", - .cra_driver_name = "ecb-twofish-avx2", - .cra_priority = 500, - .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, - .cra_blocksize = TF_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct async_helper_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_ablkcipher_type, - .cra_module = THIS_MODULE, - .cra_init = ablk_init, - .cra_exit = ablk_exit, - .cra_u = { - .ablkcipher = { - .min_keysize = TF_MIN_KEY_SIZE, - .max_keysize = TF_MAX_KEY_SIZE, - .setkey = ablk_set_key, - .encrypt = ablk_encrypt, - .decrypt = ablk_decrypt, - }, - }, -}, { - .cra_name = "cbc(twofish)", - .cra_driver_name = "cbc-twofish-avx2", - .cra_priority = 500, - .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, - .cra_blocksize = TF_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct async_helper_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_ablkcipher_type, - .cra_module = THIS_MODULE, - .cra_init = ablk_init, - .cra_exit = ablk_exit, - .cra_u = { - .ablkcipher = { - .min_keysize = TF_MIN_KEY_SIZE, - .max_keysize = TF_MAX_KEY_SIZE, - .ivsize = TF_BLOCK_SIZE, - .setkey = ablk_set_key, - .encrypt = __ablk_encrypt, - .decrypt = ablk_decrypt, - }, - }, -}, { - .cra_name = "ctr(twofish)", - .cra_driver_name = "ctr-twofish-avx2", - .cra_priority = 500, - .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, - .cra_blocksize = 1, - .cra_ctxsize = sizeof(struct async_helper_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_ablkcipher_type, - .cra_module = THIS_MODULE, - .cra_init = ablk_init, - .cra_exit = ablk_exit, - .cra_u = { - .ablkcipher = { - .min_keysize = TF_MIN_KEY_SIZE, - .max_keysize = TF_MAX_KEY_SIZE, - .ivsize = TF_BLOCK_SIZE, - .setkey = ablk_set_key, - .encrypt = ablk_encrypt, - .decrypt = ablk_encrypt, - .geniv = "chainiv", - }, - }, -}, { - .cra_name = "lrw(twofish)", - .cra_driver_name = "lrw-twofish-avx2", - .cra_priority = 500, - .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, - .cra_blocksize = TF_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct async_helper_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_ablkcipher_type, - .cra_module = THIS_MODULE, - .cra_init = ablk_init, - .cra_exit = ablk_exit, - .cra_u = { - .ablkcipher = { - .min_keysize = TF_MIN_KEY_SIZE + - TF_BLOCK_SIZE, - .max_keysize = TF_MAX_KEY_SIZE + - TF_BLOCK_SIZE, - .ivsize = TF_BLOCK_SIZE, - .setkey = ablk_set_key, - .encrypt = ablk_encrypt, - .decrypt = ablk_decrypt, - }, - }, -}, { - .cra_name = "xts(twofish)", - .cra_driver_name = "xts-twofish-avx2", - .cra_priority = 500, - .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, - .cra_blocksize = TF_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct async_helper_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_ablkcipher_type, - .cra_module = THIS_MODULE, - .cra_init = ablk_init, - .cra_exit = ablk_exit, - .cra_u = { - .ablkcipher = { - .min_keysize = TF_MIN_KEY_SIZE * 2, - .max_keysize = TF_MAX_KEY_SIZE * 2, - .ivsize = TF_BLOCK_SIZE, - .setkey = ablk_set_key, - .encrypt = ablk_encrypt, - .decrypt = ablk_decrypt, - }, - }, -} }; - -static int __init init(void) -{ - u64 xcr0; - - if (!cpu_has_avx2 || !cpu_has_osxsave) { - pr_info("AVX2 instructions are not detected.\n"); - return -ENODEV; - } - - xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); - if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) { - pr_info("AVX2 detected but unusable.\n"); - return -ENODEV; - } - - return crypto_register_algs(tf_algs, ARRAY_SIZE(tf_algs)); -} - -static void __exit fini(void) -{ - crypto_unregister_algs(tf_algs, ARRAY_SIZE(tf_algs)); -} - -module_init(init); -module_exit(fini); - -MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("Twofish Cipher Algorithm, AVX2 optimized"); -MODULE_ALIAS("twofish"); -MODULE_ALIAS("twofish-asm"); diff --git a/arch/x86/crypto/twofish_avx_glue.c b/arch/x86/crypto/twofish_avx_glue.c index 2047a562f6b3..a62ba541884e 100644 --- a/arch/x86/crypto/twofish_avx_glue.c +++ b/arch/x86/crypto/twofish_avx_glue.c @@ -50,26 +50,18 @@ /* 8-way parallel cipher functions */ asmlinkage void twofish_ecb_enc_8way(struct twofish_ctx *ctx, u8 *dst, const u8 *src); -EXPORT_SYMBOL_GPL(twofish_ecb_enc_8way); - asmlinkage void twofish_ecb_dec_8way(struct twofish_ctx *ctx, u8 *dst, const u8 *src); -EXPORT_SYMBOL_GPL(twofish_ecb_dec_8way); asmlinkage void twofish_cbc_dec_8way(struct twofish_ctx *ctx, u8 *dst, const u8 *src); -EXPORT_SYMBOL_GPL(twofish_cbc_dec_8way); - asmlinkage void twofish_ctr_8way(struct twofish_ctx *ctx, u8 *dst, const u8 *src, le128 *iv); -EXPORT_SYMBOL_GPL(twofish_ctr_8way); asmlinkage void twofish_xts_enc_8way(struct twofish_ctx *ctx, u8 *dst, const u8 *src, le128 *iv); -EXPORT_SYMBOL_GPL(twofish_xts_enc_8way); asmlinkage void twofish_xts_dec_8way(struct twofish_ctx *ctx, u8 *dst, const u8 *src, le128 *iv); -EXPORT_SYMBOL_GPL(twofish_xts_dec_8way); static inline void twofish_enc_blk_3way(struct twofish_ctx *ctx, u8 *dst, const u8 *src) @@ -77,19 +69,17 @@ static inline void twofish_enc_blk_3way(struct twofish_ctx *ctx, u8 *dst, __twofish_enc_blk_3way(ctx, dst, src, false); } -void twofish_xts_enc(void *ctx, u128 *dst, const u128 *src, le128 *iv) +static void twofish_xts_enc(void *ctx, u128 *dst, const u128 *src, le128 *iv) { glue_xts_crypt_128bit_one(ctx, dst, src, iv, GLUE_FUNC_CAST(twofish_enc_blk)); } -EXPORT_SYMBOL_GPL(twofish_xts_enc); -void twofish_xts_dec(void *ctx, u128 *dst, const u128 *src, le128 *iv) +static void twofish_xts_dec(void *ctx, u128 *dst, const u128 *src, le128 *iv) { glue_xts_crypt_128bit_one(ctx, dst, src, iv, GLUE_FUNC_CAST(twofish_dec_blk)); } -EXPORT_SYMBOL_GPL(twofish_xts_dec); static const struct common_glue_ctx twofish_enc = { diff --git a/arch/x86/include/asm/crypto/twofish.h b/arch/x86/include/asm/crypto/twofish.h index e655c6029b45..878c51ceebb5 100644 --- a/arch/x86/include/asm/crypto/twofish.h +++ b/arch/x86/include/asm/crypto/twofish.h @@ -28,20 +28,6 @@ asmlinkage void __twofish_enc_blk_3way(struct twofish_ctx *ctx, u8 *dst, asmlinkage void twofish_dec_blk_3way(struct twofish_ctx *ctx, u8 *dst, const u8 *src); -/* 8-way parallel cipher functions */ -asmlinkage void twofish_ecb_enc_8way(struct twofish_ctx *ctx, u8 *dst, - const u8 *src); -asmlinkage void twofish_ecb_dec_8way(struct twofish_ctx *ctx, u8 *dst, - const u8 *src); -asmlinkage void twofish_cbc_dec_8way(struct twofish_ctx *ctx, u8 *dst, - const u8 *src); -asmlinkage void twofish_ctr_8way(struct twofish_ctx *ctx, u8 *dst, - const u8 *src, le128 *iv); -asmlinkage void twofish_xts_enc_8way(struct twofish_ctx *ctx, u8 *dst, - const u8 *src, le128 *iv); -asmlinkage void twofish_xts_dec_8way(struct twofish_ctx *ctx, u8 *dst, - const u8 *src, le128 *iv); - /* helpers from twofish_x86_64-3way module */ extern void twofish_dec_blk_cbc_3way(void *ctx, u128 *dst, const u128 *src); extern void twofish_enc_blk_ctr(void *ctx, u128 *dst, const u128 *src, @@ -57,8 +43,4 @@ extern void lrw_twofish_exit_tfm(struct crypto_tfm *tfm); extern int xts_twofish_setkey(struct crypto_tfm *tfm, const u8 *key, unsigned int keylen); -/* helpers from twofish-avx module */ -extern void twofish_xts_enc(void *ctx, u128 *dst, const u128 *src, le128 *iv); -extern void twofish_xts_dec(void *ctx, u128 *dst, const u128 *src, le128 *iv); - #endif /* ASM_X86_TWOFISH_H */ diff --git a/crypto/Kconfig b/crypto/Kconfig index 4ef0ee715171..904ffe838567 100644 --- a/crypto/Kconfig +++ b/crypto/Kconfig @@ -1297,30 +1297,6 @@ config CRYPTO_TWOFISH_AVX_X86_64 See also: -config CRYPTO_TWOFISH_AVX2_X86_64 - tristate "Twofish cipher algorithm (x86_64/AVX2)" - depends on X86 && 64BIT - select CRYPTO_ALGAPI - select CRYPTO_CRYPTD - select CRYPTO_ABLK_HELPER_X86 - select CRYPTO_GLUE_HELPER_X86 - select CRYPTO_TWOFISH_COMMON - select CRYPTO_TWOFISH_X86_64 - select CRYPTO_TWOFISH_X86_64_3WAY - select CRYPTO_TWOFISH_AVX_X86_64 - select CRYPTO_LRW - select CRYPTO_XTS - help - Twofish cipher algorithm (x86_64/AVX2). - - Twofish was submitted as an AES (Advanced Encryption Standard) - candidate cipher by researchers at CounterPane Systems. It is a - 16 round block cipher supporting key sizes of 128, 192, and 256 - bits. - - See also: - - comment "Compression" config CRYPTO_DEFLATE diff --git a/crypto/testmgr.c b/crypto/testmgr.c index 27f111876523..b2bc5334c170 100644 --- a/crypto/testmgr.c +++ b/crypto/testmgr.c @@ -1653,9 +1653,6 @@ static const struct alg_test_desc alg_test_descs[] = { }, { .alg = "__cbc-twofish-avx", .test = alg_test_null, - }, { - .alg = "__cbc-twofish-avx2", - .test = alg_test_null, }, { .alg = "__driver-cbc-aes-aesni", .test = alg_test_null, @@ -1684,9 +1681,6 @@ static const struct alg_test_desc alg_test_descs[] = { }, { .alg = "__driver-cbc-twofish-avx", .test = alg_test_null, - }, { - .alg = "__driver-cbc-twofish-avx2", - .test = alg_test_null, }, { .alg = "__driver-ecb-aes-aesni", .test = alg_test_null, @@ -1715,9 +1709,6 @@ static const struct alg_test_desc alg_test_descs[] = { }, { .alg = "__driver-ecb-twofish-avx", .test = alg_test_null, - }, { - .alg = "__driver-ecb-twofish-avx2", - .test = alg_test_null, }, { .alg = "__ghash-pclmulqdqni", .test = alg_test_null, @@ -2018,9 +2009,6 @@ static const struct alg_test_desc alg_test_descs[] = { }, { .alg = "cryptd(__driver-ecb-twofish-avx)", .test = alg_test_null, - }, { - .alg = "cryptd(__driver-ecb-twofish-avx2)", - .test = alg_test_null, }, { .alg = "cryptd(__driver-gcm-aes-aesni)", .test = alg_test_null, -- cgit v1.2.3-59-g8ed1b From 5714758b5c23dcca8d29a43590397e58d245732f Mon Sep 17 00:00:00 2001 From: Jussi Kivilinna Date: Thu, 13 Jun 2013 17:37:40 +0300 Subject: crypto: testmgr - check that entries in alg_test_descs are in correct order Patch adds check for alg_test_descs list order, so that accidentically misplaced entries are found quicker. Duplicate entries are also checked for. Signed-off-by: Jussi Kivilinna Signed-off-by: Herbert Xu --- crypto/testmgr.c | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/crypto/testmgr.c b/crypto/testmgr.c index b2bc5334c170..a81c154e5d85 100644 --- a/crypto/testmgr.c +++ b/crypto/testmgr.c @@ -3054,6 +3054,35 @@ static const struct alg_test_desc alg_test_descs[] = { } }; +static bool alg_test_descs_checked; + +static void alg_test_descs_check_order(void) +{ + int i; + + /* only check once */ + if (alg_test_descs_checked) + return; + + alg_test_descs_checked = true; + + for (i = 1; i < ARRAY_SIZE(alg_test_descs); i++) { + int diff = strcmp(alg_test_descs[i - 1].alg, + alg_test_descs[i].alg); + + if (WARN_ON(diff > 0)) { + pr_warn("testmgr: alg_test_descs entries in wrong order: '%s' before '%s'\n", + alg_test_descs[i - 1].alg, + alg_test_descs[i].alg); + } + + if (WARN_ON(diff == 0)) { + pr_warn("testmgr: duplicate alg_test_descs entry: '%s'\n", + alg_test_descs[i].alg); + } + } +} + static int alg_find_test(const char *alg) { int start = 0; @@ -3085,6 +3114,8 @@ int alg_test(const char *driver, const char *alg, u32 type, u32 mask) int j; int rc; + alg_test_descs_check_order(); + if ((type & CRYPTO_ALG_TYPE_MASK) == CRYPTO_ALG_TYPE_CIPHER) { char nalg[CRYPTO_MAX_ALG_NAME]; -- cgit v1.2.3-59-g8ed1b From 3a338f20c3c5c33c45ab1c36c8eccd62289c6401 Mon Sep 17 00:00:00 2001 From: Jussi Kivilinna Date: Thu, 13 Jun 2013 17:37:45 +0300 Subject: crypto: testmgr - test skciphers with unaligned buffers This patch adds unaligned buffer tests for blkciphers. The first new test is with one byte offset and the second test checks if cra_alignmask for driver is big enough; for example, for testing a case where cra_alignmask is set to 7, but driver really needs buffers to be aligned to 16 bytes. Signed-off-by: Jussi Kivilinna Signed-off-by: Herbert Xu --- crypto/testmgr.c | 33 +++++++++++++++++++++++++++++---- 1 file changed, 29 insertions(+), 4 deletions(-) diff --git a/crypto/testmgr.c b/crypto/testmgr.c index a81c154e5d85..8bd185f068b6 100644 --- a/crypto/testmgr.c +++ b/crypto/testmgr.c @@ -820,7 +820,7 @@ out_nobuf: static int __test_skcipher(struct crypto_ablkcipher *tfm, int enc, struct cipher_testvec *template, unsigned int tcount, - const bool diff_dst) + const bool diff_dst, const int align_offset) { const char *algo = crypto_tfm_alg_driver_name(crypto_ablkcipher_tfm(tfm)); @@ -876,10 +876,12 @@ static int __test_skcipher(struct crypto_ablkcipher *tfm, int enc, j++; ret = -EINVAL; - if (WARN_ON(template[i].ilen > PAGE_SIZE)) + if (WARN_ON(align_offset + template[i].ilen > + PAGE_SIZE)) goto out; data = xbuf[0]; + data += align_offset; memcpy(data, template[i].input, template[i].ilen); crypto_ablkcipher_clear_flags(tfm, ~0); @@ -900,6 +902,7 @@ static int __test_skcipher(struct crypto_ablkcipher *tfm, int enc, sg_init_one(&sg[0], data, template[i].ilen); if (diff_dst) { data = xoutbuf[0]; + data += align_offset; sg_init_one(&sgout[0], data, template[i].ilen); } @@ -941,6 +944,9 @@ static int __test_skcipher(struct crypto_ablkcipher *tfm, int enc, j = 0; for (i = 0; i < tcount; i++) { + /* alignment tests are only done with continuous buffers */ + if (align_offset != 0) + break; if (template[i].iv) memcpy(iv, template[i].iv, MAX_IVLEN); @@ -1075,15 +1081,34 @@ out_nobuf: static int test_skcipher(struct crypto_ablkcipher *tfm, int enc, struct cipher_testvec *template, unsigned int tcount) { + unsigned int alignmask; int ret; /* test 'dst == src' case */ - ret = __test_skcipher(tfm, enc, template, tcount, false); + ret = __test_skcipher(tfm, enc, template, tcount, false, 0); if (ret) return ret; /* test 'dst != src' case */ - return __test_skcipher(tfm, enc, template, tcount, true); + ret = __test_skcipher(tfm, enc, template, tcount, true, 0); + if (ret) + return ret; + + /* test unaligned buffers, check with one byte offset */ + ret = __test_skcipher(tfm, enc, template, tcount, true, 1); + if (ret) + return ret; + + alignmask = crypto_tfm_alg_alignmask(&tfm->base); + if (alignmask) { + /* Check if alignment mask for tfm is correctly set. */ + ret = __test_skcipher(tfm, enc, template, tcount, true, + alignmask + 1); + if (ret) + return ret; + } + + return 0; } static int test_comp(struct crypto_comp *tfm, struct comp_testvec *ctemplate, -- cgit v1.2.3-59-g8ed1b From 58dcf5484c0cbaddbba1d3ed074729f5078346bb Mon Sep 17 00:00:00 2001 From: Jussi Kivilinna Date: Thu, 13 Jun 2013 17:37:50 +0300 Subject: crypto: testmgr - test AEADs with unaligned buffers This patch adds unaligned buffer tests for AEADs. The first new test is with one byte offset and the second test checks if cra_alignmask for driver is big enough; for example, for testing a case where cra_alignmask is set to 7, but driver really needs buffers to be aligned to 16 bytes. Signed-off-by: Jussi Kivilinna Signed-off-by: Herbert Xu --- crypto/testmgr.c | 37 +++++++++++++++++++++++++++++++------ 1 file changed, 31 insertions(+), 6 deletions(-) diff --git a/crypto/testmgr.c b/crypto/testmgr.c index 8bd185f068b6..f20538616776 100644 --- a/crypto/testmgr.c +++ b/crypto/testmgr.c @@ -360,7 +360,7 @@ out_nobuf: static int __test_aead(struct crypto_aead *tfm, int enc, struct aead_testvec *template, unsigned int tcount, - const bool diff_dst) + const bool diff_dst, const int align_offset) { const char *algo = crypto_tfm_alg_driver_name(crypto_aead_tfm(tfm)); unsigned int i, j, k, n, temp; @@ -423,15 +423,16 @@ static int __test_aead(struct crypto_aead *tfm, int enc, if (!template[i].np) { j++; - /* some tepmplates have no input data but they will + /* some templates have no input data but they will * touch input */ input = xbuf[0]; + input += align_offset; assoc = axbuf[0]; ret = -EINVAL; - if (WARN_ON(template[i].ilen > PAGE_SIZE || - template[i].alen > PAGE_SIZE)) + if (WARN_ON(align_offset + template[i].ilen > + PAGE_SIZE || template[i].alen > PAGE_SIZE)) goto out; memcpy(input, template[i].input, template[i].ilen); @@ -470,6 +471,7 @@ static int __test_aead(struct crypto_aead *tfm, int enc, if (diff_dst) { output = xoutbuf[0]; + output += align_offset; sg_init_one(&sgout[0], output, template[i].ilen + (enc ? authsize : 0)); @@ -530,6 +532,10 @@ static int __test_aead(struct crypto_aead *tfm, int enc, } for (i = 0, j = 0; i < tcount; i++) { + /* alignment tests are only done with continuous buffers */ + if (align_offset != 0) + break; + if (template[i].np) { j++; @@ -732,15 +738,34 @@ out_noxbuf: static int test_aead(struct crypto_aead *tfm, int enc, struct aead_testvec *template, unsigned int tcount) { + unsigned int alignmask; int ret; /* test 'dst == src' case */ - ret = __test_aead(tfm, enc, template, tcount, false); + ret = __test_aead(tfm, enc, template, tcount, false, 0); if (ret) return ret; /* test 'dst != src' case */ - return __test_aead(tfm, enc, template, tcount, true); + ret = __test_aead(tfm, enc, template, tcount, true, 0); + if (ret) + return ret; + + /* test unaligned buffers, check with one byte offset */ + ret = __test_aead(tfm, enc, template, tcount, true, 1); + if (ret) + return ret; + + alignmask = crypto_tfm_alg_alignmask(&tfm->base); + if (alignmask) { + /* Check if alignment mask for tfm is correctly set. */ + ret = __test_aead(tfm, enc, template, tcount, true, + alignmask + 1); + if (ret) + return ret; + } + + return 0; } static int test_cipher(struct crypto_cipher *tfm, int enc, -- cgit v1.2.3-59-g8ed1b From da5ffe11342a0ecf2cce7000a9392c9ca959e9c8 Mon Sep 17 00:00:00 2001 From: Jussi Kivilinna Date: Thu, 13 Jun 2013 17:37:55 +0300 Subject: crypto: testmgr - test hash implementations with unaligned buffers This patch adds unaligned buffer tests for hashes. The first new test is with one byte offset and the second test checks if cra_alignmask for driver is big enough; for example, for testing a case where cra_alignmask is set to 7, but driver really needs buffers to be aligned to 16 bytes. Signed-off-by: Jussi Kivilinna Signed-off-by: Herbert Xu --- crypto/testmgr.c | 41 +++++++++++++++++++++++++++++++++++++++-- 1 file changed, 39 insertions(+), 2 deletions(-) diff --git a/crypto/testmgr.c b/crypto/testmgr.c index f20538616776..2f00607039e2 100644 --- a/crypto/testmgr.c +++ b/crypto/testmgr.c @@ -184,8 +184,9 @@ static int do_one_async_hash_op(struct ahash_request *req, return ret; } -static int test_hash(struct crypto_ahash *tfm, struct hash_testvec *template, - unsigned int tcount, bool use_digest) +static int __test_hash(struct crypto_ahash *tfm, struct hash_testvec *template, + unsigned int tcount, bool use_digest, + const int align_offset) { const char *algo = crypto_tfm_alg_driver_name(crypto_ahash_tfm(tfm)); unsigned int i, j, k, temp; @@ -216,10 +217,15 @@ static int test_hash(struct crypto_ahash *tfm, struct hash_testvec *template, if (template[i].np) continue; + ret = -EINVAL; + if (WARN_ON(align_offset + template[i].psize > PAGE_SIZE)) + goto out; + j++; memset(result, 0, 64); hash_buff = xbuf[0]; + hash_buff += align_offset; memcpy(hash_buff, template[i].plaintext, template[i].psize); sg_init_one(&sg[0], hash_buff, template[i].psize); @@ -281,6 +287,10 @@ static int test_hash(struct crypto_ahash *tfm, struct hash_testvec *template, j = 0; for (i = 0; i < tcount; i++) { + /* alignment tests are only done with continuous buffers */ + if (align_offset != 0) + break; + if (template[i].np) { j++; memset(result, 0, 64); @@ -358,6 +368,33 @@ out_nobuf: return ret; } +static int test_hash(struct crypto_ahash *tfm, struct hash_testvec *template, + unsigned int tcount, bool use_digest) +{ + unsigned int alignmask; + int ret; + + ret = __test_hash(tfm, template, tcount, use_digest, 0); + if (ret) + return ret; + + /* test unaligned buffers, check with one byte offset */ + ret = __test_hash(tfm, template, tcount, use_digest, 1); + if (ret) + return ret; + + alignmask = crypto_tfm_alg_alignmask(&tfm->base); + if (alignmask) { + /* Check if alignment mask for tfm is correctly set. */ + ret = __test_hash(tfm, template, tcount, use_digest, + alignmask + 1); + if (ret) + return ret; + } + + return 0; +} + static int __test_aead(struct crypto_aead *tfm, int enc, struct aead_testvec *template, unsigned int tcount, const bool diff_dst, const int align_offset) -- cgit v1.2.3-59-g8ed1b