From 12387a46bb150f5608de4aa9a90dfdddbf991e3f Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Wed, 10 Mar 2010 18:28:55 +0800 Subject: crypto: aesni-intel - Add AES-NI accelerated CTR mode To take advantage of the hardware pipeline implementation of AES-NI instructions. CTR mode cryption is implemented in ASM to schedule multiple AES-NI instructions one after another. This way, some latency of AES-NI instruction can be eliminated. Performance testing based on dm-crypt should 50% reduction of ecryption/decryption time. Signed-off-by: Huang Ying Signed-off-by: Herbert Xu --- arch/x86/crypto/aesni-intel_asm.S | 115 ++++++++++++++++++++++++++++++++ arch/x86/crypto/aesni-intel_glue.c | 130 +++++++++++++++++++++++++++++++++++-- 2 files changed, 238 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S index 20bb0e1ac681..822846a9eba9 100644 --- a/arch/x86/crypto/aesni-intel_asm.S +++ b/arch/x86/crypto/aesni-intel_asm.S @@ -32,6 +32,9 @@ #define IN IN1 #define KEY %xmm2 #define IV %xmm3 +#define BSWAP_MASK %xmm10 +#define CTR %xmm11 +#define INC %xmm12 #define KEYP %rdi #define OUTP %rsi @@ -42,6 +45,7 @@ #define T1 %r10 #define TKEYP T1 #define T2 %r11 +#define TCTR_LOW T2 _key_expansion_128: _key_expansion_256a: @@ -724,3 +728,114 @@ ENTRY(aesni_cbc_dec) movups IV, (IVP) .Lcbc_dec_just_ret: ret + +.align 16 +.Lbswap_mask: + .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 + +/* + * _aesni_inc_init: internal ABI + * setup registers used by _aesni_inc + * input: + * IV + * output: + * CTR: == IV, in little endian + * TCTR_LOW: == lower qword of CTR + * INC: == 1, in little endian + * BSWAP_MASK == endian swapping mask + */ +_aesni_inc_init: + movaps .Lbswap_mask, BSWAP_MASK + movaps IV, CTR + PSHUFB_XMM BSWAP_MASK CTR + mov $1, TCTR_LOW + movq TCTR_LOW, INC + movq CTR, TCTR_LOW + ret + +/* + * _aesni_inc: internal ABI + * Increase IV by 1, IV is in big endian + * input: + * IV + * CTR: == IV, in little endian + * TCTR_LOW: == lower qword of CTR + * INC: == 1, in little endian + * BSWAP_MASK == endian swapping mask + * output: + * IV: Increase by 1 + * changed: + * CTR: == output IV, in little endian + * TCTR_LOW: == lower qword of CTR + */ +_aesni_inc: + paddq INC, CTR + add $1, TCTR_LOW + jnc .Linc_low + pslldq $8, INC + paddq INC, CTR + psrldq $8, INC +.Linc_low: + movaps CTR, IV + PSHUFB_XMM BSWAP_MASK IV + ret + +/* + * void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, + * size_t len, u8 *iv) + */ +ENTRY(aesni_ctr_enc) + cmp $16, LEN + jb .Lctr_enc_just_ret + mov 480(KEYP), KLEN + movups (IVP), IV + call _aesni_inc_init + cmp $64, LEN + jb .Lctr_enc_loop1 +.align 4 +.Lctr_enc_loop4: + movaps IV, STATE1 + call _aesni_inc + movups (INP), IN1 + movaps IV, STATE2 + call _aesni_inc + movups 0x10(INP), IN2 + movaps IV, STATE3 + call _aesni_inc + movups 0x20(INP), IN3 + movaps IV, STATE4 + call _aesni_inc + movups 0x30(INP), IN4 + call _aesni_enc4 + pxor IN1, STATE1 + movups STATE1, (OUTP) + pxor IN2, STATE2 + movups STATE2, 0x10(OUTP) + pxor IN3, STATE3 + movups STATE3, 0x20(OUTP) + pxor IN4, STATE4 + movups STATE4, 0x30(OUTP) + sub $64, LEN + add $64, INP + add $64, OUTP + cmp $64, LEN + jge .Lctr_enc_loop4 + cmp $16, LEN + jb .Lctr_enc_ret +.align 4 +.Lctr_enc_loop1: + movaps IV, STATE + call _aesni_inc + movups (INP), IN + call _aesni_enc1 + pxor IN, STATE + movups STATE, (OUTP) + sub $16, LEN + add $16, INP + add $16, OUTP + cmp $16, LEN + jge .Lctr_enc_loop1 +.Lctr_enc_ret: + movups IV, (IVP) +.Lctr_enc_just_ret: + ret diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c index 49c552c060e9..2cb3dcc4490a 100644 --- a/arch/x86/crypto/aesni-intel_glue.c +++ b/arch/x86/crypto/aesni-intel_glue.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include @@ -58,6 +59,8 @@ asmlinkage void aesni_cbc_enc(struct crypto_aes_ctx *ctx, u8 *out, const u8 *in, unsigned int len, u8 *iv); asmlinkage void aesni_cbc_dec(struct crypto_aes_ctx *ctx, u8 *out, const u8 *in, unsigned int len, u8 *iv); +asmlinkage void aesni_ctr_enc(struct crypto_aes_ctx *ctx, u8 *out, + const u8 *in, unsigned int len, u8 *iv); static inline struct crypto_aes_ctx *aes_ctx(void *raw_ctx) { @@ -321,6 +324,72 @@ static struct crypto_alg blk_cbc_alg = { }, }; +static void ctr_crypt_final(struct crypto_aes_ctx *ctx, + struct blkcipher_walk *walk) +{ + u8 *ctrblk = walk->iv; + u8 keystream[AES_BLOCK_SIZE]; + u8 *src = walk->src.virt.addr; + u8 *dst = walk->dst.virt.addr; + unsigned int nbytes = walk->nbytes; + + aesni_enc(ctx, keystream, ctrblk); + crypto_xor(keystream, src, nbytes); + memcpy(dst, keystream, nbytes); + crypto_inc(ctrblk, AES_BLOCK_SIZE); +} + +static int ctr_crypt(struct blkcipher_desc *desc, + struct scatterlist *dst, struct scatterlist *src, + unsigned int nbytes) +{ + struct crypto_aes_ctx *ctx = aes_ctx(crypto_blkcipher_ctx(desc->tfm)); + struct blkcipher_walk walk; + int err; + + blkcipher_walk_init(&walk, dst, src, nbytes); + err = blkcipher_walk_virt_block(desc, &walk, AES_BLOCK_SIZE); + desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; + + kernel_fpu_begin(); + while ((nbytes = walk.nbytes) >= AES_BLOCK_SIZE) { + aesni_ctr_enc(ctx, walk.dst.virt.addr, walk.src.virt.addr, + nbytes & AES_BLOCK_MASK, walk.iv); + nbytes &= AES_BLOCK_SIZE - 1; + err = blkcipher_walk_done(desc, &walk, nbytes); + } + if (walk.nbytes) { + ctr_crypt_final(ctx, &walk); + err = blkcipher_walk_done(desc, &walk, 0); + } + kernel_fpu_end(); + + return err; +} + +static struct crypto_alg blk_ctr_alg = { + .cra_name = "__ctr-aes-aesni", + .cra_driver_name = "__driver-ctr-aes-aesni", + .cra_priority = 0, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_blocksize = 1, + .cra_ctxsize = sizeof(struct crypto_aes_ctx)+AESNI_ALIGN-1, + .cra_alignmask = 0, + .cra_type = &crypto_blkcipher_type, + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(blk_ctr_alg.cra_list), + .cra_u = { + .blkcipher = { + .min_keysize = AES_MIN_KEY_SIZE, + .max_keysize = AES_MAX_KEY_SIZE, + .ivsize = AES_BLOCK_SIZE, + .setkey = aes_set_key, + .encrypt = ctr_crypt, + .decrypt = ctr_crypt, + }, + }, +}; + static int ablk_set_key(struct crypto_ablkcipher *tfm, const u8 *key, unsigned int key_len) { @@ -467,13 +536,11 @@ static struct crypto_alg ablk_cbc_alg = { }, }; -#ifdef HAS_CTR static int ablk_ctr_init(struct crypto_tfm *tfm) { struct cryptd_ablkcipher *cryptd_tfm; - cryptd_tfm = cryptd_alloc_ablkcipher("fpu(ctr(__driver-aes-aesni))", - 0, 0); + cryptd_tfm = cryptd_alloc_ablkcipher("__driver-ctr-aes-aesni", 0, 0); if (IS_ERR(cryptd_tfm)) return PTR_ERR(cryptd_tfm); ablk_init_common(tfm, cryptd_tfm); @@ -500,11 +567,50 @@ static struct crypto_alg ablk_ctr_alg = { .ivsize = AES_BLOCK_SIZE, .setkey = ablk_set_key, .encrypt = ablk_encrypt, - .decrypt = ablk_decrypt, + .decrypt = ablk_encrypt, .geniv = "chainiv", }, }, }; + +#ifdef HAS_CTR +static int ablk_rfc3686_ctr_init(struct crypto_tfm *tfm) +{ + struct cryptd_ablkcipher *cryptd_tfm; + + cryptd_tfm = cryptd_alloc_ablkcipher( + "rfc3686(__driver-ctr-aes-aesni)", 0, 0); + if (IS_ERR(cryptd_tfm)) + return PTR_ERR(cryptd_tfm); + ablk_init_common(tfm, cryptd_tfm); + return 0; +} + +static struct crypto_alg ablk_rfc3686_ctr_alg = { + .cra_name = "rfc3686(ctr(aes))", + .cra_driver_name = "rfc3686-ctr-aes-aesni", + .cra_priority = 400, + .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC, + .cra_blocksize = 1, + .cra_ctxsize = sizeof(struct async_aes_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_ablkcipher_type, + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(ablk_rfc3686_ctr_alg.cra_list), + .cra_init = ablk_rfc3686_ctr_init, + .cra_exit = ablk_exit, + .cra_u = { + .ablkcipher = { + .min_keysize = AES_MIN_KEY_SIZE+CTR_RFC3686_NONCE_SIZE, + .max_keysize = AES_MAX_KEY_SIZE+CTR_RFC3686_NONCE_SIZE, + .ivsize = CTR_RFC3686_IV_SIZE, + .setkey = ablk_set_key, + .encrypt = ablk_encrypt, + .decrypt = ablk_decrypt, + .geniv = "seqiv", + }, + }, +}; #endif #ifdef HAS_LRW @@ -640,13 +746,17 @@ static int __init aesni_init(void) goto blk_ecb_err; if ((err = crypto_register_alg(&blk_cbc_alg))) goto blk_cbc_err; + if ((err = crypto_register_alg(&blk_ctr_alg))) + goto blk_ctr_err; if ((err = crypto_register_alg(&ablk_ecb_alg))) goto ablk_ecb_err; if ((err = crypto_register_alg(&ablk_cbc_alg))) goto ablk_cbc_err; -#ifdef HAS_CTR if ((err = crypto_register_alg(&ablk_ctr_alg))) goto ablk_ctr_err; +#ifdef HAS_CTR + if ((err = crypto_register_alg(&ablk_rfc3686_ctr_alg))) + goto ablk_rfc3686_ctr_err; #endif #ifdef HAS_LRW if ((err = crypto_register_alg(&ablk_lrw_alg))) @@ -675,13 +785,17 @@ ablk_pcbc_err: ablk_lrw_err: #endif #ifdef HAS_CTR + crypto_unregister_alg(&ablk_rfc3686_ctr_alg); +ablk_rfc3686_ctr_err: +#endif crypto_unregister_alg(&ablk_ctr_alg); ablk_ctr_err: -#endif crypto_unregister_alg(&ablk_cbc_alg); ablk_cbc_err: crypto_unregister_alg(&ablk_ecb_alg); ablk_ecb_err: + crypto_unregister_alg(&blk_ctr_alg); +blk_ctr_err: crypto_unregister_alg(&blk_cbc_alg); blk_cbc_err: crypto_unregister_alg(&blk_ecb_alg); @@ -705,10 +819,12 @@ static void __exit aesni_exit(void) crypto_unregister_alg(&ablk_lrw_alg); #endif #ifdef HAS_CTR - crypto_unregister_alg(&ablk_ctr_alg); + crypto_unregister_alg(&ablk_rfc3686_ctr_alg); #endif + crypto_unregister_alg(&ablk_ctr_alg); crypto_unregister_alg(&ablk_cbc_alg); crypto_unregister_alg(&ablk_ecb_alg); + crypto_unregister_alg(&blk_ctr_alg); crypto_unregister_alg(&blk_cbc_alg); crypto_unregister_alg(&blk_ecb_alg); crypto_unregister_alg(&__aesni_alg); -- cgit v1.2.3-59-g8ed1b From 32cbd7dfce93382a70f155bf539871b4c55bed29 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Sat, 13 Mar 2010 16:28:42 +0800 Subject: crypto: aesni-intel - Fix CTR optimization build failure with gas 2.16.1 Andrew Morton reported that AES-NI CTR optimization failed to compile with gas 2.16.1, the error message is as follow: arch/x86/crypto/aesni-intel_asm.S: Assembler messages: arch/x86/crypto/aesni-intel_asm.S:752: Error: suffix or operands invalid for `movq' arch/x86/crypto/aesni-intel_asm.S:753: Error: suffix or operands invalid for `movq' To fix this, a gas macro is defined to assemble movq with 64bit general purpose registers and XMM registers. The macro will generate the raw .byte sequence for needed instructions. Reported-by: Andrew Morton Signed-off-by: Huang Ying Signed-off-by: Herbert Xu --- arch/x86/crypto/aesni-intel_asm.S | 4 +- arch/x86/include/asm/inst.h | 96 +++++++++++++++++++++++++++++++++++++-- 2 files changed, 95 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S index 822846a9eba9..ff16756a51c1 100644 --- a/arch/x86/crypto/aesni-intel_asm.S +++ b/arch/x86/crypto/aesni-intel_asm.S @@ -749,8 +749,8 @@ _aesni_inc_init: movaps IV, CTR PSHUFB_XMM BSWAP_MASK CTR mov $1, TCTR_LOW - movq TCTR_LOW, INC - movq CTR, TCTR_LOW + MOVQ_R64_XMM TCTR_LOW INC + MOVQ_R64_XMM CTR TCTR_LOW ret /* diff --git a/arch/x86/include/asm/inst.h b/arch/x86/include/asm/inst.h index 14cf526091f9..840a399701b2 100644 --- a/arch/x86/include/asm/inst.h +++ b/arch/x86/include/asm/inst.h @@ -7,7 +7,66 @@ #ifdef __ASSEMBLY__ +#define REG_NUM_INVALID 100 + +#define REG_TYPE_R64 0 +#define REG_TYPE_XMM 1 +#define REG_TYPE_INVALID 100 + + .macro R64_NUM opd r64 + \opd = REG_NUM_INVALID + .ifc \r64,%rax + \opd = 0 + .endif + .ifc \r64,%rcx + \opd = 1 + .endif + .ifc \r64,%rdx + \opd = 2 + .endif + .ifc \r64,%rbx + \opd = 3 + .endif + .ifc \r64,%rsp + \opd = 4 + .endif + .ifc \r64,%rbp + \opd = 5 + .endif + .ifc \r64,%rsi + \opd = 6 + .endif + .ifc \r64,%rdi + \opd = 7 + .endif + .ifc \r64,%r8 + \opd = 8 + .endif + .ifc \r64,%r9 + \opd = 9 + .endif + .ifc \r64,%r10 + \opd = 10 + .endif + .ifc \r64,%r11 + \opd = 11 + .endif + .ifc \r64,%r12 + \opd = 12 + .endif + .ifc \r64,%r13 + \opd = 13 + .endif + .ifc \r64,%r14 + \opd = 14 + .endif + .ifc \r64,%r15 + \opd = 15 + .endif + .endm + .macro XMM_NUM opd xmm + \opd = REG_NUM_INVALID .ifc \xmm,%xmm0 \opd = 0 .endif @@ -58,13 +117,25 @@ .endif .endm + .macro REG_TYPE type reg + R64_NUM reg_type_r64 \reg + XMM_NUM reg_type_xmm \reg + .if reg_type_r64 != REG_NUM_INVALID + \type = REG_TYPE_R64 + .elseif reg_type_xmm != REG_NUM_INVALID + \type = REG_TYPE_XMM + .else + \type = REG_TYPE_INVALID + .endif + .endm + .macro PFX_OPD_SIZE .byte 0x66 .endm - .macro PFX_REX opd1 opd2 - .if (\opd1 | \opd2) & 8 - .byte 0x40 | ((\opd1 & 8) >> 3) | ((\opd2 & 8) >> 1) + .macro PFX_REX opd1 opd2 W=0 + .if ((\opd1 | \opd2) & 8) || \W + .byte 0x40 | ((\opd1 & 8) >> 3) | ((\opd2 & 8) >> 1) | (\W << 3) .endif .endm @@ -145,6 +216,25 @@ .byte 0x0f, 0x38, 0xdf MODRM 0xc0 aesdeclast_opd1 aesdeclast_opd2 .endm + + .macro MOVQ_R64_XMM opd1 opd2 + REG_TYPE movq_r64_xmm_opd1_type \opd1 + .if movq_r64_xmm_opd1_type == REG_TYPE_XMM + XMM_NUM movq_r64_xmm_opd1 \opd1 + R64_NUM movq_r64_xmm_opd2 \opd2 + .else + R64_NUM movq_r64_xmm_opd1 \opd1 + XMM_NUM movq_r64_xmm_opd2 \opd2 + .endif + PFX_OPD_SIZE + PFX_REX movq_r64_xmm_opd1 movq_r64_xmm_opd2 1 + .if movq_r64_xmm_opd1_type == REG_TYPE_XMM + .byte 0x0f, 0x7e + .else + .byte 0x0f, 0x6e + .endif + MODRM 0xc0 movq_r64_xmm_opd1 movq_r64_xmm_opd2 + .endm #endif #endif -- cgit v1.2.3-59-g8ed1b From 62e7bec49479e0c61e8cfd914f722a9ca6fd52e5 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Wed, 24 Mar 2010 21:37:57 +0800 Subject: crypto: aesni-intel - Fix another CTR build failure with gas 2.16.1 The previous AES-NI CTR optimization compiling failure gas 2.16.1 fix introduces another compiling failure by itself. This patch fixes that. Reported-by: Andrew Morton Signed-off-by: Huang Ying Signed-off-by: Herbert Xu --- arch/x86/include/asm/inst.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/inst.h b/arch/x86/include/asm/inst.h index 840a399701b2..280bf7fb6aba 100644 --- a/arch/x86/include/asm/inst.h +++ b/arch/x86/include/asm/inst.h @@ -120,9 +120,9 @@ .macro REG_TYPE type reg R64_NUM reg_type_r64 \reg XMM_NUM reg_type_xmm \reg - .if reg_type_r64 != REG_NUM_INVALID + .if reg_type_r64 <> REG_NUM_INVALID \type = REG_TYPE_R64 - .elseif reg_type_xmm != REG_NUM_INVALID + .elseif reg_type_xmm <> REG_NUM_INVALID \type = REG_TYPE_XMM .else \type = REG_TYPE_INVALID -- cgit v1.2.3-59-g8ed1b