aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/crypto/aesni-intel_asm.S
diff options
context:
space:
mode:
authorArd Biesheuvel <ardb@kernel.org>2020-12-31 17:41:54 +0100
committerHerbert Xu <herbert@gondor.apana.org.au>2021-01-08 15:39:47 +1100
commit86ad60a65f29dd862a11c22bb4b5be28d6c5cef1 (patch)
treed0f396b57e398f50604e9e9fb20e793a02b9ccf0 /arch/x86/crypto/aesni-intel_asm.S
parentcrypto: picoxcell - Remove PicoXcell driver (diff)
downloadlinux-dev-86ad60a65f29dd862a11c22bb4b5be28d6c5cef1.tar.xz
linux-dev-86ad60a65f29dd862a11c22bb4b5be28d6c5cef1.zip
crypto: x86/aes-ni-xts - use direct calls to and 4-way stride
The XTS asm helper arrangement is a bit odd: the 8-way stride helper consists of back-to-back calls to the 4-way core transforms, which are called indirectly, based on a boolean that indicates whether we are performing encryption or decryption. Given how costly indirect calls are on x86, let's switch to direct calls, and given how the 8-way stride doesn't really add anything substantial, use a 4-way stride instead, and make the asm core routine deal with any multiple of 4 blocks. Since 512 byte sectors or 4 KB blocks are the typical quantities XTS operates on, increase the stride exported to the glue helper to 512 bytes as well. As a result, the number of indirect calls is reduced from 3 per 64 bytes of in/output to 1 per 512 bytes of in/output, which produces a 65% speedup when operating on 1 KB blocks (measured on a Intel(R) Core(TM) i7-8650U CPU) Fixes: 9697fa39efd3f ("x86/retpoline/crypto: Convert crypto assembler indirect jumps") Tested-by: Eric Biggers <ebiggers@google.com> # x86_64 Signed-off-by: Ard Biesheuvel <ardb@kernel.org> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Diffstat (limited to 'arch/x86/crypto/aesni-intel_asm.S')
-rw-r--r--arch/x86/crypto/aesni-intel_asm.S115
1 files changed, 70 insertions, 45 deletions
diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S
index a2710f76862f..84d8a156cdcd 100644
--- a/arch/x86/crypto/aesni-intel_asm.S
+++ b/arch/x86/crypto/aesni-intel_asm.S
@@ -2842,25 +2842,18 @@ SYM_FUNC_END(aesni_ctr_enc)
pxor CTR, IV;
/*
- * void aesni_xts_crypt8(const struct crypto_aes_ctx *ctx, u8 *dst,
- * const u8 *src, bool enc, le128 *iv)
+ * void aesni_xts_encrypt(const struct crypto_aes_ctx *ctx, u8 *dst,
+ * const u8 *src, unsigned int len, le128 *iv)
*/
-SYM_FUNC_START(aesni_xts_crypt8)
+SYM_FUNC_START(aesni_xts_encrypt)
FRAME_BEGIN
- testb %cl, %cl
- movl $0, %ecx
- movl $240, %r10d
- leaq _aesni_enc4, %r11
- leaq _aesni_dec4, %rax
- cmovel %r10d, %ecx
- cmoveq %rax, %r11
movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
movups (IVP), IV
mov 480(KEYP), KLEN
- addq %rcx, KEYP
+.Lxts_enc_loop4:
movdqa IV, STATE1
movdqu 0x00(INP), INC
pxor INC, STATE1
@@ -2884,71 +2877,103 @@ SYM_FUNC_START(aesni_xts_crypt8)
pxor INC, STATE4
movdqu IV, 0x30(OUTP)
- CALL_NOSPEC r11
+ call _aesni_enc4
movdqu 0x00(OUTP), INC
pxor INC, STATE1
movdqu STATE1, 0x00(OUTP)
- _aesni_gf128mul_x_ble()
- movdqa IV, STATE1
- movdqu 0x40(INP), INC
- pxor INC, STATE1
- movdqu IV, 0x40(OUTP)
-
movdqu 0x10(OUTP), INC
pxor INC, STATE2
movdqu STATE2, 0x10(OUTP)
- _aesni_gf128mul_x_ble()
- movdqa IV, STATE2
- movdqu 0x50(INP), INC
- pxor INC, STATE2
- movdqu IV, 0x50(OUTP)
-
movdqu 0x20(OUTP), INC
pxor INC, STATE3
movdqu STATE3, 0x20(OUTP)
- _aesni_gf128mul_x_ble()
- movdqa IV, STATE3
- movdqu 0x60(INP), INC
- pxor INC, STATE3
- movdqu IV, 0x60(OUTP)
-
movdqu 0x30(OUTP), INC
pxor INC, STATE4
movdqu STATE4, 0x30(OUTP)
_aesni_gf128mul_x_ble()
- movdqa IV, STATE4
- movdqu 0x70(INP), INC
- pxor INC, STATE4
- movdqu IV, 0x70(OUTP)
- _aesni_gf128mul_x_ble()
+ add $64, INP
+ add $64, OUTP
+ sub $64, LEN
+ ja .Lxts_enc_loop4
+
movups IV, (IVP)
- CALL_NOSPEC r11
+ FRAME_END
+ ret
+SYM_FUNC_END(aesni_xts_encrypt)
+
+/*
+ * void aesni_xts_decrypt(const struct crypto_aes_ctx *ctx, u8 *dst,
+ * const u8 *src, unsigned int len, le128 *iv)
+ */
+SYM_FUNC_START(aesni_xts_decrypt)
+ FRAME_BEGIN
+
+ movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
+ movups (IVP), IV
+
+ mov 480(KEYP), KLEN
+ add $240, KEYP
- movdqu 0x40(OUTP), INC
+.Lxts_dec_loop4:
+ movdqa IV, STATE1
+ movdqu 0x00(INP), INC
pxor INC, STATE1
- movdqu STATE1, 0x40(OUTP)
+ movdqu IV, 0x00(OUTP)
- movdqu 0x50(OUTP), INC
+ _aesni_gf128mul_x_ble()
+ movdqa IV, STATE2
+ movdqu 0x10(INP), INC
+ pxor INC, STATE2
+ movdqu IV, 0x10(OUTP)
+
+ _aesni_gf128mul_x_ble()
+ movdqa IV, STATE3
+ movdqu 0x20(INP), INC
+ pxor INC, STATE3
+ movdqu IV, 0x20(OUTP)
+
+ _aesni_gf128mul_x_ble()
+ movdqa IV, STATE4
+ movdqu 0x30(INP), INC
+ pxor INC, STATE4
+ movdqu IV, 0x30(OUTP)
+
+ call _aesni_dec4
+
+ movdqu 0x00(OUTP), INC
+ pxor INC, STATE1
+ movdqu STATE1, 0x00(OUTP)
+
+ movdqu 0x10(OUTP), INC
pxor INC, STATE2
- movdqu STATE2, 0x50(OUTP)
+ movdqu STATE2, 0x10(OUTP)
- movdqu 0x60(OUTP), INC
+ movdqu 0x20(OUTP), INC
pxor INC, STATE3
- movdqu STATE3, 0x60(OUTP)
+ movdqu STATE3, 0x20(OUTP)
- movdqu 0x70(OUTP), INC
+ movdqu 0x30(OUTP), INC
pxor INC, STATE4
- movdqu STATE4, 0x70(OUTP)
+ movdqu STATE4, 0x30(OUTP)
+
+ _aesni_gf128mul_x_ble()
+
+ add $64, INP
+ add $64, OUTP
+ sub $64, LEN
+ ja .Lxts_dec_loop4
+
+ movups IV, (IVP)
FRAME_END
ret
-SYM_FUNC_END(aesni_xts_crypt8)
+SYM_FUNC_END(aesni_xts_decrypt)
#endif