diff options
author | 2024-12-12 13:28:44 -0800 | |
---|---|---|
committer | 2024-12-21 22:46:24 +0800 | |
commit | 68e95f5c6418ce1d0171fa756608a84170c56165 (patch) | |
tree | a58d31bc0b6eecfea00d88c907286bd314ba6c1d | |
parent | crypto: x86/aes-xts - change len parameter to int (diff) | |
download | wireguard-linux-68e95f5c6418ce1d0171fa756608a84170c56165.tar.xz wireguard-linux-68e95f5c6418ce1d0171fa756608a84170c56165.zip |
crypto: x86/aes-xts - more code size optimizations
Prefer immediates of -128 to 128, since the former fits in a signed
byte, saving 3 bytes per instruction. Also prefer VEX-coded
instructions to EVEX where this is easy to do.
Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
-rw-r--r-- | arch/x86/crypto/aes-xts-avx-x86_64.S | 24 |
1 files changed, 13 insertions, 11 deletions
diff --git a/arch/x86/crypto/aes-xts-avx-x86_64.S b/arch/x86/crypto/aes-xts-avx-x86_64.S index c4e8ba6ed61d..0e6b9ae12e95 100644 --- a/arch/x86/crypto/aes-xts-avx-x86_64.S +++ b/arch/x86/crypto/aes-xts-avx-x86_64.S @@ -188,6 +188,7 @@ .endm // Move a vector between memory and a register. +// The register operand must be in the first 16 vector registers. .macro _vmovdqu src, dst .if VL < 64 vmovdqu \src, \dst @@ -208,11 +209,12 @@ .endm // XOR two vectors together. +// Any register operands must be in the first 16 vector registers. .macro _vpxor src1, src2, dst -.if USE_AVX10 - vpxord \src1, \src2, \dst -.else +.if VL < 64 vpxor \src1, \src2, \dst +.else + vpxord \src1, \src2, \dst .endif .endm @@ -555,7 +557,7 @@ // Compute the first set of tweaks TWEAK[0-3]. _compute_first_set_of_tweaks - sub $4*VL, LEN + add $-4*VL, LEN // shorter than 'sub 4*VL' when VL=32 jl .Lhandle_remainder\@ .Lmain_loop\@: @@ -563,10 +565,10 @@ // XOR each source block with its tweak and the zero-th round key. .if USE_AVX10 - vmovdqu8 0*VL(SRC), V0 - vmovdqu8 1*VL(SRC), V1 - vmovdqu8 2*VL(SRC), V2 - vmovdqu8 3*VL(SRC), V3 + _vmovdqu 0*VL(SRC), V0 + _vmovdqu 1*VL(SRC), V1 + _vmovdqu 2*VL(SRC), V2 + _vmovdqu 3*VL(SRC), V3 vpternlogd $0x96, TWEAK0, KEY0, V0 vpternlogd $0x96, TWEAK1, KEY0, V1 vpternlogd $0x96, TWEAK2, KEY0, V2 @@ -612,9 +614,9 @@ // Finish computing the next set of tweaks. _tweak_step 1000 - add $4*VL, SRC - add $4*VL, DST - sub $4*VL, LEN + sub $-4*VL, SRC // shorter than 'add 4*VL' when VL=32 + sub $-4*VL, DST + add $-4*VL, LEN jge .Lmain_loop\@ // Check for the uncommon case where the data length isn't a multiple of |