crypto: x86/aes-xts - more code size optimizations

Prefer immediates of -128 to 128, since the former fits in a signed byte, saving 3 bytes per instruction. Also prefer VEX-coded instructions to EVEX where this is easy to do. Signed-off-by: Eric Biggers <ebiggers@google.com> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
author: Eric Biggers <ebiggers@google.com> 2024-12-12 13:28:44 -0800
committer: Herbert Xu <herbert@gondor.apana.org.au> 2024-12-21 22:46:24 +0800
commit: 68e95f5c6418ce1d0171fa756608a84170c56165 (patch)
tree: a58d31bc0b6eecfea00d88c907286bd314ba6c1d
parent: crypto: x86/aes-xts - change len parameter to int (diff)
download: wireguard-linux-68e95f5c6418ce1d0171fa756608a84170c56165.tar.xz
wireguard-linux-68e95f5c6418ce1d0171fa756608a84170c56165.zip
1 files changed, 13 insertions, 11 deletions
diff --git a/arch/x86/crypto/aes-xts-avx-x86_64.S b/arch/x86/crypto/aes-xts-avx-x86_64.S
index c4e8ba6ed61d..0e6b9ae12e95 100644
--- a/arch/x86/crypto/aes-xts-avx-x86_64.S
+++ b/arch/x86/crypto/aes-xts-avx-x86_64.S
@@ -188,6 +188,7 @@
 .endm
 
 // Move a vector between memory and a register.
+// The register operand must be in the first 16 vector registers.
 .macro	_vmovdqu	src, dst
 .if VL < 64
 	vmovdqu		\src, \dst
@@ -208,11 +209,12 @@
 .endm
 
 // XOR two vectors together.
+// Any register operands must be in the first 16 vector registers.
 .macro	_vpxor	src1, src2, dst
-.if USE_AVX10
-	vpxord		\src1, \src2, \dst
-.else
+.if VL < 64
 	vpxor		\src1, \src2, \dst
+.else
+	vpxord		\src1, \src2, \dst
 .endif
 .endm
 
@@ -555,7 +557,7 @@
 	// Compute the first set of tweaks TWEAK[0-3].
 	_compute_first_set_of_tweaks
 
-	sub		$4*VL, LEN
+	add		$-4*VL, LEN  // shorter than 'sub 4*VL' when VL=32
 	jl		.Lhandle_remainder\@
 
 .Lmain_loop\@:
@@ -563,10 +565,10 @@
 
 	// XOR each source block with its tweak and the zero-th round key.
 .if USE_AVX10
-	vmovdqu8	0*VL(SRC), V0
-	vmovdqu8	1*VL(SRC), V1
-	vmovdqu8	2*VL(SRC), V2
-	vmovdqu8	3*VL(SRC), V3
+	_vmovdqu	0*VL(SRC), V0
+	_vmovdqu	1*VL(SRC), V1
+	_vmovdqu	2*VL(SRC), V2
+	_vmovdqu	3*VL(SRC), V3
 	vpternlogd	$0x96, TWEAK0, KEY0, V0
 	vpternlogd	$0x96, TWEAK1, KEY0, V1
 	vpternlogd	$0x96, TWEAK2, KEY0, V2
@@ -612,9 +614,9 @@
 	// Finish computing the next set of tweaks.
 	_tweak_step	1000
 
-	add		$4*VL, SRC
-	add		$4*VL, DST
-	sub		$4*VL, LEN
+	sub		$-4*VL, SRC  // shorter than 'add 4*VL' when VL=32
+	sub		$-4*VL, DST
+	add		$-4*VL, LEN
 	jge		.Lmain_loop\@
 
 	// Check for the uncommon case where the data length isn't a multiple of
author	Eric Biggers <ebiggers@google.com>	2024-12-12 13:28:44 -0800
committer	Herbert Xu <herbert@gondor.apana.org.au>	2024-12-21 22:46:24 +0800
commit	68e95f5c6418ce1d0171fa756608a84170c56165 (patch)
tree	a58d31bc0b6eecfea00d88c907286bd314ba6c1d
parent	crypto: x86/aes-xts - change len parameter to int (diff)
download	wireguard-linux-68e95f5c6418ce1d0171fa756608a84170c56165.tar.xz wireguard-linux-68e95f5c6418ce1d0171fa756608a84170c56165.zip