aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/crypto
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/crypto')
-rw-r--r--arch/x86/crypto/.gitignore1
-rw-r--r--arch/x86/crypto/Kconfig484
-rw-r--r--arch/x86/crypto/Makefile181
-rw-r--r--arch/x86/crypto/aegis128-aesni-asm.S48
-rw-r--r--arch/x86/crypto/aes_ctrby8_avx-x86_64.S308
-rw-r--r--arch/x86/crypto/aes_glue.c1
-rw-r--r--arch/x86/crypto/aesni-intel_asm.S1291
-rw-r--r--arch/x86/crypto/aesni-intel_avx-x86_64.S95
-rw-r--r--arch/x86/crypto/aesni-intel_glue.c881
-rw-r--r--arch/x86/crypto/aria-aesni-avx-asm_64.S1303
-rw-r--r--arch/x86/crypto/aria-avx.h16
-rw-r--r--arch/x86/crypto/aria_aesni_avx_glue.c213
-rw-r--r--arch/x86/crypto/blake2s-core.S6
-rw-r--r--arch/x86/crypto/blake2s-glue.c180
-rw-r--r--arch/x86/crypto/blowfish-x86_64-asm_64.S17
-rw-r--r--arch/x86/crypto/blowfish_glue.c131
-rw-r--r--arch/x86/crypto/camellia-aesni-avx-asm_64.S312
-rw-r--r--arch/x86/crypto/camellia-aesni-avx2-asm_64.S370
-rw-r--r--arch/x86/crypto/camellia-x86_64-asm_64.S12
-rw-r--r--arch/x86/crypto/camellia.h67
-rw-r--r--arch/x86/crypto/camellia_aesni_avx2_glue.c198
-rw-r--r--arch/x86/crypto/camellia_aesni_avx_glue.c216
-rw-r--r--arch/x86/crypto/camellia_glue.c153
-rw-r--r--arch/x86/crypto/cast5-avx-x86_64-asm_64.S12
-rw-r--r--arch/x86/crypto/cast5_avx_glue.c287
-rw-r--r--arch/x86/crypto/cast6-avx-x86_64-asm_64.S94
-rw-r--r--arch/x86/crypto/cast6_avx_glue.c207
-rw-r--r--arch/x86/crypto/chacha-avx2-x86_64.S6
-rw-r--r--arch/x86/crypto/chacha-avx512vl-x86_64.S10
-rw-r--r--arch/x86/crypto/chacha-ssse3-x86_64.S24
-rw-r--r--arch/x86/crypto/chacha_glue.c38
-rw-r--r--arch/x86/crypto/crc32-pclmul_asm.S73
-rw-r--r--arch/x86/crypto/crc32-pclmul_glue.c4
-rw-r--r--arch/x86/crypto/crc32c-intel_glue.c20
-rw-r--r--arch/x86/crypto/crc32c-pcl-intel-asm_64.S35
-rw-r--r--arch/x86/crypto/crct10dif-pcl-asm_64.S2
-rw-r--r--arch/x86/crypto/crct10dif-pclmul_glue.c2
-rw-r--r--arch/x86/crypto/curve25519-x86_64.c3754
-rw-r--r--arch/x86/crypto/des3_ede-asm_64.S4
-rw-r--r--arch/x86/crypto/des3_ede_glue.c116
-rw-r--r--arch/x86/crypto/ecb_cbc_helpers.h76
-rw-r--r--arch/x86/crypto/ghash-clmulni-intel_asm.S23
-rw-r--r--arch/x86/crypto/ghash-clmulni-intel_glue.c2
-rw-r--r--arch/x86/crypto/glue_helper-asm-avx.S104
-rw-r--r--arch/x86/crypto/glue_helper-asm-avx2.S136
-rw-r--r--arch/x86/crypto/glue_helper.c381
-rw-r--r--arch/x86/crypto/nh-avx2-x86_64.S2
-rw-r--r--arch/x86/crypto/nh-sse2-x86_64.S2
-rw-r--r--arch/x86/crypto/nhpoly1305-avx2-glue.c3
-rw-r--r--arch/x86/crypto/nhpoly1305-sse2-glue.c3
-rw-r--r--arch/x86/crypto/poly1305-x86_64-cryptogams.pl64
-rw-r--r--arch/x86/crypto/poly1305_glue.c35
-rw-r--r--arch/x86/crypto/polyval-clmulni_asm.S321
-rw-r--r--arch/x86/crypto/polyval-clmulni_glue.c212
-rw-r--r--arch/x86/crypto/serpent-avx-x86_64-asm_64.S78
-rw-r--r--arch/x86/crypto/serpent-avx.h21
-rw-r--r--arch/x86/crypto/serpent-avx2-asm_64.S97
-rw-r--r--arch/x86/crypto/serpent-sse2-i586-asm_32.S6
-rw-r--r--arch/x86/crypto/serpent-sse2-x86_64-asm_64.S6
-rw-r--r--arch/x86/crypto/serpent-sse2.h60
-rw-r--r--arch/x86/crypto/serpent_avx2_glue.c193
-rw-r--r--arch/x86/crypto/serpent_avx_glue.c215
-rw-r--r--arch/x86/crypto/serpent_sse2_glue.c150
-rw-r--r--arch/x86/crypto/sha1_avx2_x86_64_asm.S10
-rw-r--r--arch/x86/crypto/sha1_ni_asm.S10
-rw-r--r--arch/x86/crypto/sha1_ssse3_asm.S6
-rw-r--r--arch/x86/crypto/sha1_ssse3_glue.c16
-rw-r--r--arch/x86/crypto/sha256-avx-asm.S5
-rw-r--r--arch/x86/crypto/sha256-avx2-asm.S18
-rw-r--r--arch/x86/crypto/sha256-ssse3-asm.S2
-rw-r--r--arch/x86/crypto/sha256_ni_asm.S2
-rw-r--r--arch/x86/crypto/sha256_ssse3_glue.c15
-rw-r--r--arch/x86/crypto/sha512-avx-asm.S47
-rw-r--r--arch/x86/crypto/sha512-avx2-asm.S47
-rw-r--r--arch/x86/crypto/sha512-ssse3-asm.S45
-rw-r--r--arch/x86/crypto/sha512_ssse3_glue.c23
-rw-r--r--arch/x86/crypto/sm3-avx-asm_64.S517
-rw-r--r--arch/x86/crypto/sm3_avx_glue.c134
-rw-r--r--arch/x86/crypto/sm4-aesni-avx-asm_64.S594
-rw-r--r--arch/x86/crypto/sm4-aesni-avx2-asm_64.S501
-rw-r--r--arch/x86/crypto/sm4-avx.h24
-rw-r--r--arch/x86/crypto/sm4_aesni_avx2_glue.c169
-rw-r--r--arch/x86/crypto/sm4_aesni_avx_glue.c487
-rw-r--r--arch/x86/crypto/twofish-avx-x86_64-asm_64.S90
-rw-r--r--arch/x86/crypto/twofish-i586-asm_32.S4
-rw-r--r--arch/x86/crypto/twofish-x86_64-asm_64-3way.S8
-rw-r--r--arch/x86/crypto/twofish-x86_64-asm_64.S4
-rw-r--r--arch/x86/crypto/twofish.h21
-rw-r--r--arch/x86/crypto/twofish_avx_glue.c211
-rw-r--r--arch/x86/crypto/twofish_glue.c8
-rw-r--r--arch/x86/crypto/twofish_glue_3way.c170
91 files changed, 8939 insertions, 7611 deletions
diff --git a/arch/x86/crypto/.gitignore b/arch/x86/crypto/.gitignore
index 30be0400a439..580c839bb177 100644
--- a/arch/x86/crypto/.gitignore
+++ b/arch/x86/crypto/.gitignore
@@ -1 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0-only
poly1305-x86_64-cryptogams.S
diff --git a/arch/x86/crypto/Kconfig b/arch/x86/crypto/Kconfig
new file mode 100644
index 000000000000..71c4c473d34b
--- /dev/null
+++ b/arch/x86/crypto/Kconfig
@@ -0,0 +1,484 @@
+# SPDX-License-Identifier: GPL-2.0
+
+menu "Accelerated Cryptographic Algorithms for CPU (x86)"
+
+config CRYPTO_CURVE25519_X86
+ tristate "Public key crypto: Curve25519 (ADX)"
+ depends on X86 && 64BIT
+ select CRYPTO_LIB_CURVE25519_GENERIC
+ select CRYPTO_ARCH_HAVE_LIB_CURVE25519
+ help
+ Curve25519 algorithm
+
+ Architecture: x86_64 using:
+ - ADX (large integer arithmetic)
+
+config CRYPTO_AES_NI_INTEL
+ tristate "Ciphers: AES, modes: ECB, CBC, CTS, CTR, XTR, XTS, GCM (AES-NI)"
+ depends on X86
+ select CRYPTO_AEAD
+ select CRYPTO_LIB_AES
+ select CRYPTO_ALGAPI
+ select CRYPTO_SKCIPHER
+ select CRYPTO_SIMD
+ help
+ Block cipher: AES cipher algorithms
+ AEAD cipher: AES with GCM
+ Length-preserving ciphers: AES with ECB, CBC, CTS, CTR, XTR, XTS
+
+ Architecture: x86 (32-bit and 64-bit) using:
+ - AES-NI (AES new instructions)
+
+config CRYPTO_BLOWFISH_X86_64
+ tristate "Ciphers: Blowfish, modes: ECB, CBC"
+ depends on X86 && 64BIT
+ select CRYPTO_SKCIPHER
+ select CRYPTO_BLOWFISH_COMMON
+ imply CRYPTO_CTR
+ help
+ Block cipher: Blowfish cipher algorithm
+ Length-preserving ciphers: Blowfish with ECB and CBC modes
+
+ Architecture: x86_64
+
+config CRYPTO_CAMELLIA_X86_64
+ tristate "Ciphers: Camellia with modes: ECB, CBC"
+ depends on X86 && 64BIT
+ select CRYPTO_SKCIPHER
+ imply CRYPTO_CTR
+ help
+ Block cipher: Camellia cipher algorithms
+ Length-preserving ciphers: Camellia with ECB and CBC modes
+
+ Architecture: x86_64
+
+config CRYPTO_CAMELLIA_AESNI_AVX_X86_64
+ tristate "Ciphers: Camellia with modes: ECB, CBC (AES-NI/AVX)"
+ depends on X86 && 64BIT
+ select CRYPTO_SKCIPHER
+ select CRYPTO_CAMELLIA_X86_64
+ select CRYPTO_SIMD
+ imply CRYPTO_XTS
+ help
+ Length-preserving ciphers: Camellia with ECB and CBC modes
+
+ Architecture: x86_64 using:
+ - AES-NI (AES New Instructions)
+ - AVX (Advanced Vector Extensions)
+
+config CRYPTO_CAMELLIA_AESNI_AVX2_X86_64
+ tristate "Ciphers: Camellia with modes: ECB, CBC (AES-NI/AVX2)"
+ depends on X86 && 64BIT
+ select CRYPTO_CAMELLIA_AESNI_AVX_X86_64
+ help
+ Length-preserving ciphers: Camellia with ECB and CBC modes
+
+ Architecture: x86_64 using:
+ - AES-NI (AES New Instructions)
+ - AVX2 (Advanced Vector Extensions 2)
+
+config CRYPTO_CAST5_AVX_X86_64
+ tristate "Ciphers: CAST5 with modes: ECB, CBC (AVX)"
+ depends on X86 && 64BIT
+ select CRYPTO_SKCIPHER
+ select CRYPTO_CAST5
+ select CRYPTO_CAST_COMMON
+ select CRYPTO_SIMD
+ imply CRYPTO_CTR
+ help
+ Length-preserving ciphers: CAST5 (CAST-128) cipher algorithm
+ (RFC2144) with ECB and CBC modes
+
+ Architecture: x86_64 using:
+ - AVX (Advanced Vector Extensions)
+
+ Processes 16 blocks in parallel.
+
+config CRYPTO_CAST6_AVX_X86_64
+ tristate "Ciphers: CAST6 with modes: ECB, CBC (AVX)"
+ depends on X86 && 64BIT
+ select CRYPTO_SKCIPHER
+ select CRYPTO_CAST6
+ select CRYPTO_CAST_COMMON
+ select CRYPTO_SIMD
+ imply CRYPTO_XTS
+ imply CRYPTO_CTR
+ help
+ Length-preserving ciphers: CAST6 (CAST-256) cipher algorithm
+ (RFC2612) with ECB and CBC modes
+
+ Architecture: x86_64 using:
+ - AVX (Advanced Vector Extensions)
+
+ Processes eight blocks in parallel.
+
+config CRYPTO_DES3_EDE_X86_64
+ tristate "Ciphers: Triple DES EDE with modes: ECB, CBC"
+ depends on X86 && 64BIT
+ select CRYPTO_SKCIPHER
+ select CRYPTO_LIB_DES
+ imply CRYPTO_CTR
+ help
+ Block cipher: Triple DES EDE (FIPS 46-3) cipher algorithm
+ Length-preserving ciphers: Triple DES EDE with ECB and CBC modes
+
+ Architecture: x86_64
+
+ Processes one or three blocks in parallel.
+
+config CRYPTO_SERPENT_SSE2_X86_64
+ tristate "Ciphers: Serpent with modes: ECB, CBC (SSE2)"
+ depends on X86 && 64BIT
+ select CRYPTO_SKCIPHER
+ select CRYPTO_SERPENT
+ select CRYPTO_SIMD
+ imply CRYPTO_CTR
+ help
+ Length-preserving ciphers: Serpent cipher algorithm
+ with ECB and CBC modes
+
+ Architecture: x86_64 using:
+ - SSE2 (Streaming SIMD Extensions 2)
+
+ Processes eight blocks in parallel.
+
+config CRYPTO_SERPENT_SSE2_586
+ tristate "Ciphers: Serpent with modes: ECB, CBC (32-bit with SSE2)"
+ depends on X86 && !64BIT
+ select CRYPTO_SKCIPHER
+ select CRYPTO_SERPENT
+ select CRYPTO_SIMD
+ imply CRYPTO_CTR
+ help
+ Length-preserving ciphers: Serpent cipher algorithm
+ with ECB and CBC modes
+
+ Architecture: x86 (32-bit) using:
+ - SSE2 (Streaming SIMD Extensions 2)
+
+ Processes four blocks in parallel.
+
+config CRYPTO_SERPENT_AVX_X86_64
+ tristate "Ciphers: Serpent with modes: ECB, CBC (AVX)"
+ depends on X86 && 64BIT
+ select CRYPTO_SKCIPHER
+ select CRYPTO_SERPENT
+ select CRYPTO_SIMD
+ imply CRYPTO_XTS
+ imply CRYPTO_CTR
+ help
+ Length-preserving ciphers: Serpent cipher algorithm
+ with ECB and CBC modes
+
+ Architecture: x86_64 using:
+ - AVX (Advanced Vector Extensions)
+
+ Processes eight blocks in parallel.
+
+config CRYPTO_SERPENT_AVX2_X86_64
+ tristate "Ciphers: Serpent with modes: ECB, CBC (AVX2)"
+ depends on X86 && 64BIT
+ select CRYPTO_SERPENT_AVX_X86_64
+ help
+ Length-preserving ciphers: Serpent cipher algorithm
+ with ECB and CBC modes
+
+ Architecture: x86_64 using:
+ - AVX2 (Advanced Vector Extensions 2)
+
+ Processes 16 blocks in parallel.
+
+config CRYPTO_SM4_AESNI_AVX_X86_64
+ tristate "Ciphers: SM4 with modes: ECB, CBC, CFB, CTR (AES-NI/AVX)"
+ depends on X86 && 64BIT
+ select CRYPTO_SKCIPHER
+ select CRYPTO_SIMD
+ select CRYPTO_ALGAPI
+ select CRYPTO_SM4
+ help
+ Length-preserving ciphers: SM4 cipher algorithms
+ (OSCCA GB/T 32907-2016) with ECB, CBC, CFB, and CTR modes
+
+ Architecture: x86_64 using:
+ - AES-NI (AES New Instructions)
+ - AVX (Advanced Vector Extensions)
+
+ Through two affine transforms,
+ we can use the AES S-Box to simulate the SM4 S-Box to achieve the
+ effect of instruction acceleration.
+
+ If unsure, say N.
+
+config CRYPTO_SM4_AESNI_AVX2_X86_64
+ tristate "Ciphers: SM4 with modes: ECB, CBC, CFB, CTR (AES-NI/AVX2)"
+ depends on X86 && 64BIT
+ select CRYPTO_SKCIPHER
+ select CRYPTO_SIMD
+ select CRYPTO_ALGAPI
+ select CRYPTO_SM4
+ select CRYPTO_SM4_AESNI_AVX_X86_64
+ help
+ Length-preserving ciphers: SM4 cipher algorithms
+ (OSCCA GB/T 32907-2016) with ECB, CBC, CFB, and CTR modes
+
+ Architecture: x86_64 using:
+ - AES-NI (AES New Instructions)
+ - AVX2 (Advanced Vector Extensions 2)
+
+ Through two affine transforms,
+ we can use the AES S-Box to simulate the SM4 S-Box to achieve the
+ effect of instruction acceleration.
+
+ If unsure, say N.
+
+config CRYPTO_TWOFISH_586
+ tristate "Ciphers: Twofish (32-bit)"
+ depends on (X86 || UML_X86) && !64BIT
+ select CRYPTO_ALGAPI
+ select CRYPTO_TWOFISH_COMMON
+ imply CRYPTO_CTR
+ help
+ Block cipher: Twofish cipher algorithm
+
+ Architecture: x86 (32-bit)
+
+config CRYPTO_TWOFISH_X86_64
+ tristate "Ciphers: Twofish"
+ depends on (X86 || UML_X86) && 64BIT
+ select CRYPTO_ALGAPI
+ select CRYPTO_TWOFISH_COMMON
+ imply CRYPTO_CTR
+ help
+ Block cipher: Twofish cipher algorithm
+
+ Architecture: x86_64
+
+config CRYPTO_TWOFISH_X86_64_3WAY
+ tristate "Ciphers: Twofish with modes: ECB, CBC (3-way parallel)"
+ depends on X86 && 64BIT
+ select CRYPTO_SKCIPHER
+ select CRYPTO_TWOFISH_COMMON
+ select CRYPTO_TWOFISH_X86_64
+ help
+ Length-preserving cipher: Twofish cipher algorithm
+ with ECB and CBC modes
+
+ Architecture: x86_64
+
+ Processes three blocks in parallel, better utilizing resources of
+ out-of-order CPUs.
+
+config CRYPTO_TWOFISH_AVX_X86_64
+ tristate "Ciphers: Twofish with modes: ECB, CBC (AVX)"
+ depends on X86 && 64BIT
+ select CRYPTO_SKCIPHER
+ select CRYPTO_SIMD
+ select CRYPTO_TWOFISH_COMMON
+ select CRYPTO_TWOFISH_X86_64
+ select CRYPTO_TWOFISH_X86_64_3WAY
+ imply CRYPTO_XTS
+ help
+ Length-preserving cipher: Twofish cipher algorithm
+ with ECB and CBC modes
+
+ Architecture: x86_64 using:
+ - AVX (Advanced Vector Extensions)
+
+ Processes eight blocks in parallel.
+
+config CRYPTO_ARIA_AESNI_AVX_X86_64
+ tristate "Ciphers: ARIA with modes: ECB, CTR (AES-NI/AVX/GFNI)"
+ depends on X86 && 64BIT
+ select CRYPTO_SKCIPHER
+ select CRYPTO_SIMD
+ select CRYPTO_ALGAPI
+ select CRYPTO_ARIA
+ help
+ Length-preserving cipher: ARIA cipher algorithms
+ (RFC 5794) with ECB and CTR modes
+
+ Architecture: x86_64 using:
+ - AES-NI (AES New Instructions)
+ - AVX (Advanced Vector Extensions)
+ - GFNI (Galois Field New Instructions)
+
+ Processes 16 blocks in parallel.
+
+config CRYPTO_CHACHA20_X86_64
+ tristate "Ciphers: ChaCha20, XChaCha20, XChaCha12 (SSSE3/AVX2/AVX-512VL)"
+ depends on X86 && 64BIT
+ select CRYPTO_SKCIPHER
+ select CRYPTO_LIB_CHACHA_GENERIC
+ select CRYPTO_ARCH_HAVE_LIB_CHACHA
+ help
+ Length-preserving ciphers: ChaCha20, XChaCha20, and XChaCha12
+ stream cipher algorithms
+
+ Architecture: x86_64 using:
+ - SSSE3 (Supplemental SSE3)
+ - AVX2 (Advanced Vector Extensions 2)
+ - AVX-512VL (Advanced Vector Extensions-512VL)
+
+config CRYPTO_AEGIS128_AESNI_SSE2
+ tristate "AEAD ciphers: AEGIS-128 (AES-NI/SSE2)"
+ depends on X86 && 64BIT
+ select CRYPTO_AEAD
+ select CRYPTO_SIMD
+ help
+ AEGIS-128 AEAD algorithm
+
+ Architecture: x86_64 using:
+ - AES-NI (AES New Instructions)
+ - SSE2 (Streaming SIMD Extensions 2)
+
+config CRYPTO_NHPOLY1305_SSE2
+ tristate "Hash functions: NHPoly1305 (SSE2)"
+ depends on X86 && 64BIT
+ select CRYPTO_NHPOLY1305
+ help
+ NHPoly1305 hash function for Adiantum
+
+ Architecture: x86_64 using:
+ - SSE2 (Streaming SIMD Extensions 2)
+
+config CRYPTO_NHPOLY1305_AVX2
+ tristate "Hash functions: NHPoly1305 (AVX2)"
+ depends on X86 && 64BIT
+ select CRYPTO_NHPOLY1305
+ help
+ NHPoly1305 hash function for Adiantum
+
+ Architecture: x86_64 using:
+ - AVX2 (Advanced Vector Extensions 2)
+
+config CRYPTO_BLAKE2S_X86
+ bool "Hash functions: BLAKE2s (SSSE3/AVX-512)"
+ depends on X86 && 64BIT
+ select CRYPTO_LIB_BLAKE2S_GENERIC
+ select CRYPTO_ARCH_HAVE_LIB_BLAKE2S
+ help
+ BLAKE2s cryptographic hash function (RFC 7693)
+
+ Architecture: x86_64 using:
+ - SSSE3 (Supplemental SSE3)
+ - AVX-512 (Advanced Vector Extensions-512)
+
+config CRYPTO_POLYVAL_CLMUL_NI
+ tristate "Hash functions: POLYVAL (CLMUL-NI)"
+ depends on X86 && 64BIT
+ select CRYPTO_POLYVAL
+ help
+ POLYVAL hash function for HCTR2
+
+ Architecture: x86_64 using:
+ - CLMUL-NI (carry-less multiplication new instructions)
+
+config CRYPTO_POLY1305_X86_64
+ tristate "Hash functions: Poly1305 (SSE2/AVX2)"
+ depends on X86 && 64BIT
+ select CRYPTO_LIB_POLY1305_GENERIC
+ select CRYPTO_ARCH_HAVE_LIB_POLY1305
+ help
+ Poly1305 authenticator algorithm (RFC7539)
+
+ Architecture: x86_64 using:
+ - SSE2 (Streaming SIMD Extensions 2)
+ - AVX2 (Advanced Vector Extensions 2)
+
+config CRYPTO_SHA1_SSSE3
+ tristate "Hash functions: SHA-1 (SSSE3/AVX/AVX2/SHA-NI)"
+ depends on X86 && 64BIT
+ select CRYPTO_SHA1
+ select CRYPTO_HASH
+ help
+ SHA-1 secure hash algorithm (FIPS 180)
+
+ Architecture: x86_64 using:
+ - SSSE3 (Supplemental SSE3)
+ - AVX (Advanced Vector Extensions)
+ - AVX2 (Advanced Vector Extensions 2)
+ - SHA-NI (SHA Extensions New Instructions)
+
+config CRYPTO_SHA256_SSSE3
+ tristate "Hash functions: SHA-224 and SHA-256 (SSSE3/AVX/AVX2/SHA-NI)"
+ depends on X86 && 64BIT
+ select CRYPTO_SHA256
+ select CRYPTO_HASH
+ help
+ SHA-224 and SHA-256 secure hash algorithms (FIPS 180)
+
+ Architecture: x86_64 using:
+ - SSSE3 (Supplemental SSE3)
+ - AVX (Advanced Vector Extensions)
+ - AVX2 (Advanced Vector Extensions 2)
+ - SHA-NI (SHA Extensions New Instructions)
+
+config CRYPTO_SHA512_SSSE3
+ tristate "Hash functions: SHA-384 and SHA-512 (SSSE3/AVX/AVX2)"
+ depends on X86 && 64BIT
+ select CRYPTO_SHA512
+ select CRYPTO_HASH
+ help
+ SHA-384 and SHA-512 secure hash algorithms (FIPS 180)
+
+ Architecture: x86_64 using:
+ - SSSE3 (Supplemental SSE3)
+ - AVX (Advanced Vector Extensions)
+ - AVX2 (Advanced Vector Extensions 2)
+
+config CRYPTO_SM3_AVX_X86_64
+ tristate "Hash functions: SM3 (AVX)"
+ depends on X86 && 64BIT
+ select CRYPTO_HASH
+ select CRYPTO_SM3
+ help
+ SM3 secure hash function as defined by OSCCA GM/T 0004-2012 SM3
+
+ Architecture: x86_64 using:
+ - AVX (Advanced Vector Extensions)
+
+ If unsure, say N.
+
+config CRYPTO_GHASH_CLMUL_NI_INTEL
+ tristate "Hash functions: GHASH (CLMUL-NI)"
+ depends on X86 && 64BIT
+ select CRYPTO_CRYPTD
+ help
+ GCM GHASH hash function (NIST SP800-38D)
+
+ Architecture: x86_64 using:
+ - CLMUL-NI (carry-less multiplication new instructions)
+
+config CRYPTO_CRC32C_INTEL
+ tristate "CRC32c (SSE4.2/PCLMULQDQ)"
+ depends on X86
+ select CRYPTO_HASH
+ help
+ CRC32c CRC algorithm with the iSCSI polynomial (RFC 3385 and RFC 3720)
+
+ Architecture: x86 (32-bit and 64-bit) using:
+ - SSE4.2 (Streaming SIMD Extensions 4.2) CRC32 instruction
+ - PCLMULQDQ (carry-less multiplication)
+
+config CRYPTO_CRC32_PCLMUL
+ tristate "CRC32 (PCLMULQDQ)"
+ depends on X86
+ select CRYPTO_HASH
+ select CRC32
+ help
+ CRC32 CRC algorithm (IEEE 802.3)
+
+ Architecture: x86 (32-bit and 64-bit) using:
+ - PCLMULQDQ (carry-less multiplication)
+
+config CRYPTO_CRCT10DIF_PCLMUL
+ tristate "CRCT10DIF (PCLMULQDQ)"
+ depends on X86 && 64BIT && CRC_T10DIF
+ select CRYPTO_HASH
+ help
+ CRC16 CRC algorithm used for the T10 (SCSI) Data Integrity Field (DIF)
+
+ Architecture: x86_64 using:
+ - PCLMULQDQ (carry-less multiplication)
+
+endmenu
diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index 8c2e9eadee8a..3b1d701a4f6c 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -1,130 +1,107 @@
# SPDX-License-Identifier: GPL-2.0
#
-# Arch-specific CryptoAPI modules.
-#
-
-OBJECT_FILES_NON_STANDARD := y
-
-avx_supported := $(call as-instr,vpxor %xmm0$(comma)%xmm0$(comma)%xmm0,yes,no)
-avx2_supported := $(call as-instr,vpgatherdd %ymm0$(comma)(%eax$(comma)%ymm1\
- $(comma)4)$(comma)%ymm2,yes,no)
-avx512_supported :=$(call as-instr,vpmovm2b %k1$(comma)%zmm5,yes,no)
-sha1_ni_supported :=$(call as-instr,sha1msg1 %xmm0$(comma)%xmm1,yes,no)
-sha256_ni_supported :=$(call as-instr,sha256msg1 %xmm0$(comma)%xmm1,yes,no)
-adx_supported := $(call as-instr,adox %r10$(comma)%r10,yes,no)
-
-obj-$(CONFIG_CRYPTO_GLUE_HELPER_X86) += glue_helper.o
+# x86 crypto algorithms
obj-$(CONFIG_CRYPTO_TWOFISH_586) += twofish-i586.o
+twofish-i586-y := twofish-i586-asm_32.o twofish_glue.o
+obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o
+twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_glue.o
+obj-$(CONFIG_CRYPTO_TWOFISH_X86_64_3WAY) += twofish-x86_64-3way.o
+twofish-x86_64-3way-y := twofish-x86_64-asm_64-3way.o twofish_glue_3way.o
+obj-$(CONFIG_CRYPTO_TWOFISH_AVX_X86_64) += twofish-avx-x86_64.o
+twofish-avx-x86_64-y := twofish-avx-x86_64-asm_64.o twofish_avx_glue.o
+
obj-$(CONFIG_CRYPTO_SERPENT_SSE2_586) += serpent-sse2-i586.o
+serpent-sse2-i586-y := serpent-sse2-i586-asm_32.o serpent_sse2_glue.o
+obj-$(CONFIG_CRYPTO_SERPENT_SSE2_X86_64) += serpent-sse2-x86_64.o
+serpent-sse2-x86_64-y := serpent-sse2-x86_64-asm_64.o serpent_sse2_glue.o
+obj-$(CONFIG_CRYPTO_SERPENT_AVX_X86_64) += serpent-avx-x86_64.o
+serpent-avx-x86_64-y := serpent-avx-x86_64-asm_64.o serpent_avx_glue.o
+obj-$(CONFIG_CRYPTO_SERPENT_AVX2_X86_64) += serpent-avx2.o
+serpent-avx2-y := serpent-avx2-asm_64.o serpent_avx2_glue.o
obj-$(CONFIG_CRYPTO_DES3_EDE_X86_64) += des3_ede-x86_64.o
+des3_ede-x86_64-y := des3_ede-asm_64.o des3_ede_glue.o
+
obj-$(CONFIG_CRYPTO_CAMELLIA_X86_64) += camellia-x86_64.o
+camellia-x86_64-y := camellia-x86_64-asm_64.o camellia_glue.o
+obj-$(CONFIG_CRYPTO_CAMELLIA_AESNI_AVX_X86_64) += camellia-aesni-avx-x86_64.o
+camellia-aesni-avx-x86_64-y := camellia-aesni-avx-asm_64.o camellia_aesni_avx_glue.o
+obj-$(CONFIG_CRYPTO_CAMELLIA_AESNI_AVX2_X86_64) += camellia-aesni-avx2.o
+camellia-aesni-avx2-y := camellia-aesni-avx2-asm_64.o camellia_aesni_avx2_glue.o
+
obj-$(CONFIG_CRYPTO_BLOWFISH_X86_64) += blowfish-x86_64.o
-obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o
-obj-$(CONFIG_CRYPTO_TWOFISH_X86_64_3WAY) += twofish-x86_64-3way.o
+blowfish-x86_64-y := blowfish-x86_64-asm_64.o blowfish_glue.o
+
+obj-$(CONFIG_CRYPTO_CAST5_AVX_X86_64) += cast5-avx-x86_64.o
+cast5-avx-x86_64-y := cast5-avx-x86_64-asm_64.o cast5_avx_glue.o
+
+obj-$(CONFIG_CRYPTO_CAST6_AVX_X86_64) += cast6-avx-x86_64.o
+cast6-avx-x86_64-y := cast6-avx-x86_64-asm_64.o cast6_avx_glue.o
+
+obj-$(CONFIG_CRYPTO_AEGIS128_AESNI_SSE2) += aegis128-aesni.o
+aegis128-aesni-y := aegis128-aesni-asm.o aegis128-aesni-glue.o
+
obj-$(CONFIG_CRYPTO_CHACHA20_X86_64) += chacha-x86_64.o
-obj-$(CONFIG_CRYPTO_SERPENT_SSE2_X86_64) += serpent-sse2-x86_64.o
+chacha-x86_64-y := chacha-avx2-x86_64.o chacha-ssse3-x86_64.o chacha_glue.o
+chacha-x86_64-$(CONFIG_AS_AVX512) += chacha-avx512vl-x86_64.o
+
obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o
-obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o
+aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o
+aesni-intel-$(CONFIG_64BIT) += aesni-intel_avx-x86_64.o aes_ctrby8_avx-x86_64.o
-obj-$(CONFIG_CRYPTO_CRC32C_INTEL) += crc32c-intel.o
obj-$(CONFIG_CRYPTO_SHA1_SSSE3) += sha1-ssse3.o
-obj-$(CONFIG_CRYPTO_CRC32_PCLMUL) += crc32-pclmul.o
+sha1-ssse3-y := sha1_avx2_x86_64_asm.o sha1_ssse3_asm.o sha1_ssse3_glue.o
+sha1-ssse3-$(CONFIG_AS_SHA1_NI) += sha1_ni_asm.o
+
obj-$(CONFIG_CRYPTO_SHA256_SSSE3) += sha256-ssse3.o
+sha256-ssse3-y := sha256-ssse3-asm.o sha256-avx-asm.o sha256-avx2-asm.o sha256_ssse3_glue.o
+sha256-ssse3-$(CONFIG_AS_SHA256_NI) += sha256_ni_asm.o
+
obj-$(CONFIG_CRYPTO_SHA512_SSSE3) += sha512-ssse3.o
-obj-$(CONFIG_CRYPTO_CRCT10DIF_PCLMUL) += crct10dif-pclmul.o
-obj-$(CONFIG_CRYPTO_POLY1305_X86_64) += poly1305-x86_64.o
+sha512-ssse3-y := sha512-ssse3-asm.o sha512-avx-asm.o sha512-avx2-asm.o sha512_ssse3_glue.o
-obj-$(CONFIG_CRYPTO_AEGIS128_AESNI_SSE2) += aegis128-aesni.o
+obj-$(CONFIG_CRYPTO_BLAKE2S_X86) += libblake2s-x86_64.o
+libblake2s-x86_64-y := blake2s-core.o blake2s-glue.o
-obj-$(CONFIG_CRYPTO_NHPOLY1305_SSE2) += nhpoly1305-sse2.o
-obj-$(CONFIG_CRYPTO_NHPOLY1305_AVX2) += nhpoly1305-avx2.o
+obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o
+ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o
-# These modules require the assembler to support ADX.
-ifeq ($(adx_supported),yes)
- obj-$(CONFIG_CRYPTO_CURVE25519_X86) += curve25519-x86_64.o
-endif
-
-# These modules require assembler to support AVX.
-ifeq ($(avx_supported),yes)
- obj-$(CONFIG_CRYPTO_CAMELLIA_AESNI_AVX_X86_64) += \
- camellia-aesni-avx-x86_64.o
- obj-$(CONFIG_CRYPTO_CAST5_AVX_X86_64) += cast5-avx-x86_64.o
- obj-$(CONFIG_CRYPTO_CAST6_AVX_X86_64) += cast6-avx-x86_64.o
- obj-$(CONFIG_CRYPTO_TWOFISH_AVX_X86_64) += twofish-avx-x86_64.o
- obj-$(CONFIG_CRYPTO_SERPENT_AVX_X86_64) += serpent-avx-x86_64.o
- obj-$(CONFIG_CRYPTO_BLAKE2S_X86) += blake2s-x86_64.o
-endif
-
-# These modules require assembler to support AVX2.
-ifeq ($(avx2_supported),yes)
- obj-$(CONFIG_CRYPTO_CAMELLIA_AESNI_AVX2_X86_64) += camellia-aesni-avx2.o
- obj-$(CONFIG_CRYPTO_SERPENT_AVX2_X86_64) += serpent-avx2.o
-endif
+obj-$(CONFIG_CRYPTO_POLYVAL_CLMUL_NI) += polyval-clmulni.o
+polyval-clmulni-y := polyval-clmulni_asm.o polyval-clmulni_glue.o
-twofish-i586-y := twofish-i586-asm_32.o twofish_glue.o
-serpent-sse2-i586-y := serpent-sse2-i586-asm_32.o serpent_sse2_glue.o
+obj-$(CONFIG_CRYPTO_CRC32C_INTEL) += crc32c-intel.o
+crc32c-intel-y := crc32c-intel_glue.o
+crc32c-intel-$(CONFIG_64BIT) += crc32c-pcl-intel-asm_64.o
-des3_ede-x86_64-y := des3_ede-asm_64.o des3_ede_glue.o
-camellia-x86_64-y := camellia-x86_64-asm_64.o camellia_glue.o
-blowfish-x86_64-y := blowfish-x86_64-asm_64.o blowfish_glue.o
-twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_glue.o
-twofish-x86_64-3way-y := twofish-x86_64-asm_64-3way.o twofish_glue_3way.o
-chacha-x86_64-y := chacha-ssse3-x86_64.o chacha_glue.o
-serpent-sse2-x86_64-y := serpent-sse2-x86_64-asm_64.o serpent_sse2_glue.o
+obj-$(CONFIG_CRYPTO_CRC32_PCLMUL) += crc32-pclmul.o
+crc32-pclmul-y := crc32-pclmul_asm.o crc32-pclmul_glue.o
-aegis128-aesni-y := aegis128-aesni-asm.o aegis128-aesni-glue.o
+obj-$(CONFIG_CRYPTO_CRCT10DIF_PCLMUL) += crct10dif-pclmul.o
+crct10dif-pclmul-y := crct10dif-pcl-asm_64.o crct10dif-pclmul_glue.o
-nhpoly1305-sse2-y := nh-sse2-x86_64.o nhpoly1305-sse2-glue.o
-blake2s-x86_64-y := blake2s-core.o blake2s-glue.o
+obj-$(CONFIG_CRYPTO_POLY1305_X86_64) += poly1305-x86_64.o
poly1305-x86_64-y := poly1305-x86_64-cryptogams.o poly1305_glue.o
-ifneq ($(CONFIG_CRYPTO_POLY1305_X86_64),)
targets += poly1305-x86_64-cryptogams.S
-endif
-
-ifeq ($(avx_supported),yes)
- camellia-aesni-avx-x86_64-y := camellia-aesni-avx-asm_64.o \
- camellia_aesni_avx_glue.o
- cast5-avx-x86_64-y := cast5-avx-x86_64-asm_64.o cast5_avx_glue.o
- cast6-avx-x86_64-y := cast6-avx-x86_64-asm_64.o cast6_avx_glue.o
- twofish-avx-x86_64-y := twofish-avx-x86_64-asm_64.o \
- twofish_avx_glue.o
- serpent-avx-x86_64-y := serpent-avx-x86_64-asm_64.o \
- serpent_avx_glue.o
-endif
-
-ifeq ($(avx2_supported),yes)
- camellia-aesni-avx2-y := camellia-aesni-avx2-asm_64.o camellia_aesni_avx2_glue.o
- chacha-x86_64-y += chacha-avx2-x86_64.o
- serpent-avx2-y := serpent-avx2-asm_64.o serpent_avx2_glue.o
-
- nhpoly1305-avx2-y := nh-avx2-x86_64.o nhpoly1305-avx2-glue.o
-endif
-
-ifeq ($(avx512_supported),yes)
- chacha-x86_64-y += chacha-avx512vl-x86_64.o
-endif
-aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o
-aesni-intel-$(CONFIG_64BIT) += aesni-intel_avx-x86_64.o aes_ctrby8_avx-x86_64.o
-ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o
-sha1-ssse3-y := sha1_ssse3_asm.o sha1_ssse3_glue.o
-ifeq ($(avx2_supported),yes)
-sha1-ssse3-y += sha1_avx2_x86_64_asm.o
-endif
-ifeq ($(sha1_ni_supported),yes)
-sha1-ssse3-y += sha1_ni_asm.o
-endif
-crc32c-intel-y := crc32c-intel_glue.o
-crc32c-intel-$(CONFIG_64BIT) += crc32c-pcl-intel-asm_64.o
-crc32-pclmul-y := crc32-pclmul_asm.o crc32-pclmul_glue.o
-sha256-ssse3-y := sha256-ssse3-asm.o sha256-avx-asm.o sha256-avx2-asm.o sha256_ssse3_glue.o
-ifeq ($(sha256_ni_supported),yes)
-sha256-ssse3-y += sha256_ni_asm.o
-endif
-sha512-ssse3-y := sha512-ssse3-asm.o sha512-avx-asm.o sha512-avx2-asm.o sha512_ssse3_glue.o
-crct10dif-pclmul-y := crct10dif-pcl-asm_64.o crct10dif-pclmul_glue.o
+obj-$(CONFIG_CRYPTO_NHPOLY1305_SSE2) += nhpoly1305-sse2.o
+nhpoly1305-sse2-y := nh-sse2-x86_64.o nhpoly1305-sse2-glue.o
+obj-$(CONFIG_CRYPTO_NHPOLY1305_AVX2) += nhpoly1305-avx2.o
+nhpoly1305-avx2-y := nh-avx2-x86_64.o nhpoly1305-avx2-glue.o
+
+obj-$(CONFIG_CRYPTO_CURVE25519_X86) += curve25519-x86_64.o
+
+obj-$(CONFIG_CRYPTO_SM3_AVX_X86_64) += sm3-avx-x86_64.o
+sm3-avx-x86_64-y := sm3-avx-asm_64.o sm3_avx_glue.o
+
+obj-$(CONFIG_CRYPTO_SM4_AESNI_AVX_X86_64) += sm4-aesni-avx-x86_64.o
+sm4-aesni-avx-x86_64-y := sm4-aesni-avx-asm_64.o sm4_aesni_avx_glue.o
+
+obj-$(CONFIG_CRYPTO_SM4_AESNI_AVX2_X86_64) += sm4-aesni-avx2-x86_64.o
+sm4-aesni-avx2-x86_64-y := sm4-aesni-avx2-asm_64.o sm4_aesni_avx2_glue.o
+
+obj-$(CONFIG_CRYPTO_ARIA_AESNI_AVX_X86_64) += aria-aesni-avx-x86_64.o
+aria-aesni-avx-x86_64-y := aria-aesni-avx-asm_64.o aria_aesni_avx_glue.o
quiet_cmd_perlasm = PERLASM $@
cmd_perlasm = $(PERL) $< > $@
diff --git a/arch/x86/crypto/aegis128-aesni-asm.S b/arch/x86/crypto/aegis128-aesni-asm.S
index 51d46d93efbc..b48ddebb4748 100644
--- a/arch/x86/crypto/aegis128-aesni-asm.S
+++ b/arch/x86/crypto/aegis128-aesni-asm.S
@@ -122,7 +122,7 @@ SYM_FUNC_START_LOCAL(__load_partial)
pxor T0, MSG
.Lld_partial_8:
- ret
+ RET
SYM_FUNC_END(__load_partial)
/*
@@ -180,7 +180,7 @@ SYM_FUNC_START_LOCAL(__store_partial)
mov %r10b, (%r9)
.Lst_partial_1:
- ret
+ RET
SYM_FUNC_END(__store_partial)
/*
@@ -225,7 +225,7 @@ SYM_FUNC_START(crypto_aegis128_aesni_init)
movdqu STATE4, 0x40(STATEP)
FRAME_END
- ret
+ RET
SYM_FUNC_END(crypto_aegis128_aesni_init)
/*
@@ -337,7 +337,7 @@ SYM_FUNC_START(crypto_aegis128_aesni_ad)
movdqu STATE3, 0x30(STATEP)
movdqu STATE4, 0x40(STATEP)
FRAME_END
- ret
+ RET
.Lad_out_1:
movdqu STATE4, 0x00(STATEP)
@@ -346,7 +346,7 @@ SYM_FUNC_START(crypto_aegis128_aesni_ad)
movdqu STATE2, 0x30(STATEP)
movdqu STATE3, 0x40(STATEP)
FRAME_END
- ret
+ RET
.Lad_out_2:
movdqu STATE3, 0x00(STATEP)
@@ -355,7 +355,7 @@ SYM_FUNC_START(crypto_aegis128_aesni_ad)
movdqu STATE1, 0x30(STATEP)
movdqu STATE2, 0x40(STATEP)
FRAME_END
- ret
+ RET
.Lad_out_3:
movdqu STATE2, 0x00(STATEP)
@@ -364,7 +364,7 @@ SYM_FUNC_START(crypto_aegis128_aesni_ad)
movdqu STATE0, 0x30(STATEP)
movdqu STATE1, 0x40(STATEP)
FRAME_END
- ret
+ RET
.Lad_out_4:
movdqu STATE1, 0x00(STATEP)
@@ -373,11 +373,11 @@ SYM_FUNC_START(crypto_aegis128_aesni_ad)
movdqu STATE4, 0x30(STATEP)
movdqu STATE0, 0x40(STATEP)
FRAME_END
- ret
+ RET
.Lad_out:
FRAME_END
- ret
+ RET
SYM_FUNC_END(crypto_aegis128_aesni_ad)
.macro encrypt_block a s0 s1 s2 s3 s4 i
@@ -452,7 +452,7 @@ SYM_FUNC_START(crypto_aegis128_aesni_enc)
movdqu STATE2, 0x30(STATEP)
movdqu STATE3, 0x40(STATEP)
FRAME_END
- ret
+ RET
.Lenc_out_1:
movdqu STATE3, 0x00(STATEP)
@@ -461,7 +461,7 @@ SYM_FUNC_START(crypto_aegis128_aesni_enc)
movdqu STATE1, 0x30(STATEP)
movdqu STATE2, 0x40(STATEP)
FRAME_END
- ret
+ RET
.Lenc_out_2:
movdqu STATE2, 0x00(STATEP)
@@ -470,7 +470,7 @@ SYM_FUNC_START(crypto_aegis128_aesni_enc)
movdqu STATE0, 0x30(STATEP)
movdqu STATE1, 0x40(STATEP)
FRAME_END
- ret
+ RET
.Lenc_out_3:
movdqu STATE1, 0x00(STATEP)
@@ -479,7 +479,7 @@ SYM_FUNC_START(crypto_aegis128_aesni_enc)
movdqu STATE4, 0x30(STATEP)
movdqu STATE0, 0x40(STATEP)
FRAME_END
- ret
+ RET
.Lenc_out_4:
movdqu STATE0, 0x00(STATEP)
@@ -488,11 +488,11 @@ SYM_FUNC_START(crypto_aegis128_aesni_enc)
movdqu STATE3, 0x30(STATEP)
movdqu STATE4, 0x40(STATEP)
FRAME_END
- ret
+ RET
.Lenc_out:
FRAME_END
- ret
+ RET
SYM_FUNC_END(crypto_aegis128_aesni_enc)
/*
@@ -532,7 +532,7 @@ SYM_FUNC_START(crypto_aegis128_aesni_enc_tail)
movdqu STATE3, 0x40(STATEP)
FRAME_END
- ret
+ RET
SYM_FUNC_END(crypto_aegis128_aesni_enc_tail)
.macro decrypt_block a s0 s1 s2 s3 s4 i
@@ -606,7 +606,7 @@ SYM_FUNC_START(crypto_aegis128_aesni_dec)
movdqu STATE2, 0x30(STATEP)
movdqu STATE3, 0x40(STATEP)
FRAME_END
- ret
+ RET
.Ldec_out_1:
movdqu STATE3, 0x00(STATEP)
@@ -615,7 +615,7 @@ SYM_FUNC_START(crypto_aegis128_aesni_dec)
movdqu STATE1, 0x30(STATEP)
movdqu STATE2, 0x40(STATEP)
FRAME_END
- ret
+ RET
.Ldec_out_2:
movdqu STATE2, 0x00(STATEP)
@@ -624,7 +624,7 @@ SYM_FUNC_START(crypto_aegis128_aesni_dec)
movdqu STATE0, 0x30(STATEP)
movdqu STATE1, 0x40(STATEP)
FRAME_END
- ret
+ RET
.Ldec_out_3:
movdqu STATE1, 0x00(STATEP)
@@ -633,7 +633,7 @@ SYM_FUNC_START(crypto_aegis128_aesni_dec)
movdqu STATE4, 0x30(STATEP)
movdqu STATE0, 0x40(STATEP)
FRAME_END
- ret
+ RET
.Ldec_out_4:
movdqu STATE0, 0x00(STATEP)
@@ -642,11 +642,11 @@ SYM_FUNC_START(crypto_aegis128_aesni_dec)
movdqu STATE3, 0x30(STATEP)
movdqu STATE4, 0x40(STATEP)
FRAME_END
- ret
+ RET
.Ldec_out:
FRAME_END
- ret
+ RET
SYM_FUNC_END(crypto_aegis128_aesni_dec)
/*
@@ -696,7 +696,7 @@ SYM_FUNC_START(crypto_aegis128_aesni_dec_tail)
movdqu STATE3, 0x40(STATEP)
FRAME_END
- ret
+ RET
SYM_FUNC_END(crypto_aegis128_aesni_dec_tail)
/*
@@ -743,5 +743,5 @@ SYM_FUNC_START(crypto_aegis128_aesni_final)
movdqu MSG, (%rsi)
FRAME_END
- ret
+ RET
SYM_FUNC_END(crypto_aegis128_aesni_final)
diff --git a/arch/x86/crypto/aes_ctrby8_avx-x86_64.S b/arch/x86/crypto/aes_ctrby8_avx-x86_64.S
index ec437db1fa54..2402b9418cd7 100644
--- a/arch/x86/crypto/aes_ctrby8_avx-x86_64.S
+++ b/arch/x86/crypto/aes_ctrby8_avx-x86_64.S
@@ -1,72 +1,33 @@
+/* SPDX-License-Identifier: GPL-2.0-only OR BSD-3-Clause */
/*
- * Implement AES CTR mode by8 optimization with AVX instructions. (x86_64)
- *
- * This is AES128/192/256 CTR mode optimization implementation. It requires
- * the support of Intel(R) AESNI and AVX instructions.
- *
- * This work was inspired by the AES CTR mode optimization published
- * in Intel Optimized IPSEC Cryptograhpic library.
- * Additional information on it can be found at:
- * http://downloadcenter.intel.com/Detail_Desc.aspx?agr=Y&DwnldID=22972
- *
- * This file is provided under a dual BSD/GPLv2 license. When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
+ * AES CTR mode by8 optimization with AVX instructions. (x86_64)
*
* Copyright(c) 2014 Intel Corporation.
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
* Contact Information:
* James Guilford <james.guilford@intel.com>
* Sean Gulley <sean.m.gulley@intel.com>
* Chandramouli Narayanan <mouli@linux.intel.com>
+ */
+/*
+ * This is AES128/192/256 CTR mode optimization implementation. It requires
+ * the support of Intel(R) AESNI and AVX instructions.
*
- * BSD LICENSE
- *
- * Copyright(c) 2014 Intel Corporation.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- * Neither the name of Intel Corporation nor the names of its
- * contributors may be used to endorse or promote products derived
- * from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
+ * This work was inspired by the AES CTR mode optimization published
+ * in Intel Optimized IPSEC Cryptographic library.
+ * Additional information on it can be found at:
+ * https://github.com/intel/intel-ipsec-mb
*/
#include <linux/linkage.h>
-#include <asm/inst.h>
#define VMOVDQ vmovdqu
+/*
+ * Note: the "x" prefix in these aliases means "this is an xmm register". The
+ * alias prefixes have no relation to XCTR where the "X" prefix means "XOR
+ * counter".
+ */
#define xdata0 %xmm0
#define xdata1 %xmm1
#define xdata2 %xmm2
@@ -75,8 +36,10 @@
#define xdata5 %xmm5
#define xdata6 %xmm6
#define xdata7 %xmm7
-#define xcounter %xmm8
-#define xbyteswap %xmm9
+#define xcounter %xmm8 // CTR mode only
+#define xiv %xmm8 // XCTR mode only
+#define xbyteswap %xmm9 // CTR mode only
+#define xtmp %xmm9 // XCTR mode only
#define xkey0 %xmm10
#define xkey4 %xmm11
#define xkey8 %xmm12
@@ -89,7 +52,7 @@
#define p_keys %rdx
#define p_out %rcx
#define num_bytes %r8
-
+#define counter %r9 // XCTR mode only
#define tmp %r10
#define DDQ_DATA 0
#define XDATA 1
@@ -127,10 +90,6 @@ ddq_add_8:
/* generate a unique variable for ddq_add_x */
-.macro setddq n
- var_ddq_add = ddq_add_\n
-.endm
-
/* generate a unique variable for xmm register */
.macro setxdata n
var_xdata = %xmm\n
@@ -140,9 +99,7 @@ ddq_add_8:
.macro club name, id
.altmacro
- .if \name == DDQ_DATA
- setddq %\id
- .elseif \name == XDATA
+ .if \name == XDATA
setxdata %\id
.endif
.noaltmacro
@@ -152,7 +109,7 @@ ddq_add_8:
* do_aes num_in_par load_keys key_len
* This increments p_in, but not p_out
*/
-.macro do_aes b, k, key_len
+.macro do_aes b, k, key_len, xctr
.set by, \b
.set load_keys, \k
.set klen, \key_len
@@ -161,31 +118,48 @@ ddq_add_8:
vmovdqa 0*16(p_keys), xkey0
.endif
- vpshufb xbyteswap, xcounter, xdata0
-
- .set i, 1
- .rept (by - 1)
- club DDQ_DATA, i
- club XDATA, i
- vpaddq var_ddq_add(%rip), xcounter, var_xdata
- vptest ddq_low_msk(%rip), var_xdata
- jnz 1f
- vpaddq ddq_high_add_1(%rip), var_xdata, var_xdata
- vpaddq ddq_high_add_1(%rip), xcounter, xcounter
- 1:
- vpshufb xbyteswap, var_xdata, var_xdata
- .set i, (i +1)
- .endr
+ .if \xctr
+ movq counter, xtmp
+ .set i, 0
+ .rept (by)
+ club XDATA, i
+ vpaddq (ddq_add_1 + 16 * i)(%rip), xtmp, var_xdata
+ .set i, (i +1)
+ .endr
+ .set i, 0
+ .rept (by)
+ club XDATA, i
+ vpxor xiv, var_xdata, var_xdata
+ .set i, (i +1)
+ .endr
+ .else
+ vpshufb xbyteswap, xcounter, xdata0
+ .set i, 1
+ .rept (by - 1)
+ club XDATA, i
+ vpaddq (ddq_add_1 + 16 * (i - 1))(%rip), xcounter, var_xdata
+ vptest ddq_low_msk(%rip), var_xdata
+ jnz 1f
+ vpaddq ddq_high_add_1(%rip), var_xdata, var_xdata
+ vpaddq ddq_high_add_1(%rip), xcounter, xcounter
+ 1:
+ vpshufb xbyteswap, var_xdata, var_xdata
+ .set i, (i +1)
+ .endr
+ .endif
vmovdqa 1*16(p_keys), xkeyA
vpxor xkey0, xdata0, xdata0
- club DDQ_DATA, by
- vpaddq var_ddq_add(%rip), xcounter, xcounter
- vptest ddq_low_msk(%rip), xcounter
- jnz 1f
- vpaddq ddq_high_add_1(%rip), xcounter, xcounter
- 1:
+ .if \xctr
+ add $by, counter
+ .else
+ vpaddq (ddq_add_1 + 16 * (by - 1))(%rip), xcounter, xcounter
+ vptest ddq_low_msk(%rip), xcounter
+ jnz 1f
+ vpaddq ddq_high_add_1(%rip), xcounter, xcounter
+ 1:
+ .endif
.set i, 1
.rept (by - 1)
@@ -423,94 +397,99 @@ ddq_add_8:
.endr
.endm
-.macro do_aes_load val, key_len
- do_aes \val, 1, \key_len
+.macro do_aes_load val, key_len, xctr
+ do_aes \val, 1, \key_len, \xctr
.endm
-.macro do_aes_noload val, key_len
- do_aes \val, 0, \key_len
+.macro do_aes_noload val, key_len, xctr
+ do_aes \val, 0, \key_len, \xctr
.endm
/* main body of aes ctr load */
-.macro do_aes_ctrmain key_len
+.macro do_aes_ctrmain key_len, xctr
cmp $16, num_bytes
- jb .Ldo_return2\key_len
+ jb .Ldo_return2\xctr\key_len
- vmovdqa byteswap_const(%rip), xbyteswap
- vmovdqu (p_iv), xcounter
- vpshufb xbyteswap, xcounter, xcounter
+ .if \xctr
+ shr $4, counter
+ vmovdqu (p_iv), xiv
+ .else
+ vmovdqa byteswap_const(%rip), xbyteswap
+ vmovdqu (p_iv), xcounter
+ vpshufb xbyteswap, xcounter, xcounter
+ .endif
mov num_bytes, tmp
and $(7*16), tmp
- jz .Lmult_of_8_blks\key_len
+ jz .Lmult_of_8_blks\xctr\key_len
/* 1 <= tmp <= 7 */
cmp $(4*16), tmp
- jg .Lgt4\key_len
- je .Leq4\key_len
+ jg .Lgt4\xctr\key_len
+ je .Leq4\xctr\key_len
-.Llt4\key_len:
+.Llt4\xctr\key_len:
cmp $(2*16), tmp
- jg .Leq3\key_len
- je .Leq2\key_len
+ jg .Leq3\xctr\key_len
+ je .Leq2\xctr\key_len
-.Leq1\key_len:
- do_aes_load 1, \key_len
+.Leq1\xctr\key_len:
+ do_aes_load 1, \key_len, \xctr
add $(1*16), p_out
and $(~7*16), num_bytes
- jz .Ldo_return2\key_len
- jmp .Lmain_loop2\key_len
+ jz .Ldo_return2\xctr\key_len
+ jmp .Lmain_loop2\xctr\key_len
-.Leq2\key_len:
- do_aes_load 2, \key_len
+.Leq2\xctr\key_len:
+ do_aes_load 2, \key_len, \xctr
add $(2*16), p_out
and $(~7*16), num_bytes
- jz .Ldo_return2\key_len
- jmp .Lmain_loop2\key_len
+ jz .Ldo_return2\xctr\key_len
+ jmp .Lmain_loop2\xctr\key_len
-.Leq3\key_len:
- do_aes_load 3, \key_len
+.Leq3\xctr\key_len:
+ do_aes_load 3, \key_len, \xctr
add $(3*16), p_out
and $(~7*16), num_bytes
- jz .Ldo_return2\key_len
- jmp .Lmain_loop2\key_len
+ jz .Ldo_return2\xctr\key_len
+ jmp .Lmain_loop2\xctr\key_len
-.Leq4\key_len:
- do_aes_load 4, \key_len
+.Leq4\xctr\key_len:
+ do_aes_load 4, \key_len, \xctr
add $(4*16), p_out
and $(~7*16), num_bytes
- jz .Ldo_return2\key_len
- jmp .Lmain_loop2\key_len
+ jz .Ldo_return2\xctr\key_len
+ jmp .Lmain_loop2\xctr\key_len
-.Lgt4\key_len:
+.Lgt4\xctr\key_len:
cmp $(6*16), tmp
- jg .Leq7\key_len
- je .Leq6\key_len
+ jg .Leq7\xctr\key_len
+ je .Leq6\xctr\key_len
-.Leq5\key_len:
- do_aes_load 5, \key_len
+.Leq5\xctr\key_len:
+ do_aes_load 5, \key_len, \xctr
add $(5*16), p_out
and $(~7*16), num_bytes
- jz .Ldo_return2\key_len
- jmp .Lmain_loop2\key_len
+ jz .Ldo_return2\xctr\key_len
+ jmp .Lmain_loop2\xctr\key_len
-.Leq6\key_len:
- do_aes_load 6, \key_len
+.Leq6\xctr\key_len:
+ do_aes_load 6, \key_len, \xctr
add $(6*16), p_out
and $(~7*16), num_bytes
- jz .Ldo_return2\key_len
- jmp .Lmain_loop2\key_len
+ jz .Ldo_return2\xctr\key_len
+ jmp .Lmain_loop2\xctr\key_len
-.Leq7\key_len:
- do_aes_load 7, \key_len
+.Leq7\xctr\key_len:
+ do_aes_load 7, \key_len, \xctr
add $(7*16), p_out
and $(~7*16), num_bytes
- jz .Ldo_return2\key_len
- jmp .Lmain_loop2\key_len
+ jz .Ldo_return2\xctr\key_len
+ jmp .Lmain_loop2\xctr\key_len
-.Lmult_of_8_blks\key_len:
+.Lmult_of_8_blks\xctr\key_len:
.if (\key_len != KEY_128)
vmovdqa 0*16(p_keys), xkey0
vmovdqa 4*16(p_keys), xkey4
@@ -523,18 +502,20 @@ ddq_add_8:
vmovdqa 9*16(p_keys), xkey12
.endif
.align 16
-.Lmain_loop2\key_len:
+.Lmain_loop2\xctr\key_len:
/* num_bytes is a multiple of 8 and >0 */
- do_aes_noload 8, \key_len
+ do_aes_noload 8, \key_len, \xctr
add $(8*16), p_out
sub $(8*16), num_bytes
- jne .Lmain_loop2\key_len
+ jne .Lmain_loop2\xctr\key_len
-.Ldo_return2\key_len:
- /* return updated IV */
- vpshufb xbyteswap, xcounter, xcounter
- vmovdqu xcounter, (p_iv)
- ret
+.Ldo_return2\xctr\key_len:
+ .if !\xctr
+ /* return updated IV */
+ vpshufb xbyteswap, xcounter, xcounter
+ vmovdqu xcounter, (p_iv)
+ .endif
+ RET
.endm
/*
@@ -546,7 +527,7 @@ ddq_add_8:
*/
SYM_FUNC_START(aes_ctr_enc_128_avx_by8)
/* call the aes main loop */
- do_aes_ctrmain KEY_128
+ do_aes_ctrmain KEY_128 0
SYM_FUNC_END(aes_ctr_enc_128_avx_by8)
@@ -559,7 +540,7 @@ SYM_FUNC_END(aes_ctr_enc_128_avx_by8)
*/
SYM_FUNC_START(aes_ctr_enc_192_avx_by8)
/* call the aes main loop */
- do_aes_ctrmain KEY_192
+ do_aes_ctrmain KEY_192 0
SYM_FUNC_END(aes_ctr_enc_192_avx_by8)
@@ -572,6 +553,45 @@ SYM_FUNC_END(aes_ctr_enc_192_avx_by8)
*/
SYM_FUNC_START(aes_ctr_enc_256_avx_by8)
/* call the aes main loop */
- do_aes_ctrmain KEY_256
+ do_aes_ctrmain KEY_256 0
SYM_FUNC_END(aes_ctr_enc_256_avx_by8)
+
+/*
+ * routine to do AES128 XCTR enc/decrypt "by8"
+ * XMM registers are clobbered.
+ * Saving/restoring must be done at a higher level
+ * aes_xctr_enc_128_avx_by8(const u8 *in, const u8 *iv, const void *keys,
+ * u8* out, unsigned int num_bytes, unsigned int byte_ctr)
+ */
+SYM_FUNC_START(aes_xctr_enc_128_avx_by8)
+ /* call the aes main loop */
+ do_aes_ctrmain KEY_128 1
+
+SYM_FUNC_END(aes_xctr_enc_128_avx_by8)
+
+/*
+ * routine to do AES192 XCTR enc/decrypt "by8"
+ * XMM registers are clobbered.
+ * Saving/restoring must be done at a higher level
+ * aes_xctr_enc_192_avx_by8(const u8 *in, const u8 *iv, const void *keys,
+ * u8* out, unsigned int num_bytes, unsigned int byte_ctr)
+ */
+SYM_FUNC_START(aes_xctr_enc_192_avx_by8)
+ /* call the aes main loop */
+ do_aes_ctrmain KEY_192 1
+
+SYM_FUNC_END(aes_xctr_enc_192_avx_by8)
+
+/*
+ * routine to do AES256 XCTR enc/decrypt "by8"
+ * XMM registers are clobbered.
+ * Saving/restoring must be done at a higher level
+ * aes_xctr_enc_256_avx_by8(const u8 *in, const u8 *iv, const void *keys,
+ * u8* out, unsigned int num_bytes, unsigned int byte_ctr)
+ */
+SYM_FUNC_START(aes_xctr_enc_256_avx_by8)
+ /* call the aes main loop */
+ do_aes_ctrmain KEY_256 1
+
+SYM_FUNC_END(aes_xctr_enc_256_avx_by8)
diff --git a/arch/x86/crypto/aes_glue.c b/arch/x86/crypto/aes_glue.c
deleted file mode 100644
index 7b7dc05fa1a4..000000000000
--- a/arch/x86/crypto/aes_glue.c
+++ /dev/null
@@ -1 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S
index cad6e1bfa7d5..837c1e0aa021 100644
--- a/arch/x86/crypto/aesni-intel_asm.S
+++ b/arch/x86/crypto/aesni-intel_asm.S
@@ -26,7 +26,6 @@
*/
#include <linux/linkage.h>
-#include <asm/inst.h>
#include <asm/frame.h>
#include <asm/nospec-branch.h>
@@ -44,10 +43,6 @@
#ifdef __x86_64__
# constants in mergeable sections, linker can reorder and merge
-.section .rodata.cst16.gf128mul_x_ble_mask, "aM", @progbits, 16
-.align 16
-.Lgf128mul_x_ble_mask:
- .octa 0x00000000000000010000000000000087
.section .rodata.cst16.POLY, "aM", @progbits, 16
.align 16
POLY: .octa 0xC2000000000000000000000000000001
@@ -147,7 +142,7 @@ ALL_F: .octa 0xffffffffffffffffffffffffffffffff
#define CTR %xmm11
#define INC %xmm12
-#define GF128MUL_MASK %xmm10
+#define GF128MUL_MASK %xmm7
#ifdef __x86_64__
#define AREG %rax
@@ -201,7 +196,7 @@ ALL_F: .octa 0xffffffffffffffffffffffffffffffff
mov \SUBKEY, %r12
movdqu (%r12), \TMP3
movdqa SHUF_MASK(%rip), \TMP2
- PSHUFB_XMM \TMP2, \TMP3
+ pshufb \TMP2, \TMP3
# precompute HashKey<<1 mod poly from the HashKey (required for GHASH)
@@ -263,10 +258,10 @@ ALL_F: .octa 0xffffffffffffffffffffffffffffffff
movdqu %xmm0, OrigIV(%arg2) # ctx_data.orig_IV = iv
movdqa SHUF_MASK(%rip), %xmm2
- PSHUFB_XMM %xmm2, %xmm0
+ pshufb %xmm2, %xmm0
movdqu %xmm0, CurCount(%arg2) # ctx_data.current_counter = iv
- PRECOMPUTE \SUBKEY, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+ PRECOMPUTE \SUBKEY, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7
movdqu HashKey(%arg2), %xmm13
CALC_AAD_HASH %xmm13, \AAD, \AADLEN, %xmm0, %xmm1, %xmm2, %xmm3, \
@@ -319,7 +314,7 @@ _initial_blocks_\@:
# Main loop - Encrypt/Decrypt remaining blocks
- cmp $0, %r13
+ test %r13, %r13
je _zero_cipher_left_\@
sub $64, %r13
je _four_cipher_left_\@
@@ -347,7 +342,7 @@ _zero_cipher_left_\@:
paddd ONE(%rip), %xmm0 # INCR CNT to get Yn
movdqu %xmm0, CurCount(%arg2)
movdqa SHUF_MASK(%rip), %xmm10
- PSHUFB_XMM %xmm10, %xmm0
+ pshufb %xmm10, %xmm0
ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # Encrypt(K, Yn)
movdqu %xmm0, PBlockEncKey(%arg2)
@@ -377,7 +372,7 @@ _large_enough_update_\@:
# get the appropriate shuffle mask
movdqu (%r12), %xmm2
# shift right 16-r13 bytes
- PSHUFB_XMM %xmm2, %xmm1
+ pshufb %xmm2, %xmm1
_data_read_\@:
lea ALL_F+16(%rip), %r12
@@ -393,12 +388,12 @@ _data_read_\@:
.ifc \operation, dec
pand %xmm1, %xmm2
movdqa SHUF_MASK(%rip), %xmm10
- PSHUFB_XMM %xmm10 ,%xmm2
+ pshufb %xmm10 ,%xmm2
pxor %xmm2, %xmm8
.else
movdqa SHUF_MASK(%rip), %xmm10
- PSHUFB_XMM %xmm10,%xmm0
+ pshufb %xmm10,%xmm0
pxor %xmm0, %xmm8
.endif
@@ -408,17 +403,17 @@ _data_read_\@:
# GHASH computation for the last <16 byte block
movdqa SHUF_MASK(%rip), %xmm10
# shuffle xmm0 back to output as ciphertext
- PSHUFB_XMM %xmm10, %xmm0
+ pshufb %xmm10, %xmm0
.endif
# Output %r13 bytes
- MOVQ_R64_XMM %xmm0, %rax
+ movq %xmm0, %rax
cmp $8, %r13
jle _less_than_8_bytes_left_\@
mov %rax, (%arg3 , %r11, 1)
add $8, %r11
psrldq $8, %xmm0
- MOVQ_R64_XMM %xmm0, %rax
+ movq %xmm0, %rax
sub $8, %r13
_less_than_8_bytes_left_\@:
mov %al, (%arg3, %r11, 1)
@@ -438,7 +433,7 @@ _multiple_of_16_bytes_\@:
mov PBlockLen(%arg2), %r12
- cmp $0, %r12
+ test %r12, %r12
je _partial_done\@
GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
@@ -449,7 +444,7 @@ _partial_done\@:
movd %r12d, %xmm15 # len(A) in %xmm15
mov InLen(%arg2), %r12
shl $3, %r12 # len(C) in bits (*128)
- MOVQ_R64_XMM %r12, %xmm1
+ movq %r12, %xmm1
pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000
pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C)
@@ -457,7 +452,7 @@ _partial_done\@:
GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
# final GHASH computation
movdqa SHUF_MASK(%rip), %xmm10
- PSHUFB_XMM %xmm10, %xmm8
+ pshufb %xmm10, %xmm8
movdqu OrigIV(%arg2), %xmm0 # %xmm0 = Y0
ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Y0)
@@ -470,12 +465,12 @@ _return_T_\@:
cmp $8, %r11
jl _T_4_\@
_T_8_\@:
- MOVQ_R64_XMM %xmm0, %rax
+ movq %xmm0, %rax
mov %rax, (%r10)
add $8, %r10
sub $8, %r11
psrldq $8, %xmm0
- cmp $0, %r11
+ test %r11, %r11
je _return_T_done_\@
_T_4_\@:
movd %xmm0, %eax
@@ -483,7 +478,7 @@ _T_4_\@:
add $4, %r10
sub $4, %r11
psrldq $4, %xmm0
- cmp $0, %r11
+ test %r11, %r11
je _return_T_done_\@
_T_123_\@:
movd %xmm0, %eax
@@ -518,9 +513,9 @@ _return_T_done_\@:
pshufd $78, \HK, \TMP3
pxor \GH, \TMP2 # TMP2 = a1+a0
pxor \HK, \TMP3 # TMP3 = b1+b0
- PCLMULQDQ 0x11, \HK, \TMP1 # TMP1 = a1*b1
- PCLMULQDQ 0x00, \HK, \GH # GH = a0*b0
- PCLMULQDQ 0x00, \TMP3, \TMP2 # TMP2 = (a0+a1)*(b1+b0)
+ pclmulqdq $0x11, \HK, \TMP1 # TMP1 = a1*b1
+ pclmulqdq $0x00, \HK, \GH # GH = a0*b0
+ pclmulqdq $0x00, \TMP3, \TMP2 # TMP2 = (a0+a1)*(b1+b0)
pxor \GH, \TMP2
pxor \TMP1, \TMP2 # TMP2 = (a0*b0)+(a1*b0)
movdqa \TMP2, \TMP3
@@ -570,7 +565,7 @@ _return_T_done_\@:
cmp $8, \DLEN
jl _read_lt8_\@
mov (\DPTR), %rax
- MOVQ_R64_XMM %rax, \XMMDst
+ movq %rax, \XMMDst
sub $8, \DLEN
jz _done_read_partial_block_\@
xor %eax, %eax
@@ -579,7 +574,7 @@ _read_next_byte_\@:
mov 7(\DPTR, \DLEN, 1), %al
dec \DLEN
jnz _read_next_byte_\@
- MOVQ_R64_XMM %rax, \XMM1
+ movq %rax, \XMM1
pslldq $8, \XMM1
por \XMM1, \XMMDst
jmp _done_read_partial_block_\@
@@ -590,7 +585,7 @@ _read_next_byte_lt8_\@:
mov -1(\DPTR, \DLEN, 1), %al
dec \DLEN
jnz _read_next_byte_lt8_\@
- MOVQ_R64_XMM %rax, \XMMDst
+ movq %rax, \XMMDst
_done_read_partial_block_\@:
.endm
@@ -608,7 +603,7 @@ _done_read_partial_block_\@:
jl _get_AAD_rest\@
_get_AAD_blocks\@:
movdqu (%r10), \TMP7
- PSHUFB_XMM %xmm14, \TMP7 # byte-reflect the AAD data
+ pshufb %xmm14, \TMP7 # byte-reflect the AAD data
pxor \TMP7, \TMP6
GHASH_MUL \TMP6, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5
add $16, %r10
@@ -620,11 +615,11 @@ _get_AAD_blocks\@:
/* read the last <16B of AAD */
_get_AAD_rest\@:
- cmp $0, %r11
+ test %r11, %r11
je _get_AAD_done\@
READ_PARTIAL_BLOCK %r10, %r11, \TMP1, \TMP7
- PSHUFB_XMM %xmm14, \TMP7 # byte-reflect the AAD data
+ pshufb %xmm14, \TMP7 # byte-reflect the AAD data
pxor \TMP6, \TMP7
GHASH_MUL \TMP7, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5
movdqu \TMP7, \TMP6
@@ -641,7 +636,7 @@ _get_AAD_done\@:
.macro PARTIAL_BLOCK CYPH_PLAIN_OUT PLAIN_CYPH_IN PLAIN_CYPH_LEN DATA_OFFSET \
AAD_HASH operation
mov PBlockLen(%arg2), %r13
- cmp $0, %r13
+ test %r13, %r13
je _partial_block_done_\@ # Leave Macro if no partial blocks
# Read in input data without over reading
cmp $16, \PLAIN_CYPH_LEN
@@ -667,7 +662,7 @@ _data_read_\@: # Finished reading in data
# r16-r13 is the number of bytes in plaintext mod 16)
add %r13, %r12
movdqu (%r12), %xmm2 # get the appropriate shuffle mask
- PSHUFB_XMM %xmm2, %xmm9 # shift right r13 bytes
+ pshufb %xmm2, %xmm9 # shift right r13 bytes
.ifc \operation, dec
movdqa %xmm1, %xmm3
@@ -689,11 +684,11 @@ _no_extra_mask_1_\@:
pand %xmm1, %xmm3
movdqa SHUF_MASK(%rip), %xmm10
- PSHUFB_XMM %xmm10, %xmm3
- PSHUFB_XMM %xmm2, %xmm3
+ pshufb %xmm10, %xmm3
+ pshufb %xmm2, %xmm3
pxor %xmm3, \AAD_HASH
- cmp $0, %r10
+ test %r10, %r10
jl _partial_incomplete_1_\@
# GHASH computation for the last <16 Byte block
@@ -724,11 +719,11 @@ _no_extra_mask_2_\@:
pand %xmm1, %xmm9
movdqa SHUF_MASK(%rip), %xmm1
- PSHUFB_XMM %xmm1, %xmm9
- PSHUFB_XMM %xmm2, %xmm9
+ pshufb %xmm1, %xmm9
+ pshufb %xmm2, %xmm9
pxor %xmm9, \AAD_HASH
- cmp $0, %r10
+ test %r10, %r10
jl _partial_incomplete_2_\@
# GHASH computation for the last <16 Byte block
@@ -744,11 +739,11 @@ _encode_done_\@:
movdqa SHUF_MASK(%rip), %xmm10
# shuffle xmm9 back to output as ciphertext
- PSHUFB_XMM %xmm10, %xmm9
- PSHUFB_XMM %xmm2, %xmm9
+ pshufb %xmm10, %xmm9
+ pshufb %xmm2, %xmm9
.endif
# output encrypted Bytes
- cmp $0, %r10
+ test %r10, %r10
jl _partial_fill_\@
mov %r13, %r12
mov $16, %r13
@@ -759,14 +754,14 @@ _partial_fill_\@:
mov \PLAIN_CYPH_LEN, %r13
_count_set_\@:
movdqa %xmm9, %xmm0
- MOVQ_R64_XMM %xmm0, %rax
+ movq %xmm0, %rax
cmp $8, %r13
jle _less_than_8_bytes_left_\@
mov %rax, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
add $8, \DATA_OFFSET
psrldq $8, %xmm0
- MOVQ_R64_XMM %xmm0, %rax
+ movq %xmm0, %rax
sub $8, %r13
_less_than_8_bytes_left_\@:
movb %al, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
@@ -810,7 +805,7 @@ _partial_block_done_\@:
.else
MOVADQ \XMM0, %xmm\index
.endif
- PSHUFB_XMM %xmm14, %xmm\index # perform a 16 byte swap
+ pshufb %xmm14, %xmm\index # perform a 16 byte swap
pxor \TMP2, %xmm\index
.endr
lea 0x10(%arg1),%r10
@@ -821,7 +816,7 @@ _partial_block_done_\@:
aes_loop_initial_\@:
MOVADQ (%r10),\TMP1
.irpc index, \i_seq
- AESENC \TMP1, %xmm\index
+ aesenc \TMP1, %xmm\index
.endr
add $16,%r10
sub $1,%eax
@@ -829,7 +824,7 @@ aes_loop_initial_\@:
MOVADQ (%r10), \TMP1
.irpc index, \i_seq
- AESENCLAST \TMP1, %xmm\index # Last Round
+ aesenclast \TMP1, %xmm\index # Last Round
.endr
.irpc index, \i_seq
movdqu (%arg4 , %r11, 1), \TMP1
@@ -841,7 +836,7 @@ aes_loop_initial_\@:
.ifc \operation, dec
movdqa \TMP1, %xmm\index
.endif
- PSHUFB_XMM %xmm14, %xmm\index
+ pshufb %xmm14, %xmm\index
# prepare plaintext/ciphertext for GHASH computation
.endr
@@ -876,19 +871,19 @@ aes_loop_initial_\@:
MOVADQ ONE(%RIP),\TMP1
paddd \TMP1, \XMM0 # INCR Y0
MOVADQ \XMM0, \XMM1
- PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
+ pshufb %xmm14, \XMM1 # perform a 16 byte swap
paddd \TMP1, \XMM0 # INCR Y0
MOVADQ \XMM0, \XMM2
- PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
+ pshufb %xmm14, \XMM2 # perform a 16 byte swap
paddd \TMP1, \XMM0 # INCR Y0
MOVADQ \XMM0, \XMM3
- PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
+ pshufb %xmm14, \XMM3 # perform a 16 byte swap
paddd \TMP1, \XMM0 # INCR Y0
MOVADQ \XMM0, \XMM4
- PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
+ pshufb %xmm14, \XMM4 # perform a 16 byte swap
MOVADQ 0(%arg1),\TMP1
pxor \TMP1, \XMM1
@@ -897,17 +892,17 @@ aes_loop_initial_\@:
pxor \TMP1, \XMM4
.irpc index, 1234 # do 4 rounds
movaps 0x10*\index(%arg1), \TMP1
- AESENC \TMP1, \XMM1
- AESENC \TMP1, \XMM2
- AESENC \TMP1, \XMM3
- AESENC \TMP1, \XMM4
+ aesenc \TMP1, \XMM1
+ aesenc \TMP1, \XMM2
+ aesenc \TMP1, \XMM3
+ aesenc \TMP1, \XMM4
.endr
.irpc index, 56789 # do next 5 rounds
movaps 0x10*\index(%arg1), \TMP1
- AESENC \TMP1, \XMM1
- AESENC \TMP1, \XMM2
- AESENC \TMP1, \XMM3
- AESENC \TMP1, \XMM4
+ aesenc \TMP1, \XMM1
+ aesenc \TMP1, \XMM2
+ aesenc \TMP1, \XMM3
+ aesenc \TMP1, \XMM4
.endr
lea 0xa0(%arg1),%r10
mov keysize,%eax
@@ -918,7 +913,7 @@ aes_loop_initial_\@:
aes_loop_pre_\@:
MOVADQ (%r10),\TMP2
.irpc index, 1234
- AESENC \TMP2, %xmm\index
+ aesenc \TMP2, %xmm\index
.endr
add $16,%r10
sub $1,%eax
@@ -926,10 +921,10 @@ aes_loop_pre_\@:
aes_loop_pre_done\@:
MOVADQ (%r10), \TMP2
- AESENCLAST \TMP2, \XMM1
- AESENCLAST \TMP2, \XMM2
- AESENCLAST \TMP2, \XMM3
- AESENCLAST \TMP2, \XMM4
+ aesenclast \TMP2, \XMM1
+ aesenclast \TMP2, \XMM2
+ aesenclast \TMP2, \XMM3
+ aesenclast \TMP2, \XMM4
movdqu 16*0(%arg4 , %r11 , 1), \TMP1
pxor \TMP1, \XMM1
.ifc \operation, dec
@@ -961,12 +956,12 @@ aes_loop_pre_done\@:
.endif
add $64, %r11
- PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
+ pshufb %xmm14, \XMM1 # perform a 16 byte swap
pxor \XMMDst, \XMM1
# combine GHASHed value with the corresponding ciphertext
- PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
- PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
- PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
+ pshufb %xmm14, \XMM2 # perform a 16 byte swap
+ pshufb %xmm14, \XMM3 # perform a 16 byte swap
+ pshufb %xmm14, \XMM4 # perform a 16 byte swap
_initial_blocks_done\@:
@@ -978,7 +973,7 @@ _initial_blocks_done\@:
* arg1, %arg3, %arg4 are used as pointers only, not modified
* %r11 is the data offset value
*/
-.macro GHASH_4_ENCRYPT_4_PARALLEL_ENC TMP1 TMP2 TMP3 TMP4 TMP5 \
+.macro GHASH_4_ENCRYPT_4_PARALLEL_enc TMP1 TMP2 TMP3 TMP4 TMP5 \
TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
movdqa \XMM1, \XMM5
@@ -994,7 +989,7 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
pxor \XMM5, \TMP6
paddd ONE(%rip), \XMM0 # INCR CNT
movdqu HashKey_4(%arg2), \TMP5
- PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1
+ pclmulqdq $0x11, \TMP5, \TMP4 # TMP4 = a1*b1
movdqa \XMM0, \XMM1
paddd ONE(%rip), \XMM0 # INCR CNT
movdqa \XMM0, \XMM2
@@ -1002,51 +997,51 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
movdqa \XMM0, \XMM3
paddd ONE(%rip), \XMM0 # INCR CNT
movdqa \XMM0, \XMM4
- PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
- PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0
- PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
- PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
- PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
+ pshufb %xmm15, \XMM1 # perform a 16 byte swap
+ pclmulqdq $0x00, \TMP5, \XMM5 # XMM5 = a0*b0
+ pshufb %xmm15, \XMM2 # perform a 16 byte swap
+ pshufb %xmm15, \XMM3 # perform a 16 byte swap
+ pshufb %xmm15, \XMM4 # perform a 16 byte swap
pxor (%arg1), \XMM1
pxor (%arg1), \XMM2
pxor (%arg1), \XMM3
pxor (%arg1), \XMM4
movdqu HashKey_4_k(%arg2), \TMP5
- PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
+ pclmulqdq $0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
movaps 0x10(%arg1), \TMP1
- AESENC \TMP1, \XMM1 # Round 1
- AESENC \TMP1, \XMM2
- AESENC \TMP1, \XMM3
- AESENC \TMP1, \XMM4
+ aesenc \TMP1, \XMM1 # Round 1
+ aesenc \TMP1, \XMM2
+ aesenc \TMP1, \XMM3
+ aesenc \TMP1, \XMM4
movaps 0x20(%arg1), \TMP1
- AESENC \TMP1, \XMM1 # Round 2
- AESENC \TMP1, \XMM2
- AESENC \TMP1, \XMM3
- AESENC \TMP1, \XMM4
+ aesenc \TMP1, \XMM1 # Round 2
+ aesenc \TMP1, \XMM2
+ aesenc \TMP1, \XMM3
+ aesenc \TMP1, \XMM4
movdqa \XMM6, \TMP1
pshufd $78, \XMM6, \TMP2
pxor \XMM6, \TMP2
movdqu HashKey_3(%arg2), \TMP5
- PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
+ pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
movaps 0x30(%arg1), \TMP3
- AESENC \TMP3, \XMM1 # Round 3
- AESENC \TMP3, \XMM2
- AESENC \TMP3, \XMM3
- AESENC \TMP3, \XMM4
- PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0
+ aesenc \TMP3, \XMM1 # Round 3
+ aesenc \TMP3, \XMM2
+ aesenc \TMP3, \XMM3
+ aesenc \TMP3, \XMM4
+ pclmulqdq $0x00, \TMP5, \XMM6 # XMM6 = a0*b0
movaps 0x40(%arg1), \TMP3
- AESENC \TMP3, \XMM1 # Round 4
- AESENC \TMP3, \XMM2
- AESENC \TMP3, \XMM3
- AESENC \TMP3, \XMM4
+ aesenc \TMP3, \XMM1 # Round 4
+ aesenc \TMP3, \XMM2
+ aesenc \TMP3, \XMM3
+ aesenc \TMP3, \XMM4
movdqu HashKey_3_k(%arg2), \TMP5
- PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
+ pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
movaps 0x50(%arg1), \TMP3
- AESENC \TMP3, \XMM1 # Round 5
- AESENC \TMP3, \XMM2
- AESENC \TMP3, \XMM3
- AESENC \TMP3, \XMM4
+ aesenc \TMP3, \XMM1 # Round 5
+ aesenc \TMP3, \XMM2
+ aesenc \TMP3, \XMM3
+ aesenc \TMP3, \XMM4
pxor \TMP1, \TMP4
# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
pxor \XMM6, \XMM5
@@ -1058,25 +1053,25 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
# Multiply TMP5 * HashKey using karatsuba
- PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
+ pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1
movaps 0x60(%arg1), \TMP3
- AESENC \TMP3, \XMM1 # Round 6
- AESENC \TMP3, \XMM2
- AESENC \TMP3, \XMM3
- AESENC \TMP3, \XMM4
- PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0
+ aesenc \TMP3, \XMM1 # Round 6
+ aesenc \TMP3, \XMM2
+ aesenc \TMP3, \XMM3
+ aesenc \TMP3, \XMM4
+ pclmulqdq $0x00, \TMP5, \XMM7 # XMM7 = a0*b0
movaps 0x70(%arg1), \TMP3
- AESENC \TMP3, \XMM1 # Round 7
- AESENC \TMP3, \XMM2
- AESENC \TMP3, \XMM3
- AESENC \TMP3, \XMM4
+ aesenc \TMP3, \XMM1 # Round 7
+ aesenc \TMP3, \XMM2
+ aesenc \TMP3, \XMM3
+ aesenc \TMP3, \XMM4
movdqu HashKey_2_k(%arg2), \TMP5
- PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
+ pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
movaps 0x80(%arg1), \TMP3
- AESENC \TMP3, \XMM1 # Round 8
- AESENC \TMP3, \XMM2
- AESENC \TMP3, \XMM3
- AESENC \TMP3, \XMM4
+ aesenc \TMP3, \XMM1 # Round 8
+ aesenc \TMP3, \XMM2
+ aesenc \TMP3, \XMM3
+ aesenc \TMP3, \XMM4
pxor \TMP1, \TMP4
# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
pxor \XMM7, \XMM5
@@ -1089,13 +1084,13 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
pshufd $78, \XMM8, \TMP2
pxor \XMM8, \TMP2
movdqu HashKey(%arg2), \TMP5
- PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
+ pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1
movaps 0x90(%arg1), \TMP3
- AESENC \TMP3, \XMM1 # Round 9
- AESENC \TMP3, \XMM2
- AESENC \TMP3, \XMM3
- AESENC \TMP3, \XMM4
- PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0
+ aesenc \TMP3, \XMM1 # Round 9
+ aesenc \TMP3, \XMM2
+ aesenc \TMP3, \XMM3
+ aesenc \TMP3, \XMM4
+ pclmulqdq $0x00, \TMP5, \XMM8 # XMM8 = a0*b0
lea 0xa0(%arg1),%r10
mov keysize,%eax
shr $2,%eax # 128->4, 192->6, 256->8
@@ -1105,7 +1100,7 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
aes_loop_par_enc\@:
MOVADQ (%r10),\TMP3
.irpc index, 1234
- AESENC \TMP3, %xmm\index
+ aesenc \TMP3, %xmm\index
.endr
add $16,%r10
sub $1,%eax
@@ -1113,12 +1108,12 @@ aes_loop_par_enc\@:
aes_loop_par_enc_done\@:
MOVADQ (%r10), \TMP3
- AESENCLAST \TMP3, \XMM1 # Round 10
- AESENCLAST \TMP3, \XMM2
- AESENCLAST \TMP3, \XMM3
- AESENCLAST \TMP3, \XMM4
+ aesenclast \TMP3, \XMM1 # Round 10
+ aesenclast \TMP3, \XMM2
+ aesenclast \TMP3, \XMM3
+ aesenclast \TMP3, \XMM4
movdqu HashKey_k(%arg2), \TMP5
- PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
+ pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
movdqu (%arg4,%r11,1), \TMP3
pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
movdqu 16(%arg4,%r11,1), \TMP3
@@ -1131,10 +1126,10 @@ aes_loop_par_enc_done\@:
movdqu \XMM2, 16(%arg3,%r11,1) # Write to the ciphertext buffer
movdqu \XMM3, 32(%arg3,%r11,1) # Write to the ciphertext buffer
movdqu \XMM4, 48(%arg3,%r11,1) # Write to the ciphertext buffer
- PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
- PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
- PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
- PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
+ pshufb %xmm15, \XMM1 # perform a 16 byte swap
+ pshufb %xmm15, \XMM2 # perform a 16 byte swap
+ pshufb %xmm15, \XMM3 # perform a 16 byte swap
+ pshufb %xmm15, \XMM4 # perform a 16 byte swap
pxor \TMP4, \TMP1
pxor \XMM8, \XMM5
@@ -1186,7 +1181,7 @@ aes_loop_par_enc_done\@:
* arg1, %arg3, %arg4 are used as pointers only, not modified
* %r11 is the data offset value
*/
-.macro GHASH_4_ENCRYPT_4_PARALLEL_DEC TMP1 TMP2 TMP3 TMP4 TMP5 \
+.macro GHASH_4_ENCRYPT_4_PARALLEL_dec TMP1 TMP2 TMP3 TMP4 TMP5 \
TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
movdqa \XMM1, \XMM5
@@ -1202,7 +1197,7 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
pxor \XMM5, \TMP6
paddd ONE(%rip), \XMM0 # INCR CNT
movdqu HashKey_4(%arg2), \TMP5
- PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1
+ pclmulqdq $0x11, \TMP5, \TMP4 # TMP4 = a1*b1
movdqa \XMM0, \XMM1
paddd ONE(%rip), \XMM0 # INCR CNT
movdqa \XMM0, \XMM2
@@ -1210,51 +1205,51 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
movdqa \XMM0, \XMM3
paddd ONE(%rip), \XMM0 # INCR CNT
movdqa \XMM0, \XMM4
- PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
- PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0
- PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
- PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
- PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
+ pshufb %xmm15, \XMM1 # perform a 16 byte swap
+ pclmulqdq $0x00, \TMP5, \XMM5 # XMM5 = a0*b0
+ pshufb %xmm15, \XMM2 # perform a 16 byte swap
+ pshufb %xmm15, \XMM3 # perform a 16 byte swap
+ pshufb %xmm15, \XMM4 # perform a 16 byte swap
pxor (%arg1), \XMM1
pxor (%arg1), \XMM2
pxor (%arg1), \XMM3
pxor (%arg1), \XMM4
movdqu HashKey_4_k(%arg2), \TMP5
- PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
+ pclmulqdq $0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
movaps 0x10(%arg1), \TMP1
- AESENC \TMP1, \XMM1 # Round 1
- AESENC \TMP1, \XMM2
- AESENC \TMP1, \XMM3
- AESENC \TMP1, \XMM4
+ aesenc \TMP1, \XMM1 # Round 1
+ aesenc \TMP1, \XMM2
+ aesenc \TMP1, \XMM3
+ aesenc \TMP1, \XMM4
movaps 0x20(%arg1), \TMP1
- AESENC \TMP1, \XMM1 # Round 2
- AESENC \TMP1, \XMM2
- AESENC \TMP1, \XMM3
- AESENC \TMP1, \XMM4
+ aesenc \TMP1, \XMM1 # Round 2
+ aesenc \TMP1, \XMM2
+ aesenc \TMP1, \XMM3
+ aesenc \TMP1, \XMM4
movdqa \XMM6, \TMP1
pshufd $78, \XMM6, \TMP2
pxor \XMM6, \TMP2
movdqu HashKey_3(%arg2), \TMP5
- PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
+ pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
movaps 0x30(%arg1), \TMP3
- AESENC \TMP3, \XMM1 # Round 3
- AESENC \TMP3, \XMM2
- AESENC \TMP3, \XMM3
- AESENC \TMP3, \XMM4
- PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0
+ aesenc \TMP3, \XMM1 # Round 3
+ aesenc \TMP3, \XMM2
+ aesenc \TMP3, \XMM3
+ aesenc \TMP3, \XMM4
+ pclmulqdq $0x00, \TMP5, \XMM6 # XMM6 = a0*b0
movaps 0x40(%arg1), \TMP3
- AESENC \TMP3, \XMM1 # Round 4
- AESENC \TMP3, \XMM2
- AESENC \TMP3, \XMM3
- AESENC \TMP3, \XMM4
+ aesenc \TMP3, \XMM1 # Round 4
+ aesenc \TMP3, \XMM2
+ aesenc \TMP3, \XMM3
+ aesenc \TMP3, \XMM4
movdqu HashKey_3_k(%arg2), \TMP5
- PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
+ pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
movaps 0x50(%arg1), \TMP3
- AESENC \TMP3, \XMM1 # Round 5
- AESENC \TMP3, \XMM2
- AESENC \TMP3, \XMM3
- AESENC \TMP3, \XMM4
+ aesenc \TMP3, \XMM1 # Round 5
+ aesenc \TMP3, \XMM2
+ aesenc \TMP3, \XMM3
+ aesenc \TMP3, \XMM4
pxor \TMP1, \TMP4
# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
pxor \XMM6, \XMM5
@@ -1266,25 +1261,25 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
# Multiply TMP5 * HashKey using karatsuba
- PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
+ pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1
movaps 0x60(%arg1), \TMP3
- AESENC \TMP3, \XMM1 # Round 6
- AESENC \TMP3, \XMM2
- AESENC \TMP3, \XMM3
- AESENC \TMP3, \XMM4
- PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0
+ aesenc \TMP3, \XMM1 # Round 6
+ aesenc \TMP3, \XMM2
+ aesenc \TMP3, \XMM3
+ aesenc \TMP3, \XMM4
+ pclmulqdq $0x00, \TMP5, \XMM7 # XMM7 = a0*b0
movaps 0x70(%arg1), \TMP3
- AESENC \TMP3, \XMM1 # Round 7
- AESENC \TMP3, \XMM2
- AESENC \TMP3, \XMM3
- AESENC \TMP3, \XMM4
+ aesenc \TMP3, \XMM1 # Round 7
+ aesenc \TMP3, \XMM2
+ aesenc \TMP3, \XMM3
+ aesenc \TMP3, \XMM4
movdqu HashKey_2_k(%arg2), \TMP5
- PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
+ pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
movaps 0x80(%arg1), \TMP3
- AESENC \TMP3, \XMM1 # Round 8
- AESENC \TMP3, \XMM2
- AESENC \TMP3, \XMM3
- AESENC \TMP3, \XMM4
+ aesenc \TMP3, \XMM1 # Round 8
+ aesenc \TMP3, \XMM2
+ aesenc \TMP3, \XMM3
+ aesenc \TMP3, \XMM4
pxor \TMP1, \TMP4
# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
pxor \XMM7, \XMM5
@@ -1297,13 +1292,13 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
pshufd $78, \XMM8, \TMP2
pxor \XMM8, \TMP2
movdqu HashKey(%arg2), \TMP5
- PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
+ pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1
movaps 0x90(%arg1), \TMP3
- AESENC \TMP3, \XMM1 # Round 9
- AESENC \TMP3, \XMM2
- AESENC \TMP3, \XMM3
- AESENC \TMP3, \XMM4
- PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0
+ aesenc \TMP3, \XMM1 # Round 9
+ aesenc \TMP3, \XMM2
+ aesenc \TMP3, \XMM3
+ aesenc \TMP3, \XMM4
+ pclmulqdq $0x00, \TMP5, \XMM8 # XMM8 = a0*b0
lea 0xa0(%arg1),%r10
mov keysize,%eax
shr $2,%eax # 128->4, 192->6, 256->8
@@ -1313,7 +1308,7 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
aes_loop_par_dec\@:
MOVADQ (%r10),\TMP3
.irpc index, 1234
- AESENC \TMP3, %xmm\index
+ aesenc \TMP3, %xmm\index
.endr
add $16,%r10
sub $1,%eax
@@ -1321,12 +1316,12 @@ aes_loop_par_dec\@:
aes_loop_par_dec_done\@:
MOVADQ (%r10), \TMP3
- AESENCLAST \TMP3, \XMM1 # last round
- AESENCLAST \TMP3, \XMM2
- AESENCLAST \TMP3, \XMM3
- AESENCLAST \TMP3, \XMM4
+ aesenclast \TMP3, \XMM1 # last round
+ aesenclast \TMP3, \XMM2
+ aesenclast \TMP3, \XMM3
+ aesenclast \TMP3, \XMM4
movdqu HashKey_k(%arg2), \TMP5
- PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
+ pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
movdqu (%arg4,%r11,1), \TMP3
pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
movdqu \XMM1, (%arg3,%r11,1) # Write to plaintext buffer
@@ -1343,10 +1338,10 @@ aes_loop_par_dec_done\@:
pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK
movdqu \XMM4, 48(%arg3,%r11,1) # Write to plaintext buffer
movdqa \TMP3, \XMM4
- PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
- PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
- PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
- PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
+ pshufb %xmm15, \XMM1 # perform a 16 byte swap
+ pshufb %xmm15, \XMM2 # perform a 16 byte swap
+ pshufb %xmm15, \XMM3 # perform a 16 byte swap
+ pshufb %xmm15, \XMM4 # perform a 16 byte swap
pxor \TMP4, \TMP1
pxor \XMM8, \XMM5
@@ -1402,10 +1397,10 @@ TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
pshufd $78, \XMM1, \TMP2
pxor \XMM1, \TMP2
movdqu HashKey_4(%arg2), \TMP5
- PCLMULQDQ 0x11, \TMP5, \TMP6 # TMP6 = a1*b1
- PCLMULQDQ 0x00, \TMP5, \XMM1 # XMM1 = a0*b0
+ pclmulqdq $0x11, \TMP5, \TMP6 # TMP6 = a1*b1
+ pclmulqdq $0x00, \TMP5, \XMM1 # XMM1 = a0*b0
movdqu HashKey_4_k(%arg2), \TMP4
- PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
+ pclmulqdq $0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
movdqa \XMM1, \XMMDst
movdqa \TMP2, \XMM1 # result in TMP6, XMMDst, XMM1
@@ -1415,10 +1410,10 @@ TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
pshufd $78, \XMM2, \TMP2
pxor \XMM2, \TMP2
movdqu HashKey_3(%arg2), \TMP5
- PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
- PCLMULQDQ 0x00, \TMP5, \XMM2 # XMM2 = a0*b0
+ pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1
+ pclmulqdq $0x00, \TMP5, \XMM2 # XMM2 = a0*b0
movdqu HashKey_3_k(%arg2), \TMP4
- PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
+ pclmulqdq $0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
pxor \TMP1, \TMP6
pxor \XMM2, \XMMDst
pxor \TMP2, \XMM1
@@ -1430,10 +1425,10 @@ TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
pshufd $78, \XMM3, \TMP2
pxor \XMM3, \TMP2
movdqu HashKey_2(%arg2), \TMP5
- PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
- PCLMULQDQ 0x00, \TMP5, \XMM3 # XMM3 = a0*b0
+ pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1
+ pclmulqdq $0x00, \TMP5, \XMM3 # XMM3 = a0*b0
movdqu HashKey_2_k(%arg2), \TMP4
- PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
+ pclmulqdq $0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
pxor \TMP1, \TMP6
pxor \XMM3, \XMMDst
pxor \TMP2, \XMM1 # results accumulated in TMP6, XMMDst, XMM1
@@ -1443,10 +1438,10 @@ TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
pshufd $78, \XMM4, \TMP2
pxor \XMM4, \TMP2
movdqu HashKey(%arg2), \TMP5
- PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
- PCLMULQDQ 0x00, \TMP5, \XMM4 # XMM4 = a0*b0
+ pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1
+ pclmulqdq $0x00, \TMP5, \XMM4 # XMM4 = a0*b0
movdqu HashKey_k(%arg2), \TMP4
- PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
+ pclmulqdq $0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
pxor \TMP1, \TMP6
pxor \XMM4, \XMMDst
pxor \XMM1, \TMP2
@@ -1504,13 +1499,13 @@ TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
_esb_loop_\@:
MOVADQ (%r10),\TMP1
- AESENC \TMP1,\XMM0
+ aesenc \TMP1,\XMM0
add $16,%r10
sub $1,%eax
jnz _esb_loop_\@
MOVADQ (%r10),\TMP1
- AESENCLAST \TMP1,\XMM0
+ aesenclast \TMP1,\XMM0
.endm
/*****************************************************************************
* void aesni_gcm_dec(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
@@ -1599,7 +1594,7 @@ SYM_FUNC_START(aesni_gcm_dec)
GCM_ENC_DEC dec
GCM_COMPLETE arg10, arg11
FUNC_RESTORE
- ret
+ RET
SYM_FUNC_END(aesni_gcm_dec)
@@ -1688,7 +1683,7 @@ SYM_FUNC_START(aesni_gcm_enc)
GCM_COMPLETE arg10, arg11
FUNC_RESTORE
- ret
+ RET
SYM_FUNC_END(aesni_gcm_enc)
/*****************************************************************************
@@ -1706,7 +1701,7 @@ SYM_FUNC_START(aesni_gcm_init)
FUNC_SAVE
GCM_INIT %arg3, %arg4,%arg5, %arg6
FUNC_RESTORE
- ret
+ RET
SYM_FUNC_END(aesni_gcm_init)
/*****************************************************************************
@@ -1721,7 +1716,7 @@ SYM_FUNC_START(aesni_gcm_enc_update)
FUNC_SAVE
GCM_ENC_DEC enc
FUNC_RESTORE
- ret
+ RET
SYM_FUNC_END(aesni_gcm_enc_update)
/*****************************************************************************
@@ -1736,7 +1731,7 @@ SYM_FUNC_START(aesni_gcm_dec_update)
FUNC_SAVE
GCM_ENC_DEC dec
FUNC_RESTORE
- ret
+ RET
SYM_FUNC_END(aesni_gcm_dec_update)
/*****************************************************************************
@@ -1751,13 +1746,11 @@ SYM_FUNC_START(aesni_gcm_finalize)
FUNC_SAVE
GCM_COMPLETE %arg3 %arg4
FUNC_RESTORE
- ret
+ RET
SYM_FUNC_END(aesni_gcm_finalize)
#endif
-
-SYM_FUNC_START_LOCAL_ALIAS(_key_expansion_128)
SYM_FUNC_START_LOCAL(_key_expansion_256a)
pshufd $0b11111111, %xmm1, %xmm1
shufps $0b00010000, %xmm0, %xmm4
@@ -1767,9 +1760,9 @@ SYM_FUNC_START_LOCAL(_key_expansion_256a)
pxor %xmm1, %xmm0
movaps %xmm0, (TKEYP)
add $0x10, TKEYP
- ret
+ RET
SYM_FUNC_END(_key_expansion_256a)
-SYM_FUNC_END_ALIAS(_key_expansion_128)
+SYM_FUNC_ALIAS_LOCAL(_key_expansion_128, _key_expansion_256a)
SYM_FUNC_START_LOCAL(_key_expansion_192a)
pshufd $0b01010101, %xmm1, %xmm1
@@ -1792,7 +1785,7 @@ SYM_FUNC_START_LOCAL(_key_expansion_192a)
shufps $0b01001110, %xmm2, %xmm1
movaps %xmm1, 0x10(TKEYP)
add $0x20, TKEYP
- ret
+ RET
SYM_FUNC_END(_key_expansion_192a)
SYM_FUNC_START_LOCAL(_key_expansion_192b)
@@ -1811,7 +1804,7 @@ SYM_FUNC_START_LOCAL(_key_expansion_192b)
movaps %xmm0, (TKEYP)
add $0x10, TKEYP
- ret
+ RET
SYM_FUNC_END(_key_expansion_192b)
SYM_FUNC_START_LOCAL(_key_expansion_256b)
@@ -1823,7 +1816,7 @@ SYM_FUNC_START_LOCAL(_key_expansion_256b)
pxor %xmm1, %xmm2
movaps %xmm2, (TKEYP)
add $0x10, TKEYP
- ret
+ RET
SYM_FUNC_END(_key_expansion_256b)
/*
@@ -1849,72 +1842,72 @@ SYM_FUNC_START(aesni_set_key)
movups 0x10(UKEYP), %xmm2 # other user key
movaps %xmm2, (TKEYP)
add $0x10, TKEYP
- AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1
+ aeskeygenassist $0x1, %xmm2, %xmm1 # round 1
call _key_expansion_256a
- AESKEYGENASSIST 0x1 %xmm0 %xmm1
+ aeskeygenassist $0x1, %xmm0, %xmm1
call _key_expansion_256b
- AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2
+ aeskeygenassist $0x2, %xmm2, %xmm1 # round 2
call _key_expansion_256a
- AESKEYGENASSIST 0x2 %xmm0 %xmm1
+ aeskeygenassist $0x2, %xmm0, %xmm1
call _key_expansion_256b
- AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3
+ aeskeygenassist $0x4, %xmm2, %xmm1 # round 3
call _key_expansion_256a
- AESKEYGENASSIST 0x4 %xmm0 %xmm1
+ aeskeygenassist $0x4, %xmm0, %xmm1
call _key_expansion_256b
- AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4
+ aeskeygenassist $0x8, %xmm2, %xmm1 # round 4
call _key_expansion_256a
- AESKEYGENASSIST 0x8 %xmm0 %xmm1
+ aeskeygenassist $0x8, %xmm0, %xmm1
call _key_expansion_256b
- AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5
+ aeskeygenassist $0x10, %xmm2, %xmm1 # round 5
call _key_expansion_256a
- AESKEYGENASSIST 0x10 %xmm0 %xmm1
+ aeskeygenassist $0x10, %xmm0, %xmm1
call _key_expansion_256b
- AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6
+ aeskeygenassist $0x20, %xmm2, %xmm1 # round 6
call _key_expansion_256a
- AESKEYGENASSIST 0x20 %xmm0 %xmm1
+ aeskeygenassist $0x20, %xmm0, %xmm1
call _key_expansion_256b
- AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7
+ aeskeygenassist $0x40, %xmm2, %xmm1 # round 7
call _key_expansion_256a
jmp .Ldec_key
.Lenc_key192:
movq 0x10(UKEYP), %xmm2 # other user key
- AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1
+ aeskeygenassist $0x1, %xmm2, %xmm1 # round 1
call _key_expansion_192a
- AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2
+ aeskeygenassist $0x2, %xmm2, %xmm1 # round 2
call _key_expansion_192b
- AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3
+ aeskeygenassist $0x4, %xmm2, %xmm1 # round 3
call _key_expansion_192a
- AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4
+ aeskeygenassist $0x8, %xmm2, %xmm1 # round 4
call _key_expansion_192b
- AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5
+ aeskeygenassist $0x10, %xmm2, %xmm1 # round 5
call _key_expansion_192a
- AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6
+ aeskeygenassist $0x20, %xmm2, %xmm1 # round 6
call _key_expansion_192b
- AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7
+ aeskeygenassist $0x40, %xmm2, %xmm1 # round 7
call _key_expansion_192a
- AESKEYGENASSIST 0x80 %xmm2 %xmm1 # round 8
+ aeskeygenassist $0x80, %xmm2, %xmm1 # round 8
call _key_expansion_192b
jmp .Ldec_key
.Lenc_key128:
- AESKEYGENASSIST 0x1 %xmm0 %xmm1 # round 1
+ aeskeygenassist $0x1, %xmm0, %xmm1 # round 1
call _key_expansion_128
- AESKEYGENASSIST 0x2 %xmm0 %xmm1 # round 2
+ aeskeygenassist $0x2, %xmm0, %xmm1 # round 2
call _key_expansion_128
- AESKEYGENASSIST 0x4 %xmm0 %xmm1 # round 3
+ aeskeygenassist $0x4, %xmm0, %xmm1 # round 3
call _key_expansion_128
- AESKEYGENASSIST 0x8 %xmm0 %xmm1 # round 4
+ aeskeygenassist $0x8, %xmm0, %xmm1 # round 4
call _key_expansion_128
- AESKEYGENASSIST 0x10 %xmm0 %xmm1 # round 5
+ aeskeygenassist $0x10, %xmm0, %xmm1 # round 5
call _key_expansion_128
- AESKEYGENASSIST 0x20 %xmm0 %xmm1 # round 6
+ aeskeygenassist $0x20, %xmm0, %xmm1 # round 6
call _key_expansion_128
- AESKEYGENASSIST 0x40 %xmm0 %xmm1 # round 7
+ aeskeygenassist $0x40, %xmm0, %xmm1 # round 7
call _key_expansion_128
- AESKEYGENASSIST 0x80 %xmm0 %xmm1 # round 8
+ aeskeygenassist $0x80, %xmm0, %xmm1 # round 8
call _key_expansion_128
- AESKEYGENASSIST 0x1b %xmm0 %xmm1 # round 9
+ aeskeygenassist $0x1b, %xmm0, %xmm1 # round 9
call _key_expansion_128
- AESKEYGENASSIST 0x36 %xmm0 %xmm1 # round 10
+ aeskeygenassist $0x36, %xmm0, %xmm1 # round 10
call _key_expansion_128
.Ldec_key:
sub $0x10, TKEYP
@@ -1927,7 +1920,7 @@ SYM_FUNC_START(aesni_set_key)
.align 4
.Ldec_key_loop:
movaps (KEYP), %xmm0
- AESIMC %xmm0 %xmm1
+ aesimc %xmm0, %xmm1
movaps %xmm1, (UKEYP)
add $0x10, KEYP
sub $0x10, UKEYP
@@ -1938,7 +1931,7 @@ SYM_FUNC_START(aesni_set_key)
popl KEYP
#endif
FRAME_END
- ret
+ RET
SYM_FUNC_END(aesni_set_key)
/*
@@ -1962,7 +1955,7 @@ SYM_FUNC_START(aesni_enc)
popl KEYP
#endif
FRAME_END
- ret
+ RET
SYM_FUNC_END(aesni_enc)
/*
@@ -1988,38 +1981,38 @@ SYM_FUNC_START_LOCAL(_aesni_enc1)
je .Lenc192
add $0x20, TKEYP
movaps -0x60(TKEYP), KEY
- AESENC KEY STATE
+ aesenc KEY, STATE
movaps -0x50(TKEYP), KEY
- AESENC KEY STATE
+ aesenc KEY, STATE
.align 4
.Lenc192:
movaps -0x40(TKEYP), KEY
- AESENC KEY STATE
+ aesenc KEY, STATE
movaps -0x30(TKEYP), KEY
- AESENC KEY STATE
+ aesenc KEY, STATE
.align 4
.Lenc128:
movaps -0x20(TKEYP), KEY
- AESENC KEY STATE
+ aesenc KEY, STATE
movaps -0x10(TKEYP), KEY
- AESENC KEY STATE
+ aesenc KEY, STATE
movaps (TKEYP), KEY
- AESENC KEY STATE
+ aesenc KEY, STATE
movaps 0x10(TKEYP), KEY
- AESENC KEY STATE
+ aesenc KEY, STATE
movaps 0x20(TKEYP), KEY
- AESENC KEY STATE
+ aesenc KEY, STATE
movaps 0x30(TKEYP), KEY
- AESENC KEY STATE
+ aesenc KEY, STATE
movaps 0x40(TKEYP), KEY
- AESENC KEY STATE
+ aesenc KEY, STATE
movaps 0x50(TKEYP), KEY
- AESENC KEY STATE
+ aesenc KEY, STATE
movaps 0x60(TKEYP), KEY
- AESENC KEY STATE
+ aesenc KEY, STATE
movaps 0x70(TKEYP), KEY
- AESENCLAST KEY STATE
- ret
+ aesenclast KEY, STATE
+ RET
SYM_FUNC_END(_aesni_enc1)
/*
@@ -2054,80 +2047,80 @@ SYM_FUNC_START_LOCAL(_aesni_enc4)
je .L4enc192
add $0x20, TKEYP
movaps -0x60(TKEYP), KEY
- AESENC KEY STATE1
- AESENC KEY STATE2
- AESENC KEY STATE3
- AESENC KEY STATE4
+ aesenc KEY, STATE1
+ aesenc KEY, STATE2
+ aesenc KEY, STATE3
+ aesenc KEY, STATE4
movaps -0x50(TKEYP), KEY
- AESENC KEY STATE1
- AESENC KEY STATE2
- AESENC KEY STATE3
- AESENC KEY STATE4
+ aesenc KEY, STATE1
+ aesenc KEY, STATE2
+ aesenc KEY, STATE3
+ aesenc KEY, STATE4
#.align 4
.L4enc192:
movaps -0x40(TKEYP), KEY
- AESENC KEY STATE1
- AESENC KEY STATE2
- AESENC KEY STATE3
- AESENC KEY STATE4
+ aesenc KEY, STATE1
+ aesenc KEY, STATE2
+ aesenc KEY, STATE3
+ aesenc KEY, STATE4
movaps -0x30(TKEYP), KEY
- AESENC KEY STATE1
- AESENC KEY STATE2
- AESENC KEY STATE3
- AESENC KEY STATE4
+ aesenc KEY, STATE1
+ aesenc KEY, STATE2
+ aesenc KEY, STATE3
+ aesenc KEY, STATE4
#.align 4
.L4enc128:
movaps -0x20(TKEYP), KEY
- AESENC KEY STATE1
- AESENC KEY STATE2
- AESENC KEY STATE3
- AESENC KEY STATE4
+ aesenc KEY, STATE1
+ aesenc KEY, STATE2
+ aesenc KEY, STATE3
+ aesenc KEY, STATE4
movaps -0x10(TKEYP), KEY
- AESENC KEY STATE1
- AESENC KEY STATE2
- AESENC KEY STATE3
- AESENC KEY STATE4
+ aesenc KEY, STATE1
+ aesenc KEY, STATE2
+ aesenc KEY, STATE3
+ aesenc KEY, STATE4
movaps (TKEYP), KEY
- AESENC KEY STATE1
- AESENC KEY STATE2
- AESENC KEY STATE3
- AESENC KEY STATE4
+ aesenc KEY, STATE1
+ aesenc KEY, STATE2
+ aesenc KEY, STATE3
+ aesenc KEY, STATE4
movaps 0x10(TKEYP), KEY
- AESENC KEY STATE1
- AESENC KEY STATE2
- AESENC KEY STATE3
- AESENC KEY STATE4
+ aesenc KEY, STATE1
+ aesenc KEY, STATE2
+ aesenc KEY, STATE3
+ aesenc KEY, STATE4
movaps 0x20(TKEYP), KEY
- AESENC KEY STATE1
- AESENC KEY STATE2
- AESENC KEY STATE3
- AESENC KEY STATE4
+ aesenc KEY, STATE1
+ aesenc KEY, STATE2
+ aesenc KEY, STATE3
+ aesenc KEY, STATE4
movaps 0x30(TKEYP), KEY
- AESENC KEY STATE1
- AESENC KEY STATE2
- AESENC KEY STATE3
- AESENC KEY STATE4
+ aesenc KEY, STATE1
+ aesenc KEY, STATE2
+ aesenc KEY, STATE3
+ aesenc KEY, STATE4
movaps 0x40(TKEYP), KEY
- AESENC KEY STATE1
- AESENC KEY STATE2
- AESENC KEY STATE3
- AESENC KEY STATE4
+ aesenc KEY, STATE1
+ aesenc KEY, STATE2
+ aesenc KEY, STATE3
+ aesenc KEY, STATE4
movaps 0x50(TKEYP), KEY
- AESENC KEY STATE1
- AESENC KEY STATE2
- AESENC KEY STATE3
- AESENC KEY STATE4
+ aesenc KEY, STATE1
+ aesenc KEY, STATE2
+ aesenc KEY, STATE3
+ aesenc KEY, STATE4
movaps 0x60(TKEYP), KEY
- AESENC KEY STATE1
- AESENC KEY STATE2
- AESENC KEY STATE3
- AESENC KEY STATE4
+ aesenc KEY, STATE1
+ aesenc KEY, STATE2
+ aesenc KEY, STATE3
+ aesenc KEY, STATE4
movaps 0x70(TKEYP), KEY
- AESENCLAST KEY STATE1 # last round
- AESENCLAST KEY STATE2
- AESENCLAST KEY STATE3
- AESENCLAST KEY STATE4
- ret
+ aesenclast KEY, STATE1 # last round
+ aesenclast KEY, STATE2
+ aesenclast KEY, STATE3
+ aesenclast KEY, STATE4
+ RET
SYM_FUNC_END(_aesni_enc4)
/*
@@ -2152,7 +2145,7 @@ SYM_FUNC_START(aesni_dec)
popl KEYP
#endif
FRAME_END
- ret
+ RET
SYM_FUNC_END(aesni_dec)
/*
@@ -2178,38 +2171,38 @@ SYM_FUNC_START_LOCAL(_aesni_dec1)
je .Ldec192
add $0x20, TKEYP
movaps -0x60(TKEYP), KEY
- AESDEC KEY STATE
+ aesdec KEY, STATE
movaps -0x50(TKEYP), KEY
- AESDEC KEY STATE
+ aesdec KEY, STATE
.align 4
.Ldec192:
movaps -0x40(TKEYP), KEY
- AESDEC KEY STATE
+ aesdec KEY, STATE
movaps -0x30(TKEYP), KEY
- AESDEC KEY STATE
+ aesdec KEY, STATE
.align 4
.Ldec128:
movaps -0x20(TKEYP), KEY
- AESDEC KEY STATE
+ aesdec KEY, STATE
movaps -0x10(TKEYP), KEY
- AESDEC KEY STATE
+ aesdec KEY, STATE
movaps (TKEYP), KEY
- AESDEC KEY STATE
+ aesdec KEY, STATE
movaps 0x10(TKEYP), KEY
- AESDEC KEY STATE
+ aesdec KEY, STATE
movaps 0x20(TKEYP), KEY
- AESDEC KEY STATE
+ aesdec KEY, STATE
movaps 0x30(TKEYP), KEY
- AESDEC KEY STATE
+ aesdec KEY, STATE
movaps 0x40(TKEYP), KEY
- AESDEC KEY STATE
+ aesdec KEY, STATE
movaps 0x50(TKEYP), KEY
- AESDEC KEY STATE
+ aesdec KEY, STATE
movaps 0x60(TKEYP), KEY
- AESDEC KEY STATE
+ aesdec KEY, STATE
movaps 0x70(TKEYP), KEY
- AESDECLAST KEY STATE
- ret
+ aesdeclast KEY, STATE
+ RET
SYM_FUNC_END(_aesni_dec1)
/*
@@ -2244,80 +2237,80 @@ SYM_FUNC_START_LOCAL(_aesni_dec4)
je .L4dec192
add $0x20, TKEYP
movaps -0x60(TKEYP), KEY
- AESDEC KEY STATE1
- AESDEC KEY STATE2
- AESDEC KEY STATE3
- AESDEC KEY STATE4
+ aesdec KEY, STATE1
+ aesdec KEY, STATE2
+ aesdec KEY, STATE3
+ aesdec KEY, STATE4
movaps -0x50(TKEYP), KEY
- AESDEC KEY STATE1
- AESDEC KEY STATE2
- AESDEC KEY STATE3
- AESDEC KEY STATE4
+ aesdec KEY, STATE1
+ aesdec KEY, STATE2
+ aesdec KEY, STATE3
+ aesdec KEY, STATE4
.align 4
.L4dec192:
movaps -0x40(TKEYP), KEY
- AESDEC KEY STATE1
- AESDEC KEY STATE2
- AESDEC KEY STATE3
- AESDEC KEY STATE4
+ aesdec KEY, STATE1
+ aesdec KEY, STATE2
+ aesdec KEY, STATE3
+ aesdec KEY, STATE4
movaps -0x30(TKEYP), KEY
- AESDEC KEY STATE1
- AESDEC KEY STATE2
- AESDEC KEY STATE3
- AESDEC KEY STATE4
+ aesdec KEY, STATE1
+ aesdec KEY, STATE2
+ aesdec KEY, STATE3
+ aesdec KEY, STATE4
.align 4
.L4dec128:
movaps -0x20(TKEYP), KEY
- AESDEC KEY STATE1
- AESDEC KEY STATE2
- AESDEC KEY STATE3
- AESDEC KEY STATE4
+ aesdec KEY, STATE1
+ aesdec KEY, STATE2
+ aesdec KEY, STATE3
+ aesdec KEY, STATE4
movaps -0x10(TKEYP), KEY
- AESDEC KEY STATE1
- AESDEC KEY STATE2
- AESDEC KEY STATE3
- AESDEC KEY STATE4
+ aesdec KEY, STATE1
+ aesdec KEY, STATE2
+ aesdec KEY, STATE3
+ aesdec KEY, STATE4
movaps (TKEYP), KEY
- AESDEC KEY STATE1
- AESDEC KEY STATE2
- AESDEC KEY STATE3
- AESDEC KEY STATE4
+ aesdec KEY, STATE1
+ aesdec KEY, STATE2
+ aesdec KEY, STATE3
+ aesdec KEY, STATE4
movaps 0x10(TKEYP), KEY
- AESDEC KEY STATE1
- AESDEC KEY STATE2
- AESDEC KEY STATE3
- AESDEC KEY STATE4
+ aesdec KEY, STATE1
+ aesdec KEY, STATE2
+ aesdec KEY, STATE3
+ aesdec KEY, STATE4
movaps 0x20(TKEYP), KEY
- AESDEC KEY STATE1
- AESDEC KEY STATE2
- AESDEC KEY STATE3
- AESDEC KEY STATE4
+ aesdec KEY, STATE1
+ aesdec KEY, STATE2
+ aesdec KEY, STATE3
+ aesdec KEY, STATE4
movaps 0x30(TKEYP), KEY
- AESDEC KEY STATE1
- AESDEC KEY STATE2
- AESDEC KEY STATE3
- AESDEC KEY STATE4
+ aesdec KEY, STATE1
+ aesdec KEY, STATE2
+ aesdec KEY, STATE3
+ aesdec KEY, STATE4
movaps 0x40(TKEYP), KEY
- AESDEC KEY STATE1
- AESDEC KEY STATE2
- AESDEC KEY STATE3
- AESDEC KEY STATE4
+ aesdec KEY, STATE1
+ aesdec KEY, STATE2
+ aesdec KEY, STATE3
+ aesdec KEY, STATE4
movaps 0x50(TKEYP), KEY
- AESDEC KEY STATE1
- AESDEC KEY STATE2
- AESDEC KEY STATE3
- AESDEC KEY STATE4
+ aesdec KEY, STATE1
+ aesdec KEY, STATE2
+ aesdec KEY, STATE3
+ aesdec KEY, STATE4
movaps 0x60(TKEYP), KEY
- AESDEC KEY STATE1
- AESDEC KEY STATE2
- AESDEC KEY STATE3
- AESDEC KEY STATE4
+ aesdec KEY, STATE1
+ aesdec KEY, STATE2
+ aesdec KEY, STATE3
+ aesdec KEY, STATE4
movaps 0x70(TKEYP), KEY
- AESDECLAST KEY STATE1 # last round
- AESDECLAST KEY STATE2
- AESDECLAST KEY STATE3
- AESDECLAST KEY STATE4
- ret
+ aesdeclast KEY, STATE1 # last round
+ aesdeclast KEY, STATE2
+ aesdeclast KEY, STATE3
+ aesdeclast KEY, STATE4
+ RET
SYM_FUNC_END(_aesni_dec4)
/*
@@ -2377,7 +2370,7 @@ SYM_FUNC_START(aesni_ecb_enc)
popl LEN
#endif
FRAME_END
- ret
+ RET
SYM_FUNC_END(aesni_ecb_enc)
/*
@@ -2438,7 +2431,7 @@ SYM_FUNC_START(aesni_ecb_dec)
popl LEN
#endif
FRAME_END
- ret
+ RET
SYM_FUNC_END(aesni_ecb_dec)
/*
@@ -2482,7 +2475,7 @@ SYM_FUNC_START(aesni_cbc_enc)
popl IVP
#endif
FRAME_END
- ret
+ RET
SYM_FUNC_END(aesni_cbc_enc)
/*
@@ -2575,16 +2568,143 @@ SYM_FUNC_START(aesni_cbc_dec)
popl IVP
#endif
FRAME_END
- ret
+ RET
SYM_FUNC_END(aesni_cbc_dec)
-#ifdef __x86_64__
+/*
+ * void aesni_cts_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
+ * size_t len, u8 *iv)
+ */
+SYM_FUNC_START(aesni_cts_cbc_enc)
+ FRAME_BEGIN
+#ifndef __x86_64__
+ pushl IVP
+ pushl LEN
+ pushl KEYP
+ pushl KLEN
+ movl (FRAME_OFFSET+20)(%esp), KEYP # ctx
+ movl (FRAME_OFFSET+24)(%esp), OUTP # dst
+ movl (FRAME_OFFSET+28)(%esp), INP # src
+ movl (FRAME_OFFSET+32)(%esp), LEN # len
+ movl (FRAME_OFFSET+36)(%esp), IVP # iv
+ lea .Lcts_permute_table, T1
+#else
+ lea .Lcts_permute_table(%rip), T1
+#endif
+ mov 480(KEYP), KLEN
+ movups (IVP), STATE
+ sub $16, LEN
+ mov T1, IVP
+ add $32, IVP
+ add LEN, T1
+ sub LEN, IVP
+ movups (T1), %xmm4
+ movups (IVP), %xmm5
+
+ movups (INP), IN1
+ add LEN, INP
+ movups (INP), IN2
+
+ pxor IN1, STATE
+ call _aesni_enc1
+
+ pshufb %xmm5, IN2
+ pxor STATE, IN2
+ pshufb %xmm4, STATE
+ add OUTP, LEN
+ movups STATE, (LEN)
+
+ movaps IN2, STATE
+ call _aesni_enc1
+ movups STATE, (OUTP)
+
+#ifndef __x86_64__
+ popl KLEN
+ popl KEYP
+ popl LEN
+ popl IVP
+#endif
+ FRAME_END
+ RET
+SYM_FUNC_END(aesni_cts_cbc_enc)
+
+/*
+ * void aesni_cts_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
+ * size_t len, u8 *iv)
+ */
+SYM_FUNC_START(aesni_cts_cbc_dec)
+ FRAME_BEGIN
+#ifndef __x86_64__
+ pushl IVP
+ pushl LEN
+ pushl KEYP
+ pushl KLEN
+ movl (FRAME_OFFSET+20)(%esp), KEYP # ctx
+ movl (FRAME_OFFSET+24)(%esp), OUTP # dst
+ movl (FRAME_OFFSET+28)(%esp), INP # src
+ movl (FRAME_OFFSET+32)(%esp), LEN # len
+ movl (FRAME_OFFSET+36)(%esp), IVP # iv
+ lea .Lcts_permute_table, T1
+#else
+ lea .Lcts_permute_table(%rip), T1
+#endif
+ mov 480(KEYP), KLEN
+ add $240, KEYP
+ movups (IVP), IV
+ sub $16, LEN
+ mov T1, IVP
+ add $32, IVP
+ add LEN, T1
+ sub LEN, IVP
+ movups (T1), %xmm4
+
+ movups (INP), STATE
+ add LEN, INP
+ movups (INP), IN1
+
+ call _aesni_dec1
+ movaps STATE, IN2
+ pshufb %xmm4, STATE
+ pxor IN1, STATE
+
+ add OUTP, LEN
+ movups STATE, (LEN)
+
+ movups (IVP), %xmm0
+ pshufb %xmm0, IN1
+ pblendvb IN2, IN1
+ movaps IN1, STATE
+ call _aesni_dec1
+
+ pxor IV, STATE
+ movups STATE, (OUTP)
+
+#ifndef __x86_64__
+ popl KLEN
+ popl KEYP
+ popl LEN
+ popl IVP
+#endif
+ FRAME_END
+ RET
+SYM_FUNC_END(aesni_cts_cbc_dec)
+
.pushsection .rodata
.align 16
+.Lcts_permute_table:
+ .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
+ .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
+ .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
+ .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
+ .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
+ .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
+#ifdef __x86_64__
.Lbswap_mask:
.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+#endif
.popsection
+#ifdef __x86_64__
/*
* _aesni_inc_init: internal ABI
* setup registers used by _aesni_inc
@@ -2599,11 +2719,11 @@ SYM_FUNC_END(aesni_cbc_dec)
SYM_FUNC_START_LOCAL(_aesni_inc_init)
movaps .Lbswap_mask, BSWAP_MASK
movaps IV, CTR
- PSHUFB_XMM BSWAP_MASK CTR
+ pshufb BSWAP_MASK, CTR
mov $1, TCTR_LOW
- MOVQ_R64_XMM TCTR_LOW INC
- MOVQ_R64_XMM CTR TCTR_LOW
- ret
+ movq TCTR_LOW, INC
+ movq CTR, TCTR_LOW
+ RET
SYM_FUNC_END(_aesni_inc_init)
/*
@@ -2630,8 +2750,8 @@ SYM_FUNC_START_LOCAL(_aesni_inc)
psrldq $8, INC
.Linc_low:
movaps CTR, IV
- PSHUFB_XMM BSWAP_MASK IV
- ret
+ pshufb BSWAP_MASK, IV
+ RET
SYM_FUNC_END(_aesni_inc)
/*
@@ -2694,9 +2814,17 @@ SYM_FUNC_START(aesni_ctr_enc)
movups IV, (IVP)
.Lctr_enc_just_ret:
FRAME_END
- ret
+ RET
SYM_FUNC_END(aesni_ctr_enc)
+#endif
+
+.section .rodata.cst16.gf128mul_x_ble_mask, "aM", @progbits, 16
+.align 16
+.Lgf128mul_x_ble_mask:
+ .octa 0x00000000000000010000000000000087
+.previous
+
/*
* _aesni_gf128mul_x_ble: internal ABI
* Multiply in GF(2^128) for XTS IVs
@@ -2709,120 +2837,325 @@ SYM_FUNC_END(aesni_ctr_enc)
* CTR: == temporary value
*/
#define _aesni_gf128mul_x_ble() \
- pshufd $0x13, IV, CTR; \
+ pshufd $0x13, IV, KEY; \
paddq IV, IV; \
- psrad $31, CTR; \
- pand GF128MUL_MASK, CTR; \
- pxor CTR, IV;
+ psrad $31, KEY; \
+ pand GF128MUL_MASK, KEY; \
+ pxor KEY, IV;
/*
- * void aesni_xts_crypt8(const struct crypto_aes_ctx *ctx, u8 *dst,
- * const u8 *src, bool enc, le128 *iv)
+ * void aesni_xts_encrypt(const struct crypto_aes_ctx *ctx, u8 *dst,
+ * const u8 *src, unsigned int len, le128 *iv)
*/
-SYM_FUNC_START(aesni_xts_crypt8)
+SYM_FUNC_START(aesni_xts_encrypt)
FRAME_BEGIN
- cmpb $0, %cl
- movl $0, %ecx
- movl $240, %r10d
- leaq _aesni_enc4, %r11
- leaq _aesni_dec4, %rax
- cmovel %r10d, %ecx
- cmoveq %rax, %r11
-
+#ifndef __x86_64__
+ pushl IVP
+ pushl LEN
+ pushl KEYP
+ pushl KLEN
+ movl (FRAME_OFFSET+20)(%esp), KEYP # ctx
+ movl (FRAME_OFFSET+24)(%esp), OUTP # dst
+ movl (FRAME_OFFSET+28)(%esp), INP # src
+ movl (FRAME_OFFSET+32)(%esp), LEN # len
+ movl (FRAME_OFFSET+36)(%esp), IVP # iv
movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
+#else
+ movdqa .Lgf128mul_x_ble_mask(%rip), GF128MUL_MASK
+#endif
movups (IVP), IV
mov 480(KEYP), KLEN
- addq %rcx, KEYP
+
+.Lxts_enc_loop4:
+ sub $64, LEN
+ jl .Lxts_enc_1x
movdqa IV, STATE1
- movdqu 0x00(INP), INC
- pxor INC, STATE1
+ movdqu 0x00(INP), IN
+ pxor IN, STATE1
movdqu IV, 0x00(OUTP)
_aesni_gf128mul_x_ble()
movdqa IV, STATE2
- movdqu 0x10(INP), INC
- pxor INC, STATE2
+ movdqu 0x10(INP), IN
+ pxor IN, STATE2
movdqu IV, 0x10(OUTP)
_aesni_gf128mul_x_ble()
movdqa IV, STATE3
- movdqu 0x20(INP), INC
- pxor INC, STATE3
+ movdqu 0x20(INP), IN
+ pxor IN, STATE3
movdqu IV, 0x20(OUTP)
_aesni_gf128mul_x_ble()
movdqa IV, STATE4
- movdqu 0x30(INP), INC
- pxor INC, STATE4
+ movdqu 0x30(INP), IN
+ pxor IN, STATE4
movdqu IV, 0x30(OUTP)
- CALL_NOSPEC %r11
+ call _aesni_enc4
- movdqu 0x00(OUTP), INC
- pxor INC, STATE1
+ movdqu 0x00(OUTP), IN
+ pxor IN, STATE1
movdqu STATE1, 0x00(OUTP)
+ movdqu 0x10(OUTP), IN
+ pxor IN, STATE2
+ movdqu STATE2, 0x10(OUTP)
+
+ movdqu 0x20(OUTP), IN
+ pxor IN, STATE3
+ movdqu STATE3, 0x20(OUTP)
+
+ movdqu 0x30(OUTP), IN
+ pxor IN, STATE4
+ movdqu STATE4, 0x30(OUTP)
+
_aesni_gf128mul_x_ble()
- movdqa IV, STATE1
- movdqu 0x40(INP), INC
- pxor INC, STATE1
- movdqu IV, 0x40(OUTP)
- movdqu 0x10(OUTP), INC
- pxor INC, STATE2
- movdqu STATE2, 0x10(OUTP)
+ add $64, INP
+ add $64, OUTP
+ test LEN, LEN
+ jnz .Lxts_enc_loop4
+
+.Lxts_enc_ret_iv:
+ movups IV, (IVP)
+.Lxts_enc_ret:
+#ifndef __x86_64__
+ popl KLEN
+ popl KEYP
+ popl LEN
+ popl IVP
+#endif
+ FRAME_END
+ RET
+
+.Lxts_enc_1x:
+ add $64, LEN
+ jz .Lxts_enc_ret_iv
+ sub $16, LEN
+ jl .Lxts_enc_cts4
+
+.Lxts_enc_loop1:
+ movdqu (INP), STATE
+ pxor IV, STATE
+ call _aesni_enc1
+ pxor IV, STATE
_aesni_gf128mul_x_ble()
- movdqa IV, STATE2
- movdqu 0x50(INP), INC
- pxor INC, STATE2
- movdqu IV, 0x50(OUTP)
- movdqu 0x20(OUTP), INC
- pxor INC, STATE3
- movdqu STATE3, 0x20(OUTP)
+ test LEN, LEN
+ jz .Lxts_enc_out
+
+ add $16, INP
+ sub $16, LEN
+ jl .Lxts_enc_cts1
+
+ movdqu STATE, (OUTP)
+ add $16, OUTP
+ jmp .Lxts_enc_loop1
+
+.Lxts_enc_out:
+ movdqu STATE, (OUTP)
+ jmp .Lxts_enc_ret_iv
+
+.Lxts_enc_cts4:
+ movdqa STATE4, STATE
+ sub $16, OUTP
+
+.Lxts_enc_cts1:
+#ifndef __x86_64__
+ lea .Lcts_permute_table, T1
+#else
+ lea .Lcts_permute_table(%rip), T1
+#endif
+ add LEN, INP /* rewind input pointer */
+ add $16, LEN /* # bytes in final block */
+ movups (INP), IN1
+
+ mov T1, IVP
+ add $32, IVP
+ add LEN, T1
+ sub LEN, IVP
+ add OUTP, LEN
+
+ movups (T1), %xmm4
+ movaps STATE, IN2
+ pshufb %xmm4, STATE
+ movups STATE, (LEN)
+
+ movups (IVP), %xmm0
+ pshufb %xmm0, IN1
+ pblendvb IN2, IN1
+ movaps IN1, STATE
+
+ pxor IV, STATE
+ call _aesni_enc1
+ pxor IV, STATE
+
+ movups STATE, (OUTP)
+ jmp .Lxts_enc_ret
+SYM_FUNC_END(aesni_xts_encrypt)
+
+/*
+ * void aesni_xts_decrypt(const struct crypto_aes_ctx *ctx, u8 *dst,
+ * const u8 *src, unsigned int len, le128 *iv)
+ */
+SYM_FUNC_START(aesni_xts_decrypt)
+ FRAME_BEGIN
+#ifndef __x86_64__
+ pushl IVP
+ pushl LEN
+ pushl KEYP
+ pushl KLEN
+ movl (FRAME_OFFSET+20)(%esp), KEYP # ctx
+ movl (FRAME_OFFSET+24)(%esp), OUTP # dst
+ movl (FRAME_OFFSET+28)(%esp), INP # src
+ movl (FRAME_OFFSET+32)(%esp), LEN # len
+ movl (FRAME_OFFSET+36)(%esp), IVP # iv
+ movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
+#else
+ movdqa .Lgf128mul_x_ble_mask(%rip), GF128MUL_MASK
+#endif
+ movups (IVP), IV
+
+ mov 480(KEYP), KLEN
+ add $240, KEYP
+
+ test $15, LEN
+ jz .Lxts_dec_loop4
+ sub $16, LEN
+
+.Lxts_dec_loop4:
+ sub $64, LEN
+ jl .Lxts_dec_1x
+
+ movdqa IV, STATE1
+ movdqu 0x00(INP), IN
+ pxor IN, STATE1
+ movdqu IV, 0x00(OUTP)
_aesni_gf128mul_x_ble()
- movdqa IV, STATE3
- movdqu 0x60(INP), INC
- pxor INC, STATE3
- movdqu IV, 0x60(OUTP)
+ movdqa IV, STATE2
+ movdqu 0x10(INP), IN
+ pxor IN, STATE2
+ movdqu IV, 0x10(OUTP)
- movdqu 0x30(OUTP), INC
- pxor INC, STATE4
- movdqu STATE4, 0x30(OUTP)
+ _aesni_gf128mul_x_ble()
+ movdqa IV, STATE3
+ movdqu 0x20(INP), IN
+ pxor IN, STATE3
+ movdqu IV, 0x20(OUTP)
_aesni_gf128mul_x_ble()
movdqa IV, STATE4
- movdqu 0x70(INP), INC
- pxor INC, STATE4
- movdqu IV, 0x70(OUTP)
+ movdqu 0x30(INP), IN
+ pxor IN, STATE4
+ movdqu IV, 0x30(OUTP)
+
+ call _aesni_dec4
+
+ movdqu 0x00(OUTP), IN
+ pxor IN, STATE1
+ movdqu STATE1, 0x00(OUTP)
+
+ movdqu 0x10(OUTP), IN
+ pxor IN, STATE2
+ movdqu STATE2, 0x10(OUTP)
+
+ movdqu 0x20(OUTP), IN
+ pxor IN, STATE3
+ movdqu STATE3, 0x20(OUTP)
+
+ movdqu 0x30(OUTP), IN
+ pxor IN, STATE4
+ movdqu STATE4, 0x30(OUTP)
_aesni_gf128mul_x_ble()
+
+ add $64, INP
+ add $64, OUTP
+ test LEN, LEN
+ jnz .Lxts_dec_loop4
+
+.Lxts_dec_ret_iv:
movups IV, (IVP)
- CALL_NOSPEC %r11
+.Lxts_dec_ret:
+#ifndef __x86_64__
+ popl KLEN
+ popl KEYP
+ popl LEN
+ popl IVP
+#endif
+ FRAME_END
+ RET
- movdqu 0x40(OUTP), INC
- pxor INC, STATE1
- movdqu STATE1, 0x40(OUTP)
+.Lxts_dec_1x:
+ add $64, LEN
+ jz .Lxts_dec_ret_iv
- movdqu 0x50(OUTP), INC
- pxor INC, STATE2
- movdqu STATE2, 0x50(OUTP)
+.Lxts_dec_loop1:
+ movdqu (INP), STATE
- movdqu 0x60(OUTP), INC
- pxor INC, STATE3
- movdqu STATE3, 0x60(OUTP)
+ add $16, INP
+ sub $16, LEN
+ jl .Lxts_dec_cts1
- movdqu 0x70(OUTP), INC
- pxor INC, STATE4
- movdqu STATE4, 0x70(OUTP)
+ pxor IV, STATE
+ call _aesni_dec1
+ pxor IV, STATE
+ _aesni_gf128mul_x_ble()
- FRAME_END
- ret
-SYM_FUNC_END(aesni_xts_crypt8)
+ test LEN, LEN
+ jz .Lxts_dec_out
+
+ movdqu STATE, (OUTP)
+ add $16, OUTP
+ jmp .Lxts_dec_loop1
+
+.Lxts_dec_out:
+ movdqu STATE, (OUTP)
+ jmp .Lxts_dec_ret_iv
+
+.Lxts_dec_cts1:
+ movdqa IV, STATE4
+ _aesni_gf128mul_x_ble()
+ pxor IV, STATE
+ call _aesni_dec1
+ pxor IV, STATE
+
+#ifndef __x86_64__
+ lea .Lcts_permute_table, T1
+#else
+ lea .Lcts_permute_table(%rip), T1
#endif
+ add LEN, INP /* rewind input pointer */
+ add $16, LEN /* # bytes in final block */
+ movups (INP), IN1
+
+ mov T1, IVP
+ add $32, IVP
+ add LEN, T1
+ sub LEN, IVP
+ add OUTP, LEN
+
+ movups (T1), %xmm4
+ movaps STATE, IN2
+ pshufb %xmm4, STATE
+ movups STATE, (LEN)
+
+ movups (IVP), %xmm0
+ pshufb %xmm0, IN1
+ pblendvb IN2, IN1
+ movaps IN1, STATE
+
+ pxor STATE4, STATE
+ call _aesni_dec1
+ pxor STATE4, STATE
+
+ movups STATE, (OUTP)
+ jmp .Lxts_dec_ret
+SYM_FUNC_END(aesni_xts_decrypt)
diff --git a/arch/x86/crypto/aesni-intel_avx-x86_64.S b/arch/x86/crypto/aesni-intel_avx-x86_64.S
index bfa1c0b3e5b4..0852ab573fd3 100644
--- a/arch/x86/crypto/aesni-intel_avx-x86_64.S
+++ b/arch/x86/crypto/aesni-intel_avx-x86_64.S
@@ -120,7 +120,6 @@
##
#include <linux/linkage.h>
-#include <asm/inst.h>
# constants in mergeable sections, linker can reorder and merge
.section .rodata.cst16.POLY, "aM", @progbits, 16
@@ -213,10 +212,6 @@ HashKey_8_k = 16*21 # store XOR of HashKey^8 <<1 mod poly here (for Karatsu
#define arg4 %rcx
#define arg5 %r8
#define arg6 %r9
-#define arg7 STACK_OFFSET+8*1(%r14)
-#define arg8 STACK_OFFSET+8*2(%r14)
-#define arg9 STACK_OFFSET+8*3(%r14)
-#define arg10 STACK_OFFSET+8*4(%r14)
#define keysize 2*15*16(arg1)
i = 0
@@ -238,9 +233,6 @@ define_reg j %j
.noaltmacro
.endm
-# need to push 4 registers into stack to maintain
-STACK_OFFSET = 8*4
-
TMP1 = 16*0 # Temporary storage for AAD
TMP2 = 16*1 # Temporary storage for AES State 2 (State 1 is stored in an XMM register)
TMP3 = 16*2 # Temporary storage for AES State 3
@@ -257,25 +249,22 @@ VARIABLE_OFFSET = 16*8
################################
.macro FUNC_SAVE
- #the number of pushes must equal STACK_OFFSET
push %r12
push %r13
- push %r14
push %r15
- mov %rsp, %r14
-
-
+ push %rbp
+ mov %rsp, %rbp
sub $VARIABLE_OFFSET, %rsp
and $~63, %rsp # align rsp to 64 bytes
.endm
.macro FUNC_RESTORE
- mov %r14, %rsp
+ mov %rbp, %rsp
+ pop %rbp
pop %r15
- pop %r14
pop %r13
pop %r12
.endm
@@ -295,7 +284,7 @@ VARIABLE_OFFSET = 16*8
# combined for GCM encrypt and decrypt functions
# clobbering all xmm registers
-# clobbering r10, r11, r12, r13, r14, r15
+# clobbering r10, r11, r12, r13, r15, rax
.macro GCM_ENC_DEC INITIAL_BLOCKS GHASH_8_ENCRYPT_8_PARALLEL GHASH_LAST_8 GHASH_MUL ENC_DEC REP
vmovdqu AadHash(arg2), %xmm8
vmovdqu HashKey(arg2), %xmm13 # xmm13 = HashKey
@@ -370,7 +359,7 @@ _initial_num_blocks_is_0\@:
_initial_blocks_encrypted\@:
- cmp $0, %r13
+ test %r13, %r13
je _zero_cipher_left\@
sub $128, %r13
@@ -529,7 +518,7 @@ _multiple_of_16_bytes\@:
vmovdqu HashKey(arg2), %xmm13
mov PBlockLen(arg2), %r12
- cmp $0, %r12
+ test %r12, %r12
je _partial_done\@
#GHASH computation for the last <16 Byte block
@@ -574,7 +563,7 @@ _T_8\@:
add $8, %r10
sub $8, %r11
vpsrldq $8, %xmm9, %xmm9
- cmp $0, %r11
+ test %r11, %r11
je _return_T_done\@
_T_4\@:
vmovd %xmm9, %eax
@@ -582,7 +571,7 @@ _T_4\@:
add $4, %r10
sub $4, %r11
vpsrldq $4, %xmm9, %xmm9
- cmp $0, %r11
+ test %r11, %r11
je _return_T_done\@
_T_123\@:
vmovd %xmm9, %eax
@@ -626,7 +615,7 @@ _get_AAD_blocks\@:
cmp $16, %r11
jge _get_AAD_blocks\@
vmovdqu \T8, \T7
- cmp $0, %r11
+ test %r11, %r11
je _get_AAD_done\@
vpxor \T7, \T7, \T7
@@ -645,7 +634,7 @@ _get_AAD_rest8\@:
vpxor \T1, \T7, \T7
jmp _get_AAD_rest8\@
_get_AAD_rest4\@:
- cmp $0, %r11
+ test %r11, %r11
jle _get_AAD_rest0\@
mov (%r10), %eax
movq %rax, \T1
@@ -750,7 +739,7 @@ _done_read_partial_block_\@:
.macro PARTIAL_BLOCK GHASH_MUL CYPH_PLAIN_OUT PLAIN_CYPH_IN PLAIN_CYPH_LEN DATA_OFFSET \
AAD_HASH ENC_DEC
mov PBlockLen(arg2), %r13
- cmp $0, %r13
+ test %r13, %r13
je _partial_block_done_\@ # Leave Macro if no partial blocks
# Read in input data without over reading
cmp $16, \PLAIN_CYPH_LEN
@@ -802,7 +791,7 @@ _no_extra_mask_1_\@:
vpshufb %xmm2, %xmm3, %xmm3
vpxor %xmm3, \AAD_HASH, \AAD_HASH
- cmp $0, %r10
+ test %r10, %r10
jl _partial_incomplete_1_\@
# GHASH computation for the last <16 Byte block
@@ -837,7 +826,7 @@ _no_extra_mask_2_\@:
vpshufb %xmm2, %xmm9, %xmm9
vpxor %xmm9, \AAD_HASH, \AAD_HASH
- cmp $0, %r10
+ test %r10, %r10
jl _partial_incomplete_2_\@
# GHASH computation for the last <16 Byte block
@@ -857,7 +846,7 @@ _encode_done_\@:
vpshufb %xmm2, %xmm9, %xmm9
.endif
# output encrypted Bytes
- cmp $0, %r10
+ test %r10, %r10
jl _partial_fill_\@
mov %r13, %r12
mov $16, %r13
@@ -886,7 +875,6 @@ _less_than_8_bytes_left_\@:
_partial_block_done_\@:
.endm # PARTIAL_BLOCK
-#ifdef CONFIG_AS_AVX
###############################################################################
# GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
# Input: A and B (128-bits each, bit-reflected)
@@ -998,7 +986,7 @@ _partial_block_done_\@:
## num_initial_blocks = b mod 4#
## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext
## r10, r11, r12, rax are clobbered
-## arg1, arg3, arg4, r14 are used as a pointer only, not modified
+## arg1, arg2, arg3, arg4 are used as pointers only, not modified
.macro INITIAL_BLOCKS_AVX REP num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC
i = (8-\num_initial_blocks)
@@ -1233,7 +1221,7 @@ _initial_blocks_done\@:
# encrypt 8 blocks at a time
# ghash the 8 previously encrypted ciphertext blocks
-# arg1, arg3, arg4 are used as pointers only, not modified
+# arg1, arg2, arg3, arg4 are used as pointers only, not modified
# r11 is the data offset value
.macro GHASH_8_ENCRYPT_8_PARALLEL_AVX REP T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC
@@ -1779,7 +1767,7 @@ SYM_FUNC_START(aesni_gcm_init_avx_gen2)
FUNC_SAVE
INIT GHASH_MUL_AVX, PRECOMPUTE_AVX
FUNC_RESTORE
- ret
+ RET
SYM_FUNC_END(aesni_gcm_init_avx_gen2)
###############################################################################
@@ -1800,15 +1788,15 @@ SYM_FUNC_START(aesni_gcm_enc_update_avx_gen2)
# must be 192
GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 11
FUNC_RESTORE
- ret
+ RET
key_128_enc_update:
GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 9
FUNC_RESTORE
- ret
+ RET
key_256_enc_update:
GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 13
FUNC_RESTORE
- ret
+ RET
SYM_FUNC_END(aesni_gcm_enc_update_avx_gen2)
###############################################################################
@@ -1829,15 +1817,15 @@ SYM_FUNC_START(aesni_gcm_dec_update_avx_gen2)
# must be 192
GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 11
FUNC_RESTORE
- ret
+ RET
key_128_dec_update:
GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 9
FUNC_RESTORE
- ret
+ RET
key_256_dec_update:
GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 13
FUNC_RESTORE
- ret
+ RET
SYM_FUNC_END(aesni_gcm_dec_update_avx_gen2)
###############################################################################
@@ -1858,20 +1846,17 @@ SYM_FUNC_START(aesni_gcm_finalize_avx_gen2)
# must be 192
GCM_COMPLETE GHASH_MUL_AVX, 11, arg3, arg4
FUNC_RESTORE
- ret
+ RET
key_128_finalize:
GCM_COMPLETE GHASH_MUL_AVX, 9, arg3, arg4
FUNC_RESTORE
- ret
+ RET
key_256_finalize:
GCM_COMPLETE GHASH_MUL_AVX, 13, arg3, arg4
FUNC_RESTORE
- ret
+ RET
SYM_FUNC_END(aesni_gcm_finalize_avx_gen2)
-#endif /* CONFIG_AS_AVX */
-
-#ifdef CONFIG_AS_AVX2
###############################################################################
# GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
# Input: A and B (128-bits each, bit-reflected)
@@ -1949,7 +1934,7 @@ SYM_FUNC_END(aesni_gcm_finalize_avx_gen2)
## num_initial_blocks = b mod 4#
## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext
## r10, r11, r12, rax are clobbered
-## arg1, arg3, arg4, r14 are used as a pointer only, not modified
+## arg1, arg2, arg3, arg4 are used as pointers only, not modified
.macro INITIAL_BLOCKS_AVX2 REP num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC VER
i = (8-\num_initial_blocks)
@@ -2191,7 +2176,7 @@ _initial_blocks_done\@:
# encrypt 8 blocks at a time
# ghash the 8 previously encrypted ciphertext blocks
-# arg1, arg3, arg4 are used as pointers only, not modified
+# arg1, arg2, arg3, arg4 are used as pointers only, not modified
# r11 is the data offset value
.macro GHASH_8_ENCRYPT_8_PARALLEL_AVX2 REP T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC
@@ -2750,7 +2735,7 @@ SYM_FUNC_START(aesni_gcm_init_avx_gen4)
FUNC_SAVE
INIT GHASH_MUL_AVX2, PRECOMPUTE_AVX2
FUNC_RESTORE
- ret
+ RET
SYM_FUNC_END(aesni_gcm_init_avx_gen4)
###############################################################################
@@ -2771,15 +2756,15 @@ SYM_FUNC_START(aesni_gcm_enc_update_avx_gen4)
# must be 192
GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 11
FUNC_RESTORE
- ret
+ RET
key_128_enc_update4:
GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 9
FUNC_RESTORE
- ret
+ RET
key_256_enc_update4:
GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 13
FUNC_RESTORE
- ret
+ RET
SYM_FUNC_END(aesni_gcm_enc_update_avx_gen4)
###############################################################################
@@ -2800,15 +2785,15 @@ SYM_FUNC_START(aesni_gcm_dec_update_avx_gen4)
# must be 192
GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 11
FUNC_RESTORE
- ret
+ RET
key_128_dec_update4:
GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 9
FUNC_RESTORE
- ret
+ RET
key_256_dec_update4:
GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 13
FUNC_RESTORE
- ret
+ RET
SYM_FUNC_END(aesni_gcm_dec_update_avx_gen4)
###############################################################################
@@ -2829,15 +2814,13 @@ SYM_FUNC_START(aesni_gcm_finalize_avx_gen4)
# must be 192
GCM_COMPLETE GHASH_MUL_AVX2, 11, arg3, arg4
FUNC_RESTORE
- ret
+ RET
key_128_finalize4:
GCM_COMPLETE GHASH_MUL_AVX2, 9, arg3, arg4
FUNC_RESTORE
- ret
+ RET
key_256_finalize4:
GCM_COMPLETE GHASH_MUL_AVX2, 13, arg3, arg4
FUNC_RESTORE
- ret
+ RET
SYM_FUNC_END(aesni_gcm_finalize_avx_gen4)
-
-#endif /* CONFIG_AS_AVX2 */
diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index bbbebbd35b5d..a5b0cb3efeba 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -31,11 +31,10 @@
#include <crypto/internal/aead.h>
#include <crypto/internal/simd.h>
#include <crypto/internal/skcipher.h>
+#include <linux/jump_label.h>
#include <linux/workqueue.h>
#include <linux/spinlock.h>
-#ifdef CONFIG_X86_64
-#include <asm/crypto/glue_helper.h>
-#endif
+#include <linux/static_call.h>
#define AESNI_ALIGN 16
@@ -93,62 +92,25 @@ asmlinkage void aesni_cbc_enc(struct crypto_aes_ctx *ctx, u8 *out,
const u8 *in, unsigned int len, u8 *iv);
asmlinkage void aesni_cbc_dec(struct crypto_aes_ctx *ctx, u8 *out,
const u8 *in, unsigned int len, u8 *iv);
+asmlinkage void aesni_cts_cbc_enc(struct crypto_aes_ctx *ctx, u8 *out,
+ const u8 *in, unsigned int len, u8 *iv);
+asmlinkage void aesni_cts_cbc_dec(struct crypto_aes_ctx *ctx, u8 *out,
+ const u8 *in, unsigned int len, u8 *iv);
#define AVX_GEN2_OPTSIZE 640
#define AVX_GEN4_OPTSIZE 4096
+asmlinkage void aesni_xts_encrypt(const struct crypto_aes_ctx *ctx, u8 *out,
+ const u8 *in, unsigned int len, u8 *iv);
+
+asmlinkage void aesni_xts_decrypt(const struct crypto_aes_ctx *ctx, u8 *out,
+ const u8 *in, unsigned int len, u8 *iv);
+
#ifdef CONFIG_X86_64
-static void (*aesni_ctr_enc_tfm)(struct crypto_aes_ctx *ctx, u8 *out,
- const u8 *in, unsigned int len, u8 *iv);
asmlinkage void aesni_ctr_enc(struct crypto_aes_ctx *ctx, u8 *out,
const u8 *in, unsigned int len, u8 *iv);
-
-asmlinkage void aesni_xts_crypt8(const struct crypto_aes_ctx *ctx, u8 *out,
- const u8 *in, bool enc, le128 *iv);
-
-/* asmlinkage void aesni_gcm_enc()
- * void *ctx, AES Key schedule. Starts on a 16 byte boundary.
- * struct gcm_context_data. May be uninitialized.
- * u8 *out, Ciphertext output. Encrypt in-place is allowed.
- * const u8 *in, Plaintext input
- * unsigned long plaintext_len, Length of data in bytes for encryption.
- * u8 *iv, Pre-counter block j0: 12 byte IV concatenated with 0x00000001.
- * 16-byte aligned pointer.
- * u8 *hash_subkey, the Hash sub key input. Data starts on a 16-byte boundary.
- * const u8 *aad, Additional Authentication Data (AAD)
- * unsigned long aad_len, Length of AAD in bytes.
- * u8 *auth_tag, Authenticated Tag output.
- * unsigned long auth_tag_len), Authenticated Tag Length in bytes.
- * Valid values are 16 (most likely), 12 or 8.
- */
-asmlinkage void aesni_gcm_enc(void *ctx,
- struct gcm_context_data *gdata, u8 *out,
- const u8 *in, unsigned long plaintext_len, u8 *iv,
- u8 *hash_subkey, const u8 *aad, unsigned long aad_len,
- u8 *auth_tag, unsigned long auth_tag_len);
-
-/* asmlinkage void aesni_gcm_dec()
- * void *ctx, AES Key schedule. Starts on a 16 byte boundary.
- * struct gcm_context_data. May be uninitialized.
- * u8 *out, Plaintext output. Decrypt in-place is allowed.
- * const u8 *in, Ciphertext input
- * unsigned long ciphertext_len, Length of data in bytes for decryption.
- * u8 *iv, Pre-counter block j0: 12 byte IV concatenated with 0x00000001.
- * 16-byte aligned pointer.
- * u8 *hash_subkey, the Hash sub key input. Data starts on a 16-byte boundary.
- * const u8 *aad, Additional Authentication Data (AAD)
- * unsigned long aad_len, Length of AAD in bytes. With RFC4106 this is going
- * to be 8 or 12 bytes
- * u8 *auth_tag, Authenticated Tag output.
- * unsigned long auth_tag_len) Authenticated Tag Length in bytes.
- * Valid values are 16 (most likely), 12 or 8.
- */
-asmlinkage void aesni_gcm_dec(void *ctx,
- struct gcm_context_data *gdata, u8 *out,
- const u8 *in, unsigned long ciphertext_len, u8 *iv,
- u8 *hash_subkey, const u8 *aad, unsigned long aad_len,
- u8 *auth_tag, unsigned long auth_tag_len);
+DEFINE_STATIC_CALL(aesni_ctr_enc_tfm, aesni_ctr_enc);
/* Scatter / Gather routines, with args similar to above */
asmlinkage void aesni_gcm_init(void *ctx,
@@ -167,31 +129,26 @@ asmlinkage void aesni_gcm_finalize(void *ctx,
struct gcm_context_data *gdata,
u8 *auth_tag, unsigned long auth_tag_len);
-static const struct aesni_gcm_tfm_s {
- void (*init)(void *ctx, struct gcm_context_data *gdata, u8 *iv,
- u8 *hash_subkey, const u8 *aad, unsigned long aad_len);
- void (*enc_update)(void *ctx, struct gcm_context_data *gdata, u8 *out,
- const u8 *in, unsigned long plaintext_len);
- void (*dec_update)(void *ctx, struct gcm_context_data *gdata, u8 *out,
- const u8 *in, unsigned long ciphertext_len);
- void (*finalize)(void *ctx, struct gcm_context_data *gdata,
- u8 *auth_tag, unsigned long auth_tag_len);
-} *aesni_gcm_tfm;
-
-static const struct aesni_gcm_tfm_s aesni_gcm_tfm_sse = {
- .init = &aesni_gcm_init,
- .enc_update = &aesni_gcm_enc_update,
- .dec_update = &aesni_gcm_dec_update,
- .finalize = &aesni_gcm_finalize,
-};
-
-#ifdef CONFIG_AS_AVX
asmlinkage void aes_ctr_enc_128_avx_by8(const u8 *in, u8 *iv,
void *keys, u8 *out, unsigned int num_bytes);
asmlinkage void aes_ctr_enc_192_avx_by8(const u8 *in, u8 *iv,
void *keys, u8 *out, unsigned int num_bytes);
asmlinkage void aes_ctr_enc_256_avx_by8(const u8 *in, u8 *iv,
void *keys, u8 *out, unsigned int num_bytes);
+
+
+asmlinkage void aes_xctr_enc_128_avx_by8(const u8 *in, const u8 *iv,
+ const void *keys, u8 *out, unsigned int num_bytes,
+ unsigned int byte_ctr);
+
+asmlinkage void aes_xctr_enc_192_avx_by8(const u8 *in, const u8 *iv,
+ const void *keys, u8 *out, unsigned int num_bytes,
+ unsigned int byte_ctr);
+
+asmlinkage void aes_xctr_enc_256_avx_by8(const u8 *in, const u8 *iv,
+ const void *keys, u8 *out, unsigned int num_bytes,
+ unsigned int byte_ctr);
+
/*
* asmlinkage void aesni_gcm_init_avx_gen2()
* gcm_data *my_ctx_data, context data
@@ -215,28 +172,6 @@ asmlinkage void aesni_gcm_finalize_avx_gen2(void *ctx,
struct gcm_context_data *gdata,
u8 *auth_tag, unsigned long auth_tag_len);
-asmlinkage void aesni_gcm_enc_avx_gen2(void *ctx,
- struct gcm_context_data *gdata, u8 *out,
- const u8 *in, unsigned long plaintext_len, u8 *iv,
- const u8 *aad, unsigned long aad_len,
- u8 *auth_tag, unsigned long auth_tag_len);
-
-asmlinkage void aesni_gcm_dec_avx_gen2(void *ctx,
- struct gcm_context_data *gdata, u8 *out,
- const u8 *in, unsigned long ciphertext_len, u8 *iv,
- const u8 *aad, unsigned long aad_len,
- u8 *auth_tag, unsigned long auth_tag_len);
-
-static const struct aesni_gcm_tfm_s aesni_gcm_tfm_avx_gen2 = {
- .init = &aesni_gcm_init_avx_gen2,
- .enc_update = &aesni_gcm_enc_update_avx_gen2,
- .dec_update = &aesni_gcm_dec_update_avx_gen2,
- .finalize = &aesni_gcm_finalize_avx_gen2,
-};
-
-#endif
-
-#ifdef CONFIG_AS_AVX2
/*
* asmlinkage void aesni_gcm_init_avx_gen4()
* gcm_data *my_ctx_data, context data
@@ -260,26 +195,8 @@ asmlinkage void aesni_gcm_finalize_avx_gen4(void *ctx,
struct gcm_context_data *gdata,
u8 *auth_tag, unsigned long auth_tag_len);
-asmlinkage void aesni_gcm_enc_avx_gen4(void *ctx,
- struct gcm_context_data *gdata, u8 *out,
- const u8 *in, unsigned long plaintext_len, u8 *iv,
- const u8 *aad, unsigned long aad_len,
- u8 *auth_tag, unsigned long auth_tag_len);
-
-asmlinkage void aesni_gcm_dec_avx_gen4(void *ctx,
- struct gcm_context_data *gdata, u8 *out,
- const u8 *in, unsigned long ciphertext_len, u8 *iv,
- const u8 *aad, unsigned long aad_len,
- u8 *auth_tag, unsigned long auth_tag_len);
-
-static const struct aesni_gcm_tfm_s aesni_gcm_tfm_avx_gen4 = {
- .init = &aesni_gcm_init_avx_gen4,
- .enc_update = &aesni_gcm_enc_update_avx_gen4,
- .dec_update = &aesni_gcm_dec_update_avx_gen4,
- .finalize = &aesni_gcm_finalize_avx_gen4,
-};
-
-#endif
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(gcm_use_avx);
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(gcm_use_avx2);
static inline struct
aesni_rfc4106_gcm_ctx *aesni_rfc4106_gcm_ctx_get(struct crypto_aead *tfm)
@@ -380,16 +297,16 @@ static int ecb_encrypt(struct skcipher_request *req)
unsigned int nbytes;
int err;
- err = skcipher_walk_virt(&walk, req, true);
+ err = skcipher_walk_virt(&walk, req, false);
- kernel_fpu_begin();
while ((nbytes = walk.nbytes)) {
+ kernel_fpu_begin();
aesni_ecb_enc(ctx, walk.dst.virt.addr, walk.src.virt.addr,
nbytes & AES_BLOCK_MASK);
+ kernel_fpu_end();
nbytes &= AES_BLOCK_SIZE - 1;
err = skcipher_walk_done(&walk, nbytes);
}
- kernel_fpu_end();
return err;
}
@@ -402,16 +319,16 @@ static int ecb_decrypt(struct skcipher_request *req)
unsigned int nbytes;
int err;
- err = skcipher_walk_virt(&walk, req, true);
+ err = skcipher_walk_virt(&walk, req, false);
- kernel_fpu_begin();
while ((nbytes = walk.nbytes)) {
+ kernel_fpu_begin();
aesni_ecb_dec(ctx, walk.dst.virt.addr, walk.src.virt.addr,
nbytes & AES_BLOCK_MASK);
+ kernel_fpu_end();
nbytes &= AES_BLOCK_SIZE - 1;
err = skcipher_walk_done(&walk, nbytes);
}
- kernel_fpu_end();
return err;
}
@@ -424,16 +341,16 @@ static int cbc_encrypt(struct skcipher_request *req)
unsigned int nbytes;
int err;
- err = skcipher_walk_virt(&walk, req, true);
+ err = skcipher_walk_virt(&walk, req, false);
- kernel_fpu_begin();
while ((nbytes = walk.nbytes)) {
+ kernel_fpu_begin();
aesni_cbc_enc(ctx, walk.dst.virt.addr, walk.src.virt.addr,
nbytes & AES_BLOCK_MASK, walk.iv);
+ kernel_fpu_end();
nbytes &= AES_BLOCK_SIZE - 1;
err = skcipher_walk_done(&walk, nbytes);
}
- kernel_fpu_end();
return err;
}
@@ -446,37 +363,133 @@ static int cbc_decrypt(struct skcipher_request *req)
unsigned int nbytes;
int err;
- err = skcipher_walk_virt(&walk, req, true);
+ err = skcipher_walk_virt(&walk, req, false);
- kernel_fpu_begin();
while ((nbytes = walk.nbytes)) {
+ kernel_fpu_begin();
aesni_cbc_dec(ctx, walk.dst.virt.addr, walk.src.virt.addr,
nbytes & AES_BLOCK_MASK, walk.iv);
+ kernel_fpu_end();
nbytes &= AES_BLOCK_SIZE - 1;
err = skcipher_walk_done(&walk, nbytes);
}
- kernel_fpu_end();
return err;
}
-#ifdef CONFIG_X86_64
-static void ctr_crypt_final(struct crypto_aes_ctx *ctx,
- struct skcipher_walk *walk)
+static int cts_cbc_encrypt(struct skcipher_request *req)
{
- u8 *ctrblk = walk->iv;
- u8 keystream[AES_BLOCK_SIZE];
- u8 *src = walk->src.virt.addr;
- u8 *dst = walk->dst.virt.addr;
- unsigned int nbytes = walk->nbytes;
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct crypto_aes_ctx *ctx = aes_ctx(crypto_skcipher_ctx(tfm));
+ int cbc_blocks = DIV_ROUND_UP(req->cryptlen, AES_BLOCK_SIZE) - 2;
+ struct scatterlist *src = req->src, *dst = req->dst;
+ struct scatterlist sg_src[2], sg_dst[2];
+ struct skcipher_request subreq;
+ struct skcipher_walk walk;
+ int err;
+
+ skcipher_request_set_tfm(&subreq, tfm);
+ skcipher_request_set_callback(&subreq, skcipher_request_flags(req),
+ NULL, NULL);
+
+ if (req->cryptlen <= AES_BLOCK_SIZE) {
+ if (req->cryptlen < AES_BLOCK_SIZE)
+ return -EINVAL;
+ cbc_blocks = 1;
+ }
+
+ if (cbc_blocks > 0) {
+ skcipher_request_set_crypt(&subreq, req->src, req->dst,
+ cbc_blocks * AES_BLOCK_SIZE,
+ req->iv);
+
+ err = cbc_encrypt(&subreq);
+ if (err)
+ return err;
+
+ if (req->cryptlen == AES_BLOCK_SIZE)
+ return 0;
+
+ dst = src = scatterwalk_ffwd(sg_src, req->src, subreq.cryptlen);
+ if (req->dst != req->src)
+ dst = scatterwalk_ffwd(sg_dst, req->dst,
+ subreq.cryptlen);
+ }
+
+ /* handle ciphertext stealing */
+ skcipher_request_set_crypt(&subreq, src, dst,
+ req->cryptlen - cbc_blocks * AES_BLOCK_SIZE,
+ req->iv);
+
+ err = skcipher_walk_virt(&walk, &subreq, false);
+ if (err)
+ return err;
+
+ kernel_fpu_begin();
+ aesni_cts_cbc_enc(ctx, walk.dst.virt.addr, walk.src.virt.addr,
+ walk.nbytes, walk.iv);
+ kernel_fpu_end();
+
+ return skcipher_walk_done(&walk, 0);
+}
+
+static int cts_cbc_decrypt(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct crypto_aes_ctx *ctx = aes_ctx(crypto_skcipher_ctx(tfm));
+ int cbc_blocks = DIV_ROUND_UP(req->cryptlen, AES_BLOCK_SIZE) - 2;
+ struct scatterlist *src = req->src, *dst = req->dst;
+ struct scatterlist sg_src[2], sg_dst[2];
+ struct skcipher_request subreq;
+ struct skcipher_walk walk;
+ int err;
+
+ skcipher_request_set_tfm(&subreq, tfm);
+ skcipher_request_set_callback(&subreq, skcipher_request_flags(req),
+ NULL, NULL);
+
+ if (req->cryptlen <= AES_BLOCK_SIZE) {
+ if (req->cryptlen < AES_BLOCK_SIZE)
+ return -EINVAL;
+ cbc_blocks = 1;
+ }
+
+ if (cbc_blocks > 0) {
+ skcipher_request_set_crypt(&subreq, req->src, req->dst,
+ cbc_blocks * AES_BLOCK_SIZE,
+ req->iv);
+
+ err = cbc_decrypt(&subreq);
+ if (err)
+ return err;
+
+ if (req->cryptlen == AES_BLOCK_SIZE)
+ return 0;
+
+ dst = src = scatterwalk_ffwd(sg_src, req->src, subreq.cryptlen);
+ if (req->dst != req->src)
+ dst = scatterwalk_ffwd(sg_dst, req->dst,
+ subreq.cryptlen);
+ }
+
+ /* handle ciphertext stealing */
+ skcipher_request_set_crypt(&subreq, src, dst,
+ req->cryptlen - cbc_blocks * AES_BLOCK_SIZE,
+ req->iv);
+
+ err = skcipher_walk_virt(&walk, &subreq, false);
+ if (err)
+ return err;
- aesni_enc(ctx, keystream, ctrblk);
- crypto_xor_cpy(dst, keystream, src, nbytes);
+ kernel_fpu_begin();
+ aesni_cts_cbc_dec(ctx, walk.dst.virt.addr, walk.src.virt.addr,
+ walk.nbytes, walk.iv);
+ kernel_fpu_end();
- crypto_inc(ctrblk, AES_BLOCK_SIZE);
+ return skcipher_walk_done(&walk, 0);
}
-#ifdef CONFIG_AS_AVX
+#ifdef CONFIG_X86_64
static void aesni_ctr_enc_avx_tfm(struct crypto_aes_ctx *ctx, u8 *out,
const u8 *in, unsigned int len, u8 *iv)
{
@@ -493,124 +506,92 @@ static void aesni_ctr_enc_avx_tfm(struct crypto_aes_ctx *ctx, u8 *out,
else
aes_ctr_enc_256_avx_by8(in, iv, (void *)ctx, out, len);
}
-#endif
static int ctr_crypt(struct skcipher_request *req)
{
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
struct crypto_aes_ctx *ctx = aes_ctx(crypto_skcipher_ctx(tfm));
+ u8 keystream[AES_BLOCK_SIZE];
struct skcipher_walk walk;
unsigned int nbytes;
int err;
- err = skcipher_walk_virt(&walk, req, true);
+ err = skcipher_walk_virt(&walk, req, false);
- kernel_fpu_begin();
- while ((nbytes = walk.nbytes) >= AES_BLOCK_SIZE) {
- aesni_ctr_enc_tfm(ctx, walk.dst.virt.addr, walk.src.virt.addr,
- nbytes & AES_BLOCK_MASK, walk.iv);
- nbytes &= AES_BLOCK_SIZE - 1;
+ while ((nbytes = walk.nbytes) > 0) {
+ kernel_fpu_begin();
+ if (nbytes & AES_BLOCK_MASK)
+ static_call(aesni_ctr_enc_tfm)(ctx, walk.dst.virt.addr,
+ walk.src.virt.addr,
+ nbytes & AES_BLOCK_MASK,
+ walk.iv);
+ nbytes &= ~AES_BLOCK_MASK;
+
+ if (walk.nbytes == walk.total && nbytes > 0) {
+ aesni_enc(ctx, keystream, walk.iv);
+ crypto_xor_cpy(walk.dst.virt.addr + walk.nbytes - nbytes,
+ walk.src.virt.addr + walk.nbytes - nbytes,
+ keystream, nbytes);
+ crypto_inc(walk.iv, AES_BLOCK_SIZE);
+ nbytes = 0;
+ }
+ kernel_fpu_end();
err = skcipher_walk_done(&walk, nbytes);
}
- if (walk.nbytes) {
- ctr_crypt_final(ctx, &walk);
- err = skcipher_walk_done(&walk, 0);
- }
- kernel_fpu_end();
-
return err;
}
-static int xts_aesni_setkey(struct crypto_skcipher *tfm, const u8 *key,
- unsigned int keylen)
-{
- struct aesni_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
- int err;
-
- err = xts_verify_key(tfm, key, keylen);
- if (err)
- return err;
-
- keylen /= 2;
-
- /* first half of xts-key is for crypt */
- err = aes_set_key_common(crypto_skcipher_tfm(tfm), ctx->raw_crypt_ctx,
- key, keylen);
- if (err)
- return err;
-
- /* second half of xts-key is for tweak */
- return aes_set_key_common(crypto_skcipher_tfm(tfm), ctx->raw_tweak_ctx,
- key + keylen, keylen);
-}
-
-
-static void aesni_xts_enc(const void *ctx, u8 *dst, const u8 *src, le128 *iv)
-{
- glue_xts_crypt_128bit_one(ctx, dst, src, iv, aesni_enc);
-}
-
-static void aesni_xts_dec(const void *ctx, u8 *dst, const u8 *src, le128 *iv)
-{
- glue_xts_crypt_128bit_one(ctx, dst, src, iv, aesni_dec);
-}
-
-static void aesni_xts_enc8(const void *ctx, u8 *dst, const u8 *src, le128 *iv)
+static void aesni_xctr_enc_avx_tfm(struct crypto_aes_ctx *ctx, u8 *out,
+ const u8 *in, unsigned int len, u8 *iv,
+ unsigned int byte_ctr)
{
- aesni_xts_crypt8(ctx, dst, src, true, iv);
+ if (ctx->key_length == AES_KEYSIZE_128)
+ aes_xctr_enc_128_avx_by8(in, iv, (void *)ctx, out, len,
+ byte_ctr);
+ else if (ctx->key_length == AES_KEYSIZE_192)
+ aes_xctr_enc_192_avx_by8(in, iv, (void *)ctx, out, len,
+ byte_ctr);
+ else
+ aes_xctr_enc_256_avx_by8(in, iv, (void *)ctx, out, len,
+ byte_ctr);
}
-static void aesni_xts_dec8(const void *ctx, u8 *dst, const u8 *src, le128 *iv)
-{
- aesni_xts_crypt8(ctx, dst, src, false, iv);
-}
-
-static const struct common_glue_ctx aesni_enc_xts = {
- .num_funcs = 2,
- .fpu_blocks_limit = 1,
-
- .funcs = { {
- .num_blocks = 8,
- .fn_u = { .xts = aesni_xts_enc8 }
- }, {
- .num_blocks = 1,
- .fn_u = { .xts = aesni_xts_enc }
- } }
-};
-
-static const struct common_glue_ctx aesni_dec_xts = {
- .num_funcs = 2,
- .fpu_blocks_limit = 1,
-
- .funcs = { {
- .num_blocks = 8,
- .fn_u = { .xts = aesni_xts_dec8 }
- }, {
- .num_blocks = 1,
- .fn_u = { .xts = aesni_xts_dec }
- } }
-};
-
-static int xts_encrypt(struct skcipher_request *req)
+static int xctr_crypt(struct skcipher_request *req)
{
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
- struct aesni_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
+ struct crypto_aes_ctx *ctx = aes_ctx(crypto_skcipher_ctx(tfm));
+ u8 keystream[AES_BLOCK_SIZE];
+ struct skcipher_walk walk;
+ unsigned int nbytes;
+ unsigned int byte_ctr = 0;
+ int err;
+ __le32 block[AES_BLOCK_SIZE / sizeof(__le32)];
- return glue_xts_req_128bit(&aesni_enc_xts, req, aesni_enc,
- aes_ctx(ctx->raw_tweak_ctx),
- aes_ctx(ctx->raw_crypt_ctx),
- false);
-}
+ err = skcipher_walk_virt(&walk, req, false);
-static int xts_decrypt(struct skcipher_request *req)
-{
- struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
- struct aesni_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
-
- return glue_xts_req_128bit(&aesni_dec_xts, req, aesni_enc,
- aes_ctx(ctx->raw_tweak_ctx),
- aes_ctx(ctx->raw_crypt_ctx),
- true);
+ while ((nbytes = walk.nbytes) > 0) {
+ kernel_fpu_begin();
+ if (nbytes & AES_BLOCK_MASK)
+ aesni_xctr_enc_avx_tfm(ctx, walk.dst.virt.addr,
+ walk.src.virt.addr, nbytes & AES_BLOCK_MASK,
+ walk.iv, byte_ctr);
+ nbytes &= ~AES_BLOCK_MASK;
+ byte_ctr += walk.nbytes - nbytes;
+
+ if (walk.nbytes == walk.total && nbytes > 0) {
+ memcpy(block, walk.iv, AES_BLOCK_SIZE);
+ block[0] ^= cpu_to_le32(1 + byte_ctr / AES_BLOCK_SIZE);
+ aesni_enc(ctx, keystream, (u8 *)block);
+ crypto_xor_cpy(walk.dst.virt.addr + walk.nbytes -
+ nbytes, walk.src.virt.addr + walk.nbytes
+ - nbytes, keystream, nbytes);
+ byte_ctr += nbytes;
+ nbytes = 0;
+ }
+ kernel_fpu_end();
+ err = skcipher_walk_done(&walk, nbytes);
+ }
+ return err;
}
static int
@@ -689,46 +670,35 @@ static int generic_gcmaes_set_authsize(struct crypto_aead *tfm,
static int gcmaes_crypt_by_sg(bool enc, struct aead_request *req,
unsigned int assoclen, u8 *hash_subkey,
- u8 *iv, void *aes_ctx)
+ u8 *iv, void *aes_ctx, u8 *auth_tag,
+ unsigned long auth_tag_len)
{
- struct crypto_aead *tfm = crypto_aead_reqtfm(req);
- unsigned long auth_tag_len = crypto_aead_authsize(tfm);
- const struct aesni_gcm_tfm_s *gcm_tfm = aesni_gcm_tfm;
- struct gcm_context_data data AESNI_ALIGN_ATTR;
- struct scatter_walk dst_sg_walk = {};
+ u8 databuf[sizeof(struct gcm_context_data) + (AESNI_ALIGN - 8)] __aligned(8);
+ struct gcm_context_data *data = PTR_ALIGN((void *)databuf, AESNI_ALIGN);
unsigned long left = req->cryptlen;
- unsigned long len, srclen, dstlen;
struct scatter_walk assoc_sg_walk;
- struct scatter_walk src_sg_walk;
- struct scatterlist src_start[2];
- struct scatterlist dst_start[2];
- struct scatterlist *src_sg;
- struct scatterlist *dst_sg;
- u8 *src, *dst, *assoc;
+ struct skcipher_walk walk;
+ bool do_avx, do_avx2;
u8 *assocmem = NULL;
- u8 authTag[16];
+ u8 *assoc;
+ int err;
if (!enc)
left -= auth_tag_len;
-#ifdef CONFIG_AS_AVX2
- if (left < AVX_GEN4_OPTSIZE && gcm_tfm == &aesni_gcm_tfm_avx_gen4)
- gcm_tfm = &aesni_gcm_tfm_avx_gen2;
-#endif
-#ifdef CONFIG_AS_AVX
- if (left < AVX_GEN2_OPTSIZE && gcm_tfm == &aesni_gcm_tfm_avx_gen2)
- gcm_tfm = &aesni_gcm_tfm_sse;
-#endif
+ do_avx = (left >= AVX_GEN2_OPTSIZE);
+ do_avx2 = (left >= AVX_GEN4_OPTSIZE);
/* Linearize assoc, if not already linear */
- if (req->src->length >= assoclen && req->src->length &&
- (!PageHighMem(sg_page(req->src)) ||
- req->src->offset + req->src->length <= PAGE_SIZE)) {
+ if (req->src->length >= assoclen && req->src->length) {
scatterwalk_start(&assoc_sg_walk, req->src);
assoc = scatterwalk_map(&assoc_sg_walk);
} else {
+ gfp_t flags = (req->base.flags & CRYPTO_TFM_REQ_MAY_SLEEP) ?
+ GFP_KERNEL : GFP_ATOMIC;
+
/* assoc can be any length, so must be on heap */
- assocmem = kmalloc(assoclen, GFP_ATOMIC);
+ assocmem = kmalloc(assoclen, flags);
if (unlikely(!assocmem))
return -ENOMEM;
assoc = assocmem;
@@ -736,62 +706,15 @@ static int gcmaes_crypt_by_sg(bool enc, struct aead_request *req,
scatterwalk_map_and_copy(assoc, req->src, 0, assoclen, 0);
}
- if (left) {
- src_sg = scatterwalk_ffwd(src_start, req->src, req->assoclen);
- scatterwalk_start(&src_sg_walk, src_sg);
- if (req->src != req->dst) {
- dst_sg = scatterwalk_ffwd(dst_start, req->dst,
- req->assoclen);
- scatterwalk_start(&dst_sg_walk, dst_sg);
- }
- }
-
kernel_fpu_begin();
- gcm_tfm->init(aes_ctx, &data, iv,
- hash_subkey, assoc, assoclen);
- if (req->src != req->dst) {
- while (left) {
- src = scatterwalk_map(&src_sg_walk);
- dst = scatterwalk_map(&dst_sg_walk);
- srclen = scatterwalk_clamp(&src_sg_walk, left);
- dstlen = scatterwalk_clamp(&dst_sg_walk, left);
- len = min(srclen, dstlen);
- if (len) {
- if (enc)
- gcm_tfm->enc_update(aes_ctx, &data,
- dst, src, len);
- else
- gcm_tfm->dec_update(aes_ctx, &data,
- dst, src, len);
- }
- left -= len;
-
- scatterwalk_unmap(src);
- scatterwalk_unmap(dst);
- scatterwalk_advance(&src_sg_walk, len);
- scatterwalk_advance(&dst_sg_walk, len);
- scatterwalk_done(&src_sg_walk, 0, left);
- scatterwalk_done(&dst_sg_walk, 1, left);
- }
- } else {
- while (left) {
- dst = src = scatterwalk_map(&src_sg_walk);
- len = scatterwalk_clamp(&src_sg_walk, left);
- if (len) {
- if (enc)
- gcm_tfm->enc_update(aes_ctx, &data,
- src, src, len);
- else
- gcm_tfm->dec_update(aes_ctx, &data,
- src, src, len);
- }
- left -= len;
- scatterwalk_unmap(src);
- scatterwalk_advance(&src_sg_walk, len);
- scatterwalk_done(&src_sg_walk, 1, left);
- }
- }
- gcm_tfm->finalize(aes_ctx, &data, authTag, auth_tag_len);
+ if (static_branch_likely(&gcm_use_avx2) && do_avx2)
+ aesni_gcm_init_avx_gen4(aes_ctx, data, iv, hash_subkey, assoc,
+ assoclen);
+ else if (static_branch_likely(&gcm_use_avx) && do_avx)
+ aesni_gcm_init_avx_gen2(aes_ctx, data, iv, hash_subkey, assoc,
+ assoclen);
+ else
+ aesni_gcm_init(aes_ctx, data, iv, hash_subkey, assoc, assoclen);
kernel_fpu_end();
if (!assocmem)
@@ -799,24 +722,58 @@ static int gcmaes_crypt_by_sg(bool enc, struct aead_request *req,
else
kfree(assocmem);
- if (!enc) {
- u8 authTagMsg[16];
+ err = enc ? skcipher_walk_aead_encrypt(&walk, req, false)
+ : skcipher_walk_aead_decrypt(&walk, req, false);
- /* Copy out original authTag */
- scatterwalk_map_and_copy(authTagMsg, req->src,
- req->assoclen + req->cryptlen -
- auth_tag_len,
- auth_tag_len, 0);
+ while (walk.nbytes > 0) {
+ kernel_fpu_begin();
+ if (static_branch_likely(&gcm_use_avx2) && do_avx2) {
+ if (enc)
+ aesni_gcm_enc_update_avx_gen4(aes_ctx, data,
+ walk.dst.virt.addr,
+ walk.src.virt.addr,
+ walk.nbytes);
+ else
+ aesni_gcm_dec_update_avx_gen4(aes_ctx, data,
+ walk.dst.virt.addr,
+ walk.src.virt.addr,
+ walk.nbytes);
+ } else if (static_branch_likely(&gcm_use_avx) && do_avx) {
+ if (enc)
+ aesni_gcm_enc_update_avx_gen2(aes_ctx, data,
+ walk.dst.virt.addr,
+ walk.src.virt.addr,
+ walk.nbytes);
+ else
+ aesni_gcm_dec_update_avx_gen2(aes_ctx, data,
+ walk.dst.virt.addr,
+ walk.src.virt.addr,
+ walk.nbytes);
+ } else if (enc) {
+ aesni_gcm_enc_update(aes_ctx, data, walk.dst.virt.addr,
+ walk.src.virt.addr, walk.nbytes);
+ } else {
+ aesni_gcm_dec_update(aes_ctx, data, walk.dst.virt.addr,
+ walk.src.virt.addr, walk.nbytes);
+ }
+ kernel_fpu_end();
- /* Compare generated tag with passed in tag. */
- return crypto_memneq(authTagMsg, authTag, auth_tag_len) ?
- -EBADMSG : 0;
+ err = skcipher_walk_done(&walk, 0);
}
- /* Copy in the authTag */
- scatterwalk_map_and_copy(authTag, req->dst,
- req->assoclen + req->cryptlen,
- auth_tag_len, 1);
+ if (err)
+ return err;
+
+ kernel_fpu_begin();
+ if (static_branch_likely(&gcm_use_avx2) && do_avx2)
+ aesni_gcm_finalize_avx_gen4(aes_ctx, data, auth_tag,
+ auth_tag_len);
+ else if (static_branch_likely(&gcm_use_avx) && do_avx)
+ aesni_gcm_finalize_avx_gen2(aes_ctx, data, auth_tag,
+ auth_tag_len);
+ else
+ aesni_gcm_finalize(aes_ctx, data, auth_tag, auth_tag_len);
+ kernel_fpu_end();
return 0;
}
@@ -824,15 +781,47 @@ static int gcmaes_crypt_by_sg(bool enc, struct aead_request *req,
static int gcmaes_encrypt(struct aead_request *req, unsigned int assoclen,
u8 *hash_subkey, u8 *iv, void *aes_ctx)
{
- return gcmaes_crypt_by_sg(true, req, assoclen, hash_subkey, iv,
- aes_ctx);
+ struct crypto_aead *tfm = crypto_aead_reqtfm(req);
+ unsigned long auth_tag_len = crypto_aead_authsize(tfm);
+ u8 auth_tag[16];
+ int err;
+
+ err = gcmaes_crypt_by_sg(true, req, assoclen, hash_subkey, iv, aes_ctx,
+ auth_tag, auth_tag_len);
+ if (err)
+ return err;
+
+ scatterwalk_map_and_copy(auth_tag, req->dst,
+ req->assoclen + req->cryptlen,
+ auth_tag_len, 1);
+ return 0;
}
static int gcmaes_decrypt(struct aead_request *req, unsigned int assoclen,
u8 *hash_subkey, u8 *iv, void *aes_ctx)
{
- return gcmaes_crypt_by_sg(false, req, assoclen, hash_subkey, iv,
- aes_ctx);
+ struct crypto_aead *tfm = crypto_aead_reqtfm(req);
+ unsigned long auth_tag_len = crypto_aead_authsize(tfm);
+ u8 auth_tag_msg[16];
+ u8 auth_tag[16];
+ int err;
+
+ err = gcmaes_crypt_by_sg(false, req, assoclen, hash_subkey, iv, aes_ctx,
+ auth_tag, auth_tag_len);
+ if (err)
+ return err;
+
+ /* Copy out original auth_tag */
+ scatterwalk_map_and_copy(auth_tag_msg, req->src,
+ req->assoclen + req->cryptlen - auth_tag_len,
+ auth_tag_len, 0);
+
+ /* Compare generated tag with passed in tag. */
+ if (crypto_memneq(auth_tag_msg, auth_tag, auth_tag_len)) {
+ memzero_explicit(auth_tag, sizeof(auth_tag));
+ return -EBADMSG;
+ }
+ return 0;
}
static int helper_rfc4106_encrypt(struct aead_request *req)
@@ -840,7 +829,8 @@ static int helper_rfc4106_encrypt(struct aead_request *req)
struct crypto_aead *tfm = crypto_aead_reqtfm(req);
struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm);
void *aes_ctx = &(ctx->aes_key_expanded);
- u8 iv[16] __attribute__ ((__aligned__(AESNI_ALIGN)));
+ u8 ivbuf[16 + (AESNI_ALIGN - 8)] __aligned(8);
+ u8 *iv = PTR_ALIGN(&ivbuf[0], AESNI_ALIGN);
unsigned int i;
__be32 counter = cpu_to_be32(1);
@@ -867,7 +857,8 @@ static int helper_rfc4106_decrypt(struct aead_request *req)
struct crypto_aead *tfm = crypto_aead_reqtfm(req);
struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm);
void *aes_ctx = &(ctx->aes_key_expanded);
- u8 iv[16] __attribute__ ((__aligned__(AESNI_ALIGN)));
+ u8 ivbuf[16 + (AESNI_ALIGN - 8)] __aligned(8);
+ u8 *iv = PTR_ALIGN(&ivbuf[0], AESNI_ALIGN);
unsigned int i;
if (unlikely(req->assoclen != 16 && req->assoclen != 20))
@@ -889,6 +880,133 @@ static int helper_rfc4106_decrypt(struct aead_request *req)
}
#endif
+static int xts_aesni_setkey(struct crypto_skcipher *tfm, const u8 *key,
+ unsigned int keylen)
+{
+ struct aesni_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
+ int err;
+
+ err = xts_verify_key(tfm, key, keylen);
+ if (err)
+ return err;
+
+ keylen /= 2;
+
+ /* first half of xts-key is for crypt */
+ err = aes_set_key_common(crypto_skcipher_tfm(tfm), ctx->raw_crypt_ctx,
+ key, keylen);
+ if (err)
+ return err;
+
+ /* second half of xts-key is for tweak */
+ return aes_set_key_common(crypto_skcipher_tfm(tfm), ctx->raw_tweak_ctx,
+ key + keylen, keylen);
+}
+
+static int xts_crypt(struct skcipher_request *req, bool encrypt)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct aesni_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
+ int tail = req->cryptlen % AES_BLOCK_SIZE;
+ struct skcipher_request subreq;
+ struct skcipher_walk walk;
+ int err;
+
+ if (req->cryptlen < AES_BLOCK_SIZE)
+ return -EINVAL;
+
+ err = skcipher_walk_virt(&walk, req, false);
+ if (!walk.nbytes)
+ return err;
+
+ if (unlikely(tail > 0 && walk.nbytes < walk.total)) {
+ int blocks = DIV_ROUND_UP(req->cryptlen, AES_BLOCK_SIZE) - 2;
+
+ skcipher_walk_abort(&walk);
+
+ skcipher_request_set_tfm(&subreq, tfm);
+ skcipher_request_set_callback(&subreq,
+ skcipher_request_flags(req),
+ NULL, NULL);
+ skcipher_request_set_crypt(&subreq, req->src, req->dst,
+ blocks * AES_BLOCK_SIZE, req->iv);
+ req = &subreq;
+
+ err = skcipher_walk_virt(&walk, req, false);
+ if (!walk.nbytes)
+ return err;
+ } else {
+ tail = 0;
+ }
+
+ kernel_fpu_begin();
+
+ /* calculate first value of T */
+ aesni_enc(aes_ctx(ctx->raw_tweak_ctx), walk.iv, walk.iv);
+
+ while (walk.nbytes > 0) {
+ int nbytes = walk.nbytes;
+
+ if (nbytes < walk.total)
+ nbytes &= ~(AES_BLOCK_SIZE - 1);
+
+ if (encrypt)
+ aesni_xts_encrypt(aes_ctx(ctx->raw_crypt_ctx),
+ walk.dst.virt.addr, walk.src.virt.addr,
+ nbytes, walk.iv);
+ else
+ aesni_xts_decrypt(aes_ctx(ctx->raw_crypt_ctx),
+ walk.dst.virt.addr, walk.src.virt.addr,
+ nbytes, walk.iv);
+ kernel_fpu_end();
+
+ err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
+
+ if (walk.nbytes > 0)
+ kernel_fpu_begin();
+ }
+
+ if (unlikely(tail > 0 && !err)) {
+ struct scatterlist sg_src[2], sg_dst[2];
+ struct scatterlist *src, *dst;
+
+ dst = src = scatterwalk_ffwd(sg_src, req->src, req->cryptlen);
+ if (req->dst != req->src)
+ dst = scatterwalk_ffwd(sg_dst, req->dst, req->cryptlen);
+
+ skcipher_request_set_crypt(req, src, dst, AES_BLOCK_SIZE + tail,
+ req->iv);
+
+ err = skcipher_walk_virt(&walk, &subreq, false);
+ if (err)
+ return err;
+
+ kernel_fpu_begin();
+ if (encrypt)
+ aesni_xts_encrypt(aes_ctx(ctx->raw_crypt_ctx),
+ walk.dst.virt.addr, walk.src.virt.addr,
+ walk.nbytes, walk.iv);
+ else
+ aesni_xts_decrypt(aes_ctx(ctx->raw_crypt_ctx),
+ walk.dst.virt.addr, walk.src.virt.addr,
+ walk.nbytes, walk.iv);
+ kernel_fpu_end();
+
+ err = skcipher_walk_done(&walk, 0);
+ }
+ return err;
+}
+
+static int xts_encrypt(struct skcipher_request *req)
+{
+ return xts_crypt(req, true);
+}
+
+static int xts_decrypt(struct skcipher_request *req)
+{
+ return xts_crypt(req, false);
+}
+
static struct crypto_alg aesni_cipher_alg = {
.cra_name = "aes",
.cra_driver_name = "aes-aesni",
@@ -940,6 +1058,23 @@ static struct skcipher_alg aesni_skciphers[] = {
.setkey = aesni_skcipher_setkey,
.encrypt = cbc_encrypt,
.decrypt = cbc_decrypt,
+ }, {
+ .base = {
+ .cra_name = "__cts(cbc(aes))",
+ .cra_driver_name = "__cts-cbc-aes-aesni",
+ .cra_priority = 400,
+ .cra_flags = CRYPTO_ALG_INTERNAL,
+ .cra_blocksize = AES_BLOCK_SIZE,
+ .cra_ctxsize = CRYPTO_AES_CTX_SIZE,
+ .cra_module = THIS_MODULE,
+ },
+ .min_keysize = AES_MIN_KEY_SIZE,
+ .max_keysize = AES_MAX_KEY_SIZE,
+ .ivsize = AES_BLOCK_SIZE,
+ .walksize = 2 * AES_BLOCK_SIZE,
+ .setkey = aesni_skcipher_setkey,
+ .encrypt = cts_cbc_encrypt,
+ .decrypt = cts_cbc_decrypt,
#ifdef CONFIG_X86_64
}, {
.base = {
@@ -958,6 +1093,7 @@ static struct skcipher_alg aesni_skciphers[] = {
.setkey = aesni_skcipher_setkey,
.encrypt = ctr_crypt,
.decrypt = ctr_crypt,
+#endif
}, {
.base = {
.cra_name = "__xts(aes)",
@@ -971,10 +1107,10 @@ static struct skcipher_alg aesni_skciphers[] = {
.min_keysize = 2 * AES_MIN_KEY_SIZE,
.max_keysize = 2 * AES_MAX_KEY_SIZE,
.ivsize = AES_BLOCK_SIZE,
+ .walksize = 2 * AES_BLOCK_SIZE,
.setkey = xts_aesni_setkey,
.encrypt = xts_encrypt,
.decrypt = xts_decrypt,
-#endif
}
};
@@ -982,6 +1118,33 @@ static
struct simd_skcipher_alg *aesni_simd_skciphers[ARRAY_SIZE(aesni_skciphers)];
#ifdef CONFIG_X86_64
+/*
+ * XCTR does not have a non-AVX implementation, so it must be enabled
+ * conditionally.
+ */
+static struct skcipher_alg aesni_xctr = {
+ .base = {
+ .cra_name = "__xctr(aes)",
+ .cra_driver_name = "__xctr-aes-aesni",
+ .cra_priority = 400,
+ .cra_flags = CRYPTO_ALG_INTERNAL,
+ .cra_blocksize = 1,
+ .cra_ctxsize = CRYPTO_AES_CTX_SIZE,
+ .cra_module = THIS_MODULE,
+ },
+ .min_keysize = AES_MIN_KEY_SIZE,
+ .max_keysize = AES_MAX_KEY_SIZE,
+ .ivsize = AES_BLOCK_SIZE,
+ .chunksize = AES_BLOCK_SIZE,
+ .setkey = aesni_skcipher_setkey,
+ .encrypt = xctr_crypt,
+ .decrypt = xctr_crypt,
+};
+
+static struct simd_skcipher_alg *aesni_simd_xctr;
+#endif /* CONFIG_X86_64 */
+
+#ifdef CONFIG_X86_64
static int generic_gcmaes_set_key(struct crypto_aead *aead, const u8 *key,
unsigned int key_len)
{
@@ -997,7 +1160,8 @@ static int generic_gcmaes_encrypt(struct aead_request *req)
struct crypto_aead *tfm = crypto_aead_reqtfm(req);
struct generic_gcmaes_ctx *ctx = generic_gcmaes_ctx_get(tfm);
void *aes_ctx = &(ctx->aes_key_expanded);
- u8 iv[16] __attribute__ ((__aligned__(AESNI_ALIGN)));
+ u8 ivbuf[16 + (AESNI_ALIGN - 8)] __aligned(8);
+ u8 *iv = PTR_ALIGN(&ivbuf[0], AESNI_ALIGN);
__be32 counter = cpu_to_be32(1);
memcpy(iv, req->iv, 12);
@@ -1013,7 +1177,8 @@ static int generic_gcmaes_decrypt(struct aead_request *req)
struct crypto_aead *tfm = crypto_aead_reqtfm(req);
struct generic_gcmaes_ctx *ctx = generic_gcmaes_ctx_get(tfm);
void *aes_ctx = &(ctx->aes_key_expanded);
- u8 iv[16] __attribute__ ((__aligned__(AESNI_ALIGN)));
+ u8 ivbuf[16 + (AESNI_ALIGN - 8)] __aligned(8);
+ u8 *iv = PTR_ALIGN(&ivbuf[0], AESNI_ALIGN);
memcpy(iv, req->iv, 12);
*((__be32 *)(iv+12)) = counter;
@@ -1036,7 +1201,7 @@ static struct aead_alg aesni_aeads[] = { {
.cra_flags = CRYPTO_ALG_INTERNAL,
.cra_blocksize = 1,
.cra_ctxsize = sizeof(struct aesni_rfc4106_gcm_ctx),
- .cra_alignmask = AESNI_ALIGN - 1,
+ .cra_alignmask = 0,
.cra_module = THIS_MODULE,
},
}, {
@@ -1053,7 +1218,7 @@ static struct aead_alg aesni_aeads[] = { {
.cra_flags = CRYPTO_ALG_INTERNAL,
.cra_blocksize = 1,
.cra_ctxsize = sizeof(struct generic_gcmaes_ctx),
- .cra_alignmask = AESNI_ALIGN - 1,
+ .cra_alignmask = 0,
.cra_module = THIS_MODULE,
},
} };
@@ -1064,7 +1229,7 @@ static struct aead_alg aesni_aeads[0];
static struct simd_aead_alg *aesni_simd_aeads[ARRAY_SIZE(aesni_aeads)];
static const struct x86_cpu_id aesni_cpu_id[] = {
- X86_FEATURE_MATCH(X86_FEATURE_AES),
+ X86_MATCH_FEATURE(X86_FEATURE_AES, NULL),
{}
};
MODULE_DEVICE_TABLE(x86cpu, aesni_cpu_id);
@@ -1076,31 +1241,23 @@ static int __init aesni_init(void)
if (!x86_match_cpu(aesni_cpu_id))
return -ENODEV;
#ifdef CONFIG_X86_64
-#ifdef CONFIG_AS_AVX2
if (boot_cpu_has(X86_FEATURE_AVX2)) {
pr_info("AVX2 version of gcm_enc/dec engaged.\n");
- aesni_gcm_tfm = &aesni_gcm_tfm_avx_gen4;
+ static_branch_enable(&gcm_use_avx);
+ static_branch_enable(&gcm_use_avx2);
} else
-#endif
-#ifdef CONFIG_AS_AVX
if (boot_cpu_has(X86_FEATURE_AVX)) {
pr_info("AVX version of gcm_enc/dec engaged.\n");
- aesni_gcm_tfm = &aesni_gcm_tfm_avx_gen2;
- } else
-#endif
- {
+ static_branch_enable(&gcm_use_avx);
+ } else {
pr_info("SSE version of gcm_enc/dec engaged.\n");
- aesni_gcm_tfm = &aesni_gcm_tfm_sse;
}
- aesni_ctr_enc_tfm = aesni_ctr_enc;
-#ifdef CONFIG_AS_AVX
if (boot_cpu_has(X86_FEATURE_AVX)) {
/* optimize performance of ctr mode encryption transform */
- aesni_ctr_enc_tfm = aesni_ctr_enc_avx_tfm;
+ static_call_update(aesni_ctr_enc_tfm, aesni_ctr_enc_avx_tfm);
pr_info("AES CTR mode by8 optimization enabled\n");
}
-#endif
-#endif
+#endif /* CONFIG_X86_64 */
err = crypto_register_alg(&aesni_cipher_alg);
if (err)
@@ -1117,8 +1274,22 @@ static int __init aesni_init(void)
if (err)
goto unregister_skciphers;
+#ifdef CONFIG_X86_64
+ if (boot_cpu_has(X86_FEATURE_AVX))
+ err = simd_register_skciphers_compat(&aesni_xctr, 1,
+ &aesni_simd_xctr);
+ if (err)
+ goto unregister_aeads;
+#endif /* CONFIG_X86_64 */
+
return 0;
+#ifdef CONFIG_X86_64
+unregister_aeads:
+ simd_unregister_aeads(aesni_aeads, ARRAY_SIZE(aesni_aeads),
+ aesni_simd_aeads);
+#endif /* CONFIG_X86_64 */
+
unregister_skciphers:
simd_unregister_skciphers(aesni_skciphers, ARRAY_SIZE(aesni_skciphers),
aesni_simd_skciphers);
@@ -1134,6 +1305,10 @@ static void __exit aesni_exit(void)
simd_unregister_skciphers(aesni_skciphers, ARRAY_SIZE(aesni_skciphers),
aesni_simd_skciphers);
crypto_unregister_alg(&aesni_cipher_alg);
+#ifdef CONFIG_X86_64
+ if (boot_cpu_has(X86_FEATURE_AVX))
+ simd_unregister_skciphers(&aesni_xctr, 1, &aesni_simd_xctr);
+#endif /* CONFIG_X86_64 */
}
late_initcall(aesni_init);
diff --git a/arch/x86/crypto/aria-aesni-avx-asm_64.S b/arch/x86/crypto/aria-aesni-avx-asm_64.S
new file mode 100644
index 000000000000..c75fd7d015ed
--- /dev/null
+++ b/arch/x86/crypto/aria-aesni-avx-asm_64.S
@@ -0,0 +1,1303 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * ARIA Cipher 16-way parallel algorithm (AVX)
+ *
+ * Copyright (c) 2022 Taehee Yoo <ap420073@gmail.com>
+ *
+ */
+
+#include <linux/linkage.h>
+#include <asm/frame.h>
+
+/* struct aria_ctx: */
+#define enc_key 0
+#define dec_key 272
+#define rounds 544
+
+/* register macros */
+#define CTX %rdi
+
+
+#define BV8(a0, a1, a2, a3, a4, a5, a6, a7) \
+ ( (((a0) & 1) << 0) | \
+ (((a1) & 1) << 1) | \
+ (((a2) & 1) << 2) | \
+ (((a3) & 1) << 3) | \
+ (((a4) & 1) << 4) | \
+ (((a5) & 1) << 5) | \
+ (((a6) & 1) << 6) | \
+ (((a7) & 1) << 7) )
+
+#define BM8X8(l0, l1, l2, l3, l4, l5, l6, l7) \
+ ( ((l7) << (0 * 8)) | \
+ ((l6) << (1 * 8)) | \
+ ((l5) << (2 * 8)) | \
+ ((l4) << (3 * 8)) | \
+ ((l3) << (4 * 8)) | \
+ ((l2) << (5 * 8)) | \
+ ((l1) << (6 * 8)) | \
+ ((l0) << (7 * 8)) )
+
+#define inc_le128(x, minus_one, tmp) \
+ vpcmpeqq minus_one, x, tmp; \
+ vpsubq minus_one, x, x; \
+ vpslldq $8, tmp, tmp; \
+ vpsubq tmp, x, x;
+
+#define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0) \
+ vpand x, mask4bit, tmp0; \
+ vpandn x, mask4bit, x; \
+ vpsrld $4, x, x; \
+ \
+ vpshufb tmp0, lo_t, tmp0; \
+ vpshufb x, hi_t, x; \
+ vpxor tmp0, x, x;
+
+#define transpose_4x4(x0, x1, x2, x3, t1, t2) \
+ vpunpckhdq x1, x0, t2; \
+ vpunpckldq x1, x0, x0; \
+ \
+ vpunpckldq x3, x2, t1; \
+ vpunpckhdq x3, x2, x2; \
+ \
+ vpunpckhqdq t1, x0, x1; \
+ vpunpcklqdq t1, x0, x0; \
+ \
+ vpunpckhqdq x2, t2, x3; \
+ vpunpcklqdq x2, t2, x2;
+
+#define byteslice_16x16b(a0, b0, c0, d0, \
+ a1, b1, c1, d1, \
+ a2, b2, c2, d2, \
+ a3, b3, c3, d3, \
+ st0, st1) \
+ vmovdqu d2, st0; \
+ vmovdqu d3, st1; \
+ transpose_4x4(a0, a1, a2, a3, d2, d3); \
+ transpose_4x4(b0, b1, b2, b3, d2, d3); \
+ vmovdqu st0, d2; \
+ vmovdqu st1, d3; \
+ \
+ vmovdqu a0, st0; \
+ vmovdqu a1, st1; \
+ transpose_4x4(c0, c1, c2, c3, a0, a1); \
+ transpose_4x4(d0, d1, d2, d3, a0, a1); \
+ \
+ vmovdqu .Lshufb_16x16b, a0; \
+ vmovdqu st1, a1; \
+ vpshufb a0, a2, a2; \
+ vpshufb a0, a3, a3; \
+ vpshufb a0, b0, b0; \
+ vpshufb a0, b1, b1; \
+ vpshufb a0, b2, b2; \
+ vpshufb a0, b3, b3; \
+ vpshufb a0, a1, a1; \
+ vpshufb a0, c0, c0; \
+ vpshufb a0, c1, c1; \
+ vpshufb a0, c2, c2; \
+ vpshufb a0, c3, c3; \
+ vpshufb a0, d0, d0; \
+ vpshufb a0, d1, d1; \
+ vpshufb a0, d2, d2; \
+ vpshufb a0, d3, d3; \
+ vmovdqu d3, st1; \
+ vmovdqu st0, d3; \
+ vpshufb a0, d3, a0; \
+ vmovdqu d2, st0; \
+ \
+ transpose_4x4(a0, b0, c0, d0, d2, d3); \
+ transpose_4x4(a1, b1, c1, d1, d2, d3); \
+ vmovdqu st0, d2; \
+ vmovdqu st1, d3; \
+ \
+ vmovdqu b0, st0; \
+ vmovdqu b1, st1; \
+ transpose_4x4(a2, b2, c2, d2, b0, b1); \
+ transpose_4x4(a3, b3, c3, d3, b0, b1); \
+ vmovdqu st0, b0; \
+ vmovdqu st1, b1; \
+ /* does not adjust output bytes inside vectors */
+
+#define debyteslice_16x16b(a0, b0, c0, d0, \
+ a1, b1, c1, d1, \
+ a2, b2, c2, d2, \
+ a3, b3, c3, d3, \
+ st0, st1) \
+ vmovdqu d2, st0; \
+ vmovdqu d3, st1; \
+ transpose_4x4(a0, a1, a2, a3, d2, d3); \
+ transpose_4x4(b0, b1, b2, b3, d2, d3); \
+ vmovdqu st0, d2; \
+ vmovdqu st1, d3; \
+ \
+ vmovdqu a0, st0; \
+ vmovdqu a1, st1; \
+ transpose_4x4(c0, c1, c2, c3, a0, a1); \
+ transpose_4x4(d0, d1, d2, d3, a0, a1); \
+ \
+ vmovdqu .Lshufb_16x16b, a0; \
+ vmovdqu st1, a1; \
+ vpshufb a0, a2, a2; \
+ vpshufb a0, a3, a3; \
+ vpshufb a0, b0, b0; \
+ vpshufb a0, b1, b1; \
+ vpshufb a0, b2, b2; \
+ vpshufb a0, b3, b3; \
+ vpshufb a0, a1, a1; \
+ vpshufb a0, c0, c0; \
+ vpshufb a0, c1, c1; \
+ vpshufb a0, c2, c2; \
+ vpshufb a0, c3, c3; \
+ vpshufb a0, d0, d0; \
+ vpshufb a0, d1, d1; \
+ vpshufb a0, d2, d2; \
+ vpshufb a0, d3, d3; \
+ vmovdqu d3, st1; \
+ vmovdqu st0, d3; \
+ vpshufb a0, d3, a0; \
+ vmovdqu d2, st0; \
+ \
+ transpose_4x4(c0, d0, a0, b0, d2, d3); \
+ transpose_4x4(c1, d1, a1, b1, d2, d3); \
+ vmovdqu st0, d2; \
+ vmovdqu st1, d3; \
+ \
+ vmovdqu b0, st0; \
+ vmovdqu b1, st1; \
+ transpose_4x4(c2, d2, a2, b2, b0, b1); \
+ transpose_4x4(c3, d3, a3, b3, b0, b1); \
+ vmovdqu st0, b0; \
+ vmovdqu st1, b1; \
+ /* does not adjust output bytes inside vectors */
+
+/* load blocks to registers and apply pre-whitening */
+#define inpack16_pre(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7, \
+ rio) \
+ vmovdqu (0 * 16)(rio), x0; \
+ vmovdqu (1 * 16)(rio), x1; \
+ vmovdqu (2 * 16)(rio), x2; \
+ vmovdqu (3 * 16)(rio), x3; \
+ vmovdqu (4 * 16)(rio), x4; \
+ vmovdqu (5 * 16)(rio), x5; \
+ vmovdqu (6 * 16)(rio), x6; \
+ vmovdqu (7 * 16)(rio), x7; \
+ vmovdqu (8 * 16)(rio), y0; \
+ vmovdqu (9 * 16)(rio), y1; \
+ vmovdqu (10 * 16)(rio), y2; \
+ vmovdqu (11 * 16)(rio), y3; \
+ vmovdqu (12 * 16)(rio), y4; \
+ vmovdqu (13 * 16)(rio), y5; \
+ vmovdqu (14 * 16)(rio), y6; \
+ vmovdqu (15 * 16)(rio), y7;
+
+/* byteslice pre-whitened blocks and store to temporary memory */
+#define inpack16_post(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7, \
+ mem_ab, mem_cd) \
+ byteslice_16x16b(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7, \
+ (mem_ab), (mem_cd)); \
+ \
+ vmovdqu x0, 0 * 16(mem_ab); \
+ vmovdqu x1, 1 * 16(mem_ab); \
+ vmovdqu x2, 2 * 16(mem_ab); \
+ vmovdqu x3, 3 * 16(mem_ab); \
+ vmovdqu x4, 4 * 16(mem_ab); \
+ vmovdqu x5, 5 * 16(mem_ab); \
+ vmovdqu x6, 6 * 16(mem_ab); \
+ vmovdqu x7, 7 * 16(mem_ab); \
+ vmovdqu y0, 0 * 16(mem_cd); \
+ vmovdqu y1, 1 * 16(mem_cd); \
+ vmovdqu y2, 2 * 16(mem_cd); \
+ vmovdqu y3, 3 * 16(mem_cd); \
+ vmovdqu y4, 4 * 16(mem_cd); \
+ vmovdqu y5, 5 * 16(mem_cd); \
+ vmovdqu y6, 6 * 16(mem_cd); \
+ vmovdqu y7, 7 * 16(mem_cd);
+
+#define write_output(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7, \
+ mem) \
+ vmovdqu x0, 0 * 16(mem); \
+ vmovdqu x1, 1 * 16(mem); \
+ vmovdqu x2, 2 * 16(mem); \
+ vmovdqu x3, 3 * 16(mem); \
+ vmovdqu x4, 4 * 16(mem); \
+ vmovdqu x5, 5 * 16(mem); \
+ vmovdqu x6, 6 * 16(mem); \
+ vmovdqu x7, 7 * 16(mem); \
+ vmovdqu y0, 8 * 16(mem); \
+ vmovdqu y1, 9 * 16(mem); \
+ vmovdqu y2, 10 * 16(mem); \
+ vmovdqu y3, 11 * 16(mem); \
+ vmovdqu y4, 12 * 16(mem); \
+ vmovdqu y5, 13 * 16(mem); \
+ vmovdqu y6, 14 * 16(mem); \
+ vmovdqu y7, 15 * 16(mem); \
+
+#define aria_store_state_8way(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ mem_tmp, idx) \
+ vmovdqu x0, ((idx + 0) * 16)(mem_tmp); \
+ vmovdqu x1, ((idx + 1) * 16)(mem_tmp); \
+ vmovdqu x2, ((idx + 2) * 16)(mem_tmp); \
+ vmovdqu x3, ((idx + 3) * 16)(mem_tmp); \
+ vmovdqu x4, ((idx + 4) * 16)(mem_tmp); \
+ vmovdqu x5, ((idx + 5) * 16)(mem_tmp); \
+ vmovdqu x6, ((idx + 6) * 16)(mem_tmp); \
+ vmovdqu x7, ((idx + 7) * 16)(mem_tmp);
+
+#define aria_load_state_8way(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ mem_tmp, idx) \
+ vmovdqu ((idx + 0) * 16)(mem_tmp), x0; \
+ vmovdqu ((idx + 1) * 16)(mem_tmp), x1; \
+ vmovdqu ((idx + 2) * 16)(mem_tmp), x2; \
+ vmovdqu ((idx + 3) * 16)(mem_tmp), x3; \
+ vmovdqu ((idx + 4) * 16)(mem_tmp), x4; \
+ vmovdqu ((idx + 5) * 16)(mem_tmp), x5; \
+ vmovdqu ((idx + 6) * 16)(mem_tmp), x6; \
+ vmovdqu ((idx + 7) * 16)(mem_tmp), x7;
+
+#define aria_ark_8way(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ t0, rk, idx, round) \
+ /* AddRoundKey */ \
+ vpbroadcastb ((round * 16) + idx + 3)(rk), t0; \
+ vpxor t0, x0, x0; \
+ vpbroadcastb ((round * 16) + idx + 2)(rk), t0; \
+ vpxor t0, x1, x1; \
+ vpbroadcastb ((round * 16) + idx + 1)(rk), t0; \
+ vpxor t0, x2, x2; \
+ vpbroadcastb ((round * 16) + idx + 0)(rk), t0; \
+ vpxor t0, x3, x3; \
+ vpbroadcastb ((round * 16) + idx + 7)(rk), t0; \
+ vpxor t0, x4, x4; \
+ vpbroadcastb ((round * 16) + idx + 6)(rk), t0; \
+ vpxor t0, x5, x5; \
+ vpbroadcastb ((round * 16) + idx + 5)(rk), t0; \
+ vpxor t0, x6, x6; \
+ vpbroadcastb ((round * 16) + idx + 4)(rk), t0; \
+ vpxor t0, x7, x7;
+
+#define aria_sbox_8way_gfni(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ t0, t1, t2, t3, \
+ t4, t5, t6, t7) \
+ vpbroadcastq .Ltf_s2_bitmatrix, t0; \
+ vpbroadcastq .Ltf_inv_bitmatrix, t1; \
+ vpbroadcastq .Ltf_id_bitmatrix, t2; \
+ vpbroadcastq .Ltf_aff_bitmatrix, t3; \
+ vpbroadcastq .Ltf_x2_bitmatrix, t4; \
+ vgf2p8affineinvqb $(tf_s2_const), t0, x1, x1; \
+ vgf2p8affineinvqb $(tf_s2_const), t0, x5, x5; \
+ vgf2p8affineqb $(tf_inv_const), t1, x2, x2; \
+ vgf2p8affineqb $(tf_inv_const), t1, x6, x6; \
+ vgf2p8affineinvqb $0, t2, x2, x2; \
+ vgf2p8affineinvqb $0, t2, x6, x6; \
+ vgf2p8affineinvqb $(tf_aff_const), t3, x0, x0; \
+ vgf2p8affineinvqb $(tf_aff_const), t3, x4, x4; \
+ vgf2p8affineqb $(tf_x2_const), t4, x3, x3; \
+ vgf2p8affineqb $(tf_x2_const), t4, x7, x7; \
+ vgf2p8affineinvqb $0, t2, x3, x3; \
+ vgf2p8affineinvqb $0, t2, x7, x7
+
+#define aria_sbox_8way(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ t0, t1, t2, t3, \
+ t4, t5, t6, t7) \
+ vpxor t7, t7, t7; \
+ vmovdqa .Linv_shift_row, t0; \
+ vmovdqa .Lshift_row, t1; \
+ vpbroadcastd .L0f0f0f0f, t6; \
+ vmovdqa .Ltf_lo__inv_aff__and__s2, t2; \
+ vmovdqa .Ltf_hi__inv_aff__and__s2, t3; \
+ vmovdqa .Ltf_lo__x2__and__fwd_aff, t4; \
+ vmovdqa .Ltf_hi__x2__and__fwd_aff, t5; \
+ \
+ vaesenclast t7, x0, x0; \
+ vaesenclast t7, x4, x4; \
+ vaesenclast t7, x1, x1; \
+ vaesenclast t7, x5, x5; \
+ vaesdeclast t7, x2, x2; \
+ vaesdeclast t7, x6, x6; \
+ \
+ /* AES inverse shift rows */ \
+ vpshufb t0, x0, x0; \
+ vpshufb t0, x4, x4; \
+ vpshufb t0, x1, x1; \
+ vpshufb t0, x5, x5; \
+ vpshufb t1, x3, x3; \
+ vpshufb t1, x7, x7; \
+ vpshufb t1, x2, x2; \
+ vpshufb t1, x6, x6; \
+ \
+ /* affine transformation for S2 */ \
+ filter_8bit(x1, t2, t3, t6, t0); \
+ /* affine transformation for S2 */ \
+ filter_8bit(x5, t2, t3, t6, t0); \
+ \
+ /* affine transformation for X2 */ \
+ filter_8bit(x3, t4, t5, t6, t0); \
+ /* affine transformation for X2 */ \
+ filter_8bit(x7, t4, t5, t6, t0); \
+ vaesdeclast t7, x3, x3; \
+ vaesdeclast t7, x7, x7;
+
+#define aria_diff_m(x0, x1, x2, x3, \
+ t0, t1, t2, t3) \
+ /* T = rotr32(X, 8); */ \
+ /* X ^= T */ \
+ vpxor x0, x3, t0; \
+ vpxor x1, x0, t1; \
+ vpxor x2, x1, t2; \
+ vpxor x3, x2, t3; \
+ /* X = T ^ rotr(X, 16); */ \
+ vpxor t2, x0, x0; \
+ vpxor x1, t3, t3; \
+ vpxor t0, x2, x2; \
+ vpxor t1, x3, x1; \
+ vmovdqu t3, x3;
+
+#define aria_diff_word(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7) \
+ /* t1 ^= t2; */ \
+ vpxor y0, x4, x4; \
+ vpxor y1, x5, x5; \
+ vpxor y2, x6, x6; \
+ vpxor y3, x7, x7; \
+ \
+ /* t2 ^= t3; */ \
+ vpxor y4, y0, y0; \
+ vpxor y5, y1, y1; \
+ vpxor y6, y2, y2; \
+ vpxor y7, y3, y3; \
+ \
+ /* t0 ^= t1; */ \
+ vpxor x4, x0, x0; \
+ vpxor x5, x1, x1; \
+ vpxor x6, x2, x2; \
+ vpxor x7, x3, x3; \
+ \
+ /* t3 ^= t1; */ \
+ vpxor x4, y4, y4; \
+ vpxor x5, y5, y5; \
+ vpxor x6, y6, y6; \
+ vpxor x7, y7, y7; \
+ \
+ /* t2 ^= t0; */ \
+ vpxor x0, y0, y0; \
+ vpxor x1, y1, y1; \
+ vpxor x2, y2, y2; \
+ vpxor x3, y3, y3; \
+ \
+ /* t1 ^= t2; */ \
+ vpxor y0, x4, x4; \
+ vpxor y1, x5, x5; \
+ vpxor y2, x6, x6; \
+ vpxor y3, x7, x7;
+
+#define aria_fe(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7, \
+ mem_tmp, rk, round) \
+ aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
+ y0, rk, 8, round); \
+ \
+ aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \
+ y0, y1, y2, y3, y4, y5, y6, y7); \
+ \
+ aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
+ aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
+ aria_store_state_8way(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ mem_tmp, 8); \
+ \
+ aria_load_state_8way(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ mem_tmp, 0); \
+ aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
+ y0, rk, 0, round); \
+ \
+ aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \
+ y0, y1, y2, y3, y4, y5, y6, y7); \
+ \
+ aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
+ aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
+ aria_store_state_8way(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ mem_tmp, 0); \
+ aria_load_state_8way(y0, y1, y2, y3, \
+ y4, y5, y6, y7, \
+ mem_tmp, 8); \
+ aria_diff_word(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7); \
+ /* aria_diff_byte() \
+ * T3 = ABCD -> BADC \
+ * T3 = y4, y5, y6, y7 -> y5, y4, y7, y6 \
+ * T0 = ABCD -> CDAB \
+ * T0 = x0, x1, x2, x3 -> x2, x3, x0, x1 \
+ * T1 = ABCD -> DCBA \
+ * T1 = x4, x5, x6, x7 -> x7, x6, x5, x4 \
+ */ \
+ aria_diff_word(x2, x3, x0, x1, \
+ x7, x6, x5, x4, \
+ y0, y1, y2, y3, \
+ y5, y4, y7, y6); \
+ aria_store_state_8way(x3, x2, x1, x0, \
+ x6, x7, x4, x5, \
+ mem_tmp, 0);
+
+#define aria_fo(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7, \
+ mem_tmp, rk, round) \
+ aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
+ y0, rk, 8, round); \
+ \
+ aria_sbox_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
+ y0, y1, y2, y3, y4, y5, y6, y7); \
+ \
+ aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
+ aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
+ aria_store_state_8way(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ mem_tmp, 8); \
+ \
+ aria_load_state_8way(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ mem_tmp, 0); \
+ aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
+ y0, rk, 0, round); \
+ \
+ aria_sbox_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
+ y0, y1, y2, y3, y4, y5, y6, y7); \
+ \
+ aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
+ aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
+ aria_store_state_8way(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ mem_tmp, 0); \
+ aria_load_state_8way(y0, y1, y2, y3, \
+ y4, y5, y6, y7, \
+ mem_tmp, 8); \
+ aria_diff_word(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7); \
+ /* aria_diff_byte() \
+ * T1 = ABCD -> BADC \
+ * T1 = x4, x5, x6, x7 -> x5, x4, x7, x6 \
+ * T2 = ABCD -> CDAB \
+ * T2 = y0, y1, y2, y3, -> y2, y3, y0, y1 \
+ * T3 = ABCD -> DCBA \
+ * T3 = y4, y5, y6, y7 -> y7, y6, y5, y4 \
+ */ \
+ aria_diff_word(x0, x1, x2, x3, \
+ x5, x4, x7, x6, \
+ y2, y3, y0, y1, \
+ y7, y6, y5, y4); \
+ aria_store_state_8way(x3, x2, x1, x0, \
+ x6, x7, x4, x5, \
+ mem_tmp, 0);
+
+#define aria_ff(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7, \
+ mem_tmp, rk, round, last_round) \
+ aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
+ y0, rk, 8, round); \
+ \
+ aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \
+ y0, y1, y2, y3, y4, y5, y6, y7); \
+ \
+ aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
+ y0, rk, 8, last_round); \
+ \
+ aria_store_state_8way(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ mem_tmp, 8); \
+ \
+ aria_load_state_8way(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ mem_tmp, 0); \
+ aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
+ y0, rk, 0, round); \
+ \
+ aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \
+ y0, y1, y2, y3, y4, y5, y6, y7); \
+ \
+ aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
+ y0, rk, 0, last_round); \
+ \
+ aria_load_state_8way(y0, y1, y2, y3, \
+ y4, y5, y6, y7, \
+ mem_tmp, 8);
+
+#define aria_fe_gfni(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7, \
+ mem_tmp, rk, round) \
+ aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
+ y0, rk, 8, round); \
+ \
+ aria_sbox_8way_gfni(x2, x3, x0, x1, \
+ x6, x7, x4, x5, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7); \
+ \
+ aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
+ aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
+ aria_store_state_8way(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ mem_tmp, 8); \
+ \
+ aria_load_state_8way(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ mem_tmp, 0); \
+ aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
+ y0, rk, 0, round); \
+ \
+ aria_sbox_8way_gfni(x2, x3, x0, x1, \
+ x6, x7, x4, x5, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7); \
+ \
+ aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
+ aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
+ aria_store_state_8way(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ mem_tmp, 0); \
+ aria_load_state_8way(y0, y1, y2, y3, \
+ y4, y5, y6, y7, \
+ mem_tmp, 8); \
+ aria_diff_word(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7); \
+ /* aria_diff_byte() \
+ * T3 = ABCD -> BADC \
+ * T3 = y4, y5, y6, y7 -> y5, y4, y7, y6 \
+ * T0 = ABCD -> CDAB \
+ * T0 = x0, x1, x2, x3 -> x2, x3, x0, x1 \
+ * T1 = ABCD -> DCBA \
+ * T1 = x4, x5, x6, x7 -> x7, x6, x5, x4 \
+ */ \
+ aria_diff_word(x2, x3, x0, x1, \
+ x7, x6, x5, x4, \
+ y0, y1, y2, y3, \
+ y5, y4, y7, y6); \
+ aria_store_state_8way(x3, x2, x1, x0, \
+ x6, x7, x4, x5, \
+ mem_tmp, 0);
+
+#define aria_fo_gfni(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7, \
+ mem_tmp, rk, round) \
+ aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
+ y0, rk, 8, round); \
+ \
+ aria_sbox_8way_gfni(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7); \
+ \
+ aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
+ aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
+ aria_store_state_8way(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ mem_tmp, 8); \
+ \
+ aria_load_state_8way(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ mem_tmp, 0); \
+ aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
+ y0, rk, 0, round); \
+ \
+ aria_sbox_8way_gfni(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7); \
+ \
+ aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
+ aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
+ aria_store_state_8way(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ mem_tmp, 0); \
+ aria_load_state_8way(y0, y1, y2, y3, \
+ y4, y5, y6, y7, \
+ mem_tmp, 8); \
+ aria_diff_word(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7); \
+ /* aria_diff_byte() \
+ * T1 = ABCD -> BADC \
+ * T1 = x4, x5, x6, x7 -> x5, x4, x7, x6 \
+ * T2 = ABCD -> CDAB \
+ * T2 = y0, y1, y2, y3, -> y2, y3, y0, y1 \
+ * T3 = ABCD -> DCBA \
+ * T3 = y4, y5, y6, y7 -> y7, y6, y5, y4 \
+ */ \
+ aria_diff_word(x0, x1, x2, x3, \
+ x5, x4, x7, x6, \
+ y2, y3, y0, y1, \
+ y7, y6, y5, y4); \
+ aria_store_state_8way(x3, x2, x1, x0, \
+ x6, x7, x4, x5, \
+ mem_tmp, 0);
+
+#define aria_ff_gfni(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7, \
+ mem_tmp, rk, round, last_round) \
+ aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
+ y0, rk, 8, round); \
+ \
+ aria_sbox_8way_gfni(x2, x3, x0, x1, \
+ x6, x7, x4, x5, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7); \
+ \
+ aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
+ y0, rk, 8, last_round); \
+ \
+ aria_store_state_8way(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ mem_tmp, 8); \
+ \
+ aria_load_state_8way(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ mem_tmp, 0); \
+ aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
+ y0, rk, 0, round); \
+ \
+ aria_sbox_8way_gfni(x2, x3, x0, x1, \
+ x6, x7, x4, x5, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7); \
+ \
+ aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
+ y0, rk, 0, last_round); \
+ \
+ aria_load_state_8way(y0, y1, y2, y3, \
+ y4, y5, y6, y7, \
+ mem_tmp, 8);
+
+/* NB: section is mergeable, all elements must be aligned 16-byte blocks */
+.section .rodata.cst16, "aM", @progbits, 16
+.align 16
+
+#define SHUFB_BYTES(idx) \
+ 0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx)
+
+.Lshufb_16x16b:
+ .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3);
+/* For isolating SubBytes from AESENCLAST, inverse shift row */
+.Linv_shift_row:
+ .byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b
+ .byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03
+.Lshift_row:
+ .byte 0x00, 0x05, 0x0a, 0x0f, 0x04, 0x09, 0x0e, 0x03
+ .byte 0x08, 0x0d, 0x02, 0x07, 0x0c, 0x01, 0x06, 0x0b
+/* For CTR-mode IV byteswap */
+.Lbswap128_mask:
+ .byte 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08
+ .byte 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00
+
+/* AES inverse affine and S2 combined:
+ * 1 1 0 0 0 0 0 1 x0 0
+ * 0 1 0 0 1 0 0 0 x1 0
+ * 1 1 0 0 1 1 1 1 x2 0
+ * 0 1 1 0 1 0 0 1 x3 1
+ * 0 1 0 0 1 1 0 0 * x4 + 0
+ * 0 1 0 1 1 0 0 0 x5 0
+ * 0 0 0 0 0 1 0 1 x6 0
+ * 1 1 1 0 0 1 1 1 x7 1
+ */
+.Ltf_lo__inv_aff__and__s2:
+ .octa 0x92172DA81A9FA520B2370D883ABF8500
+.Ltf_hi__inv_aff__and__s2:
+ .octa 0x2B15FFC1AF917B45E6D8320C625CB688
+
+/* X2 and AES forward affine combined:
+ * 1 0 1 1 0 0 0 1 x0 0
+ * 0 1 1 1 1 0 1 1 x1 0
+ * 0 0 0 1 1 0 1 0 x2 1
+ * 0 1 0 0 0 1 0 0 x3 0
+ * 0 0 1 1 1 0 1 1 * x4 + 0
+ * 0 1 0 0 1 0 0 0 x5 0
+ * 1 1 0 1 0 0 1 1 x6 0
+ * 0 1 0 0 1 0 1 0 x7 0
+ */
+.Ltf_lo__x2__and__fwd_aff:
+ .octa 0xEFAE0544FCBD1657B8F95213ABEA4100
+.Ltf_hi__x2__and__fwd_aff:
+ .octa 0x3F893781E95FE1576CDA64D2BA0CB204
+
+.section .rodata.cst8, "aM", @progbits, 8
+.align 8
+/* AES affine: */
+#define tf_aff_const BV8(1, 1, 0, 0, 0, 1, 1, 0)
+.Ltf_aff_bitmatrix:
+ .quad BM8X8(BV8(1, 0, 0, 0, 1, 1, 1, 1),
+ BV8(1, 1, 0, 0, 0, 1, 1, 1),
+ BV8(1, 1, 1, 0, 0, 0, 1, 1),
+ BV8(1, 1, 1, 1, 0, 0, 0, 1),
+ BV8(1, 1, 1, 1, 1, 0, 0, 0),
+ BV8(0, 1, 1, 1, 1, 1, 0, 0),
+ BV8(0, 0, 1, 1, 1, 1, 1, 0),
+ BV8(0, 0, 0, 1, 1, 1, 1, 1))
+
+/* AES inverse affine: */
+#define tf_inv_const BV8(1, 0, 1, 0, 0, 0, 0, 0)
+.Ltf_inv_bitmatrix:
+ .quad BM8X8(BV8(0, 0, 1, 0, 0, 1, 0, 1),
+ BV8(1, 0, 0, 1, 0, 0, 1, 0),
+ BV8(0, 1, 0, 0, 1, 0, 0, 1),
+ BV8(1, 0, 1, 0, 0, 1, 0, 0),
+ BV8(0, 1, 0, 1, 0, 0, 1, 0),
+ BV8(0, 0, 1, 0, 1, 0, 0, 1),
+ BV8(1, 0, 0, 1, 0, 1, 0, 0),
+ BV8(0, 1, 0, 0, 1, 0, 1, 0))
+
+/* S2: */
+#define tf_s2_const BV8(0, 1, 0, 0, 0, 1, 1, 1)
+.Ltf_s2_bitmatrix:
+ .quad BM8X8(BV8(0, 1, 0, 1, 0, 1, 1, 1),
+ BV8(0, 0, 1, 1, 1, 1, 1, 1),
+ BV8(1, 1, 1, 0, 1, 1, 0, 1),
+ BV8(1, 1, 0, 0, 0, 0, 1, 1),
+ BV8(0, 1, 0, 0, 0, 0, 1, 1),
+ BV8(1, 1, 0, 0, 1, 1, 1, 0),
+ BV8(0, 1, 1, 0, 0, 0, 1, 1),
+ BV8(1, 1, 1, 1, 0, 1, 1, 0))
+
+/* X2: */
+#define tf_x2_const BV8(0, 0, 1, 1, 0, 1, 0, 0)
+.Ltf_x2_bitmatrix:
+ .quad BM8X8(BV8(0, 0, 0, 1, 1, 0, 0, 0),
+ BV8(0, 0, 1, 0, 0, 1, 1, 0),
+ BV8(0, 0, 0, 0, 1, 0, 1, 0),
+ BV8(1, 1, 1, 0, 0, 0, 1, 1),
+ BV8(1, 1, 1, 0, 1, 1, 0, 0),
+ BV8(0, 1, 1, 0, 1, 0, 1, 1),
+ BV8(1, 0, 1, 1, 1, 1, 0, 1),
+ BV8(1, 0, 0, 1, 0, 0, 1, 1))
+
+/* Identity matrix: */
+.Ltf_id_bitmatrix:
+ .quad BM8X8(BV8(1, 0, 0, 0, 0, 0, 0, 0),
+ BV8(0, 1, 0, 0, 0, 0, 0, 0),
+ BV8(0, 0, 1, 0, 0, 0, 0, 0),
+ BV8(0, 0, 0, 1, 0, 0, 0, 0),
+ BV8(0, 0, 0, 0, 1, 0, 0, 0),
+ BV8(0, 0, 0, 0, 0, 1, 0, 0),
+ BV8(0, 0, 0, 0, 0, 0, 1, 0),
+ BV8(0, 0, 0, 0, 0, 0, 0, 1))
+
+/* 4-bit mask */
+.section .rodata.cst4.L0f0f0f0f, "aM", @progbits, 4
+.align 4
+.L0f0f0f0f:
+ .long 0x0f0f0f0f
+
+.text
+
+SYM_FUNC_START_LOCAL(__aria_aesni_avx_crypt_16way)
+ /* input:
+ * %r9: rk
+ * %rsi: dst
+ * %rdx: src
+ * %xmm0..%xmm15: 16 byte-sliced blocks
+ */
+
+ FRAME_BEGIN
+
+ movq %rsi, %rax;
+ leaq 8 * 16(%rax), %r8;
+
+ inpack16_post(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+ %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+ %xmm15, %rax, %r8);
+ aria_fo(%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15,
+ %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+ %rax, %r9, 0);
+ aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
+ %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+ %xmm15, %rax, %r9, 1);
+ aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
+ %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+ %rax, %r9, 2);
+ aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
+ %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+ %xmm15, %rax, %r9, 3);
+ aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
+ %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+ %rax, %r9, 4);
+ aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
+ %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+ %xmm15, %rax, %r9, 5);
+ aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
+ %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+ %rax, %r9, 6);
+ aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
+ %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+ %xmm15, %rax, %r9, 7);
+ aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
+ %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+ %rax, %r9, 8);
+ aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
+ %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+ %xmm15, %rax, %r9, 9);
+ aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
+ %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+ %rax, %r9, 10);
+ cmpl $12, rounds(CTX);
+ jne .Laria_192;
+ aria_ff(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
+ %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+ %xmm15, %rax, %r9, 11, 12);
+ jmp .Laria_end;
+.Laria_192:
+ aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
+ %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+ %xmm15, %rax, %r9, 11);
+ aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
+ %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+ %rax, %r9, 12);
+ cmpl $14, rounds(CTX);
+ jne .Laria_256;
+ aria_ff(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
+ %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+ %xmm15, %rax, %r9, 13, 14);
+ jmp .Laria_end;
+.Laria_256:
+ aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
+ %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+ %xmm15, %rax, %r9, 13);
+ aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
+ %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+ %rax, %r9, 14);
+ aria_ff(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
+ %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+ %xmm15, %rax, %r9, 15, 16);
+.Laria_end:
+ debyteslice_16x16b(%xmm8, %xmm12, %xmm1, %xmm4,
+ %xmm9, %xmm13, %xmm0, %xmm5,
+ %xmm10, %xmm14, %xmm3, %xmm6,
+ %xmm11, %xmm15, %xmm2, %xmm7,
+ (%rax), (%r8));
+
+ FRAME_END
+ RET;
+SYM_FUNC_END(__aria_aesni_avx_crypt_16way)
+
+SYM_FUNC_START(aria_aesni_avx_encrypt_16way)
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst
+ * %rdx: src
+ */
+
+ FRAME_BEGIN
+
+ leaq enc_key(CTX), %r9;
+
+ inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+ %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+ %xmm15, %rdx);
+
+ call __aria_aesni_avx_crypt_16way;
+
+ write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
+ %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+ %xmm15, %rax);
+
+ FRAME_END
+ RET;
+SYM_FUNC_END(aria_aesni_avx_encrypt_16way)
+
+SYM_FUNC_START(aria_aesni_avx_decrypt_16way)
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst
+ * %rdx: src
+ */
+
+ FRAME_BEGIN
+
+ leaq dec_key(CTX), %r9;
+
+ inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+ %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+ %xmm15, %rdx);
+
+ call __aria_aesni_avx_crypt_16way;
+
+ write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
+ %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+ %xmm15, %rax);
+
+ FRAME_END
+ RET;
+SYM_FUNC_END(aria_aesni_avx_decrypt_16way)
+
+SYM_FUNC_START_LOCAL(__aria_aesni_avx_ctr_gen_keystream_16way)
+ /* input:
+ * %rdi: ctx
+ * %rsi: dst
+ * %rdx: src
+ * %rcx: keystream
+ * %r8: iv (big endian, 128bit)
+ */
+
+ FRAME_BEGIN
+ /* load IV and byteswap */
+ vmovdqu (%r8), %xmm8;
+
+ vmovdqa .Lbswap128_mask (%rip), %xmm1;
+ vpshufb %xmm1, %xmm8, %xmm3; /* be => le */
+
+ vpcmpeqd %xmm0, %xmm0, %xmm0;
+ vpsrldq $8, %xmm0, %xmm0; /* low: -1, high: 0 */
+
+ /* construct IVs */
+ inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
+ vpshufb %xmm1, %xmm3, %xmm9;
+ inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
+ vpshufb %xmm1, %xmm3, %xmm10;
+ inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
+ vpshufb %xmm1, %xmm3, %xmm11;
+ inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
+ vpshufb %xmm1, %xmm3, %xmm12;
+ inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
+ vpshufb %xmm1, %xmm3, %xmm13;
+ inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
+ vpshufb %xmm1, %xmm3, %xmm14;
+ inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
+ vpshufb %xmm1, %xmm3, %xmm15;
+ vmovdqu %xmm8, (0 * 16)(%rcx);
+ vmovdqu %xmm9, (1 * 16)(%rcx);
+ vmovdqu %xmm10, (2 * 16)(%rcx);
+ vmovdqu %xmm11, (3 * 16)(%rcx);
+ vmovdqu %xmm12, (4 * 16)(%rcx);
+ vmovdqu %xmm13, (5 * 16)(%rcx);
+ vmovdqu %xmm14, (6 * 16)(%rcx);
+ vmovdqu %xmm15, (7 * 16)(%rcx);
+
+ inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
+ vpshufb %xmm1, %xmm3, %xmm8;
+ inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
+ vpshufb %xmm1, %xmm3, %xmm9;
+ inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
+ vpshufb %xmm1, %xmm3, %xmm10;
+ inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
+ vpshufb %xmm1, %xmm3, %xmm11;
+ inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
+ vpshufb %xmm1, %xmm3, %xmm12;
+ inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
+ vpshufb %xmm1, %xmm3, %xmm13;
+ inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
+ vpshufb %xmm1, %xmm3, %xmm14;
+ inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
+ vpshufb %xmm1, %xmm3, %xmm15;
+ inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
+ vpshufb %xmm1, %xmm3, %xmm4;
+ vmovdqu %xmm4, (%r8);
+
+ vmovdqu (0 * 16)(%rcx), %xmm0;
+ vmovdqu (1 * 16)(%rcx), %xmm1;
+ vmovdqu (2 * 16)(%rcx), %xmm2;
+ vmovdqu (3 * 16)(%rcx), %xmm3;
+ vmovdqu (4 * 16)(%rcx), %xmm4;
+ vmovdqu (5 * 16)(%rcx), %xmm5;
+ vmovdqu (6 * 16)(%rcx), %xmm6;
+ vmovdqu (7 * 16)(%rcx), %xmm7;
+
+ FRAME_END
+ RET;
+SYM_FUNC_END(__aria_aesni_avx_ctr_gen_keystream_16way)
+
+SYM_FUNC_START(aria_aesni_avx_ctr_crypt_16way)
+ /* input:
+ * %rdi: ctx
+ * %rsi: dst
+ * %rdx: src
+ * %rcx: keystream
+ * %r8: iv (big endian, 128bit)
+ */
+ FRAME_BEGIN
+
+ call __aria_aesni_avx_ctr_gen_keystream_16way;
+
+ leaq (%rsi), %r10;
+ leaq (%rdx), %r11;
+ leaq (%rcx), %rsi;
+ leaq (%rcx), %rdx;
+ leaq enc_key(CTX), %r9;
+
+ call __aria_aesni_avx_crypt_16way;
+
+ vpxor (0 * 16)(%r11), %xmm1, %xmm1;
+ vpxor (1 * 16)(%r11), %xmm0, %xmm0;
+ vpxor (2 * 16)(%r11), %xmm3, %xmm3;
+ vpxor (3 * 16)(%r11), %xmm2, %xmm2;
+ vpxor (4 * 16)(%r11), %xmm4, %xmm4;
+ vpxor (5 * 16)(%r11), %xmm5, %xmm5;
+ vpxor (6 * 16)(%r11), %xmm6, %xmm6;
+ vpxor (7 * 16)(%r11), %xmm7, %xmm7;
+ vpxor (8 * 16)(%r11), %xmm8, %xmm8;
+ vpxor (9 * 16)(%r11), %xmm9, %xmm9;
+ vpxor (10 * 16)(%r11), %xmm10, %xmm10;
+ vpxor (11 * 16)(%r11), %xmm11, %xmm11;
+ vpxor (12 * 16)(%r11), %xmm12, %xmm12;
+ vpxor (13 * 16)(%r11), %xmm13, %xmm13;
+ vpxor (14 * 16)(%r11), %xmm14, %xmm14;
+ vpxor (15 * 16)(%r11), %xmm15, %xmm15;
+ write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
+ %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+ %xmm15, %r10);
+
+ FRAME_END
+ RET;
+SYM_FUNC_END(aria_aesni_avx_ctr_crypt_16way)
+
+SYM_FUNC_START_LOCAL(__aria_aesni_avx_gfni_crypt_16way)
+ /* input:
+ * %r9: rk
+ * %rsi: dst
+ * %rdx: src
+ * %xmm0..%xmm15: 16 byte-sliced blocks
+ */
+
+ FRAME_BEGIN
+
+ movq %rsi, %rax;
+ leaq 8 * 16(%rax), %r8;
+
+ inpack16_post(%xmm0, %xmm1, %xmm2, %xmm3,
+ %xmm4, %xmm5, %xmm6, %xmm7,
+ %xmm8, %xmm9, %xmm10, %xmm11,
+ %xmm12, %xmm13, %xmm14,
+ %xmm15, %rax, %r8);
+ aria_fo_gfni(%xmm8, %xmm9, %xmm10, %xmm11,
+ %xmm12, %xmm13, %xmm14, %xmm15,
+ %xmm0, %xmm1, %xmm2, %xmm3,
+ %xmm4, %xmm5, %xmm6, %xmm7,
+ %rax, %r9, 0);
+ aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
+ %xmm4, %xmm5, %xmm6, %xmm7,
+ %xmm8, %xmm9, %xmm10, %xmm11,
+ %xmm12, %xmm13, %xmm14,
+ %xmm15, %rax, %r9, 1);
+ aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
+ %xmm12, %xmm13, %xmm14, %xmm15,
+ %xmm0, %xmm1, %xmm2, %xmm3,
+ %xmm4, %xmm5, %xmm6, %xmm7,
+ %rax, %r9, 2);
+ aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
+ %xmm4, %xmm5, %xmm6, %xmm7,
+ %xmm8, %xmm9, %xmm10, %xmm11,
+ %xmm12, %xmm13, %xmm14,
+ %xmm15, %rax, %r9, 3);
+ aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
+ %xmm12, %xmm13, %xmm14, %xmm15,
+ %xmm0, %xmm1, %xmm2, %xmm3,
+ %xmm4, %xmm5, %xmm6, %xmm7,
+ %rax, %r9, 4);
+ aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
+ %xmm4, %xmm5, %xmm6, %xmm7,
+ %xmm8, %xmm9, %xmm10, %xmm11,
+ %xmm12, %xmm13, %xmm14,
+ %xmm15, %rax, %r9, 5);
+ aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
+ %xmm12, %xmm13, %xmm14, %xmm15,
+ %xmm0, %xmm1, %xmm2, %xmm3,
+ %xmm4, %xmm5, %xmm6, %xmm7,
+ %rax, %r9, 6);
+ aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
+ %xmm4, %xmm5, %xmm6, %xmm7,
+ %xmm8, %xmm9, %xmm10, %xmm11,
+ %xmm12, %xmm13, %xmm14,
+ %xmm15, %rax, %r9, 7);
+ aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
+ %xmm12, %xmm13, %xmm14, %xmm15,
+ %xmm0, %xmm1, %xmm2, %xmm3,
+ %xmm4, %xmm5, %xmm6, %xmm7,
+ %rax, %r9, 8);
+ aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
+ %xmm4, %xmm5, %xmm6, %xmm7,
+ %xmm8, %xmm9, %xmm10, %xmm11,
+ %xmm12, %xmm13, %xmm14,
+ %xmm15, %rax, %r9, 9);
+ aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
+ %xmm12, %xmm13, %xmm14, %xmm15,
+ %xmm0, %xmm1, %xmm2, %xmm3,
+ %xmm4, %xmm5, %xmm6, %xmm7,
+ %rax, %r9, 10);
+ cmpl $12, rounds(CTX);
+ jne .Laria_gfni_192;
+ aria_ff_gfni(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
+ %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+ %xmm15, %rax, %r9, 11, 12);
+ jmp .Laria_gfni_end;
+.Laria_gfni_192:
+ aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
+ %xmm4, %xmm5, %xmm6, %xmm7,
+ %xmm8, %xmm9, %xmm10, %xmm11,
+ %xmm12, %xmm13, %xmm14,
+ %xmm15, %rax, %r9, 11);
+ aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
+ %xmm12, %xmm13, %xmm14, %xmm15,
+ %xmm0, %xmm1, %xmm2, %xmm3,
+ %xmm4, %xmm5, %xmm6, %xmm7,
+ %rax, %r9, 12);
+ cmpl $14, rounds(CTX);
+ jne .Laria_gfni_256;
+ aria_ff_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
+ %xmm4, %xmm5, %xmm6, %xmm7,
+ %xmm8, %xmm9, %xmm10, %xmm11,
+ %xmm12, %xmm13, %xmm14,
+ %xmm15, %rax, %r9, 13, 14);
+ jmp .Laria_gfni_end;
+.Laria_gfni_256:
+ aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
+ %xmm4, %xmm5, %xmm6, %xmm7,
+ %xmm8, %xmm9, %xmm10, %xmm11,
+ %xmm12, %xmm13, %xmm14,
+ %xmm15, %rax, %r9, 13);
+ aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
+ %xmm12, %xmm13, %xmm14, %xmm15,
+ %xmm0, %xmm1, %xmm2, %xmm3,
+ %xmm4, %xmm5, %xmm6, %xmm7,
+ %rax, %r9, 14);
+ aria_ff_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
+ %xmm4, %xmm5, %xmm6, %xmm7,
+ %xmm8, %xmm9, %xmm10, %xmm11,
+ %xmm12, %xmm13, %xmm14,
+ %xmm15, %rax, %r9, 15, 16);
+.Laria_gfni_end:
+ debyteslice_16x16b(%xmm8, %xmm12, %xmm1, %xmm4,
+ %xmm9, %xmm13, %xmm0, %xmm5,
+ %xmm10, %xmm14, %xmm3, %xmm6,
+ %xmm11, %xmm15, %xmm2, %xmm7,
+ (%rax), (%r8));
+
+ FRAME_END
+ RET;
+SYM_FUNC_END(__aria_aesni_avx_gfni_crypt_16way)
+
+SYM_FUNC_START(aria_aesni_avx_gfni_encrypt_16way)
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst
+ * %rdx: src
+ */
+
+ FRAME_BEGIN
+
+ leaq enc_key(CTX), %r9;
+
+ inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+ %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+ %xmm15, %rdx);
+
+ call __aria_aesni_avx_gfni_crypt_16way;
+
+ write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
+ %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+ %xmm15, %rax);
+
+ FRAME_END
+ RET;
+SYM_FUNC_END(aria_aesni_avx_gfni_encrypt_16way)
+
+SYM_FUNC_START(aria_aesni_avx_gfni_decrypt_16way)
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst
+ * %rdx: src
+ */
+
+ FRAME_BEGIN
+
+ leaq dec_key(CTX), %r9;
+
+ inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+ %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+ %xmm15, %rdx);
+
+ call __aria_aesni_avx_gfni_crypt_16way;
+
+ write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
+ %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+ %xmm15, %rax);
+
+ FRAME_END
+ RET;
+SYM_FUNC_END(aria_aesni_avx_gfni_decrypt_16way)
+
+SYM_FUNC_START(aria_aesni_avx_gfni_ctr_crypt_16way)
+ /* input:
+ * %rdi: ctx
+ * %rsi: dst
+ * %rdx: src
+ * %rcx: keystream
+ * %r8: iv (big endian, 128bit)
+ */
+ FRAME_BEGIN
+
+ call __aria_aesni_avx_ctr_gen_keystream_16way
+
+ leaq (%rsi), %r10;
+ leaq (%rdx), %r11;
+ leaq (%rcx), %rsi;
+ leaq (%rcx), %rdx;
+ leaq enc_key(CTX), %r9;
+
+ call __aria_aesni_avx_gfni_crypt_16way;
+
+ vpxor (0 * 16)(%r11), %xmm1, %xmm1;
+ vpxor (1 * 16)(%r11), %xmm0, %xmm0;
+ vpxor (2 * 16)(%r11), %xmm3, %xmm3;
+ vpxor (3 * 16)(%r11), %xmm2, %xmm2;
+ vpxor (4 * 16)(%r11), %xmm4, %xmm4;
+ vpxor (5 * 16)(%r11), %xmm5, %xmm5;
+ vpxor (6 * 16)(%r11), %xmm6, %xmm6;
+ vpxor (7 * 16)(%r11), %xmm7, %xmm7;
+ vpxor (8 * 16)(%r11), %xmm8, %xmm8;
+ vpxor (9 * 16)(%r11), %xmm9, %xmm9;
+ vpxor (10 * 16)(%r11), %xmm10, %xmm10;
+ vpxor (11 * 16)(%r11), %xmm11, %xmm11;
+ vpxor (12 * 16)(%r11), %xmm12, %xmm12;
+ vpxor (13 * 16)(%r11), %xmm13, %xmm13;
+ vpxor (14 * 16)(%r11), %xmm14, %xmm14;
+ vpxor (15 * 16)(%r11), %xmm15, %xmm15;
+ write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
+ %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+ %xmm15, %r10);
+
+ FRAME_END
+ RET;
+SYM_FUNC_END(aria_aesni_avx_gfni_ctr_crypt_16way)
diff --git a/arch/x86/crypto/aria-avx.h b/arch/x86/crypto/aria-avx.h
new file mode 100644
index 000000000000..01e9a01dc157
--- /dev/null
+++ b/arch/x86/crypto/aria-avx.h
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#ifndef ASM_X86_ARIA_AVX_H
+#define ASM_X86_ARIA_AVX_H
+
+#include <linux/types.h>
+
+#define ARIA_AESNI_PARALLEL_BLOCKS 16
+#define ARIA_AESNI_PARALLEL_BLOCK_SIZE (ARIA_BLOCK_SIZE * 16)
+
+struct aria_avx_ops {
+ void (*aria_encrypt_16way)(const void *ctx, u8 *dst, const u8 *src);
+ void (*aria_decrypt_16way)(const void *ctx, u8 *dst, const u8 *src);
+ void (*aria_ctr_crypt_16way)(const void *ctx, u8 *dst, const u8 *src,
+ u8 *keystream, u8 *iv);
+};
+#endif
diff --git a/arch/x86/crypto/aria_aesni_avx_glue.c b/arch/x86/crypto/aria_aesni_avx_glue.c
new file mode 100644
index 000000000000..c561ea4fefa5
--- /dev/null
+++ b/arch/x86/crypto/aria_aesni_avx_glue.c
@@ -0,0 +1,213 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Glue Code for the AVX/AES-NI/GFNI assembler implementation of the ARIA Cipher
+ *
+ * Copyright (c) 2022 Taehee Yoo <ap420073@gmail.com>
+ */
+
+#include <crypto/algapi.h>
+#include <crypto/internal/simd.h>
+#include <crypto/aria.h>
+#include <linux/crypto.h>
+#include <linux/err.h>
+#include <linux/module.h>
+#include <linux/types.h>
+
+#include "ecb_cbc_helpers.h"
+#include "aria-avx.h"
+
+asmlinkage void aria_aesni_avx_encrypt_16way(const void *ctx, u8 *dst,
+ const u8 *src);
+asmlinkage void aria_aesni_avx_decrypt_16way(const void *ctx, u8 *dst,
+ const u8 *src);
+asmlinkage void aria_aesni_avx_ctr_crypt_16way(const void *ctx, u8 *dst,
+ const u8 *src,
+ u8 *keystream, u8 *iv);
+asmlinkage void aria_aesni_avx_gfni_encrypt_16way(const void *ctx, u8 *dst,
+ const u8 *src);
+asmlinkage void aria_aesni_avx_gfni_decrypt_16way(const void *ctx, u8 *dst,
+ const u8 *src);
+asmlinkage void aria_aesni_avx_gfni_ctr_crypt_16way(const void *ctx, u8 *dst,
+ const u8 *src,
+ u8 *keystream, u8 *iv);
+
+static struct aria_avx_ops aria_ops;
+
+static int ecb_do_encrypt(struct skcipher_request *req, const u32 *rkey)
+{
+ ECB_WALK_START(req, ARIA_BLOCK_SIZE, ARIA_AESNI_PARALLEL_BLOCKS);
+ ECB_BLOCK(ARIA_AESNI_PARALLEL_BLOCKS, aria_ops.aria_encrypt_16way);
+ ECB_BLOCK(1, aria_encrypt);
+ ECB_WALK_END();
+}
+
+static int ecb_do_decrypt(struct skcipher_request *req, const u32 *rkey)
+{
+ ECB_WALK_START(req, ARIA_BLOCK_SIZE, ARIA_AESNI_PARALLEL_BLOCKS);
+ ECB_BLOCK(ARIA_AESNI_PARALLEL_BLOCKS, aria_ops.aria_decrypt_16way);
+ ECB_BLOCK(1, aria_decrypt);
+ ECB_WALK_END();
+}
+
+static int aria_avx_ecb_encrypt(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct aria_ctx *ctx = crypto_skcipher_ctx(tfm);
+
+ return ecb_do_encrypt(req, ctx->enc_key[0]);
+}
+
+static int aria_avx_ecb_decrypt(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct aria_ctx *ctx = crypto_skcipher_ctx(tfm);
+
+ return ecb_do_decrypt(req, ctx->dec_key[0]);
+}
+
+static int aria_avx_set_key(struct crypto_skcipher *tfm, const u8 *key,
+ unsigned int keylen)
+{
+ return aria_set_key(&tfm->base, key, keylen);
+}
+
+static int aria_avx_ctr_encrypt(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct aria_ctx *ctx = crypto_skcipher_ctx(tfm);
+ struct skcipher_walk walk;
+ unsigned int nbytes;
+ int err;
+
+ err = skcipher_walk_virt(&walk, req, false);
+
+ while ((nbytes = walk.nbytes) > 0) {
+ const u8 *src = walk.src.virt.addr;
+ u8 *dst = walk.dst.virt.addr;
+
+ while (nbytes >= ARIA_AESNI_PARALLEL_BLOCK_SIZE) {
+ u8 keystream[ARIA_AESNI_PARALLEL_BLOCK_SIZE];
+
+ kernel_fpu_begin();
+ aria_ops.aria_ctr_crypt_16way(ctx, dst, src, keystream,
+ walk.iv);
+ kernel_fpu_end();
+ dst += ARIA_AESNI_PARALLEL_BLOCK_SIZE;
+ src += ARIA_AESNI_PARALLEL_BLOCK_SIZE;
+ nbytes -= ARIA_AESNI_PARALLEL_BLOCK_SIZE;
+ }
+
+ while (nbytes >= ARIA_BLOCK_SIZE) {
+ u8 keystream[ARIA_BLOCK_SIZE];
+
+ memcpy(keystream, walk.iv, ARIA_BLOCK_SIZE);
+ crypto_inc(walk.iv, ARIA_BLOCK_SIZE);
+
+ aria_encrypt(ctx, keystream, keystream);
+
+ crypto_xor_cpy(dst, src, keystream, ARIA_BLOCK_SIZE);
+ dst += ARIA_BLOCK_SIZE;
+ src += ARIA_BLOCK_SIZE;
+ nbytes -= ARIA_BLOCK_SIZE;
+ }
+
+ if (walk.nbytes == walk.total && nbytes > 0) {
+ u8 keystream[ARIA_BLOCK_SIZE];
+
+ memcpy(keystream, walk.iv, ARIA_BLOCK_SIZE);
+ crypto_inc(walk.iv, ARIA_BLOCK_SIZE);
+
+ aria_encrypt(ctx, keystream, keystream);
+
+ crypto_xor_cpy(dst, src, keystream, nbytes);
+ dst += nbytes;
+ src += nbytes;
+ nbytes = 0;
+ }
+ err = skcipher_walk_done(&walk, nbytes);
+ }
+
+ return err;
+}
+
+static struct skcipher_alg aria_algs[] = {
+ {
+ .base.cra_name = "__ecb(aria)",
+ .base.cra_driver_name = "__ecb-aria-avx",
+ .base.cra_priority = 400,
+ .base.cra_flags = CRYPTO_ALG_INTERNAL,
+ .base.cra_blocksize = ARIA_BLOCK_SIZE,
+ .base.cra_ctxsize = sizeof(struct aria_ctx),
+ .base.cra_module = THIS_MODULE,
+ .min_keysize = ARIA_MIN_KEY_SIZE,
+ .max_keysize = ARIA_MAX_KEY_SIZE,
+ .setkey = aria_avx_set_key,
+ .encrypt = aria_avx_ecb_encrypt,
+ .decrypt = aria_avx_ecb_decrypt,
+ }, {
+ .base.cra_name = "__ctr(aria)",
+ .base.cra_driver_name = "__ctr-aria-avx",
+ .base.cra_priority = 400,
+ .base.cra_flags = CRYPTO_ALG_INTERNAL,
+ .base.cra_blocksize = 1,
+ .base.cra_ctxsize = sizeof(struct aria_ctx),
+ .base.cra_module = THIS_MODULE,
+ .min_keysize = ARIA_MIN_KEY_SIZE,
+ .max_keysize = ARIA_MAX_KEY_SIZE,
+ .ivsize = ARIA_BLOCK_SIZE,
+ .chunksize = ARIA_BLOCK_SIZE,
+ .walksize = 16 * ARIA_BLOCK_SIZE,
+ .setkey = aria_avx_set_key,
+ .encrypt = aria_avx_ctr_encrypt,
+ .decrypt = aria_avx_ctr_encrypt,
+ }
+};
+
+static struct simd_skcipher_alg *aria_simd_algs[ARRAY_SIZE(aria_algs)];
+
+static int __init aria_avx_init(void)
+{
+ const char *feature_name;
+
+ if (!boot_cpu_has(X86_FEATURE_AVX) ||
+ !boot_cpu_has(X86_FEATURE_AES) ||
+ !boot_cpu_has(X86_FEATURE_OSXSAVE)) {
+ pr_info("AVX or AES-NI instructions are not detected.\n");
+ return -ENODEV;
+ }
+
+ if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM,
+ &feature_name)) {
+ pr_info("CPU feature '%s' is not supported.\n", feature_name);
+ return -ENODEV;
+ }
+
+ if (boot_cpu_has(X86_FEATURE_GFNI)) {
+ aria_ops.aria_encrypt_16way = aria_aesni_avx_gfni_encrypt_16way;
+ aria_ops.aria_decrypt_16way = aria_aesni_avx_gfni_decrypt_16way;
+ aria_ops.aria_ctr_crypt_16way = aria_aesni_avx_gfni_ctr_crypt_16way;
+ } else {
+ aria_ops.aria_encrypt_16way = aria_aesni_avx_encrypt_16way;
+ aria_ops.aria_decrypt_16way = aria_aesni_avx_decrypt_16way;
+ aria_ops.aria_ctr_crypt_16way = aria_aesni_avx_ctr_crypt_16way;
+ }
+
+ return simd_register_skciphers_compat(aria_algs,
+ ARRAY_SIZE(aria_algs),
+ aria_simd_algs);
+}
+
+static void __exit aria_avx_exit(void)
+{
+ simd_unregister_skciphers(aria_algs, ARRAY_SIZE(aria_algs),
+ aria_simd_algs);
+}
+
+module_init(aria_avx_init);
+module_exit(aria_avx_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Taehee Yoo <ap420073@gmail.com>");
+MODULE_DESCRIPTION("ARIA Cipher Algorithm, AVX/AES-NI/GFNI optimized");
+MODULE_ALIAS_CRYPTO("aria");
+MODULE_ALIAS_CRYPTO("aria-aesni-avx");
diff --git a/arch/x86/crypto/blake2s-core.S b/arch/x86/crypto/blake2s-core.S
index 24910b766bdd..b50b35ff1fdb 100644
--- a/arch/x86/crypto/blake2s-core.S
+++ b/arch/x86/crypto/blake2s-core.S
@@ -46,7 +46,6 @@ SIGMA2:
#endif /* CONFIG_AS_AVX512 */
.text
-#ifdef CONFIG_AS_SSSE3
SYM_FUNC_START(blake2s_compress_ssse3)
testq %rdx,%rdx
je .Lendofloop
@@ -172,9 +171,8 @@ SYM_FUNC_START(blake2s_compress_ssse3)
movdqu %xmm1,0x10(%rdi)
movdqu %xmm14,0x20(%rdi)
.Lendofloop:
- ret
+ RET
SYM_FUNC_END(blake2s_compress_ssse3)
-#endif /* CONFIG_AS_SSSE3 */
#ifdef CONFIG_AS_AVX512
SYM_FUNC_START(blake2s_compress_avx512)
@@ -253,6 +251,6 @@ SYM_FUNC_START(blake2s_compress_avx512)
vmovdqu %xmm1,0x10(%rdi)
vmovdqu %xmm4,0x20(%rdi)
vzeroupper
- retq
+ RET
SYM_FUNC_END(blake2s_compress_avx512)
#endif /* CONFIG_AS_AVX512 */
diff --git a/arch/x86/crypto/blake2s-glue.c b/arch/x86/crypto/blake2s-glue.c
index 06ef2d4a4701..aaba21230528 100644
--- a/arch/x86/crypto/blake2s-glue.c
+++ b/arch/x86/crypto/blake2s-glue.c
@@ -4,13 +4,12 @@
*/
#include <crypto/internal/blake2s.h>
-#include <crypto/internal/simd.h>
-#include <crypto/internal/hash.h>
#include <linux/types.h>
#include <linux/jump_label.h>
#include <linux/kernel.h>
#include <linux/module.h>
+#include <linux/sizes.h>
#include <asm/cpufeature.h>
#include <asm/fpu/api.h>
@@ -27,21 +26,20 @@ asmlinkage void blake2s_compress_avx512(struct blake2s_state *state,
static __ro_after_init DEFINE_STATIC_KEY_FALSE(blake2s_use_ssse3);
static __ro_after_init DEFINE_STATIC_KEY_FALSE(blake2s_use_avx512);
-void blake2s_compress_arch(struct blake2s_state *state,
- const u8 *block, size_t nblocks,
- const u32 inc)
+void blake2s_compress(struct blake2s_state *state, const u8 *block,
+ size_t nblocks, const u32 inc)
{
/* SIMD disables preemption, so relax after processing each page. */
- BUILD_BUG_ON(PAGE_SIZE / BLAKE2S_BLOCK_SIZE < 8);
+ BUILD_BUG_ON(SZ_4K / BLAKE2S_BLOCK_SIZE < 8);
- if (!static_branch_likely(&blake2s_use_ssse3) || !crypto_simd_usable()) {
+ if (!static_branch_likely(&blake2s_use_ssse3) || !may_use_simd()) {
blake2s_compress_generic(state, block, nblocks, inc);
return;
}
- for (;;) {
+ do {
const size_t blocks = min_t(size_t, nblocks,
- PAGE_SIZE / BLAKE2S_BLOCK_SIZE);
+ SZ_4K / BLAKE2S_BLOCK_SIZE);
kernel_fpu_begin();
if (IS_ENABLED(CONFIG_AS_AVX512) &&
@@ -52,152 +50,15 @@ void blake2s_compress_arch(struct blake2s_state *state,
kernel_fpu_end();
nblocks -= blocks;
- if (!nblocks)
- break;
block += blocks * BLAKE2S_BLOCK_SIZE;
- }
-}
-EXPORT_SYMBOL(blake2s_compress_arch);
-
-static int crypto_blake2s_setkey(struct crypto_shash *tfm, const u8 *key,
- unsigned int keylen)
-{
- struct blake2s_tfm_ctx *tctx = crypto_shash_ctx(tfm);
-
- if (keylen == 0 || keylen > BLAKE2S_KEY_SIZE)
- return -EINVAL;
-
- memcpy(tctx->key, key, keylen);
- tctx->keylen = keylen;
-
- return 0;
-}
-
-static int crypto_blake2s_init(struct shash_desc *desc)
-{
- struct blake2s_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm);
- struct blake2s_state *state = shash_desc_ctx(desc);
- const int outlen = crypto_shash_digestsize(desc->tfm);
-
- if (tctx->keylen)
- blake2s_init_key(state, outlen, tctx->key, tctx->keylen);
- else
- blake2s_init(state, outlen);
-
- return 0;
-}
-
-static int crypto_blake2s_update(struct shash_desc *desc, const u8 *in,
- unsigned int inlen)
-{
- struct blake2s_state *state = shash_desc_ctx(desc);
- const size_t fill = BLAKE2S_BLOCK_SIZE - state->buflen;
-
- if (unlikely(!inlen))
- return 0;
- if (inlen > fill) {
- memcpy(state->buf + state->buflen, in, fill);
- blake2s_compress_arch(state, state->buf, 1, BLAKE2S_BLOCK_SIZE);
- state->buflen = 0;
- in += fill;
- inlen -= fill;
- }
- if (inlen > BLAKE2S_BLOCK_SIZE) {
- const size_t nblocks = DIV_ROUND_UP(inlen, BLAKE2S_BLOCK_SIZE);
- /* Hash one less (full) block than strictly possible */
- blake2s_compress_arch(state, in, nblocks - 1, BLAKE2S_BLOCK_SIZE);
- in += BLAKE2S_BLOCK_SIZE * (nblocks - 1);
- inlen -= BLAKE2S_BLOCK_SIZE * (nblocks - 1);
- }
- memcpy(state->buf + state->buflen, in, inlen);
- state->buflen += inlen;
-
- return 0;
-}
-
-static int crypto_blake2s_final(struct shash_desc *desc, u8 *out)
-{
- struct blake2s_state *state = shash_desc_ctx(desc);
-
- blake2s_set_lastblock(state);
- memset(state->buf + state->buflen, 0,
- BLAKE2S_BLOCK_SIZE - state->buflen); /* Padding */
- blake2s_compress_arch(state, state->buf, 1, state->buflen);
- cpu_to_le32_array(state->h, ARRAY_SIZE(state->h));
- memcpy(out, state->h, state->outlen);
- memzero_explicit(state, sizeof(*state));
-
- return 0;
+ } while (nblocks);
}
-
-static struct shash_alg blake2s_algs[] = {{
- .base.cra_name = "blake2s-128",
- .base.cra_driver_name = "blake2s-128-x86",
- .base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY,
- .base.cra_ctxsize = sizeof(struct blake2s_tfm_ctx),
- .base.cra_priority = 200,
- .base.cra_blocksize = BLAKE2S_BLOCK_SIZE,
- .base.cra_module = THIS_MODULE,
-
- .digestsize = BLAKE2S_128_HASH_SIZE,
- .setkey = crypto_blake2s_setkey,
- .init = crypto_blake2s_init,
- .update = crypto_blake2s_update,
- .final = crypto_blake2s_final,
- .descsize = sizeof(struct blake2s_state),
-}, {
- .base.cra_name = "blake2s-160",
- .base.cra_driver_name = "blake2s-160-x86",
- .base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY,
- .base.cra_ctxsize = sizeof(struct blake2s_tfm_ctx),
- .base.cra_priority = 200,
- .base.cra_blocksize = BLAKE2S_BLOCK_SIZE,
- .base.cra_module = THIS_MODULE,
-
- .digestsize = BLAKE2S_160_HASH_SIZE,
- .setkey = crypto_blake2s_setkey,
- .init = crypto_blake2s_init,
- .update = crypto_blake2s_update,
- .final = crypto_blake2s_final,
- .descsize = sizeof(struct blake2s_state),
-}, {
- .base.cra_name = "blake2s-224",
- .base.cra_driver_name = "blake2s-224-x86",
- .base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY,
- .base.cra_ctxsize = sizeof(struct blake2s_tfm_ctx),
- .base.cra_priority = 200,
- .base.cra_blocksize = BLAKE2S_BLOCK_SIZE,
- .base.cra_module = THIS_MODULE,
-
- .digestsize = BLAKE2S_224_HASH_SIZE,
- .setkey = crypto_blake2s_setkey,
- .init = crypto_blake2s_init,
- .update = crypto_blake2s_update,
- .final = crypto_blake2s_final,
- .descsize = sizeof(struct blake2s_state),
-}, {
- .base.cra_name = "blake2s-256",
- .base.cra_driver_name = "blake2s-256-x86",
- .base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY,
- .base.cra_ctxsize = sizeof(struct blake2s_tfm_ctx),
- .base.cra_priority = 200,
- .base.cra_blocksize = BLAKE2S_BLOCK_SIZE,
- .base.cra_module = THIS_MODULE,
-
- .digestsize = BLAKE2S_256_HASH_SIZE,
- .setkey = crypto_blake2s_setkey,
- .init = crypto_blake2s_init,
- .update = crypto_blake2s_update,
- .final = crypto_blake2s_final,
- .descsize = sizeof(struct blake2s_state),
-}};
+EXPORT_SYMBOL(blake2s_compress);
static int __init blake2s_mod_init(void)
{
- if (!boot_cpu_has(X86_FEATURE_SSSE3))
- return 0;
-
- static_branch_enable(&blake2s_use_ssse3);
+ if (boot_cpu_has(X86_FEATURE_SSSE3))
+ static_branch_enable(&blake2s_use_ssse3);
if (IS_ENABLED(CONFIG_AS_AVX512) &&
boot_cpu_has(X86_FEATURE_AVX) &&
@@ -208,26 +69,9 @@ static int __init blake2s_mod_init(void)
XFEATURE_MASK_AVX512, NULL))
static_branch_enable(&blake2s_use_avx512);
- return IS_REACHABLE(CONFIG_CRYPTO_HASH) ?
- crypto_register_shashes(blake2s_algs,
- ARRAY_SIZE(blake2s_algs)) : 0;
-}
-
-static void __exit blake2s_mod_exit(void)
-{
- if (IS_REACHABLE(CONFIG_CRYPTO_HASH) && boot_cpu_has(X86_FEATURE_SSSE3))
- crypto_unregister_shashes(blake2s_algs, ARRAY_SIZE(blake2s_algs));
+ return 0;
}
module_init(blake2s_mod_init);
-module_exit(blake2s_mod_exit);
-MODULE_ALIAS_CRYPTO("blake2s-128");
-MODULE_ALIAS_CRYPTO("blake2s-128-x86");
-MODULE_ALIAS_CRYPTO("blake2s-160");
-MODULE_ALIAS_CRYPTO("blake2s-160-x86");
-MODULE_ALIAS_CRYPTO("blake2s-224");
-MODULE_ALIAS_CRYPTO("blake2s-224-x86");
-MODULE_ALIAS_CRYPTO("blake2s-256");
-MODULE_ALIAS_CRYPTO("blake2s-256-x86");
MODULE_LICENSE("GPL v2");
diff --git a/arch/x86/crypto/blowfish-x86_64-asm_64.S b/arch/x86/crypto/blowfish-x86_64-asm_64.S
index 4222ac6d6584..4a43e072d2d1 100644
--- a/arch/x86/crypto/blowfish-x86_64-asm_64.S
+++ b/arch/x86/crypto/blowfish-x86_64-asm_64.S
@@ -6,6 +6,7 @@
*/
#include <linux/linkage.h>
+#include <linux/cfi_types.h>
.file "blowfish-x86_64-asm.S"
.text
@@ -135,13 +136,13 @@ SYM_FUNC_START(__blowfish_enc_blk)
jnz .L__enc_xor;
write_block();
- ret;
+ RET;
.L__enc_xor:
xor_block();
- ret;
+ RET;
SYM_FUNC_END(__blowfish_enc_blk)
-SYM_FUNC_START(blowfish_dec_blk)
+SYM_TYPED_FUNC_START(blowfish_dec_blk)
/* input:
* %rdi: ctx
* %rsi: dst
@@ -170,7 +171,7 @@ SYM_FUNC_START(blowfish_dec_blk)
movq %r11, %r12;
- ret;
+ RET;
SYM_FUNC_END(blowfish_dec_blk)
/**********************************************************************
@@ -322,17 +323,17 @@ SYM_FUNC_START(__blowfish_enc_blk_4way)
popq %rbx;
popq %r12;
- ret;
+ RET;
.L__enc_xor4:
xor_block4();
popq %rbx;
popq %r12;
- ret;
+ RET;
SYM_FUNC_END(__blowfish_enc_blk_4way)
-SYM_FUNC_START(blowfish_dec_blk_4way)
+SYM_TYPED_FUNC_START(blowfish_dec_blk_4way)
/* input:
* %rdi: ctx
* %rsi: dst
@@ -364,5 +365,5 @@ SYM_FUNC_START(blowfish_dec_blk_4way)
popq %rbx;
popq %r12;
- ret;
+ RET;
SYM_FUNC_END(blowfish_dec_blk_4way)
diff --git a/arch/x86/crypto/blowfish_glue.c b/arch/x86/crypto/blowfish_glue.c
index cedfdba69ce3..019c64c1340a 100644
--- a/arch/x86/crypto/blowfish_glue.c
+++ b/arch/x86/crypto/blowfish_glue.c
@@ -6,8 +6,6 @@
*
* CBC & ECB parts based on code (crypto/cbc.c,ecb.c) by:
* Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au>
- * CTR part based on code (crypto/ctr.c) by:
- * (C) Copyright IBM Corp. 2007 - Joy Latten <latten@us.ibm.com>
*/
#include <crypto/algapi.h>
@@ -34,24 +32,12 @@ static inline void blowfish_enc_blk(struct bf_ctx *ctx, u8 *dst, const u8 *src)
__blowfish_enc_blk(ctx, dst, src, false);
}
-static inline void blowfish_enc_blk_xor(struct bf_ctx *ctx, u8 *dst,
- const u8 *src)
-{
- __blowfish_enc_blk(ctx, dst, src, true);
-}
-
static inline void blowfish_enc_blk_4way(struct bf_ctx *ctx, u8 *dst,
const u8 *src)
{
__blowfish_enc_blk_4way(ctx, dst, src, false);
}
-static inline void blowfish_enc_blk_xor_4way(struct bf_ctx *ctx, u8 *dst,
- const u8 *src)
-{
- __blowfish_enc_blk_4way(ctx, dst, src, true);
-}
-
static void blowfish_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
{
blowfish_enc_blk(crypto_tfm_ctx(tfm), dst, src);
@@ -158,7 +144,7 @@ static int cbc_encrypt(struct skcipher_request *req)
err = skcipher_walk_virt(&walk, req, false);
- while ((nbytes = walk.nbytes)) {
+ while (walk.nbytes) {
nbytes = __cbc_encrypt(ctx, &walk);
err = skcipher_walk_done(&walk, nbytes);
}
@@ -239,7 +225,7 @@ static int cbc_decrypt(struct skcipher_request *req)
err = skcipher_walk_virt(&walk, req, false);
- while ((nbytes = walk.nbytes)) {
+ while (walk.nbytes) {
nbytes = __cbc_decrypt(ctx, &walk);
err = skcipher_walk_done(&walk, nbytes);
}
@@ -247,97 +233,6 @@ static int cbc_decrypt(struct skcipher_request *req)
return err;
}
-static void ctr_crypt_final(struct bf_ctx *ctx, struct skcipher_walk *walk)
-{
- u8 *ctrblk = walk->iv;
- u8 keystream[BF_BLOCK_SIZE];
- u8 *src = walk->src.virt.addr;
- u8 *dst = walk->dst.virt.addr;
- unsigned int nbytes = walk->nbytes;
-
- blowfish_enc_blk(ctx, keystream, ctrblk);
- crypto_xor_cpy(dst, keystream, src, nbytes);
-
- crypto_inc(ctrblk, BF_BLOCK_SIZE);
-}
-
-static unsigned int __ctr_crypt(struct bf_ctx *ctx, struct skcipher_walk *walk)
-{
- unsigned int bsize = BF_BLOCK_SIZE;
- unsigned int nbytes = walk->nbytes;
- u64 *src = (u64 *)walk->src.virt.addr;
- u64 *dst = (u64 *)walk->dst.virt.addr;
- u64 ctrblk = be64_to_cpu(*(__be64 *)walk->iv);
- __be64 ctrblocks[4];
-
- /* Process four block batch */
- if (nbytes >= bsize * 4) {
- do {
- if (dst != src) {
- dst[0] = src[0];
- dst[1] = src[1];
- dst[2] = src[2];
- dst[3] = src[3];
- }
-
- /* create ctrblks for parallel encrypt */
- ctrblocks[0] = cpu_to_be64(ctrblk++);
- ctrblocks[1] = cpu_to_be64(ctrblk++);
- ctrblocks[2] = cpu_to_be64(ctrblk++);
- ctrblocks[3] = cpu_to_be64(ctrblk++);
-
- blowfish_enc_blk_xor_4way(ctx, (u8 *)dst,
- (u8 *)ctrblocks);
-
- src += 4;
- dst += 4;
- } while ((nbytes -= bsize * 4) >= bsize * 4);
-
- if (nbytes < bsize)
- goto done;
- }
-
- /* Handle leftovers */
- do {
- if (dst != src)
- *dst = *src;
-
- ctrblocks[0] = cpu_to_be64(ctrblk++);
-
- blowfish_enc_blk_xor(ctx, (u8 *)dst, (u8 *)ctrblocks);
-
- src += 1;
- dst += 1;
- } while ((nbytes -= bsize) >= bsize);
-
-done:
- *(__be64 *)walk->iv = cpu_to_be64(ctrblk);
- return nbytes;
-}
-
-static int ctr_crypt(struct skcipher_request *req)
-{
- struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
- struct bf_ctx *ctx = crypto_skcipher_ctx(tfm);
- struct skcipher_walk walk;
- unsigned int nbytes;
- int err;
-
- err = skcipher_walk_virt(&walk, req, false);
-
- while ((nbytes = walk.nbytes) >= BF_BLOCK_SIZE) {
- nbytes = __ctr_crypt(ctx, &walk);
- err = skcipher_walk_done(&walk, nbytes);
- }
-
- if (nbytes) {
- ctr_crypt_final(ctx, &walk);
- err = skcipher_walk_done(&walk, 0);
- }
-
- return err;
-}
-
static struct crypto_alg bf_cipher_alg = {
.cra_name = "blowfish",
.cra_driver_name = "blowfish-asm",
@@ -384,20 +279,6 @@ static struct skcipher_alg bf_skcipher_algs[] = {
.setkey = blowfish_setkey_skcipher,
.encrypt = cbc_encrypt,
.decrypt = cbc_decrypt,
- }, {
- .base.cra_name = "ctr(blowfish)",
- .base.cra_driver_name = "ctr-blowfish-asm",
- .base.cra_priority = 300,
- .base.cra_blocksize = 1,
- .base.cra_ctxsize = sizeof(struct bf_ctx),
- .base.cra_module = THIS_MODULE,
- .min_keysize = BF_MIN_KEY_SIZE,
- .max_keysize = BF_MAX_KEY_SIZE,
- .ivsize = BF_BLOCK_SIZE,
- .chunksize = BF_BLOCK_SIZE,
- .setkey = blowfish_setkey_skcipher,
- .encrypt = ctr_crypt,
- .decrypt = ctr_crypt,
},
};
@@ -422,7 +303,7 @@ static int force;
module_param(force, int, 0);
MODULE_PARM_DESC(force, "Force module load, ignore CPU blacklist");
-static int __init init(void)
+static int __init blowfish_init(void)
{
int err;
@@ -446,15 +327,15 @@ static int __init init(void)
return err;
}
-static void __exit fini(void)
+static void __exit blowfish_fini(void)
{
crypto_unregister_alg(&bf_cipher_alg);
crypto_unregister_skciphers(bf_skcipher_algs,
ARRAY_SIZE(bf_skcipher_algs));
}
-module_init(init);
-module_exit(fini);
+module_init(blowfish_init);
+module_exit(blowfish_fini);
MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("Blowfish Cipher Algorithm, asm optimized");
diff --git a/arch/x86/crypto/camellia-aesni-avx-asm_64.S b/arch/x86/crypto/camellia-aesni-avx-asm_64.S
index d01ddd73de65..2e1658ddbe1a 100644
--- a/arch/x86/crypto/camellia-aesni-avx-asm_64.S
+++ b/arch/x86/crypto/camellia-aesni-avx-asm_64.S
@@ -17,7 +17,6 @@
#include <linux/linkage.h>
#include <asm/frame.h>
-#include <asm/nospec-branch.h>
#define CAMELLIA_TABLE_BYTE_LEN 272
@@ -193,7 +192,7 @@ SYM_FUNC_START_LOCAL(roundsm16_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_c
roundsm16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15,
%rcx, (%r9));
- ret;
+ RET;
SYM_FUNC_END(roundsm16_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd)
.align 8
@@ -201,7 +200,7 @@ SYM_FUNC_START_LOCAL(roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_a
roundsm16(%xmm4, %xmm5, %xmm6, %xmm7, %xmm0, %xmm1, %xmm2, %xmm3,
%xmm12, %xmm13, %xmm14, %xmm15, %xmm8, %xmm9, %xmm10, %xmm11,
%rax, (%r9));
- ret;
+ RET;
SYM_FUNC_END(roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
/*
@@ -589,14 +588,6 @@ SYM_FUNC_END(roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
.long 0x80808080
.long 0x80808080
-/* For CTR-mode IV byteswap */
-.Lbswap128_mask:
- .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
-
-/* For XTS mode IV generation */
-.Lxts_gf128mul_and_shl1_mask:
- .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0
-
/*
* pre-SubByte transform
*
@@ -787,7 +778,7 @@ SYM_FUNC_START_LOCAL(__camellia_enc_blk16)
%xmm15, (key_table)(CTX, %r8, 8), (%rax), 1 * 16(%rax));
FRAME_END
- ret;
+ RET;
.align 8
.Lenc_max32:
@@ -874,7 +865,7 @@ SYM_FUNC_START_LOCAL(__camellia_dec_blk16)
%xmm15, (key_table)(CTX), (%rax), 1 * 16(%rax));
FRAME_END
- ret;
+ RET;
.align 8
.Ldec_max32:
@@ -915,7 +906,7 @@ SYM_FUNC_START(camellia_ecb_enc_16way)
%xmm8, %rsi);
FRAME_END
- ret;
+ RET;
SYM_FUNC_END(camellia_ecb_enc_16way)
SYM_FUNC_START(camellia_ecb_dec_16way)
@@ -945,7 +936,7 @@ SYM_FUNC_START(camellia_ecb_dec_16way)
%xmm8, %rsi);
FRAME_END
- ret;
+ RET;
SYM_FUNC_END(camellia_ecb_dec_16way)
SYM_FUNC_START(camellia_cbc_dec_16way)
@@ -996,294 +987,5 @@ SYM_FUNC_START(camellia_cbc_dec_16way)
%xmm8, %rsi);
FRAME_END
- ret;
+ RET;
SYM_FUNC_END(camellia_cbc_dec_16way)
-
-#define inc_le128(x, minus_one, tmp) \
- vpcmpeqq minus_one, x, tmp; \
- vpsubq minus_one, x, x; \
- vpslldq $8, tmp, tmp; \
- vpsubq tmp, x, x;
-
-SYM_FUNC_START(camellia_ctr_16way)
- /* input:
- * %rdi: ctx, CTX
- * %rsi: dst (16 blocks)
- * %rdx: src (16 blocks)
- * %rcx: iv (little endian, 128bit)
- */
- FRAME_BEGIN
-
- subq $(16 * 16), %rsp;
- movq %rsp, %rax;
-
- vmovdqa .Lbswap128_mask, %xmm14;
-
- /* load IV and byteswap */
- vmovdqu (%rcx), %xmm0;
- vpshufb %xmm14, %xmm0, %xmm15;
- vmovdqu %xmm15, 15 * 16(%rax);
-
- vpcmpeqd %xmm15, %xmm15, %xmm15;
- vpsrldq $8, %xmm15, %xmm15; /* low: -1, high: 0 */
-
- /* construct IVs */
- inc_le128(%xmm0, %xmm15, %xmm13);
- vpshufb %xmm14, %xmm0, %xmm13;
- vmovdqu %xmm13, 14 * 16(%rax);
- inc_le128(%xmm0, %xmm15, %xmm13);
- vpshufb %xmm14, %xmm0, %xmm13;
- vmovdqu %xmm13, 13 * 16(%rax);
- inc_le128(%xmm0, %xmm15, %xmm13);
- vpshufb %xmm14, %xmm0, %xmm12;
- inc_le128(%xmm0, %xmm15, %xmm13);
- vpshufb %xmm14, %xmm0, %xmm11;
- inc_le128(%xmm0, %xmm15, %xmm13);
- vpshufb %xmm14, %xmm0, %xmm10;
- inc_le128(%xmm0, %xmm15, %xmm13);
- vpshufb %xmm14, %xmm0, %xmm9;
- inc_le128(%xmm0, %xmm15, %xmm13);
- vpshufb %xmm14, %xmm0, %xmm8;
- inc_le128(%xmm0, %xmm15, %xmm13);
- vpshufb %xmm14, %xmm0, %xmm7;
- inc_le128(%xmm0, %xmm15, %xmm13);
- vpshufb %xmm14, %xmm0, %xmm6;
- inc_le128(%xmm0, %xmm15, %xmm13);
- vpshufb %xmm14, %xmm0, %xmm5;
- inc_le128(%xmm0, %xmm15, %xmm13);
- vpshufb %xmm14, %xmm0, %xmm4;
- inc_le128(%xmm0, %xmm15, %xmm13);
- vpshufb %xmm14, %xmm0, %xmm3;
- inc_le128(%xmm0, %xmm15, %xmm13);
- vpshufb %xmm14, %xmm0, %xmm2;
- inc_le128(%xmm0, %xmm15, %xmm13);
- vpshufb %xmm14, %xmm0, %xmm1;
- inc_le128(%xmm0, %xmm15, %xmm13);
- vmovdqa %xmm0, %xmm13;
- vpshufb %xmm14, %xmm0, %xmm0;
- inc_le128(%xmm13, %xmm15, %xmm14);
- vmovdqu %xmm13, (%rcx);
-
- /* inpack16_pre: */
- vmovq (key_table)(CTX), %xmm15;
- vpshufb .Lpack_bswap, %xmm15, %xmm15;
- vpxor %xmm0, %xmm15, %xmm0;
- vpxor %xmm1, %xmm15, %xmm1;
- vpxor %xmm2, %xmm15, %xmm2;
- vpxor %xmm3, %xmm15, %xmm3;
- vpxor %xmm4, %xmm15, %xmm4;
- vpxor %xmm5, %xmm15, %xmm5;
- vpxor %xmm6, %xmm15, %xmm6;
- vpxor %xmm7, %xmm15, %xmm7;
- vpxor %xmm8, %xmm15, %xmm8;
- vpxor %xmm9, %xmm15, %xmm9;
- vpxor %xmm10, %xmm15, %xmm10;
- vpxor %xmm11, %xmm15, %xmm11;
- vpxor %xmm12, %xmm15, %xmm12;
- vpxor 13 * 16(%rax), %xmm15, %xmm13;
- vpxor 14 * 16(%rax), %xmm15, %xmm14;
- vpxor 15 * 16(%rax), %xmm15, %xmm15;
-
- call __camellia_enc_blk16;
-
- addq $(16 * 16), %rsp;
-
- vpxor 0 * 16(%rdx), %xmm7, %xmm7;
- vpxor 1 * 16(%rdx), %xmm6, %xmm6;
- vpxor 2 * 16(%rdx), %xmm5, %xmm5;
- vpxor 3 * 16(%rdx), %xmm4, %xmm4;
- vpxor 4 * 16(%rdx), %xmm3, %xmm3;
- vpxor 5 * 16(%rdx), %xmm2, %xmm2;
- vpxor 6 * 16(%rdx), %xmm1, %xmm1;
- vpxor 7 * 16(%rdx), %xmm0, %xmm0;
- vpxor 8 * 16(%rdx), %xmm15, %xmm15;
- vpxor 9 * 16(%rdx), %xmm14, %xmm14;
- vpxor 10 * 16(%rdx), %xmm13, %xmm13;
- vpxor 11 * 16(%rdx), %xmm12, %xmm12;
- vpxor 12 * 16(%rdx), %xmm11, %xmm11;
- vpxor 13 * 16(%rdx), %xmm10, %xmm10;
- vpxor 14 * 16(%rdx), %xmm9, %xmm9;
- vpxor 15 * 16(%rdx), %xmm8, %xmm8;
- write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0,
- %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
- %xmm8, %rsi);
-
- FRAME_END
- ret;
-SYM_FUNC_END(camellia_ctr_16way)
-
-#define gf128mul_x_ble(iv, mask, tmp) \
- vpsrad $31, iv, tmp; \
- vpaddq iv, iv, iv; \
- vpshufd $0x13, tmp, tmp; \
- vpand mask, tmp, tmp; \
- vpxor tmp, iv, iv;
-
-.align 8
-SYM_FUNC_START_LOCAL(camellia_xts_crypt_16way)
- /* input:
- * %rdi: ctx, CTX
- * %rsi: dst (16 blocks)
- * %rdx: src (16 blocks)
- * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
- * %r8: index for input whitening key
- * %r9: pointer to __camellia_enc_blk16 or __camellia_dec_blk16
- */
- FRAME_BEGIN
-
- subq $(16 * 16), %rsp;
- movq %rsp, %rax;
-
- vmovdqa .Lxts_gf128mul_and_shl1_mask, %xmm14;
-
- /* load IV */
- vmovdqu (%rcx), %xmm0;
- vpxor 0 * 16(%rdx), %xmm0, %xmm15;
- vmovdqu %xmm15, 15 * 16(%rax);
- vmovdqu %xmm0, 0 * 16(%rsi);
-
- /* construct IVs */
- gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
- vpxor 1 * 16(%rdx), %xmm0, %xmm15;
- vmovdqu %xmm15, 14 * 16(%rax);
- vmovdqu %xmm0, 1 * 16(%rsi);
-
- gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
- vpxor 2 * 16(%rdx), %xmm0, %xmm13;
- vmovdqu %xmm0, 2 * 16(%rsi);
-
- gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
- vpxor 3 * 16(%rdx), %xmm0, %xmm12;
- vmovdqu %xmm0, 3 * 16(%rsi);
-
- gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
- vpxor 4 * 16(%rdx), %xmm0, %xmm11;
- vmovdqu %xmm0, 4 * 16(%rsi);
-
- gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
- vpxor 5 * 16(%rdx), %xmm0, %xmm10;
- vmovdqu %xmm0, 5 * 16(%rsi);
-
- gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
- vpxor 6 * 16(%rdx), %xmm0, %xmm9;
- vmovdqu %xmm0, 6 * 16(%rsi);
-
- gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
- vpxor 7 * 16(%rdx), %xmm0, %xmm8;
- vmovdqu %xmm0, 7 * 16(%rsi);
-
- gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
- vpxor 8 * 16(%rdx), %xmm0, %xmm7;
- vmovdqu %xmm0, 8 * 16(%rsi);
-
- gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
- vpxor 9 * 16(%rdx), %xmm0, %xmm6;
- vmovdqu %xmm0, 9 * 16(%rsi);
-
- gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
- vpxor 10 * 16(%rdx), %xmm0, %xmm5;
- vmovdqu %xmm0, 10 * 16(%rsi);
-
- gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
- vpxor 11 * 16(%rdx), %xmm0, %xmm4;
- vmovdqu %xmm0, 11 * 16(%rsi);
-
- gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
- vpxor 12 * 16(%rdx), %xmm0, %xmm3;
- vmovdqu %xmm0, 12 * 16(%rsi);
-
- gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
- vpxor 13 * 16(%rdx), %xmm0, %xmm2;
- vmovdqu %xmm0, 13 * 16(%rsi);
-
- gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
- vpxor 14 * 16(%rdx), %xmm0, %xmm1;
- vmovdqu %xmm0, 14 * 16(%rsi);
-
- gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
- vpxor 15 * 16(%rdx), %xmm0, %xmm15;
- vmovdqu %xmm15, 0 * 16(%rax);
- vmovdqu %xmm0, 15 * 16(%rsi);
-
- gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
- vmovdqu %xmm0, (%rcx);
-
- /* inpack16_pre: */
- vmovq (key_table)(CTX, %r8, 8), %xmm15;
- vpshufb .Lpack_bswap, %xmm15, %xmm15;
- vpxor 0 * 16(%rax), %xmm15, %xmm0;
- vpxor %xmm1, %xmm15, %xmm1;
- vpxor %xmm2, %xmm15, %xmm2;
- vpxor %xmm3, %xmm15, %xmm3;
- vpxor %xmm4, %xmm15, %xmm4;
- vpxor %xmm5, %xmm15, %xmm5;
- vpxor %xmm6, %xmm15, %xmm6;
- vpxor %xmm7, %xmm15, %xmm7;
- vpxor %xmm8, %xmm15, %xmm8;
- vpxor %xmm9, %xmm15, %xmm9;
- vpxor %xmm10, %xmm15, %xmm10;
- vpxor %xmm11, %xmm15, %xmm11;
- vpxor %xmm12, %xmm15, %xmm12;
- vpxor %xmm13, %xmm15, %xmm13;
- vpxor 14 * 16(%rax), %xmm15, %xmm14;
- vpxor 15 * 16(%rax), %xmm15, %xmm15;
-
- CALL_NOSPEC %r9;
-
- addq $(16 * 16), %rsp;
-
- vpxor 0 * 16(%rsi), %xmm7, %xmm7;
- vpxor 1 * 16(%rsi), %xmm6, %xmm6;
- vpxor 2 * 16(%rsi), %xmm5, %xmm5;
- vpxor 3 * 16(%rsi), %xmm4, %xmm4;
- vpxor 4 * 16(%rsi), %xmm3, %xmm3;
- vpxor 5 * 16(%rsi), %xmm2, %xmm2;
- vpxor 6 * 16(%rsi), %xmm1, %xmm1;
- vpxor 7 * 16(%rsi), %xmm0, %xmm0;
- vpxor 8 * 16(%rsi), %xmm15, %xmm15;
- vpxor 9 * 16(%rsi), %xmm14, %xmm14;
- vpxor 10 * 16(%rsi), %xmm13, %xmm13;
- vpxor 11 * 16(%rsi), %xmm12, %xmm12;
- vpxor 12 * 16(%rsi), %xmm11, %xmm11;
- vpxor 13 * 16(%rsi), %xmm10, %xmm10;
- vpxor 14 * 16(%rsi), %xmm9, %xmm9;
- vpxor 15 * 16(%rsi), %xmm8, %xmm8;
- write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0,
- %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
- %xmm8, %rsi);
-
- FRAME_END
- ret;
-SYM_FUNC_END(camellia_xts_crypt_16way)
-
-SYM_FUNC_START(camellia_xts_enc_16way)
- /* input:
- * %rdi: ctx, CTX
- * %rsi: dst (16 blocks)
- * %rdx: src (16 blocks)
- * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
- */
- xorl %r8d, %r8d; /* input whitening key, 0 for enc */
-
- leaq __camellia_enc_blk16, %r9;
-
- jmp camellia_xts_crypt_16way;
-SYM_FUNC_END(camellia_xts_enc_16way)
-
-SYM_FUNC_START(camellia_xts_dec_16way)
- /* input:
- * %rdi: ctx, CTX
- * %rsi: dst (16 blocks)
- * %rdx: src (16 blocks)
- * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
- */
-
- cmpl $16, key_length(CTX);
- movl $32, %r8d;
- movl $24, %eax;
- cmovel %eax, %r8d; /* input whitening key, last for dec */
-
- leaq __camellia_dec_blk16, %r9;
-
- jmp camellia_xts_crypt_16way;
-SYM_FUNC_END(camellia_xts_dec_16way)
diff --git a/arch/x86/crypto/camellia-aesni-avx2-asm_64.S b/arch/x86/crypto/camellia-aesni-avx2-asm_64.S
index 563ef6e83cdd..0e4e9abbf4de 100644
--- a/arch/x86/crypto/camellia-aesni-avx2-asm_64.S
+++ b/arch/x86/crypto/camellia-aesni-avx2-asm_64.S
@@ -7,7 +7,6 @@
#include <linux/linkage.h>
#include <asm/frame.h>
-#include <asm/nospec-branch.h>
#define CAMELLIA_TABLE_BYTE_LEN 272
@@ -227,7 +226,7 @@ SYM_FUNC_START_LOCAL(roundsm32_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_c
roundsm32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, %ymm15,
%rcx, (%r9));
- ret;
+ RET;
SYM_FUNC_END(roundsm32_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd)
.align 8
@@ -235,7 +234,7 @@ SYM_FUNC_START_LOCAL(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_a
roundsm32(%ymm4, %ymm5, %ymm6, %ymm7, %ymm0, %ymm1, %ymm2, %ymm3,
%ymm12, %ymm13, %ymm14, %ymm15, %ymm8, %ymm9, %ymm10, %ymm11,
%rax, (%r9));
- ret;
+ RET;
SYM_FUNC_END(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
/*
@@ -625,16 +624,6 @@ SYM_FUNC_END(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
.section .rodata.cst16, "aM", @progbits, 16
.align 16
-/* For CTR-mode IV byteswap */
-.Lbswap128_mask:
- .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
-
-/* For XTS mode */
-.Lxts_gf128mul_and_shl1_mask_0:
- .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0
-.Lxts_gf128mul_and_shl1_mask_1:
- .byte 0x0e, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0
-
/*
* pre-SubByte transform
*
@@ -825,7 +814,7 @@ SYM_FUNC_START_LOCAL(__camellia_enc_blk32)
%ymm15, (key_table)(CTX, %r8, 8), (%rax), 1 * 32(%rax));
FRAME_END
- ret;
+ RET;
.align 8
.Lenc_max32:
@@ -912,7 +901,7 @@ SYM_FUNC_START_LOCAL(__camellia_dec_blk32)
%ymm15, (key_table)(CTX), (%rax), 1 * 32(%rax));
FRAME_END
- ret;
+ RET;
.align 8
.Ldec_max32:
@@ -957,7 +946,7 @@ SYM_FUNC_START(camellia_ecb_enc_32way)
vzeroupper;
FRAME_END
- ret;
+ RET;
SYM_FUNC_END(camellia_ecb_enc_32way)
SYM_FUNC_START(camellia_ecb_dec_32way)
@@ -991,7 +980,7 @@ SYM_FUNC_START(camellia_ecb_dec_32way)
vzeroupper;
FRAME_END
- ret;
+ RET;
SYM_FUNC_END(camellia_ecb_dec_32way)
SYM_FUNC_START(camellia_cbc_dec_32way)
@@ -1001,6 +990,7 @@ SYM_FUNC_START(camellia_cbc_dec_32way)
* %rdx: src (32 blocks)
*/
FRAME_BEGIN
+ subq $(16 * 32), %rsp;
vzeroupper;
@@ -1013,7 +1003,6 @@ SYM_FUNC_START(camellia_cbc_dec_32way)
%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
%ymm15, %rdx, (key_table)(CTX, %r8, 8));
- movq %rsp, %r10;
cmpq %rsi, %rdx;
je .Lcbc_dec_use_stack;
@@ -1026,7 +1015,6 @@ SYM_FUNC_START(camellia_cbc_dec_32way)
* dst still in-use (because dst == src), so use stack for temporary
* storage.
*/
- subq $(16 * 32), %rsp;
movq %rsp, %rax;
.Lcbc_dec_continue:
@@ -1036,7 +1024,6 @@ SYM_FUNC_START(camellia_cbc_dec_32way)
vpxor %ymm7, %ymm7, %ymm7;
vinserti128 $1, (%rdx), %ymm7, %ymm7;
vpxor (%rax), %ymm7, %ymm7;
- movq %r10, %rsp;
vpxor (0 * 32 + 16)(%rdx), %ymm6, %ymm6;
vpxor (1 * 32 + 16)(%rdx), %ymm5, %ymm5;
vpxor (2 * 32 + 16)(%rdx), %ymm4, %ymm4;
@@ -1058,346 +1045,7 @@ SYM_FUNC_START(camellia_cbc_dec_32way)
vzeroupper;
- FRAME_END
- ret;
-SYM_FUNC_END(camellia_cbc_dec_32way)
-
-#define inc_le128(x, minus_one, tmp) \
- vpcmpeqq minus_one, x, tmp; \
- vpsubq minus_one, x, x; \
- vpslldq $8, tmp, tmp; \
- vpsubq tmp, x, x;
-
-#define add2_le128(x, minus_one, minus_two, tmp1, tmp2) \
- vpcmpeqq minus_one, x, tmp1; \
- vpcmpeqq minus_two, x, tmp2; \
- vpsubq minus_two, x, x; \
- vpor tmp2, tmp1, tmp1; \
- vpslldq $8, tmp1, tmp1; \
- vpsubq tmp1, x, x;
-
-SYM_FUNC_START(camellia_ctr_32way)
- /* input:
- * %rdi: ctx, CTX
- * %rsi: dst (32 blocks)
- * %rdx: src (32 blocks)
- * %rcx: iv (little endian, 128bit)
- */
- FRAME_BEGIN
-
- vzeroupper;
-
- movq %rsp, %r10;
- cmpq %rsi, %rdx;
- je .Lctr_use_stack;
-
- /* dst can be used as temporary storage, src is not overwritten. */
- movq %rsi, %rax;
- jmp .Lctr_continue;
-
-.Lctr_use_stack:
- subq $(16 * 32), %rsp;
- movq %rsp, %rax;
-
-.Lctr_continue:
- vpcmpeqd %ymm15, %ymm15, %ymm15;
- vpsrldq $8, %ymm15, %ymm15; /* ab: -1:0 ; cd: -1:0 */
- vpaddq %ymm15, %ymm15, %ymm12; /* ab: -2:0 ; cd: -2:0 */
-
- /* load IV and byteswap */
- vmovdqu (%rcx), %xmm0;
- vmovdqa %xmm0, %xmm1;
- inc_le128(%xmm0, %xmm15, %xmm14);
- vbroadcasti128 .Lbswap128_mask, %ymm14;
- vinserti128 $1, %xmm0, %ymm1, %ymm0;
- vpshufb %ymm14, %ymm0, %ymm13;
- vmovdqu %ymm13, 15 * 32(%rax);
-
- /* construct IVs */
- add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); /* ab:le2 ; cd:le3 */
- vpshufb %ymm14, %ymm0, %ymm13;
- vmovdqu %ymm13, 14 * 32(%rax);
- add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
- vpshufb %ymm14, %ymm0, %ymm13;
- vmovdqu %ymm13, 13 * 32(%rax);
- add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
- vpshufb %ymm14, %ymm0, %ymm13;
- vmovdqu %ymm13, 12 * 32(%rax);
- add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
- vpshufb %ymm14, %ymm0, %ymm13;
- vmovdqu %ymm13, 11 * 32(%rax);
- add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
- vpshufb %ymm14, %ymm0, %ymm10;
- add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
- vpshufb %ymm14, %ymm0, %ymm9;
- add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
- vpshufb %ymm14, %ymm0, %ymm8;
- add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
- vpshufb %ymm14, %ymm0, %ymm7;
- add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
- vpshufb %ymm14, %ymm0, %ymm6;
- add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
- vpshufb %ymm14, %ymm0, %ymm5;
- add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
- vpshufb %ymm14, %ymm0, %ymm4;
- add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
- vpshufb %ymm14, %ymm0, %ymm3;
- add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
- vpshufb %ymm14, %ymm0, %ymm2;
- add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
- vpshufb %ymm14, %ymm0, %ymm1;
- add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
- vextracti128 $1, %ymm0, %xmm13;
- vpshufb %ymm14, %ymm0, %ymm0;
- inc_le128(%xmm13, %xmm15, %xmm14);
- vmovdqu %xmm13, (%rcx);
-
- /* inpack32_pre: */
- vpbroadcastq (key_table)(CTX), %ymm15;
- vpshufb .Lpack_bswap, %ymm15, %ymm15;
- vpxor %ymm0, %ymm15, %ymm0;
- vpxor %ymm1, %ymm15, %ymm1;
- vpxor %ymm2, %ymm15, %ymm2;
- vpxor %ymm3, %ymm15, %ymm3;
- vpxor %ymm4, %ymm15, %ymm4;
- vpxor %ymm5, %ymm15, %ymm5;
- vpxor %ymm6, %ymm15, %ymm6;
- vpxor %ymm7, %ymm15, %ymm7;
- vpxor %ymm8, %ymm15, %ymm8;
- vpxor %ymm9, %ymm15, %ymm9;
- vpxor %ymm10, %ymm15, %ymm10;
- vpxor 11 * 32(%rax), %ymm15, %ymm11;
- vpxor 12 * 32(%rax), %ymm15, %ymm12;
- vpxor 13 * 32(%rax), %ymm15, %ymm13;
- vpxor 14 * 32(%rax), %ymm15, %ymm14;
- vpxor 15 * 32(%rax), %ymm15, %ymm15;
-
- call __camellia_enc_blk32;
-
- movq %r10, %rsp;
-
- vpxor 0 * 32(%rdx), %ymm7, %ymm7;
- vpxor 1 * 32(%rdx), %ymm6, %ymm6;
- vpxor 2 * 32(%rdx), %ymm5, %ymm5;
- vpxor 3 * 32(%rdx), %ymm4, %ymm4;
- vpxor 4 * 32(%rdx), %ymm3, %ymm3;
- vpxor 5 * 32(%rdx), %ymm2, %ymm2;
- vpxor 6 * 32(%rdx), %ymm1, %ymm1;
- vpxor 7 * 32(%rdx), %ymm0, %ymm0;
- vpxor 8 * 32(%rdx), %ymm15, %ymm15;
- vpxor 9 * 32(%rdx), %ymm14, %ymm14;
- vpxor 10 * 32(%rdx), %ymm13, %ymm13;
- vpxor 11 * 32(%rdx), %ymm12, %ymm12;
- vpxor 12 * 32(%rdx), %ymm11, %ymm11;
- vpxor 13 * 32(%rdx), %ymm10, %ymm10;
- vpxor 14 * 32(%rdx), %ymm9, %ymm9;
- vpxor 15 * 32(%rdx), %ymm8, %ymm8;
- write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0,
- %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9,
- %ymm8, %rsi);
-
- vzeroupper;
-
- FRAME_END
- ret;
-SYM_FUNC_END(camellia_ctr_32way)
-
-#define gf128mul_x_ble(iv, mask, tmp) \
- vpsrad $31, iv, tmp; \
- vpaddq iv, iv, iv; \
- vpshufd $0x13, tmp, tmp; \
- vpand mask, tmp, tmp; \
- vpxor tmp, iv, iv;
-
-#define gf128mul_x2_ble(iv, mask1, mask2, tmp0, tmp1) \
- vpsrad $31, iv, tmp0; \
- vpaddq iv, iv, tmp1; \
- vpsllq $2, iv, iv; \
- vpshufd $0x13, tmp0, tmp0; \
- vpsrad $31, tmp1, tmp1; \
- vpand mask2, tmp0, tmp0; \
- vpshufd $0x13, tmp1, tmp1; \
- vpxor tmp0, iv, iv; \
- vpand mask1, tmp1, tmp1; \
- vpxor tmp1, iv, iv;
-
-.align 8
-SYM_FUNC_START_LOCAL(camellia_xts_crypt_32way)
- /* input:
- * %rdi: ctx, CTX
- * %rsi: dst (32 blocks)
- * %rdx: src (32 blocks)
- * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
- * %r8: index for input whitening key
- * %r9: pointer to __camellia_enc_blk32 or __camellia_dec_blk32
- */
- FRAME_BEGIN
-
- vzeroupper;
-
- subq $(16 * 32), %rsp;
- movq %rsp, %rax;
-
- vbroadcasti128 .Lxts_gf128mul_and_shl1_mask_0, %ymm12;
-
- /* load IV and construct second IV */
- vmovdqu (%rcx), %xmm0;
- vmovdqa %xmm0, %xmm15;
- gf128mul_x_ble(%xmm0, %xmm12, %xmm13);
- vbroadcasti128 .Lxts_gf128mul_and_shl1_mask_1, %ymm13;
- vinserti128 $1, %xmm0, %ymm15, %ymm0;
- vpxor 0 * 32(%rdx), %ymm0, %ymm15;
- vmovdqu %ymm15, 15 * 32(%rax);
- vmovdqu %ymm0, 0 * 32(%rsi);
-
- /* construct IVs */
- gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
- vpxor 1 * 32(%rdx), %ymm0, %ymm15;
- vmovdqu %ymm15, 14 * 32(%rax);
- vmovdqu %ymm0, 1 * 32(%rsi);
-
- gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
- vpxor 2 * 32(%rdx), %ymm0, %ymm15;
- vmovdqu %ymm15, 13 * 32(%rax);
- vmovdqu %ymm0, 2 * 32(%rsi);
-
- gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
- vpxor 3 * 32(%rdx), %ymm0, %ymm15;
- vmovdqu %ymm15, 12 * 32(%rax);
- vmovdqu %ymm0, 3 * 32(%rsi);
-
- gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
- vpxor 4 * 32(%rdx), %ymm0, %ymm11;
- vmovdqu %ymm0, 4 * 32(%rsi);
-
- gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
- vpxor 5 * 32(%rdx), %ymm0, %ymm10;
- vmovdqu %ymm0, 5 * 32(%rsi);
-
- gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
- vpxor 6 * 32(%rdx), %ymm0, %ymm9;
- vmovdqu %ymm0, 6 * 32(%rsi);
-
- gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
- vpxor 7 * 32(%rdx), %ymm0, %ymm8;
- vmovdqu %ymm0, 7 * 32(%rsi);
-
- gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
- vpxor 8 * 32(%rdx), %ymm0, %ymm7;
- vmovdqu %ymm0, 8 * 32(%rsi);
-
- gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
- vpxor 9 * 32(%rdx), %ymm0, %ymm6;
- vmovdqu %ymm0, 9 * 32(%rsi);
-
- gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
- vpxor 10 * 32(%rdx), %ymm0, %ymm5;
- vmovdqu %ymm0, 10 * 32(%rsi);
-
- gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
- vpxor 11 * 32(%rdx), %ymm0, %ymm4;
- vmovdqu %ymm0, 11 * 32(%rsi);
-
- gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
- vpxor 12 * 32(%rdx), %ymm0, %ymm3;
- vmovdqu %ymm0, 12 * 32(%rsi);
-
- gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
- vpxor 13 * 32(%rdx), %ymm0, %ymm2;
- vmovdqu %ymm0, 13 * 32(%rsi);
-
- gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
- vpxor 14 * 32(%rdx), %ymm0, %ymm1;
- vmovdqu %ymm0, 14 * 32(%rsi);
-
- gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
- vpxor 15 * 32(%rdx), %ymm0, %ymm15;
- vmovdqu %ymm15, 0 * 32(%rax);
- vmovdqu %ymm0, 15 * 32(%rsi);
-
- vextracti128 $1, %ymm0, %xmm0;
- gf128mul_x_ble(%xmm0, %xmm12, %xmm15);
- vmovdqu %xmm0, (%rcx);
-
- /* inpack32_pre: */
- vpbroadcastq (key_table)(CTX, %r8, 8), %ymm15;
- vpshufb .Lpack_bswap, %ymm15, %ymm15;
- vpxor 0 * 32(%rax), %ymm15, %ymm0;
- vpxor %ymm1, %ymm15, %ymm1;
- vpxor %ymm2, %ymm15, %ymm2;
- vpxor %ymm3, %ymm15, %ymm3;
- vpxor %ymm4, %ymm15, %ymm4;
- vpxor %ymm5, %ymm15, %ymm5;
- vpxor %ymm6, %ymm15, %ymm6;
- vpxor %ymm7, %ymm15, %ymm7;
- vpxor %ymm8, %ymm15, %ymm8;
- vpxor %ymm9, %ymm15, %ymm9;
- vpxor %ymm10, %ymm15, %ymm10;
- vpxor %ymm11, %ymm15, %ymm11;
- vpxor 12 * 32(%rax), %ymm15, %ymm12;
- vpxor 13 * 32(%rax), %ymm15, %ymm13;
- vpxor 14 * 32(%rax), %ymm15, %ymm14;
- vpxor 15 * 32(%rax), %ymm15, %ymm15;
-
- CALL_NOSPEC %r9;
-
addq $(16 * 32), %rsp;
-
- vpxor 0 * 32(%rsi), %ymm7, %ymm7;
- vpxor 1 * 32(%rsi), %ymm6, %ymm6;
- vpxor 2 * 32(%rsi), %ymm5, %ymm5;
- vpxor 3 * 32(%rsi), %ymm4, %ymm4;
- vpxor 4 * 32(%rsi), %ymm3, %ymm3;
- vpxor 5 * 32(%rsi), %ymm2, %ymm2;
- vpxor 6 * 32(%rsi), %ymm1, %ymm1;
- vpxor 7 * 32(%rsi), %ymm0, %ymm0;
- vpxor 8 * 32(%rsi), %ymm15, %ymm15;
- vpxor 9 * 32(%rsi), %ymm14, %ymm14;
- vpxor 10 * 32(%rsi), %ymm13, %ymm13;
- vpxor 11 * 32(%rsi), %ymm12, %ymm12;
- vpxor 12 * 32(%rsi), %ymm11, %ymm11;
- vpxor 13 * 32(%rsi), %ymm10, %ymm10;
- vpxor 14 * 32(%rsi), %ymm9, %ymm9;
- vpxor 15 * 32(%rsi), %ymm8, %ymm8;
- write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0,
- %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9,
- %ymm8, %rsi);
-
- vzeroupper;
-
FRAME_END
- ret;
-SYM_FUNC_END(camellia_xts_crypt_32way)
-
-SYM_FUNC_START(camellia_xts_enc_32way)
- /* input:
- * %rdi: ctx, CTX
- * %rsi: dst (32 blocks)
- * %rdx: src (32 blocks)
- * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
- */
-
- xorl %r8d, %r8d; /* input whitening key, 0 for enc */
-
- leaq __camellia_enc_blk32, %r9;
-
- jmp camellia_xts_crypt_32way;
-SYM_FUNC_END(camellia_xts_enc_32way)
-
-SYM_FUNC_START(camellia_xts_dec_32way)
- /* input:
- * %rdi: ctx, CTX
- * %rsi: dst (32 blocks)
- * %rdx: src (32 blocks)
- * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
- */
-
- cmpl $16, key_length(CTX);
- movl $32, %r8d;
- movl $24, %eax;
- cmovel %eax, %r8d; /* input whitening key, last for dec */
-
- leaq __camellia_dec_blk32, %r9;
-
- jmp camellia_xts_crypt_32way;
-SYM_FUNC_END(camellia_xts_dec_32way)
+ RET;
+SYM_FUNC_END(camellia_cbc_dec_32way)
diff --git a/arch/x86/crypto/camellia-x86_64-asm_64.S b/arch/x86/crypto/camellia-x86_64-asm_64.S
index 1372e6408850..347c059f5940 100644
--- a/arch/x86/crypto/camellia-x86_64-asm_64.S
+++ b/arch/x86/crypto/camellia-x86_64-asm_64.S
@@ -213,13 +213,13 @@ SYM_FUNC_START(__camellia_enc_blk)
enc_outunpack(mov, RT1);
movq RR12, %r12;
- ret;
+ RET;
.L__enc_xor:
enc_outunpack(xor, RT1);
movq RR12, %r12;
- ret;
+ RET;
SYM_FUNC_END(__camellia_enc_blk)
SYM_FUNC_START(camellia_dec_blk)
@@ -257,7 +257,7 @@ SYM_FUNC_START(camellia_dec_blk)
dec_outunpack();
movq RR12, %r12;
- ret;
+ RET;
SYM_FUNC_END(camellia_dec_blk)
/**********************************************************************
@@ -448,14 +448,14 @@ SYM_FUNC_START(__camellia_enc_blk_2way)
movq RR12, %r12;
popq %rbx;
- ret;
+ RET;
.L__enc2_xor:
enc_outunpack2(xor, RT2);
movq RR12, %r12;
popq %rbx;
- ret;
+ RET;
SYM_FUNC_END(__camellia_enc_blk_2way)
SYM_FUNC_START(camellia_dec_blk_2way)
@@ -495,5 +495,5 @@ SYM_FUNC_START(camellia_dec_blk_2way)
movq RR12, %r12;
movq RXOR, %rbx;
- ret;
+ RET;
SYM_FUNC_END(camellia_dec_blk_2way)
diff --git a/arch/x86/crypto/camellia.h b/arch/x86/crypto/camellia.h
new file mode 100644
index 000000000000..1dcea79e8f8e
--- /dev/null
+++ b/arch/x86/crypto/camellia.h
@@ -0,0 +1,67 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef ASM_X86_CAMELLIA_H
+#define ASM_X86_CAMELLIA_H
+
+#include <crypto/b128ops.h>
+#include <linux/crypto.h>
+#include <linux/kernel.h>
+
+#define CAMELLIA_MIN_KEY_SIZE 16
+#define CAMELLIA_MAX_KEY_SIZE 32
+#define CAMELLIA_BLOCK_SIZE 16
+#define CAMELLIA_TABLE_BYTE_LEN 272
+#define CAMELLIA_PARALLEL_BLOCKS 2
+
+struct crypto_skcipher;
+
+struct camellia_ctx {
+ u64 key_table[CAMELLIA_TABLE_BYTE_LEN / sizeof(u64)];
+ u32 key_length;
+};
+
+extern int __camellia_setkey(struct camellia_ctx *cctx,
+ const unsigned char *key,
+ unsigned int key_len);
+
+/* regular block cipher functions */
+asmlinkage void __camellia_enc_blk(const void *ctx, u8 *dst, const u8 *src,
+ bool xor);
+asmlinkage void camellia_dec_blk(const void *ctx, u8 *dst, const u8 *src);
+
+/* 2-way parallel cipher functions */
+asmlinkage void __camellia_enc_blk_2way(const void *ctx, u8 *dst, const u8 *src,
+ bool xor);
+asmlinkage void camellia_dec_blk_2way(const void *ctx, u8 *dst, const u8 *src);
+
+/* 16-way parallel cipher functions (avx/aes-ni) */
+asmlinkage void camellia_ecb_enc_16way(const void *ctx, u8 *dst, const u8 *src);
+asmlinkage void camellia_ecb_dec_16way(const void *ctx, u8 *dst, const u8 *src);
+
+asmlinkage void camellia_cbc_dec_16way(const void *ctx, u8 *dst, const u8 *src);
+
+static inline void camellia_enc_blk(const void *ctx, u8 *dst, const u8 *src)
+{
+ __camellia_enc_blk(ctx, dst, src, false);
+}
+
+static inline void camellia_enc_blk_xor(const void *ctx, u8 *dst, const u8 *src)
+{
+ __camellia_enc_blk(ctx, dst, src, true);
+}
+
+static inline void camellia_enc_blk_2way(const void *ctx, u8 *dst,
+ const u8 *src)
+{
+ __camellia_enc_blk_2way(ctx, dst, src, false);
+}
+
+static inline void camellia_enc_blk_xor_2way(const void *ctx, u8 *dst,
+ const u8 *src)
+{
+ __camellia_enc_blk_2way(ctx, dst, src, true);
+}
+
+/* glue helpers */
+extern void camellia_decrypt_cbc_2way(const void *ctx, u8 *dst, const u8 *src);
+
+#endif /* ASM_X86_CAMELLIA_H */
diff --git a/arch/x86/crypto/camellia_aesni_avx2_glue.c b/arch/x86/crypto/camellia_aesni_avx2_glue.c
index ccda647422d6..e7e4d64e9577 100644
--- a/arch/x86/crypto/camellia_aesni_avx2_glue.c
+++ b/arch/x86/crypto/camellia_aesni_avx2_glue.c
@@ -5,16 +5,16 @@
* Copyright © 2013 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
*/
-#include <asm/crypto/camellia.h>
-#include <asm/crypto/glue_helper.h>
#include <crypto/algapi.h>
#include <crypto/internal/simd.h>
-#include <crypto/xts.h>
#include <linux/crypto.h>
#include <linux/err.h>
#include <linux/module.h>
#include <linux/types.h>
+#include "camellia.h"
+#include "ecb_cbc_helpers.h"
+
#define CAMELLIA_AESNI_PARALLEL_BLOCKS 16
#define CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS 32
@@ -23,121 +23,6 @@ asmlinkage void camellia_ecb_enc_32way(const void *ctx, u8 *dst, const u8 *src);
asmlinkage void camellia_ecb_dec_32way(const void *ctx, u8 *dst, const u8 *src);
asmlinkage void camellia_cbc_dec_32way(const void *ctx, u8 *dst, const u8 *src);
-asmlinkage void camellia_ctr_32way(const void *ctx, u8 *dst, const u8 *src,
- le128 *iv);
-
-asmlinkage void camellia_xts_enc_32way(const void *ctx, u8 *dst, const u8 *src,
- le128 *iv);
-asmlinkage void camellia_xts_dec_32way(const void *ctx, u8 *dst, const u8 *src,
- le128 *iv);
-
-static const struct common_glue_ctx camellia_enc = {
- .num_funcs = 4,
- .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS,
-
- .funcs = { {
- .num_blocks = CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS,
- .fn_u = { .ecb = camellia_ecb_enc_32way }
- }, {
- .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS,
- .fn_u = { .ecb = camellia_ecb_enc_16way }
- }, {
- .num_blocks = 2,
- .fn_u = { .ecb = camellia_enc_blk_2way }
- }, {
- .num_blocks = 1,
- .fn_u = { .ecb = camellia_enc_blk }
- } }
-};
-
-static const struct common_glue_ctx camellia_ctr = {
- .num_funcs = 4,
- .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS,
-
- .funcs = { {
- .num_blocks = CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS,
- .fn_u = { .ctr = camellia_ctr_32way }
- }, {
- .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS,
- .fn_u = { .ctr = camellia_ctr_16way }
- }, {
- .num_blocks = 2,
- .fn_u = { .ctr = camellia_crypt_ctr_2way }
- }, {
- .num_blocks = 1,
- .fn_u = { .ctr = camellia_crypt_ctr }
- } }
-};
-
-static const struct common_glue_ctx camellia_enc_xts = {
- .num_funcs = 3,
- .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS,
-
- .funcs = { {
- .num_blocks = CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS,
- .fn_u = { .xts = camellia_xts_enc_32way }
- }, {
- .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS,
- .fn_u = { .xts = camellia_xts_enc_16way }
- }, {
- .num_blocks = 1,
- .fn_u = { .xts = camellia_xts_enc }
- } }
-};
-
-static const struct common_glue_ctx camellia_dec = {
- .num_funcs = 4,
- .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS,
-
- .funcs = { {
- .num_blocks = CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS,
- .fn_u = { .ecb = camellia_ecb_dec_32way }
- }, {
- .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS,
- .fn_u = { .ecb = camellia_ecb_dec_16way }
- }, {
- .num_blocks = 2,
- .fn_u = { .ecb = camellia_dec_blk_2way }
- }, {
- .num_blocks = 1,
- .fn_u = { .ecb = camellia_dec_blk }
- } }
-};
-
-static const struct common_glue_ctx camellia_dec_cbc = {
- .num_funcs = 4,
- .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS,
-
- .funcs = { {
- .num_blocks = CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS,
- .fn_u = { .cbc = camellia_cbc_dec_32way }
- }, {
- .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS,
- .fn_u = { .cbc = camellia_cbc_dec_16way }
- }, {
- .num_blocks = 2,
- .fn_u = { .cbc = camellia_decrypt_cbc_2way }
- }, {
- .num_blocks = 1,
- .fn_u = { .cbc = camellia_dec_blk }
- } }
-};
-
-static const struct common_glue_ctx camellia_dec_xts = {
- .num_funcs = 3,
- .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS,
-
- .funcs = { {
- .num_blocks = CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS,
- .fn_u = { .xts = camellia_xts_dec_32way }
- }, {
- .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS,
- .fn_u = { .xts = camellia_xts_dec_16way }
- }, {
- .num_blocks = 1,
- .fn_u = { .xts = camellia_xts_dec }
- } }
-};
static int camellia_setkey(struct crypto_skcipher *tfm, const u8 *key,
unsigned int keylen)
@@ -147,45 +32,39 @@ static int camellia_setkey(struct crypto_skcipher *tfm, const u8 *key,
static int ecb_encrypt(struct skcipher_request *req)
{
- return glue_ecb_req_128bit(&camellia_enc, req);
+ ECB_WALK_START(req, CAMELLIA_BLOCK_SIZE, CAMELLIA_AESNI_PARALLEL_BLOCKS);
+ ECB_BLOCK(CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS, camellia_ecb_enc_32way);
+ ECB_BLOCK(CAMELLIA_AESNI_PARALLEL_BLOCKS, camellia_ecb_enc_16way);
+ ECB_BLOCK(2, camellia_enc_blk_2way);
+ ECB_BLOCK(1, camellia_enc_blk);
+ ECB_WALK_END();
}
static int ecb_decrypt(struct skcipher_request *req)
{
- return glue_ecb_req_128bit(&camellia_dec, req);
+ ECB_WALK_START(req, CAMELLIA_BLOCK_SIZE, CAMELLIA_AESNI_PARALLEL_BLOCKS);
+ ECB_BLOCK(CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS, camellia_ecb_dec_32way);
+ ECB_BLOCK(CAMELLIA_AESNI_PARALLEL_BLOCKS, camellia_ecb_dec_16way);
+ ECB_BLOCK(2, camellia_dec_blk_2way);
+ ECB_BLOCK(1, camellia_dec_blk);
+ ECB_WALK_END();
}
static int cbc_encrypt(struct skcipher_request *req)
{
- return glue_cbc_encrypt_req_128bit(camellia_enc_blk, req);
+ CBC_WALK_START(req, CAMELLIA_BLOCK_SIZE, -1);
+ CBC_ENC_BLOCK(camellia_enc_blk);
+ CBC_WALK_END();
}
static int cbc_decrypt(struct skcipher_request *req)
{
- return glue_cbc_decrypt_req_128bit(&camellia_dec_cbc, req);
-}
-
-static int ctr_crypt(struct skcipher_request *req)
-{
- return glue_ctr_req_128bit(&camellia_ctr, req);
-}
-
-static int xts_encrypt(struct skcipher_request *req)
-{
- struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
- struct camellia_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
-
- return glue_xts_req_128bit(&camellia_enc_xts, req, camellia_enc_blk,
- &ctx->tweak_ctx, &ctx->crypt_ctx, false);
-}
-
-static int xts_decrypt(struct skcipher_request *req)
-{
- struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
- struct camellia_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
-
- return glue_xts_req_128bit(&camellia_dec_xts, req, camellia_enc_blk,
- &ctx->tweak_ctx, &ctx->crypt_ctx, true);
+ CBC_WALK_START(req, CAMELLIA_BLOCK_SIZE, CAMELLIA_AESNI_PARALLEL_BLOCKS);
+ CBC_DEC_BLOCK(CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS, camellia_cbc_dec_32way);
+ CBC_DEC_BLOCK(CAMELLIA_AESNI_PARALLEL_BLOCKS, camellia_cbc_dec_16way);
+ CBC_DEC_BLOCK(2, camellia_decrypt_cbc_2way);
+ CBC_DEC_BLOCK(1, camellia_dec_blk);
+ CBC_WALK_END();
}
static struct skcipher_alg camellia_algs[] = {
@@ -216,35 +95,6 @@ static struct skcipher_alg camellia_algs[] = {
.setkey = camellia_setkey,
.encrypt = cbc_encrypt,
.decrypt = cbc_decrypt,
- }, {
- .base.cra_name = "__ctr(camellia)",
- .base.cra_driver_name = "__ctr-camellia-aesni-avx2",
- .base.cra_priority = 500,
- .base.cra_flags = CRYPTO_ALG_INTERNAL,
- .base.cra_blocksize = 1,
- .base.cra_ctxsize = sizeof(struct camellia_ctx),
- .base.cra_module = THIS_MODULE,
- .min_keysize = CAMELLIA_MIN_KEY_SIZE,
- .max_keysize = CAMELLIA_MAX_KEY_SIZE,
- .ivsize = CAMELLIA_BLOCK_SIZE,
- .chunksize = CAMELLIA_BLOCK_SIZE,
- .setkey = camellia_setkey,
- .encrypt = ctr_crypt,
- .decrypt = ctr_crypt,
- }, {
- .base.cra_name = "__xts(camellia)",
- .base.cra_driver_name = "__xts-camellia-aesni-avx2",
- .base.cra_priority = 500,
- .base.cra_flags = CRYPTO_ALG_INTERNAL,
- .base.cra_blocksize = CAMELLIA_BLOCK_SIZE,
- .base.cra_ctxsize = sizeof(struct camellia_xts_ctx),
- .base.cra_module = THIS_MODULE,
- .min_keysize = 2 * CAMELLIA_MIN_KEY_SIZE,
- .max_keysize = 2 * CAMELLIA_MAX_KEY_SIZE,
- .ivsize = CAMELLIA_BLOCK_SIZE,
- .setkey = xts_camellia_setkey,
- .encrypt = xts_encrypt,
- .decrypt = xts_decrypt,
},
};
diff --git a/arch/x86/crypto/camellia_aesni_avx_glue.c b/arch/x86/crypto/camellia_aesni_avx_glue.c
index 4e5de6ef206e..c7ccf63e741e 100644
--- a/arch/x86/crypto/camellia_aesni_avx_glue.c
+++ b/arch/x86/crypto/camellia_aesni_avx_glue.c
@@ -5,16 +5,16 @@
* Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
*/
-#include <asm/crypto/camellia.h>
-#include <asm/crypto/glue_helper.h>
#include <crypto/algapi.h>
#include <crypto/internal/simd.h>
-#include <crypto/xts.h>
#include <linux/crypto.h>
#include <linux/err.h>
#include <linux/module.h>
#include <linux/types.h>
+#include "camellia.h"
+#include "ecb_cbc_helpers.h"
+
#define CAMELLIA_AESNI_PARALLEL_BLOCKS 16
/* 16-way parallel cipher functions (avx/aes-ni) */
@@ -27,120 +27,6 @@ EXPORT_SYMBOL_GPL(camellia_ecb_dec_16way);
asmlinkage void camellia_cbc_dec_16way(const void *ctx, u8 *dst, const u8 *src);
EXPORT_SYMBOL_GPL(camellia_cbc_dec_16way);
-asmlinkage void camellia_ctr_16way(const void *ctx, u8 *dst, const u8 *src,
- le128 *iv);
-EXPORT_SYMBOL_GPL(camellia_ctr_16way);
-
-asmlinkage void camellia_xts_enc_16way(const void *ctx, u8 *dst, const u8 *src,
- le128 *iv);
-EXPORT_SYMBOL_GPL(camellia_xts_enc_16way);
-
-asmlinkage void camellia_xts_dec_16way(const void *ctx, u8 *dst, const u8 *src,
- le128 *iv);
-EXPORT_SYMBOL_GPL(camellia_xts_dec_16way);
-
-void camellia_xts_enc(const void *ctx, u8 *dst, const u8 *src, le128 *iv)
-{
- glue_xts_crypt_128bit_one(ctx, dst, src, iv, camellia_enc_blk);
-}
-EXPORT_SYMBOL_GPL(camellia_xts_enc);
-
-void camellia_xts_dec(const void *ctx, u8 *dst, const u8 *src, le128 *iv)
-{
- glue_xts_crypt_128bit_one(ctx, dst, src, iv, camellia_dec_blk);
-}
-EXPORT_SYMBOL_GPL(camellia_xts_dec);
-
-static const struct common_glue_ctx camellia_enc = {
- .num_funcs = 3,
- .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS,
-
- .funcs = { {
- .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS,
- .fn_u = { .ecb = camellia_ecb_enc_16way }
- }, {
- .num_blocks = 2,
- .fn_u = { .ecb = camellia_enc_blk_2way }
- }, {
- .num_blocks = 1,
- .fn_u = { .ecb = camellia_enc_blk }
- } }
-};
-
-static const struct common_glue_ctx camellia_ctr = {
- .num_funcs = 3,
- .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS,
-
- .funcs = { {
- .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS,
- .fn_u = { .ctr = camellia_ctr_16way }
- }, {
- .num_blocks = 2,
- .fn_u = { .ctr = camellia_crypt_ctr_2way }
- }, {
- .num_blocks = 1,
- .fn_u = { .ctr = camellia_crypt_ctr }
- } }
-};
-
-static const struct common_glue_ctx camellia_enc_xts = {
- .num_funcs = 2,
- .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS,
-
- .funcs = { {
- .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS,
- .fn_u = { .xts = camellia_xts_enc_16way }
- }, {
- .num_blocks = 1,
- .fn_u = { .xts = camellia_xts_enc }
- } }
-};
-
-static const struct common_glue_ctx camellia_dec = {
- .num_funcs = 3,
- .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS,
-
- .funcs = { {
- .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS,
- .fn_u = { .ecb = camellia_ecb_dec_16way }
- }, {
- .num_blocks = 2,
- .fn_u = { .ecb = camellia_dec_blk_2way }
- }, {
- .num_blocks = 1,
- .fn_u = { .ecb = camellia_dec_blk }
- } }
-};
-
-static const struct common_glue_ctx camellia_dec_cbc = {
- .num_funcs = 3,
- .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS,
-
- .funcs = { {
- .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS,
- .fn_u = { .cbc = camellia_cbc_dec_16way }
- }, {
- .num_blocks = 2,
- .fn_u = { .cbc = camellia_decrypt_cbc_2way }
- }, {
- .num_blocks = 1,
- .fn_u = { .cbc = camellia_dec_blk }
- } }
-};
-
-static const struct common_glue_ctx camellia_dec_xts = {
- .num_funcs = 2,
- .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS,
-
- .funcs = { {
- .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS,
- .fn_u = { .xts = camellia_xts_dec_16way }
- }, {
- .num_blocks = 1,
- .fn_u = { .xts = camellia_xts_dec }
- } }
-};
-
static int camellia_setkey(struct crypto_skcipher *tfm, const u8 *key,
unsigned int keylen)
{
@@ -149,65 +35,36 @@ static int camellia_setkey(struct crypto_skcipher *tfm, const u8 *key,
static int ecb_encrypt(struct skcipher_request *req)
{
- return glue_ecb_req_128bit(&camellia_enc, req);
+ ECB_WALK_START(req, CAMELLIA_BLOCK_SIZE, CAMELLIA_AESNI_PARALLEL_BLOCKS);
+ ECB_BLOCK(CAMELLIA_AESNI_PARALLEL_BLOCKS, camellia_ecb_enc_16way);
+ ECB_BLOCK(2, camellia_enc_blk_2way);
+ ECB_BLOCK(1, camellia_enc_blk);
+ ECB_WALK_END();
}
static int ecb_decrypt(struct skcipher_request *req)
{
- return glue_ecb_req_128bit(&camellia_dec, req);
+ ECB_WALK_START(req, CAMELLIA_BLOCK_SIZE, CAMELLIA_AESNI_PARALLEL_BLOCKS);
+ ECB_BLOCK(CAMELLIA_AESNI_PARALLEL_BLOCKS, camellia_ecb_dec_16way);
+ ECB_BLOCK(2, camellia_dec_blk_2way);
+ ECB_BLOCK(1, camellia_dec_blk);
+ ECB_WALK_END();
}
static int cbc_encrypt(struct skcipher_request *req)
{
- return glue_cbc_encrypt_req_128bit(camellia_enc_blk, req);
+ CBC_WALK_START(req, CAMELLIA_BLOCK_SIZE, -1);
+ CBC_ENC_BLOCK(camellia_enc_blk);
+ CBC_WALK_END();
}
static int cbc_decrypt(struct skcipher_request *req)
{
- return glue_cbc_decrypt_req_128bit(&camellia_dec_cbc, req);
-}
-
-static int ctr_crypt(struct skcipher_request *req)
-{
- return glue_ctr_req_128bit(&camellia_ctr, req);
-}
-
-int xts_camellia_setkey(struct crypto_skcipher *tfm, const u8 *key,
- unsigned int keylen)
-{
- struct camellia_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
- int err;
-
- err = xts_verify_key(tfm, key, keylen);
- if (err)
- return err;
-
- /* first half of xts-key is for crypt */
- err = __camellia_setkey(&ctx->crypt_ctx, key, keylen / 2);
- if (err)
- return err;
-
- /* second half of xts-key is for tweak */
- return __camellia_setkey(&ctx->tweak_ctx, key + keylen / 2, keylen / 2);
-}
-EXPORT_SYMBOL_GPL(xts_camellia_setkey);
-
-static int xts_encrypt(struct skcipher_request *req)
-{
- struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
- struct camellia_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
-
- return glue_xts_req_128bit(&camellia_enc_xts, req, camellia_enc_blk,
- &ctx->tweak_ctx, &ctx->crypt_ctx, false);
-}
-
-static int xts_decrypt(struct skcipher_request *req)
-{
- struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
- struct camellia_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
-
- return glue_xts_req_128bit(&camellia_dec_xts, req, camellia_enc_blk,
- &ctx->tweak_ctx, &ctx->crypt_ctx, true);
+ CBC_WALK_START(req, CAMELLIA_BLOCK_SIZE, CAMELLIA_AESNI_PARALLEL_BLOCKS);
+ CBC_DEC_BLOCK(CAMELLIA_AESNI_PARALLEL_BLOCKS, camellia_cbc_dec_16way);
+ CBC_DEC_BLOCK(2, camellia_decrypt_cbc_2way);
+ CBC_DEC_BLOCK(1, camellia_dec_blk);
+ CBC_WALK_END();
}
static struct skcipher_alg camellia_algs[] = {
@@ -238,36 +95,7 @@ static struct skcipher_alg camellia_algs[] = {
.setkey = camellia_setkey,
.encrypt = cbc_encrypt,
.decrypt = cbc_decrypt,
- }, {
- .base.cra_name = "__ctr(camellia)",
- .base.cra_driver_name = "__ctr-camellia-aesni",
- .base.cra_priority = 400,
- .base.cra_flags = CRYPTO_ALG_INTERNAL,
- .base.cra_blocksize = 1,
- .base.cra_ctxsize = sizeof(struct camellia_ctx),
- .base.cra_module = THIS_MODULE,
- .min_keysize = CAMELLIA_MIN_KEY_SIZE,
- .max_keysize = CAMELLIA_MAX_KEY_SIZE,
- .ivsize = CAMELLIA_BLOCK_SIZE,
- .chunksize = CAMELLIA_BLOCK_SIZE,
- .setkey = camellia_setkey,
- .encrypt = ctr_crypt,
- .decrypt = ctr_crypt,
- }, {
- .base.cra_name = "__xts(camellia)",
- .base.cra_driver_name = "__xts-camellia-aesni",
- .base.cra_priority = 400,
- .base.cra_flags = CRYPTO_ALG_INTERNAL,
- .base.cra_blocksize = CAMELLIA_BLOCK_SIZE,
- .base.cra_ctxsize = sizeof(struct camellia_xts_ctx),
- .base.cra_module = THIS_MODULE,
- .min_keysize = 2 * CAMELLIA_MIN_KEY_SIZE,
- .max_keysize = 2 * CAMELLIA_MAX_KEY_SIZE,
- .ivsize = CAMELLIA_BLOCK_SIZE,
- .setkey = xts_camellia_setkey,
- .encrypt = xts_encrypt,
- .decrypt = xts_decrypt,
- },
+ }
};
static struct simd_skcipher_alg *camellia_simd_algs[ARRAY_SIZE(camellia_algs)];
diff --git a/arch/x86/crypto/camellia_glue.c b/arch/x86/crypto/camellia_glue.c
index 242c056e5fa8..d45e9c0c42ac 100644
--- a/arch/x86/crypto/camellia_glue.c
+++ b/arch/x86/crypto/camellia_glue.c
@@ -14,8 +14,9 @@
#include <linux/module.h>
#include <linux/types.h>
#include <crypto/algapi.h>
-#include <asm/crypto/camellia.h>
-#include <asm/crypto/glue_helper.h>
+
+#include "camellia.h"
+#include "ecb_cbc_helpers.h"
/* regular block cipher functions */
asmlinkage void __camellia_enc_blk(const void *ctx, u8 *dst, const u8 *src,
@@ -1262,129 +1263,47 @@ static int camellia_setkey_skcipher(struct crypto_skcipher *tfm, const u8 *key,
return camellia_setkey(&tfm->base, key, key_len);
}
-void camellia_decrypt_cbc_2way(const void *ctx, u8 *d, const u8 *s)
+void camellia_decrypt_cbc_2way(const void *ctx, u8 *dst, const u8 *src)
{
- u128 *dst = (u128 *)d;
- const u128 *src = (const u128 *)s;
- u128 iv = *src;
-
- camellia_dec_blk_2way(ctx, (u8 *)dst, (u8 *)src);
+ u8 buf[CAMELLIA_BLOCK_SIZE];
+ const u8 *iv = src;
- u128_xor(&dst[1], &dst[1], &iv);
+ if (dst == src)
+ iv = memcpy(buf, iv, sizeof(buf));
+ camellia_dec_blk_2way(ctx, dst, src);
+ crypto_xor(dst + CAMELLIA_BLOCK_SIZE, iv, CAMELLIA_BLOCK_SIZE);
}
EXPORT_SYMBOL_GPL(camellia_decrypt_cbc_2way);
-void camellia_crypt_ctr(const void *ctx, u8 *d, const u8 *s, le128 *iv)
-{
- be128 ctrblk;
- u128 *dst = (u128 *)d;
- const u128 *src = (const u128 *)s;
-
- if (dst != src)
- *dst = *src;
-
- le128_to_be128(&ctrblk, iv);
- le128_inc(iv);
-
- camellia_enc_blk_xor(ctx, (u8 *)dst, (u8 *)&ctrblk);
-}
-EXPORT_SYMBOL_GPL(camellia_crypt_ctr);
-
-void camellia_crypt_ctr_2way(const void *ctx, u8 *d, const u8 *s, le128 *iv)
-{
- be128 ctrblks[2];
- u128 *dst = (u128 *)d;
- const u128 *src = (const u128 *)s;
-
- if (dst != src) {
- dst[0] = src[0];
- dst[1] = src[1];
- }
-
- le128_to_be128(&ctrblks[0], iv);
- le128_inc(iv);
- le128_to_be128(&ctrblks[1], iv);
- le128_inc(iv);
-
- camellia_enc_blk_xor_2way(ctx, (u8 *)dst, (u8 *)ctrblks);
-}
-EXPORT_SYMBOL_GPL(camellia_crypt_ctr_2way);
-
-static const struct common_glue_ctx camellia_enc = {
- .num_funcs = 2,
- .fpu_blocks_limit = -1,
-
- .funcs = { {
- .num_blocks = 2,
- .fn_u = { .ecb = camellia_enc_blk_2way }
- }, {
- .num_blocks = 1,
- .fn_u = { .ecb = camellia_enc_blk }
- } }
-};
-
-static const struct common_glue_ctx camellia_ctr = {
- .num_funcs = 2,
- .fpu_blocks_limit = -1,
-
- .funcs = { {
- .num_blocks = 2,
- .fn_u = { .ctr = camellia_crypt_ctr_2way }
- }, {
- .num_blocks = 1,
- .fn_u = { .ctr = camellia_crypt_ctr }
- } }
-};
-
-static const struct common_glue_ctx camellia_dec = {
- .num_funcs = 2,
- .fpu_blocks_limit = -1,
-
- .funcs = { {
- .num_blocks = 2,
- .fn_u = { .ecb = camellia_dec_blk_2way }
- }, {
- .num_blocks = 1,
- .fn_u = { .ecb = camellia_dec_blk }
- } }
-};
-
-static const struct common_glue_ctx camellia_dec_cbc = {
- .num_funcs = 2,
- .fpu_blocks_limit = -1,
-
- .funcs = { {
- .num_blocks = 2,
- .fn_u = { .cbc = camellia_decrypt_cbc_2way }
- }, {
- .num_blocks = 1,
- .fn_u = { .cbc = camellia_dec_blk }
- } }
-};
-
static int ecb_encrypt(struct skcipher_request *req)
{
- return glue_ecb_req_128bit(&camellia_enc, req);
+ ECB_WALK_START(req, CAMELLIA_BLOCK_SIZE, -1);
+ ECB_BLOCK(2, camellia_enc_blk_2way);
+ ECB_BLOCK(1, camellia_enc_blk);
+ ECB_WALK_END();
}
static int ecb_decrypt(struct skcipher_request *req)
{
- return glue_ecb_req_128bit(&camellia_dec, req);
+ ECB_WALK_START(req, CAMELLIA_BLOCK_SIZE, -1);
+ ECB_BLOCK(2, camellia_dec_blk_2way);
+ ECB_BLOCK(1, camellia_dec_blk);
+ ECB_WALK_END();
}
static int cbc_encrypt(struct skcipher_request *req)
{
- return glue_cbc_encrypt_req_128bit(camellia_enc_blk, req);
+ CBC_WALK_START(req, CAMELLIA_BLOCK_SIZE, -1);
+ CBC_ENC_BLOCK(camellia_enc_blk);
+ CBC_WALK_END();
}
static int cbc_decrypt(struct skcipher_request *req)
{
- return glue_cbc_decrypt_req_128bit(&camellia_dec_cbc, req);
-}
-
-static int ctr_crypt(struct skcipher_request *req)
-{
- return glue_ctr_req_128bit(&camellia_ctr, req);
+ CBC_WALK_START(req, CAMELLIA_BLOCK_SIZE, -1);
+ CBC_DEC_BLOCK(2, camellia_decrypt_cbc_2way);
+ CBC_DEC_BLOCK(1, camellia_dec_blk);
+ CBC_WALK_END();
}
static struct crypto_alg camellia_cipher_alg = {
@@ -1433,20 +1352,6 @@ static struct skcipher_alg camellia_skcipher_algs[] = {
.setkey = camellia_setkey_skcipher,
.encrypt = cbc_encrypt,
.decrypt = cbc_decrypt,
- }, {
- .base.cra_name = "ctr(camellia)",
- .base.cra_driver_name = "ctr-camellia-asm",
- .base.cra_priority = 300,
- .base.cra_blocksize = 1,
- .base.cra_ctxsize = sizeof(struct camellia_ctx),
- .base.cra_module = THIS_MODULE,
- .min_keysize = CAMELLIA_MIN_KEY_SIZE,
- .max_keysize = CAMELLIA_MAX_KEY_SIZE,
- .ivsize = CAMELLIA_BLOCK_SIZE,
- .chunksize = CAMELLIA_BLOCK_SIZE,
- .setkey = camellia_setkey_skcipher,
- .encrypt = ctr_crypt,
- .decrypt = ctr_crypt,
}
};
@@ -1472,7 +1377,7 @@ static int force;
module_param(force, int, 0);
MODULE_PARM_DESC(force, "Force module load, ignore CPU blacklist");
-static int __init init(void)
+static int __init camellia_init(void)
{
int err;
@@ -1496,15 +1401,15 @@ static int __init init(void)
return err;
}
-static void __exit fini(void)
+static void __exit camellia_fini(void)
{
crypto_unregister_alg(&camellia_cipher_alg);
crypto_unregister_skciphers(camellia_skcipher_algs,
ARRAY_SIZE(camellia_skcipher_algs));
}
-module_init(init);
-module_exit(fini);
+module_init(camellia_init);
+module_exit(camellia_fini);
MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("Camellia Cipher Algorithm, asm optimized");
diff --git a/arch/x86/crypto/cast5-avx-x86_64-asm_64.S b/arch/x86/crypto/cast5-avx-x86_64-asm_64.S
index 8a6181b08b59..b258af420c92 100644
--- a/arch/x86/crypto/cast5-avx-x86_64-asm_64.S
+++ b/arch/x86/crypto/cast5-avx-x86_64-asm_64.S
@@ -279,7 +279,7 @@ SYM_FUNC_START_LOCAL(__cast5_enc_blk16)
outunpack_blocks(RR3, RL3, RTMP, RX, RKM);
outunpack_blocks(RR4, RL4, RTMP, RX, RKM);
- ret;
+ RET;
SYM_FUNC_END(__cast5_enc_blk16)
.align 16
@@ -352,7 +352,7 @@ SYM_FUNC_START_LOCAL(__cast5_dec_blk16)
outunpack_blocks(RR3, RL3, RTMP, RX, RKM);
outunpack_blocks(RR4, RL4, RTMP, RX, RKM);
- ret;
+ RET;
.L__skip_dec:
vpsrldq $4, RKR, RKR;
@@ -393,7 +393,7 @@ SYM_FUNC_START(cast5_ecb_enc_16way)
popq %r15;
FRAME_END
- ret;
+ RET;
SYM_FUNC_END(cast5_ecb_enc_16way)
SYM_FUNC_START(cast5_ecb_dec_16way)
@@ -431,7 +431,7 @@ SYM_FUNC_START(cast5_ecb_dec_16way)
popq %r15;
FRAME_END
- ret;
+ RET;
SYM_FUNC_END(cast5_ecb_dec_16way)
SYM_FUNC_START(cast5_cbc_dec_16way)
@@ -483,7 +483,7 @@ SYM_FUNC_START(cast5_cbc_dec_16way)
popq %r15;
popq %r12;
FRAME_END
- ret;
+ RET;
SYM_FUNC_END(cast5_cbc_dec_16way)
SYM_FUNC_START(cast5_ctr_16way)
@@ -559,5 +559,5 @@ SYM_FUNC_START(cast5_ctr_16way)
popq %r15;
popq %r12;
FRAME_END
- ret;
+ RET;
SYM_FUNC_END(cast5_ctr_16way)
diff --git a/arch/x86/crypto/cast5_avx_glue.c b/arch/x86/crypto/cast5_avx_glue.c
index 384ccb00f9e1..3976a87f92ad 100644
--- a/arch/x86/crypto/cast5_avx_glue.c
+++ b/arch/x86/crypto/cast5_avx_glue.c
@@ -6,7 +6,6 @@
* <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
*/
-#include <asm/crypto/glue_helper.h>
#include <crypto/algapi.h>
#include <crypto/cast5.h>
#include <crypto/internal/simd.h>
@@ -15,6 +14,8 @@
#include <linux/module.h>
#include <linux/types.h>
+#include "ecb_cbc_helpers.h"
+
#define CAST5_PARALLEL_BLOCKS 16
asmlinkage void cast5_ecb_enc_16way(struct cast5_ctx *ctx, u8 *dst,
@@ -23,8 +24,6 @@ asmlinkage void cast5_ecb_dec_16way(struct cast5_ctx *ctx, u8 *dst,
const u8 *src);
asmlinkage void cast5_cbc_dec_16way(struct cast5_ctx *ctx, u8 *dst,
const u8 *src);
-asmlinkage void cast5_ctr_16way(struct cast5_ctx *ctx, u8 *dst, const u8 *src,
- __be64 *iv);
static int cast5_setkey_skcipher(struct crypto_skcipher *tfm, const u8 *key,
unsigned int keylen)
@@ -32,272 +31,35 @@ static int cast5_setkey_skcipher(struct crypto_skcipher *tfm, const u8 *key,
return cast5_setkey(&tfm->base, key, keylen);
}
-static inline bool cast5_fpu_begin(bool fpu_enabled, struct skcipher_walk *walk,
- unsigned int nbytes)
-{
- return glue_fpu_begin(CAST5_BLOCK_SIZE, CAST5_PARALLEL_BLOCKS,
- walk, fpu_enabled, nbytes);
-}
-
-static inline void cast5_fpu_end(bool fpu_enabled)
-{
- return glue_fpu_end(fpu_enabled);
-}
-
-static int ecb_crypt(struct skcipher_request *req, bool enc)
-{
- bool fpu_enabled = false;
- struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
- struct cast5_ctx *ctx = crypto_skcipher_ctx(tfm);
- struct skcipher_walk walk;
- const unsigned int bsize = CAST5_BLOCK_SIZE;
- unsigned int nbytes;
- void (*fn)(struct cast5_ctx *ctx, u8 *dst, const u8 *src);
- int err;
-
- err = skcipher_walk_virt(&walk, req, false);
-
- while ((nbytes = walk.nbytes)) {
- u8 *wsrc = walk.src.virt.addr;
- u8 *wdst = walk.dst.virt.addr;
-
- fpu_enabled = cast5_fpu_begin(fpu_enabled, &walk, nbytes);
-
- /* Process multi-block batch */
- if (nbytes >= bsize * CAST5_PARALLEL_BLOCKS) {
- fn = (enc) ? cast5_ecb_enc_16way : cast5_ecb_dec_16way;
- do {
- fn(ctx, wdst, wsrc);
-
- wsrc += bsize * CAST5_PARALLEL_BLOCKS;
- wdst += bsize * CAST5_PARALLEL_BLOCKS;
- nbytes -= bsize * CAST5_PARALLEL_BLOCKS;
- } while (nbytes >= bsize * CAST5_PARALLEL_BLOCKS);
-
- if (nbytes < bsize)
- goto done;
- }
-
- fn = (enc) ? __cast5_encrypt : __cast5_decrypt;
-
- /* Handle leftovers */
- do {
- fn(ctx, wdst, wsrc);
-
- wsrc += bsize;
- wdst += bsize;
- nbytes -= bsize;
- } while (nbytes >= bsize);
-
-done:
- err = skcipher_walk_done(&walk, nbytes);
- }
-
- cast5_fpu_end(fpu_enabled);
- return err;
-}
-
static int ecb_encrypt(struct skcipher_request *req)
{
- return ecb_crypt(req, true);
+ ECB_WALK_START(req, CAST5_BLOCK_SIZE, CAST5_PARALLEL_BLOCKS);
+ ECB_BLOCK(CAST5_PARALLEL_BLOCKS, cast5_ecb_enc_16way);
+ ECB_BLOCK(1, __cast5_encrypt);
+ ECB_WALK_END();
}
static int ecb_decrypt(struct skcipher_request *req)
{
- return ecb_crypt(req, false);
+ ECB_WALK_START(req, CAST5_BLOCK_SIZE, CAST5_PARALLEL_BLOCKS);
+ ECB_BLOCK(CAST5_PARALLEL_BLOCKS, cast5_ecb_dec_16way);
+ ECB_BLOCK(1, __cast5_decrypt);
+ ECB_WALK_END();
}
static int cbc_encrypt(struct skcipher_request *req)
{
- const unsigned int bsize = CAST5_BLOCK_SIZE;
- struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
- struct cast5_ctx *ctx = crypto_skcipher_ctx(tfm);
- struct skcipher_walk walk;
- unsigned int nbytes;
- int err;
-
- err = skcipher_walk_virt(&walk, req, false);
-
- while ((nbytes = walk.nbytes)) {
- u64 *src = (u64 *)walk.src.virt.addr;
- u64 *dst = (u64 *)walk.dst.virt.addr;
- u64 *iv = (u64 *)walk.iv;
-
- do {
- *dst = *src ^ *iv;
- __cast5_encrypt(ctx, (u8 *)dst, (u8 *)dst);
- iv = dst;
- src++;
- dst++;
- nbytes -= bsize;
- } while (nbytes >= bsize);
-
- *(u64 *)walk.iv = *iv;
- err = skcipher_walk_done(&walk, nbytes);
- }
-
- return err;
-}
-
-static unsigned int __cbc_decrypt(struct cast5_ctx *ctx,
- struct skcipher_walk *walk)
-{
- const unsigned int bsize = CAST5_BLOCK_SIZE;
- unsigned int nbytes = walk->nbytes;
- u64 *src = (u64 *)walk->src.virt.addr;
- u64 *dst = (u64 *)walk->dst.virt.addr;
- u64 last_iv;
-
- /* Start of the last block. */
- src += nbytes / bsize - 1;
- dst += nbytes / bsize - 1;
-
- last_iv = *src;
-
- /* Process multi-block batch */
- if (nbytes >= bsize * CAST5_PARALLEL_BLOCKS) {
- do {
- nbytes -= bsize * (CAST5_PARALLEL_BLOCKS - 1);
- src -= CAST5_PARALLEL_BLOCKS - 1;
- dst -= CAST5_PARALLEL_BLOCKS - 1;
-
- cast5_cbc_dec_16way(ctx, (u8 *)dst, (u8 *)src);
-
- nbytes -= bsize;
- if (nbytes < bsize)
- goto done;
-
- *dst ^= *(src - 1);
- src -= 1;
- dst -= 1;
- } while (nbytes >= bsize * CAST5_PARALLEL_BLOCKS);
- }
-
- /* Handle leftovers */
- for (;;) {
- __cast5_decrypt(ctx, (u8 *)dst, (u8 *)src);
-
- nbytes -= bsize;
- if (nbytes < bsize)
- break;
-
- *dst ^= *(src - 1);
- src -= 1;
- dst -= 1;
- }
-
-done:
- *dst ^= *(u64 *)walk->iv;
- *(u64 *)walk->iv = last_iv;
-
- return nbytes;
+ CBC_WALK_START(req, CAST5_BLOCK_SIZE, -1);
+ CBC_ENC_BLOCK(__cast5_encrypt);
+ CBC_WALK_END();
}
static int cbc_decrypt(struct skcipher_request *req)
{
- struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
- struct cast5_ctx *ctx = crypto_skcipher_ctx(tfm);
- bool fpu_enabled = false;
- struct skcipher_walk walk;
- unsigned int nbytes;
- int err;
-
- err = skcipher_walk_virt(&walk, req, false);
-
- while ((nbytes = walk.nbytes)) {
- fpu_enabled = cast5_fpu_begin(fpu_enabled, &walk, nbytes);
- nbytes = __cbc_decrypt(ctx, &walk);
- err = skcipher_walk_done(&walk, nbytes);
- }
-
- cast5_fpu_end(fpu_enabled);
- return err;
-}
-
-static void ctr_crypt_final(struct skcipher_walk *walk, struct cast5_ctx *ctx)
-{
- u8 *ctrblk = walk->iv;
- u8 keystream[CAST5_BLOCK_SIZE];
- u8 *src = walk->src.virt.addr;
- u8 *dst = walk->dst.virt.addr;
- unsigned int nbytes = walk->nbytes;
-
- __cast5_encrypt(ctx, keystream, ctrblk);
- crypto_xor_cpy(dst, keystream, src, nbytes);
-
- crypto_inc(ctrblk, CAST5_BLOCK_SIZE);
-}
-
-static unsigned int __ctr_crypt(struct skcipher_walk *walk,
- struct cast5_ctx *ctx)
-{
- const unsigned int bsize = CAST5_BLOCK_SIZE;
- unsigned int nbytes = walk->nbytes;
- u64 *src = (u64 *)walk->src.virt.addr;
- u64 *dst = (u64 *)walk->dst.virt.addr;
-
- /* Process multi-block batch */
- if (nbytes >= bsize * CAST5_PARALLEL_BLOCKS) {
- do {
- cast5_ctr_16way(ctx, (u8 *)dst, (u8 *)src,
- (__be64 *)walk->iv);
-
- src += CAST5_PARALLEL_BLOCKS;
- dst += CAST5_PARALLEL_BLOCKS;
- nbytes -= bsize * CAST5_PARALLEL_BLOCKS;
- } while (nbytes >= bsize * CAST5_PARALLEL_BLOCKS);
-
- if (nbytes < bsize)
- goto done;
- }
-
- /* Handle leftovers */
- do {
- u64 ctrblk;
-
- if (dst != src)
- *dst = *src;
-
- ctrblk = *(u64 *)walk->iv;
- be64_add_cpu((__be64 *)walk->iv, 1);
-
- __cast5_encrypt(ctx, (u8 *)&ctrblk, (u8 *)&ctrblk);
- *dst ^= ctrblk;
-
- src += 1;
- dst += 1;
- nbytes -= bsize;
- } while (nbytes >= bsize);
-
-done:
- return nbytes;
-}
-
-static int ctr_crypt(struct skcipher_request *req)
-{
- struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
- struct cast5_ctx *ctx = crypto_skcipher_ctx(tfm);
- bool fpu_enabled = false;
- struct skcipher_walk walk;
- unsigned int nbytes;
- int err;
-
- err = skcipher_walk_virt(&walk, req, false);
-
- while ((nbytes = walk.nbytes) >= CAST5_BLOCK_SIZE) {
- fpu_enabled = cast5_fpu_begin(fpu_enabled, &walk, nbytes);
- nbytes = __ctr_crypt(&walk, ctx);
- err = skcipher_walk_done(&walk, nbytes);
- }
-
- cast5_fpu_end(fpu_enabled);
-
- if (walk.nbytes) {
- ctr_crypt_final(&walk, ctx);
- err = skcipher_walk_done(&walk, 0);
- }
-
- return err;
+ CBC_WALK_START(req, CAST5_BLOCK_SIZE, CAST5_PARALLEL_BLOCKS);
+ CBC_DEC_BLOCK(CAST5_PARALLEL_BLOCKS, cast5_cbc_dec_16way);
+ CBC_DEC_BLOCK(1, __cast5_decrypt);
+ CBC_WALK_END();
}
static struct skcipher_alg cast5_algs[] = {
@@ -328,21 +90,6 @@ static struct skcipher_alg cast5_algs[] = {
.setkey = cast5_setkey_skcipher,
.encrypt = cbc_encrypt,
.decrypt = cbc_decrypt,
- }, {
- .base.cra_name = "__ctr(cast5)",
- .base.cra_driver_name = "__ctr-cast5-avx",
- .base.cra_priority = 200,
- .base.cra_flags = CRYPTO_ALG_INTERNAL,
- .base.cra_blocksize = 1,
- .base.cra_ctxsize = sizeof(struct cast5_ctx),
- .base.cra_module = THIS_MODULE,
- .min_keysize = CAST5_MIN_KEY_SIZE,
- .max_keysize = CAST5_MAX_KEY_SIZE,
- .ivsize = CAST5_BLOCK_SIZE,
- .chunksize = CAST5_BLOCK_SIZE,
- .setkey = cast5_setkey_skcipher,
- .encrypt = ctr_crypt,
- .decrypt = ctr_crypt,
}
};
diff --git a/arch/x86/crypto/cast6-avx-x86_64-asm_64.S b/arch/x86/crypto/cast6-avx-x86_64-asm_64.S
index 932a3ce32a88..82b716fd5dba 100644
--- a/arch/x86/crypto/cast6-avx-x86_64-asm_64.S
+++ b/arch/x86/crypto/cast6-avx-x86_64-asm_64.S
@@ -212,8 +212,6 @@
.section .rodata.cst16, "aM", @progbits, 16
.align 16
-.Lxts_gf128mul_and_shl1_mask:
- .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0
.Lbswap_mask:
.byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
.Lbswap128_mask:
@@ -291,7 +289,7 @@ SYM_FUNC_START_LOCAL(__cast6_enc_blk8)
outunpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
outunpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
- ret;
+ RET;
SYM_FUNC_END(__cast6_enc_blk8)
.align 8
@@ -338,7 +336,7 @@ SYM_FUNC_START_LOCAL(__cast6_dec_blk8)
outunpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
outunpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
- ret;
+ RET;
SYM_FUNC_END(__cast6_dec_blk8)
SYM_FUNC_START(cast6_ecb_enc_8way)
@@ -361,7 +359,7 @@ SYM_FUNC_START(cast6_ecb_enc_8way)
popq %r15;
FRAME_END
- ret;
+ RET;
SYM_FUNC_END(cast6_ecb_enc_8way)
SYM_FUNC_START(cast6_ecb_dec_8way)
@@ -384,7 +382,7 @@ SYM_FUNC_START(cast6_ecb_dec_8way)
popq %r15;
FRAME_END
- ret;
+ RET;
SYM_FUNC_END(cast6_ecb_dec_8way)
SYM_FUNC_START(cast6_cbc_dec_8way)
@@ -410,87 +408,5 @@ SYM_FUNC_START(cast6_cbc_dec_8way)
popq %r15;
popq %r12;
FRAME_END
- ret;
+ RET;
SYM_FUNC_END(cast6_cbc_dec_8way)
-
-SYM_FUNC_START(cast6_ctr_8way)
- /* input:
- * %rdi: ctx, CTX
- * %rsi: dst
- * %rdx: src
- * %rcx: iv (little endian, 128bit)
- */
- FRAME_BEGIN
- pushq %r12;
- pushq %r15
-
- movq %rdi, CTX;
- movq %rsi, %r11;
- movq %rdx, %r12;
-
- load_ctr_8way(%rcx, .Lbswap128_mask, RA1, RB1, RC1, RD1, RA2, RB2, RC2,
- RD2, RX, RKR, RKM);
-
- call __cast6_enc_blk8;
-
- store_ctr_8way(%r12, %r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
-
- popq %r15;
- popq %r12;
- FRAME_END
- ret;
-SYM_FUNC_END(cast6_ctr_8way)
-
-SYM_FUNC_START(cast6_xts_enc_8way)
- /* input:
- * %rdi: ctx, CTX
- * %rsi: dst
- * %rdx: src
- * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
- */
- FRAME_BEGIN
- pushq %r15;
-
- movq %rdi, CTX
- movq %rsi, %r11;
-
- /* regs <= src, dst <= IVs, regs <= regs xor IVs */
- load_xts_8way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2,
- RX, RKR, RKM, .Lxts_gf128mul_and_shl1_mask);
-
- call __cast6_enc_blk8;
-
- /* dst <= regs xor IVs(in dst) */
- store_xts_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
-
- popq %r15;
- FRAME_END
- ret;
-SYM_FUNC_END(cast6_xts_enc_8way)
-
-SYM_FUNC_START(cast6_xts_dec_8way)
- /* input:
- * %rdi: ctx, CTX
- * %rsi: dst
- * %rdx: src
- * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
- */
- FRAME_BEGIN
- pushq %r15;
-
- movq %rdi, CTX
- movq %rsi, %r11;
-
- /* regs <= src, dst <= IVs, regs <= regs xor IVs */
- load_xts_8way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2,
- RX, RKR, RKM, .Lxts_gf128mul_and_shl1_mask);
-
- call __cast6_dec_blk8;
-
- /* dst <= regs xor IVs(in dst) */
- store_xts_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
-
- popq %r15;
- FRAME_END
- ret;
-SYM_FUNC_END(cast6_xts_dec_8way)
diff --git a/arch/x86/crypto/cast6_avx_glue.c b/arch/x86/crypto/cast6_avx_glue.c
index 48e0f37796fa..7e2aea372349 100644
--- a/arch/x86/crypto/cast6_avx_glue.c
+++ b/arch/x86/crypto/cast6_avx_glue.c
@@ -15,8 +15,8 @@
#include <crypto/algapi.h>
#include <crypto/cast6.h>
#include <crypto/internal/simd.h>
-#include <crypto/xts.h>
-#include <asm/crypto/glue_helper.h>
+
+#include "ecb_cbc_helpers.h"
#define CAST6_PARALLEL_BLOCKS 8
@@ -24,13 +24,6 @@ asmlinkage void cast6_ecb_enc_8way(const void *ctx, u8 *dst, const u8 *src);
asmlinkage void cast6_ecb_dec_8way(const void *ctx, u8 *dst, const u8 *src);
asmlinkage void cast6_cbc_dec_8way(const void *ctx, u8 *dst, const u8 *src);
-asmlinkage void cast6_ctr_8way(const void *ctx, u8 *dst, const u8 *src,
- le128 *iv);
-
-asmlinkage void cast6_xts_enc_8way(const void *ctx, u8 *dst, const u8 *src,
- le128 *iv);
-asmlinkage void cast6_xts_dec_8way(const void *ctx, u8 *dst, const u8 *src,
- le128 *iv);
static int cast6_setkey_skcipher(struct crypto_skcipher *tfm,
const u8 *key, unsigned int keylen)
@@ -38,172 +31,35 @@ static int cast6_setkey_skcipher(struct crypto_skcipher *tfm,
return cast6_setkey(&tfm->base, key, keylen);
}
-static void cast6_xts_enc(const void *ctx, u8 *dst, const u8 *src, le128 *iv)
-{
- glue_xts_crypt_128bit_one(ctx, dst, src, iv, __cast6_encrypt);
-}
-
-static void cast6_xts_dec(const void *ctx, u8 *dst, const u8 *src, le128 *iv)
-{
- glue_xts_crypt_128bit_one(ctx, dst, src, iv, __cast6_decrypt);
-}
-
-static void cast6_crypt_ctr(const void *ctx, u8 *d, const u8 *s, le128 *iv)
-{
- be128 ctrblk;
- u128 *dst = (u128 *)d;
- const u128 *src = (const u128 *)s;
-
- le128_to_be128(&ctrblk, iv);
- le128_inc(iv);
-
- __cast6_encrypt(ctx, (u8 *)&ctrblk, (u8 *)&ctrblk);
- u128_xor(dst, src, (u128 *)&ctrblk);
-}
-
-static const struct common_glue_ctx cast6_enc = {
- .num_funcs = 2,
- .fpu_blocks_limit = CAST6_PARALLEL_BLOCKS,
-
- .funcs = { {
- .num_blocks = CAST6_PARALLEL_BLOCKS,
- .fn_u = { .ecb = cast6_ecb_enc_8way }
- }, {
- .num_blocks = 1,
- .fn_u = { .ecb = __cast6_encrypt }
- } }
-};
-
-static const struct common_glue_ctx cast6_ctr = {
- .num_funcs = 2,
- .fpu_blocks_limit = CAST6_PARALLEL_BLOCKS,
-
- .funcs = { {
- .num_blocks = CAST6_PARALLEL_BLOCKS,
- .fn_u = { .ctr = cast6_ctr_8way }
- }, {
- .num_blocks = 1,
- .fn_u = { .ctr = cast6_crypt_ctr }
- } }
-};
-
-static const struct common_glue_ctx cast6_enc_xts = {
- .num_funcs = 2,
- .fpu_blocks_limit = CAST6_PARALLEL_BLOCKS,
-
- .funcs = { {
- .num_blocks = CAST6_PARALLEL_BLOCKS,
- .fn_u = { .xts = cast6_xts_enc_8way }
- }, {
- .num_blocks = 1,
- .fn_u = { .xts = cast6_xts_enc }
- } }
-};
-
-static const struct common_glue_ctx cast6_dec = {
- .num_funcs = 2,
- .fpu_blocks_limit = CAST6_PARALLEL_BLOCKS,
-
- .funcs = { {
- .num_blocks = CAST6_PARALLEL_BLOCKS,
- .fn_u = { .ecb = cast6_ecb_dec_8way }
- }, {
- .num_blocks = 1,
- .fn_u = { .ecb = __cast6_decrypt }
- } }
-};
-
-static const struct common_glue_ctx cast6_dec_cbc = {
- .num_funcs = 2,
- .fpu_blocks_limit = CAST6_PARALLEL_BLOCKS,
-
- .funcs = { {
- .num_blocks = CAST6_PARALLEL_BLOCKS,
- .fn_u = { .cbc = cast6_cbc_dec_8way }
- }, {
- .num_blocks = 1,
- .fn_u = { .cbc = __cast6_decrypt }
- } }
-};
-
-static const struct common_glue_ctx cast6_dec_xts = {
- .num_funcs = 2,
- .fpu_blocks_limit = CAST6_PARALLEL_BLOCKS,
-
- .funcs = { {
- .num_blocks = CAST6_PARALLEL_BLOCKS,
- .fn_u = { .xts = cast6_xts_dec_8way }
- }, {
- .num_blocks = 1,
- .fn_u = { .xts = cast6_xts_dec }
- } }
-};
-
static int ecb_encrypt(struct skcipher_request *req)
{
- return glue_ecb_req_128bit(&cast6_enc, req);
+ ECB_WALK_START(req, CAST6_BLOCK_SIZE, CAST6_PARALLEL_BLOCKS);
+ ECB_BLOCK(CAST6_PARALLEL_BLOCKS, cast6_ecb_enc_8way);
+ ECB_BLOCK(1, __cast6_encrypt);
+ ECB_WALK_END();
}
static int ecb_decrypt(struct skcipher_request *req)
{
- return glue_ecb_req_128bit(&cast6_dec, req);
+ ECB_WALK_START(req, CAST6_BLOCK_SIZE, CAST6_PARALLEL_BLOCKS);
+ ECB_BLOCK(CAST6_PARALLEL_BLOCKS, cast6_ecb_dec_8way);
+ ECB_BLOCK(1, __cast6_decrypt);
+ ECB_WALK_END();
}
static int cbc_encrypt(struct skcipher_request *req)
{
- return glue_cbc_encrypt_req_128bit(__cast6_encrypt, req);
+ CBC_WALK_START(req, CAST6_BLOCK_SIZE, -1);
+ CBC_ENC_BLOCK(__cast6_encrypt);
+ CBC_WALK_END();
}
static int cbc_decrypt(struct skcipher_request *req)
{
- return glue_cbc_decrypt_req_128bit(&cast6_dec_cbc, req);
-}
-
-static int ctr_crypt(struct skcipher_request *req)
-{
- return glue_ctr_req_128bit(&cast6_ctr, req);
-}
-
-struct cast6_xts_ctx {
- struct cast6_ctx tweak_ctx;
- struct cast6_ctx crypt_ctx;
-};
-
-static int xts_cast6_setkey(struct crypto_skcipher *tfm, const u8 *key,
- unsigned int keylen)
-{
- struct cast6_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
- int err;
-
- err = xts_verify_key(tfm, key, keylen);
- if (err)
- return err;
-
- /* first half of xts-key is for crypt */
- err = __cast6_setkey(&ctx->crypt_ctx, key, keylen / 2);
- if (err)
- return err;
-
- /* second half of xts-key is for tweak */
- return __cast6_setkey(&ctx->tweak_ctx, key + keylen / 2, keylen / 2);
-}
-
-static int xts_encrypt(struct skcipher_request *req)
-{
- struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
- struct cast6_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
-
- return glue_xts_req_128bit(&cast6_enc_xts, req, __cast6_encrypt,
- &ctx->tweak_ctx, &ctx->crypt_ctx, false);
-}
-
-static int xts_decrypt(struct skcipher_request *req)
-{
- struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
- struct cast6_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
-
- return glue_xts_req_128bit(&cast6_dec_xts, req, __cast6_encrypt,
- &ctx->tweak_ctx, &ctx->crypt_ctx, true);
+ CBC_WALK_START(req, CAST6_BLOCK_SIZE, CAST6_PARALLEL_BLOCKS);
+ CBC_DEC_BLOCK(CAST6_PARALLEL_BLOCKS, cast6_cbc_dec_8way);
+ CBC_DEC_BLOCK(1, __cast6_decrypt);
+ CBC_WALK_END();
}
static struct skcipher_alg cast6_algs[] = {
@@ -234,35 +90,6 @@ static struct skcipher_alg cast6_algs[] = {
.setkey = cast6_setkey_skcipher,
.encrypt = cbc_encrypt,
.decrypt = cbc_decrypt,
- }, {
- .base.cra_name = "__ctr(cast6)",
- .base.cra_driver_name = "__ctr-cast6-avx",
- .base.cra_priority = 200,
- .base.cra_flags = CRYPTO_ALG_INTERNAL,
- .base.cra_blocksize = 1,
- .base.cra_ctxsize = sizeof(struct cast6_ctx),
- .base.cra_module = THIS_MODULE,
- .min_keysize = CAST6_MIN_KEY_SIZE,
- .max_keysize = CAST6_MAX_KEY_SIZE,
- .ivsize = CAST6_BLOCK_SIZE,
- .chunksize = CAST6_BLOCK_SIZE,
- .setkey = cast6_setkey_skcipher,
- .encrypt = ctr_crypt,
- .decrypt = ctr_crypt,
- }, {
- .base.cra_name = "__xts(cast6)",
- .base.cra_driver_name = "__xts-cast6-avx",
- .base.cra_priority = 200,
- .base.cra_flags = CRYPTO_ALG_INTERNAL,
- .base.cra_blocksize = CAST6_BLOCK_SIZE,
- .base.cra_ctxsize = sizeof(struct cast6_xts_ctx),
- .base.cra_module = THIS_MODULE,
- .min_keysize = 2 * CAST6_MIN_KEY_SIZE,
- .max_keysize = 2 * CAST6_MAX_KEY_SIZE,
- .ivsize = CAST6_BLOCK_SIZE,
- .setkey = xts_cast6_setkey,
- .encrypt = xts_encrypt,
- .decrypt = xts_decrypt,
},
};
diff --git a/arch/x86/crypto/chacha-avx2-x86_64.S b/arch/x86/crypto/chacha-avx2-x86_64.S
index ee9a40ab4109..f3d8fc018249 100644
--- a/arch/x86/crypto/chacha-avx2-x86_64.S
+++ b/arch/x86/crypto/chacha-avx2-x86_64.S
@@ -193,7 +193,7 @@ SYM_FUNC_START(chacha_2block_xor_avx2)
.Ldone2:
vzeroupper
- ret
+ RET
.Lxorpart2:
# xor remaining bytes from partial register into output
@@ -498,7 +498,7 @@ SYM_FUNC_START(chacha_4block_xor_avx2)
.Ldone4:
vzeroupper
- ret
+ RET
.Lxorpart4:
# xor remaining bytes from partial register into output
@@ -992,7 +992,7 @@ SYM_FUNC_START(chacha_8block_xor_avx2)
.Ldone8:
vzeroupper
lea -8(%r10),%rsp
- ret
+ RET
.Lxorpart8:
# xor remaining bytes from partial register into output
diff --git a/arch/x86/crypto/chacha-avx512vl-x86_64.S b/arch/x86/crypto/chacha-avx512vl-x86_64.S
index bb193fde123a..259383e1ad44 100644
--- a/arch/x86/crypto/chacha-avx512vl-x86_64.S
+++ b/arch/x86/crypto/chacha-avx512vl-x86_64.S
@@ -166,13 +166,13 @@ SYM_FUNC_START(chacha_2block_xor_avx512vl)
.Ldone2:
vzeroupper
- ret
+ RET
.Lxorpart2:
# xor remaining bytes from partial register into output
mov %rcx,%rax
and $0xf,%rcx
- jz .Ldone8
+ jz .Ldone2
mov %rax,%r9
and $~0xf,%r9
@@ -432,13 +432,13 @@ SYM_FUNC_START(chacha_4block_xor_avx512vl)
.Ldone4:
vzeroupper
- ret
+ RET
.Lxorpart4:
# xor remaining bytes from partial register into output
mov %rcx,%rax
and $0xf,%rcx
- jz .Ldone8
+ jz .Ldone4
mov %rax,%r9
and $~0xf,%r9
@@ -812,7 +812,7 @@ SYM_FUNC_START(chacha_8block_xor_avx512vl)
.Ldone8:
vzeroupper
- ret
+ RET
.Lxorpart8:
# xor remaining bytes from partial register into output
diff --git a/arch/x86/crypto/chacha-ssse3-x86_64.S b/arch/x86/crypto/chacha-ssse3-x86_64.S
index a38ab2512a6f..7111949cd5b9 100644
--- a/arch/x86/crypto/chacha-ssse3-x86_64.S
+++ b/arch/x86/crypto/chacha-ssse3-x86_64.S
@@ -108,7 +108,7 @@ SYM_FUNC_START_LOCAL(chacha_permute)
sub $2,%r8d
jnz .Ldoubleround
- ret
+ RET
SYM_FUNC_END(chacha_permute)
SYM_FUNC_START(chacha_block_xor_ssse3)
@@ -120,10 +120,10 @@ SYM_FUNC_START(chacha_block_xor_ssse3)
FRAME_BEGIN
# x0..3 = s0..3
- movdqa 0x00(%rdi),%xmm0
- movdqa 0x10(%rdi),%xmm1
- movdqa 0x20(%rdi),%xmm2
- movdqa 0x30(%rdi),%xmm3
+ movdqu 0x00(%rdi),%xmm0
+ movdqu 0x10(%rdi),%xmm1
+ movdqu 0x20(%rdi),%xmm2
+ movdqu 0x30(%rdi),%xmm3
movdqa %xmm0,%xmm8
movdqa %xmm1,%xmm9
movdqa %xmm2,%xmm10
@@ -166,7 +166,7 @@ SYM_FUNC_START(chacha_block_xor_ssse3)
.Ldone:
FRAME_END
- ret
+ RET
.Lxorpart:
# xor remaining bytes from partial register into output
@@ -205,10 +205,10 @@ SYM_FUNC_START(hchacha_block_ssse3)
# %edx: nrounds
FRAME_BEGIN
- movdqa 0x00(%rdi),%xmm0
- movdqa 0x10(%rdi),%xmm1
- movdqa 0x20(%rdi),%xmm2
- movdqa 0x30(%rdi),%xmm3
+ movdqu 0x00(%rdi),%xmm0
+ movdqu 0x10(%rdi),%xmm1
+ movdqu 0x20(%rdi),%xmm2
+ movdqu 0x30(%rdi),%xmm3
mov %edx,%r8d
call chacha_permute
@@ -217,7 +217,7 @@ SYM_FUNC_START(hchacha_block_ssse3)
movdqu %xmm3,0x10(%rsi)
FRAME_END
- ret
+ RET
SYM_FUNC_END(hchacha_block_ssse3)
SYM_FUNC_START(chacha_4block_xor_ssse3)
@@ -762,7 +762,7 @@ SYM_FUNC_START(chacha_4block_xor_ssse3)
.Ldone4:
lea -8(%r10),%rsp
- ret
+ RET
.Lxorpart4:
# xor remaining bytes from partial register into output
diff --git a/arch/x86/crypto/chacha_glue.c b/arch/x86/crypto/chacha_glue.c
index 68a74953efaf..7b3a1cf0984b 100644
--- a/arch/x86/crypto/chacha_glue.c
+++ b/arch/x86/crypto/chacha_glue.c
@@ -12,10 +12,9 @@
#include <crypto/internal/skcipher.h>
#include <linux/kernel.h>
#include <linux/module.h>
+#include <linux/sizes.h>
#include <asm/simd.h>
-#define CHACHA_STATE_ALIGN 16
-
asmlinkage void chacha_block_xor_ssse3(u32 *state, u8 *dst, const u8 *src,
unsigned int len, int nrounds);
asmlinkage void chacha_4block_xor_ssse3(u32 *state, u8 *dst, const u8 *src,
@@ -79,8 +78,7 @@ static void chacha_dosimd(u32 *state, u8 *dst, const u8 *src,
}
}
- if (IS_ENABLED(CONFIG_AS_AVX2) &&
- static_branch_likely(&chacha_use_avx2)) {
+ if (static_branch_likely(&chacha_use_avx2)) {
while (bytes >= CHACHA_BLOCK_SIZE * 8) {
chacha_8block_xor_avx2(state, dst, src, bytes, nrounds);
bytes -= CHACHA_BLOCK_SIZE * 8;
@@ -125,8 +123,6 @@ static void chacha_dosimd(u32 *state, u8 *dst, const u8 *src,
void hchacha_block_arch(const u32 *state, u32 *stream, int nrounds)
{
- state = PTR_ALIGN(state, CHACHA_STATE_ALIGN);
-
if (!static_branch_likely(&chacha_use_simd) || !crypto_simd_usable()) {
hchacha_block_generic(state, stream, nrounds);
} else {
@@ -139,8 +135,6 @@ EXPORT_SYMBOL(hchacha_block_arch);
void chacha_init_arch(u32 *state, const u32 *key, const u8 *iv)
{
- state = PTR_ALIGN(state, CHACHA_STATE_ALIGN);
-
chacha_init_generic(state, key, iv);
}
EXPORT_SYMBOL(chacha_init_arch);
@@ -148,30 +142,33 @@ EXPORT_SYMBOL(chacha_init_arch);
void chacha_crypt_arch(u32 *state, u8 *dst, const u8 *src, unsigned int bytes,
int nrounds)
{
- state = PTR_ALIGN(state, CHACHA_STATE_ALIGN);
-
if (!static_branch_likely(&chacha_use_simd) || !crypto_simd_usable() ||
bytes <= CHACHA_BLOCK_SIZE)
return chacha_crypt_generic(state, dst, src, bytes, nrounds);
- kernel_fpu_begin();
- chacha_dosimd(state, dst, src, bytes, nrounds);
- kernel_fpu_end();
+ do {
+ unsigned int todo = min_t(unsigned int, bytes, SZ_4K);
+
+ kernel_fpu_begin();
+ chacha_dosimd(state, dst, src, todo, nrounds);
+ kernel_fpu_end();
+
+ bytes -= todo;
+ src += todo;
+ dst += todo;
+ } while (bytes);
}
EXPORT_SYMBOL(chacha_crypt_arch);
static int chacha_simd_stream_xor(struct skcipher_request *req,
const struct chacha_ctx *ctx, const u8 *iv)
{
- u32 *state, state_buf[16 + 2] __aligned(8);
+ u32 state[CHACHA_STATE_WORDS] __aligned(8);
struct skcipher_walk walk;
int err;
err = skcipher_walk_virt(&walk, req, false);
- BUILD_BUG_ON(CHACHA_STATE_ALIGN != 16);
- state = PTR_ALIGN(state_buf + 0, CHACHA_STATE_ALIGN);
-
chacha_init_generic(state, ctx->key, iv);
while (walk.nbytes > 0) {
@@ -210,12 +207,10 @@ static int xchacha_simd(struct skcipher_request *req)
{
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
- u32 *state, state_buf[16 + 2] __aligned(8);
+ u32 state[CHACHA_STATE_WORDS] __aligned(8);
struct chacha_ctx subctx;
u8 real_iv[16];
- BUILD_BUG_ON(CHACHA_STATE_ALIGN != 16);
- state = PTR_ALIGN(state_buf + 0, CHACHA_STATE_ALIGN);
chacha_init_generic(state, ctx->key, req->iv);
if (req->cryptlen > CHACHA_BLOCK_SIZE && crypto_simd_usable()) {
@@ -288,8 +283,7 @@ static int __init chacha_simd_mod_init(void)
static_branch_enable(&chacha_use_simd);
- if (IS_ENABLED(CONFIG_AS_AVX2) &&
- boot_cpu_has(X86_FEATURE_AVX) &&
+ if (boot_cpu_has(X86_FEATURE_AVX) &&
boot_cpu_has(X86_FEATURE_AVX2) &&
cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) {
static_branch_enable(&chacha_use_avx2);
diff --git a/arch/x86/crypto/crc32-pclmul_asm.S b/arch/x86/crypto/crc32-pclmul_asm.S
index 9fd28ff65bc2..ca53e96996ac 100644
--- a/arch/x86/crypto/crc32-pclmul_asm.S
+++ b/arch/x86/crypto/crc32-pclmul_asm.S
@@ -1,26 +1,4 @@
-/* GPL HEADER START
- *
- * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 only,
- * as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License version 2 for more details (a copy is included
- * in the LICENSE file that accompanied this code).
- *
- * You should have received a copy of the GNU General Public License
- * version 2 along with this program; If not, see http://www.gnu.org/licenses
- *
- * Please visit http://www.xyratex.com/contact if you need additional
- * information or have any questions.
- *
- * GPL HEADER END
- */
-
+/* SPDX-License-Identifier: GPL-2.0-only */
/*
* Copyright 2012 Xyratex Technology Limited
*
@@ -38,7 +16,6 @@
*/
#include <linux/linkage.h>
-#include <asm/inst.h>
.section .rodata
@@ -129,17 +106,17 @@ loop_64:/* 64 bytes Full cache line folding */
#ifdef __x86_64__
movdqa %xmm4, %xmm8
#endif
- PCLMULQDQ 00, CONSTANT, %xmm1
- PCLMULQDQ 00, CONSTANT, %xmm2
- PCLMULQDQ 00, CONSTANT, %xmm3
+ pclmulqdq $0x00, CONSTANT, %xmm1
+ pclmulqdq $0x00, CONSTANT, %xmm2
+ pclmulqdq $0x00, CONSTANT, %xmm3
#ifdef __x86_64__
- PCLMULQDQ 00, CONSTANT, %xmm4
+ pclmulqdq $0x00, CONSTANT, %xmm4
#endif
- PCLMULQDQ 0x11, CONSTANT, %xmm5
- PCLMULQDQ 0x11, CONSTANT, %xmm6
- PCLMULQDQ 0x11, CONSTANT, %xmm7
+ pclmulqdq $0x11, CONSTANT, %xmm5
+ pclmulqdq $0x11, CONSTANT, %xmm6
+ pclmulqdq $0x11, CONSTANT, %xmm7
#ifdef __x86_64__
- PCLMULQDQ 0x11, CONSTANT, %xmm8
+ pclmulqdq $0x11, CONSTANT, %xmm8
#endif
pxor %xmm5, %xmm1
pxor %xmm6, %xmm2
@@ -149,8 +126,8 @@ loop_64:/* 64 bytes Full cache line folding */
#else
/* xmm8 unsupported for x32 */
movdqa %xmm4, %xmm5
- PCLMULQDQ 00, CONSTANT, %xmm4
- PCLMULQDQ 0x11, CONSTANT, %xmm5
+ pclmulqdq $0x00, CONSTANT, %xmm4
+ pclmulqdq $0x11, CONSTANT, %xmm5
pxor %xmm5, %xmm4
#endif
@@ -172,20 +149,20 @@ less_64:/* Folding cache line into 128bit */
prefetchnta (BUF)
movdqa %xmm1, %xmm5
- PCLMULQDQ 0x00, CONSTANT, %xmm1
- PCLMULQDQ 0x11, CONSTANT, %xmm5
+ pclmulqdq $0x00, CONSTANT, %xmm1
+ pclmulqdq $0x11, CONSTANT, %xmm5
pxor %xmm5, %xmm1
pxor %xmm2, %xmm1
movdqa %xmm1, %xmm5
- PCLMULQDQ 0x00, CONSTANT, %xmm1
- PCLMULQDQ 0x11, CONSTANT, %xmm5
+ pclmulqdq $0x00, CONSTANT, %xmm1
+ pclmulqdq $0x11, CONSTANT, %xmm5
pxor %xmm5, %xmm1
pxor %xmm3, %xmm1
movdqa %xmm1, %xmm5
- PCLMULQDQ 0x00, CONSTANT, %xmm1
- PCLMULQDQ 0x11, CONSTANT, %xmm5
+ pclmulqdq $0x00, CONSTANT, %xmm1
+ pclmulqdq $0x11, CONSTANT, %xmm5
pxor %xmm5, %xmm1
pxor %xmm4, %xmm1
@@ -193,8 +170,8 @@ less_64:/* Folding cache line into 128bit */
jb fold_64
loop_16:/* Folding rest buffer into 128bit */
movdqa %xmm1, %xmm5
- PCLMULQDQ 0x00, CONSTANT, %xmm1
- PCLMULQDQ 0x11, CONSTANT, %xmm5
+ pclmulqdq $0x00, CONSTANT, %xmm1
+ pclmulqdq $0x11, CONSTANT, %xmm5
pxor %xmm5, %xmm1
pxor (BUF), %xmm1
sub $0x10, LEN
@@ -205,7 +182,7 @@ loop_16:/* Folding rest buffer into 128bit */
fold_64:
/* perform the last 64 bit fold, also adds 32 zeroes
* to the input stream */
- PCLMULQDQ 0x01, %xmm1, CONSTANT /* R4 * xmm1.low */
+ pclmulqdq $0x01, %xmm1, CONSTANT /* R4 * xmm1.low */
psrldq $0x08, %xmm1
pxor CONSTANT, %xmm1
@@ -220,7 +197,7 @@ fold_64:
#endif
psrldq $0x04, %xmm2
pand %xmm3, %xmm1
- PCLMULQDQ 0x00, CONSTANT, %xmm1
+ pclmulqdq $0x00, CONSTANT, %xmm1
pxor %xmm2, %xmm1
/* Finish up with the bit-reversed barrett reduction 64 ==> 32 bits */
@@ -231,11 +208,11 @@ fold_64:
#endif
movdqa %xmm1, %xmm2
pand %xmm3, %xmm1
- PCLMULQDQ 0x10, CONSTANT, %xmm1
+ pclmulqdq $0x10, CONSTANT, %xmm1
pand %xmm3, %xmm1
- PCLMULQDQ 0x00, CONSTANT, %xmm1
+ pclmulqdq $0x00, CONSTANT, %xmm1
pxor %xmm2, %xmm1
- PEXTRD 0x01, %xmm1, %eax
+ pextrd $0x01, %xmm1, %eax
- ret
+ RET
SYM_FUNC_END(crc32_pclmul_le_16)
diff --git a/arch/x86/crypto/crc32-pclmul_glue.c b/arch/x86/crypto/crc32-pclmul_glue.c
index 418bd88acac8..98cf3b4e4c9f 100644
--- a/arch/x86/crypto/crc32-pclmul_glue.c
+++ b/arch/x86/crypto/crc32-pclmul_glue.c
@@ -24,7 +24,7 @@
/*
* Copyright 2012 Xyratex Technology Limited
*
- * Wrappers for kernel crypto shash api to pclmulqdq crc32 imlementation.
+ * Wrappers for kernel crypto shash api to pclmulqdq crc32 implementation.
*/
#include <linux/init.h>
#include <linux/module.h>
@@ -170,7 +170,7 @@ static struct shash_alg alg = {
};
static const struct x86_cpu_id crc32pclmul_cpu_id[] = {
- X86_FEATURE_MATCH(X86_FEATURE_PCLMULQDQ),
+ X86_MATCH_FEATURE(X86_FEATURE_PCLMULQDQ, NULL),
{}
};
MODULE_DEVICE_TABLE(x86cpu, crc32pclmul_cpu_id);
diff --git a/arch/x86/crypto/crc32c-intel_glue.c b/arch/x86/crypto/crc32c-intel_glue.c
index c20d1b8a82c3..feccb5254c7e 100644
--- a/arch/x86/crypto/crc32c-intel_glue.c
+++ b/arch/x86/crypto/crc32c-intel_glue.c
@@ -28,9 +28,9 @@
#define SCALE_F sizeof(unsigned long)
#ifdef CONFIG_X86_64
-#define REX_PRE "0x48, "
+#define CRC32_INST "crc32q %1, %q0"
#else
-#define REX_PRE
+#define CRC32_INST "crc32l %1, %0"
#endif
#ifdef CONFIG_X86_64
@@ -48,11 +48,8 @@ asmlinkage unsigned int crc_pcl(const u8 *buffer, int len,
static u32 crc32c_intel_le_hw_byte(u32 crc, unsigned char const *data, size_t length)
{
while (length--) {
- __asm__ __volatile__(
- ".byte 0xf2, 0xf, 0x38, 0xf0, 0xf1"
- :"=S"(crc)
- :"0"(crc), "c"(*data)
- );
+ asm("crc32b %1, %0"
+ : "+r" (crc) : "rm" (*data));
data++;
}
@@ -66,11 +63,8 @@ static u32 __pure crc32c_intel_le_hw(u32 crc, unsigned char const *p, size_t len
unsigned long *ptmp = (unsigned long *)p;
while (iquotient--) {
- __asm__ __volatile__(
- ".byte 0xf2, " REX_PRE "0xf, 0x38, 0xf1, 0xf1;"
- :"=S"(crc)
- :"0"(crc), "c"(*ptmp)
- );
+ asm(CRC32_INST
+ : "+r" (crc) : "rm" (*ptmp));
ptmp++;
}
@@ -221,7 +215,7 @@ static struct shash_alg alg = {
};
static const struct x86_cpu_id crc32c_cpu_id[] = {
- X86_FEATURE_MATCH(X86_FEATURE_XMM4_2),
+ X86_MATCH_FEATURE(X86_FEATURE_XMM4_2, NULL),
{}
};
MODULE_DEVICE_TABLE(x86cpu, crc32c_cpu_id);
diff --git a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
index 0e6690e3618c..ec35915f0901 100644
--- a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
+++ b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
@@ -43,7 +43,6 @@
* SOFTWARE.
*/
-#include <asm/inst.h>
#include <linux/linkage.h>
#include <asm/nospec-branch.h>
@@ -54,7 +53,7 @@
.endm
.macro JMPTBL_ENTRY i
-.word crc_\i - crc_array
+.quad crc_\i
.endm
.macro JNC_LESS_THAN j
@@ -75,7 +74,7 @@
.text
SYM_FUNC_START(crc_pcl)
-#define bufp %rdi
+#define bufp rdi
#define bufp_dw %edi
#define bufp_w %di
#define bufp_b %dil
@@ -105,9 +104,9 @@ SYM_FUNC_START(crc_pcl)
## 1) ALIGN:
################################################################
- mov bufp, bufptmp # rdi = *buf
- neg bufp
- and $7, bufp # calculate the unalignment amount of
+ mov %bufp, bufptmp # rdi = *buf
+ neg %bufp
+ and $7, %bufp # calculate the unalignment amount of
# the address
je proc_block # Skip if aligned
@@ -123,13 +122,13 @@ SYM_FUNC_START(crc_pcl)
do_align:
#### Calculate CRC of unaligned bytes of the buffer (if any)
movq (bufptmp), tmp # load a quadward from the buffer
- add bufp, bufptmp # align buffer pointer for quadword
+ add %bufp, bufptmp # align buffer pointer for quadword
# processing
- sub bufp, len # update buffer length
+ sub %bufp, len # update buffer length
align_loop:
crc32b %bl, crc_init_dw # compute crc32 of 1-byte
shr $8, tmp # get next byte
- dec bufp
+ dec %bufp
jne align_loop
proc_block:
@@ -169,10 +168,7 @@ continue_block:
xor crc2, crc2
## branch into array
- lea jump_table(%rip), bufp
- movzxw (bufp, %rax, 2), len
- lea crc_array(%rip), bufp
- lea (bufp, len, 1), bufp
+ mov jump_table(,%rax,8), %bufp
JMP_NOSPEC bufp
################################################################
@@ -199,6 +195,7 @@ crc_array:
.altmacro
LABEL crc_ %i
.noaltmacro
+ ENDBR
crc32q -i*8(block_0), crc_init
crc32q -i*8(block_1), crc1
crc32q -i*8(block_2), crc2
@@ -208,6 +205,7 @@ LABEL crc_ %i
.altmacro
LABEL crc_ %i
.noaltmacro
+ ENDBR
crc32q -i*8(block_0), crc_init
crc32q -i*8(block_1), crc1
# SKIP crc32 -i*8(block_2), crc2 ; Don't do this one yet
@@ -218,17 +216,17 @@ LABEL crc_ %i
## 4) Combine three results:
################################################################
- lea (K_table-8)(%rip), bufp # first entry is for idx 1
+ lea (K_table-8)(%rip), %bufp # first entry is for idx 1
shlq $3, %rax # rax *= 8
- pmovzxdq (bufp,%rax), %xmm0 # 2 consts: K1:K2
+ pmovzxdq (%bufp,%rax), %xmm0 # 2 consts: K1:K2
leal (%eax,%eax,2), %eax # rax *= 3 (total *24)
subq %rax, tmp # tmp -= rax*24
movq crc_init, %xmm1 # CRC for block 1
- PCLMULQDQ 0x00,%xmm0,%xmm1 # Multiply by K2
+ pclmulqdq $0x00, %xmm0, %xmm1 # Multiply by K2
movq crc1, %xmm2 # CRC for block 2
- PCLMULQDQ 0x10, %xmm0, %xmm2 # Multiply by K1
+ pclmulqdq $0x10, %xmm0, %xmm2 # Multiply by K1
pxor %xmm2,%xmm1
movq %xmm1, %rax
@@ -241,6 +239,7 @@ LABEL crc_ %i
################################################################
LABEL crc_ 0
+ ENDBR
mov tmp, len
cmp $128*24, tmp
jae full_block
@@ -310,7 +309,7 @@ do_return:
popq %rsi
popq %rdi
popq %rbx
- ret
+ RET
SYM_FUNC_END(crc_pcl)
.section .rodata, "a", @progbits
diff --git a/arch/x86/crypto/crct10dif-pcl-asm_64.S b/arch/x86/crypto/crct10dif-pcl-asm_64.S
index b2533d63030e..721474abfb71 100644
--- a/arch/x86/crypto/crct10dif-pcl-asm_64.S
+++ b/arch/x86/crypto/crct10dif-pcl-asm_64.S
@@ -257,7 +257,7 @@ SYM_FUNC_START(crc_t10dif_pcl)
# Final CRC value (x^16 * M(x)) mod G(x) is in low 16 bits of xmm0.
pextrw $0, %xmm0, %eax
- ret
+ RET
.align 16
.Lless_than_256_bytes:
diff --git a/arch/x86/crypto/crct10dif-pclmul_glue.c b/arch/x86/crypto/crct10dif-pclmul_glue.c
index 3c81e15b0873..71291d5af9f4 100644
--- a/arch/x86/crypto/crct10dif-pclmul_glue.c
+++ b/arch/x86/crypto/crct10dif-pclmul_glue.c
@@ -114,7 +114,7 @@ static struct shash_alg alg = {
};
static const struct x86_cpu_id crct10dif_cpu_id[] = {
- X86_FEATURE_MATCH(X86_FEATURE_PCLMULQDQ),
+ X86_MATCH_FEATURE(X86_FEATURE_PCLMULQDQ, NULL),
{}
};
MODULE_DEVICE_TABLE(x86cpu, crct10dif_cpu_id);
diff --git a/arch/x86/crypto/curve25519-x86_64.c b/arch/x86/crypto/curve25519-x86_64.c
index eec7d2d24239..d55fa9e9b9e6 100644
--- a/arch/x86/crypto/curve25519-x86_64.c
+++ b/arch/x86/crypto/curve25519-x86_64.c
@@ -1,8 +1,7 @@
-// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
+// SPDX-License-Identifier: GPL-2.0 OR MIT
/*
- * Copyright (c) 2017 Armando Faz <armfazh@ic.unicamp.br>. All Rights Reserved.
- * Copyright (C) 2018-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
- * Copyright (C) 2018 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved.
+ * Copyright (C) 2020 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ * Copyright (c) 2016-2020 INRIA, CMU and Microsoft Corporation
*/
#include <crypto/curve25519.h>
@@ -12,2341 +11,1592 @@
#include <linux/jump_label.h>
#include <linux/kernel.h>
#include <linux/module.h>
+#include <linux/scatterlist.h>
#include <asm/cpufeature.h>
#include <asm/processor.h>
-static __ro_after_init DEFINE_STATIC_KEY_FALSE(curve25519_use_bmi2);
-static __ro_after_init DEFINE_STATIC_KEY_FALSE(curve25519_use_adx);
-
-enum { NUM_WORDS_ELTFP25519 = 4 };
-typedef __aligned(32) u64 eltfp25519_1w[NUM_WORDS_ELTFP25519];
-typedef __aligned(32) u64 eltfp25519_1w_buffer[2 * NUM_WORDS_ELTFP25519];
-
-#define mul_eltfp25519_1w_adx(c, a, b) do { \
- mul_256x256_integer_adx(m.buffer, a, b); \
- red_eltfp25519_1w_adx(c, m.buffer); \
-} while (0)
-
-#define mul_eltfp25519_1w_bmi2(c, a, b) do { \
- mul_256x256_integer_bmi2(m.buffer, a, b); \
- red_eltfp25519_1w_bmi2(c, m.buffer); \
-} while (0)
-
-#define sqr_eltfp25519_1w_adx(a) do { \
- sqr_256x256_integer_adx(m.buffer, a); \
- red_eltfp25519_1w_adx(a, m.buffer); \
-} while (0)
-
-#define sqr_eltfp25519_1w_bmi2(a) do { \
- sqr_256x256_integer_bmi2(m.buffer, a); \
- red_eltfp25519_1w_bmi2(a, m.buffer); \
-} while (0)
-
-#define mul_eltfp25519_2w_adx(c, a, b) do { \
- mul2_256x256_integer_adx(m.buffer, a, b); \
- red_eltfp25519_2w_adx(c, m.buffer); \
-} while (0)
-
-#define mul_eltfp25519_2w_bmi2(c, a, b) do { \
- mul2_256x256_integer_bmi2(m.buffer, a, b); \
- red_eltfp25519_2w_bmi2(c, m.buffer); \
-} while (0)
-
-#define sqr_eltfp25519_2w_adx(a) do { \
- sqr2_256x256_integer_adx(m.buffer, a); \
- red_eltfp25519_2w_adx(a, m.buffer); \
-} while (0)
-
-#define sqr_eltfp25519_2w_bmi2(a) do { \
- sqr2_256x256_integer_bmi2(m.buffer, a); \
- red_eltfp25519_2w_bmi2(a, m.buffer); \
-} while (0)
-
-#define sqrn_eltfp25519_1w_adx(a, times) do { \
- int ____counter = (times); \
- while (____counter-- > 0) \
- sqr_eltfp25519_1w_adx(a); \
-} while (0)
-
-#define sqrn_eltfp25519_1w_bmi2(a, times) do { \
- int ____counter = (times); \
- while (____counter-- > 0) \
- sqr_eltfp25519_1w_bmi2(a); \
-} while (0)
-
-#define copy_eltfp25519_1w(C, A) do { \
- (C)[0] = (A)[0]; \
- (C)[1] = (A)[1]; \
- (C)[2] = (A)[2]; \
- (C)[3] = (A)[3]; \
-} while (0)
-
-#define setzero_eltfp25519_1w(C) do { \
- (C)[0] = 0; \
- (C)[1] = 0; \
- (C)[2] = 0; \
- (C)[3] = 0; \
-} while (0)
-
-__aligned(32) static const u64 table_ladder_8k[252 * NUM_WORDS_ELTFP25519] = {
- /* 1 */ 0xfffffffffffffff3UL, 0xffffffffffffffffUL,
- 0xffffffffffffffffUL, 0x5fffffffffffffffUL,
- /* 2 */ 0x6b8220f416aafe96UL, 0x82ebeb2b4f566a34UL,
- 0xd5a9a5b075a5950fUL, 0x5142b2cf4b2488f4UL,
- /* 3 */ 0x6aaebc750069680cUL, 0x89cf7820a0f99c41UL,
- 0x2a58d9183b56d0f4UL, 0x4b5aca80e36011a4UL,
- /* 4 */ 0x329132348c29745dUL, 0xf4a2e616e1642fd7UL,
- 0x1e45bb03ff67bc34UL, 0x306912d0f42a9b4aUL,
- /* 5 */ 0xff886507e6af7154UL, 0x04f50e13dfeec82fUL,
- 0xaa512fe82abab5ceUL, 0x174e251a68d5f222UL,
- /* 6 */ 0xcf96700d82028898UL, 0x1743e3370a2c02c5UL,
- 0x379eec98b4e86eaaUL, 0x0c59888a51e0482eUL,
- /* 7 */ 0xfbcbf1d699b5d189UL, 0xacaef0d58e9fdc84UL,
- 0xc1c20d06231f7614UL, 0x2938218da274f972UL,
- /* 8 */ 0xf6af49beff1d7f18UL, 0xcc541c22387ac9c2UL,
- 0x96fcc9ef4015c56bUL, 0x69c1627c690913a9UL,
- /* 9 */ 0x7a86fd2f4733db0eUL, 0xfdb8c4f29e087de9UL,
- 0x095e4b1a8ea2a229UL, 0x1ad7a7c829b37a79UL,
- /* 10 */ 0x342d89cad17ea0c0UL, 0x67bedda6cced2051UL,
- 0x19ca31bf2bb42f74UL, 0x3df7b4c84980acbbUL,
- /* 11 */ 0xa8c6444dc80ad883UL, 0xb91e440366e3ab85UL,
- 0xc215cda00164f6d8UL, 0x3d867c6ef247e668UL,
- /* 12 */ 0xc7dd582bcc3e658cUL, 0xfd2c4748ee0e5528UL,
- 0xa0fd9b95cc9f4f71UL, 0x7529d871b0675ddfUL,
- /* 13 */ 0xb8f568b42d3cbd78UL, 0x1233011b91f3da82UL,
- 0x2dce6ccd4a7c3b62UL, 0x75e7fc8e9e498603UL,
- /* 14 */ 0x2f4f13f1fcd0b6ecUL, 0xf1a8ca1f29ff7a45UL,
- 0xc249c1a72981e29bUL, 0x6ebe0dbb8c83b56aUL,
- /* 15 */ 0x7114fa8d170bb222UL, 0x65a2dcd5bf93935fUL,
- 0xbdc41f68b59c979aUL, 0x2f0eef79a2ce9289UL,
- /* 16 */ 0x42ecbf0c083c37ceUL, 0x2930bc09ec496322UL,
- 0xf294b0c19cfeac0dUL, 0x3780aa4bedfabb80UL,
- /* 17 */ 0x56c17d3e7cead929UL, 0xe7cb4beb2e5722c5UL,
- 0x0ce931732dbfe15aUL, 0x41b883c7621052f8UL,
- /* 18 */ 0xdbf75ca0c3d25350UL, 0x2936be086eb1e351UL,
- 0xc936e03cb4a9b212UL, 0x1d45bf82322225aaUL,
- /* 19 */ 0xe81ab1036a024cc5UL, 0xe212201c304c9a72UL,
- 0xc5d73fba6832b1fcUL, 0x20ffdb5a4d839581UL,
- /* 20 */ 0xa283d367be5d0fadUL, 0x6c2b25ca8b164475UL,
- 0x9d4935467caaf22eUL, 0x5166408eee85ff49UL,
- /* 21 */ 0x3c67baa2fab4e361UL, 0xb3e433c67ef35cefUL,
- 0x5259729241159b1cUL, 0x6a621892d5b0ab33UL,
- /* 22 */ 0x20b74a387555cdcbUL, 0x532aa10e1208923fUL,
- 0xeaa17b7762281dd1UL, 0x61ab3443f05c44bfUL,
- /* 23 */ 0x257a6c422324def8UL, 0x131c6c1017e3cf7fUL,
- 0x23758739f630a257UL, 0x295a407a01a78580UL,
- /* 24 */ 0xf8c443246d5da8d9UL, 0x19d775450c52fa5dUL,
- 0x2afcfc92731bf83dUL, 0x7d10c8e81b2b4700UL,
- /* 25 */ 0xc8e0271f70baa20bUL, 0x993748867ca63957UL,
- 0x5412efb3cb7ed4bbUL, 0x3196d36173e62975UL,
- /* 26 */ 0xde5bcad141c7dffcUL, 0x47cc8cd2b395c848UL,
- 0xa34cd942e11af3cbUL, 0x0256dbf2d04ecec2UL,
- /* 27 */ 0x875ab7e94b0e667fUL, 0xcad4dd83c0850d10UL,
- 0x47f12e8f4e72c79fUL, 0x5f1a87bb8c85b19bUL,
- /* 28 */ 0x7ae9d0b6437f51b8UL, 0x12c7ce5518879065UL,
- 0x2ade09fe5cf77aeeUL, 0x23a05a2f7d2c5627UL,
- /* 29 */ 0x5908e128f17c169aUL, 0xf77498dd8ad0852dUL,
- 0x74b4c4ceab102f64UL, 0x183abadd10139845UL,
- /* 30 */ 0xb165ba8daa92aaacUL, 0xd5c5ef9599386705UL,
- 0xbe2f8f0cf8fc40d1UL, 0x2701e635ee204514UL,
- /* 31 */ 0x629fa80020156514UL, 0xf223868764a8c1ceUL,
- 0x5b894fff0b3f060eUL, 0x60d9944cf708a3faUL,
- /* 32 */ 0xaeea001a1c7a201fUL, 0xebf16a633ee2ce63UL,
- 0x6f7709594c7a07e1UL, 0x79b958150d0208cbUL,
- /* 33 */ 0x24b55e5301d410e7UL, 0xe3a34edff3fdc84dUL,
- 0xd88768e4904032d8UL, 0x131384427b3aaeecUL,
- /* 34 */ 0x8405e51286234f14UL, 0x14dc4739adb4c529UL,
- 0xb8a2b5b250634ffdUL, 0x2fe2a94ad8a7ff93UL,
- /* 35 */ 0xec5c57efe843faddUL, 0x2843ce40f0bb9918UL,
- 0xa4b561d6cf3d6305UL, 0x743629bde8fb777eUL,
- /* 36 */ 0x343edd46bbaf738fUL, 0xed981828b101a651UL,
- 0xa401760b882c797aUL, 0x1fc223e28dc88730UL,
- /* 37 */ 0x48604e91fc0fba0eUL, 0xb637f78f052c6fa4UL,
- 0x91ccac3d09e9239cUL, 0x23f7eed4437a687cUL,
- /* 38 */ 0x5173b1118d9bd800UL, 0x29d641b63189d4a7UL,
- 0xfdbf177988bbc586UL, 0x2959894fcad81df5UL,
- /* 39 */ 0xaebc8ef3b4bbc899UL, 0x4148995ab26992b9UL,
- 0x24e20b0134f92cfbUL, 0x40d158894a05dee8UL,
- /* 40 */ 0x46b00b1185af76f6UL, 0x26bac77873187a79UL,
- 0x3dc0bf95ab8fff5fUL, 0x2a608bd8945524d7UL,
- /* 41 */ 0x26449588bd446302UL, 0x7c4bc21c0388439cUL,
- 0x8e98a4f383bd11b2UL, 0x26218d7bc9d876b9UL,
- /* 42 */ 0xe3081542997c178aUL, 0x3c2d29a86fb6606fUL,
- 0x5c217736fa279374UL, 0x7dde05734afeb1faUL,
- /* 43 */ 0x3bf10e3906d42babUL, 0xe4f7803e1980649cUL,
- 0xe6053bf89595bf7aUL, 0x394faf38da245530UL,
- /* 44 */ 0x7a8efb58896928f4UL, 0xfbc778e9cc6a113cUL,
- 0x72670ce330af596fUL, 0x48f222a81d3d6cf7UL,
- /* 45 */ 0xf01fce410d72caa7UL, 0x5a20ecc7213b5595UL,
- 0x7bc21165c1fa1483UL, 0x07f89ae31da8a741UL,
- /* 46 */ 0x05d2c2b4c6830ff9UL, 0xd43e330fc6316293UL,
- 0xa5a5590a96d3a904UL, 0x705edb91a65333b6UL,
- /* 47 */ 0x048ee15e0bb9a5f7UL, 0x3240cfca9e0aaf5dUL,
- 0x8f4b71ceedc4a40bUL, 0x621c0da3de544a6dUL,
- /* 48 */ 0x92872836a08c4091UL, 0xce8375b010c91445UL,
- 0x8a72eb524f276394UL, 0x2667fcfa7ec83635UL,
- /* 49 */ 0x7f4c173345e8752aUL, 0x061b47feee7079a5UL,
- 0x25dd9afa9f86ff34UL, 0x3780cef5425dc89cUL,
- /* 50 */ 0x1a46035a513bb4e9UL, 0x3e1ef379ac575adaUL,
- 0xc78c5f1c5fa24b50UL, 0x321a967634fd9f22UL,
- /* 51 */ 0x946707b8826e27faUL, 0x3dca84d64c506fd0UL,
- 0xc189218075e91436UL, 0x6d9284169b3b8484UL,
- /* 52 */ 0x3a67e840383f2ddfUL, 0x33eec9a30c4f9b75UL,
- 0x3ec7c86fa783ef47UL, 0x26ec449fbac9fbc4UL,
- /* 53 */ 0x5c0f38cba09b9e7dUL, 0x81168cc762a3478cUL,
- 0x3e23b0d306fc121cUL, 0x5a238aa0a5efdcddUL,
- /* 54 */ 0x1ba26121c4ea43ffUL, 0x36f8c77f7c8832b5UL,
- 0x88fbea0b0adcf99aUL, 0x5ca9938ec25bebf9UL,
- /* 55 */ 0xd5436a5e51fccda0UL, 0x1dbc4797c2cd893bUL,
- 0x19346a65d3224a08UL, 0x0f5034e49b9af466UL,
- /* 56 */ 0xf23c3967a1e0b96eUL, 0xe58b08fa867a4d88UL,
- 0xfb2fabc6a7341679UL, 0x2a75381eb6026946UL,
- /* 57 */ 0xc80a3be4c19420acUL, 0x66b1f6c681f2b6dcUL,
- 0x7cf7036761e93388UL, 0x25abbbd8a660a4c4UL,
- /* 58 */ 0x91ea12ba14fd5198UL, 0x684950fc4a3cffa9UL,
- 0xf826842130f5ad28UL, 0x3ea988f75301a441UL,
- /* 59 */ 0xc978109a695f8c6fUL, 0x1746eb4a0530c3f3UL,
- 0x444d6d77b4459995UL, 0x75952b8c054e5cc7UL,
- /* 60 */ 0xa3703f7915f4d6aaUL, 0x66c346202f2647d8UL,
- 0xd01469df811d644bUL, 0x77fea47d81a5d71fUL,
- /* 61 */ 0xc5e9529ef57ca381UL, 0x6eeeb4b9ce2f881aUL,
- 0xb6e91a28e8009bd6UL, 0x4b80be3e9afc3fecUL,
- /* 62 */ 0x7e3773c526aed2c5UL, 0x1b4afcb453c9a49dUL,
- 0xa920bdd7baffb24dUL, 0x7c54699f122d400eUL,
- /* 63 */ 0xef46c8e14fa94bc8UL, 0xe0b074ce2952ed5eUL,
- 0xbea450e1dbd885d5UL, 0x61b68649320f712cUL,
- /* 64 */ 0x8a485f7309ccbdd1UL, 0xbd06320d7d4d1a2dUL,
- 0x25232973322dbef4UL, 0x445dc4758c17f770UL,
- /* 65 */ 0xdb0434177cc8933cUL, 0xed6fe82175ea059fUL,
- 0x1efebefdc053db34UL, 0x4adbe867c65daf99UL,
- /* 66 */ 0x3acd71a2a90609dfUL, 0xe5e991856dd04050UL,
- 0x1ec69b688157c23cUL, 0x697427f6885cfe4dUL,
- /* 67 */ 0xd7be7b9b65e1a851UL, 0xa03d28d522c536ddUL,
- 0x28399d658fd2b645UL, 0x49e5b7e17c2641e1UL,
- /* 68 */ 0x6f8c3a98700457a4UL, 0x5078f0a25ebb6778UL,
- 0xd13c3ccbc382960fUL, 0x2e003258a7df84b1UL,
- /* 69 */ 0x8ad1f39be6296a1cUL, 0xc1eeaa652a5fbfb2UL,
- 0x33ee0673fd26f3cbUL, 0x59256173a69d2cccUL,
- /* 70 */ 0x41ea07aa4e18fc41UL, 0xd9fc19527c87a51eUL,
- 0xbdaacb805831ca6fUL, 0x445b652dc916694fUL,
- /* 71 */ 0xce92a3a7f2172315UL, 0x1edc282de11b9964UL,
- 0xa1823aafe04c314aUL, 0x790a2d94437cf586UL,
- /* 72 */ 0x71c447fb93f6e009UL, 0x8922a56722845276UL,
- 0xbf70903b204f5169UL, 0x2f7a89891ba319feUL,
- /* 73 */ 0x02a08eb577e2140cUL, 0xed9a4ed4427bdcf4UL,
- 0x5253ec44e4323cd1UL, 0x3e88363c14e9355bUL,
- /* 74 */ 0xaa66c14277110b8cUL, 0x1ae0391610a23390UL,
- 0x2030bd12c93fc2a2UL, 0x3ee141579555c7abUL,
- /* 75 */ 0x9214de3a6d6e7d41UL, 0x3ccdd88607f17efeUL,
- 0x674f1288f8e11217UL, 0x5682250f329f93d0UL,
- /* 76 */ 0x6cf00b136d2e396eUL, 0x6e4cf86f1014debfUL,
- 0x5930b1b5bfcc4e83UL, 0x047069b48aba16b6UL,
- /* 77 */ 0x0d4ce4ab69b20793UL, 0xb24db91a97d0fb9eUL,
- 0xcdfa50f54e00d01dUL, 0x221b1085368bddb5UL,
- /* 78 */ 0xe7e59468b1e3d8d2UL, 0x53c56563bd122f93UL,
- 0xeee8a903e0663f09UL, 0x61efa662cbbe3d42UL,
- /* 79 */ 0x2cf8ddddde6eab2aUL, 0x9bf80ad51435f231UL,
- 0x5deadacec9f04973UL, 0x29275b5d41d29b27UL,
- /* 80 */ 0xcfde0f0895ebf14fUL, 0xb9aab96b054905a7UL,
- 0xcae80dd9a1c420fdUL, 0x0a63bf2f1673bbc7UL,
- /* 81 */ 0x092f6e11958fbc8cUL, 0x672a81e804822fadUL,
- 0xcac8351560d52517UL, 0x6f3f7722c8f192f8UL,
- /* 82 */ 0xf8ba90ccc2e894b7UL, 0x2c7557a438ff9f0dUL,
- 0x894d1d855ae52359UL, 0x68e122157b743d69UL,
- /* 83 */ 0xd87e5570cfb919f3UL, 0x3f2cdecd95798db9UL,
- 0x2121154710c0a2ceUL, 0x3c66a115246dc5b2UL,
- /* 84 */ 0xcbedc562294ecb72UL, 0xba7143c36a280b16UL,
- 0x9610c2efd4078b67UL, 0x6144735d946a4b1eUL,
- /* 85 */ 0x536f111ed75b3350UL, 0x0211db8c2041d81bUL,
- 0xf93cb1000e10413cUL, 0x149dfd3c039e8876UL,
- /* 86 */ 0xd479dde46b63155bUL, 0xb66e15e93c837976UL,
- 0xdafde43b1f13e038UL, 0x5fafda1a2e4b0b35UL,
- /* 87 */ 0x3600bbdf17197581UL, 0x3972050bbe3cd2c2UL,
- 0x5938906dbdd5be86UL, 0x34fce5e43f9b860fUL,
- /* 88 */ 0x75a8a4cd42d14d02UL, 0x828dabc53441df65UL,
- 0x33dcabedd2e131d3UL, 0x3ebad76fb814d25fUL,
- /* 89 */ 0xd4906f566f70e10fUL, 0x5d12f7aa51690f5aUL,
- 0x45adb16e76cefcf2UL, 0x01f768aead232999UL,
- /* 90 */ 0x2b6cc77b6248febdUL, 0x3cd30628ec3aaffdUL,
- 0xce1c0b80d4ef486aUL, 0x4c3bff2ea6f66c23UL,
- /* 91 */ 0x3f2ec4094aeaeb5fUL, 0x61b19b286e372ca7UL,
- 0x5eefa966de2a701dUL, 0x23b20565de55e3efUL,
- /* 92 */ 0xe301ca5279d58557UL, 0x07b2d4ce27c2874fUL,
- 0xa532cd8a9dcf1d67UL, 0x2a52fee23f2bff56UL,
- /* 93 */ 0x8624efb37cd8663dUL, 0xbbc7ac20ffbd7594UL,
- 0x57b85e9c82d37445UL, 0x7b3052cb86a6ec66UL,
- /* 94 */ 0x3482f0ad2525e91eUL, 0x2cb68043d28edca0UL,
- 0xaf4f6d052e1b003aUL, 0x185f8c2529781b0aUL,
- /* 95 */ 0xaa41de5bd80ce0d6UL, 0x9407b2416853e9d6UL,
- 0x563ec36e357f4c3aUL, 0x4cc4b8dd0e297bceUL,
- /* 96 */ 0xa2fc1a52ffb8730eUL, 0x1811f16e67058e37UL,
- 0x10f9a366cddf4ee1UL, 0x72f4a0c4a0b9f099UL,
- /* 97 */ 0x8c16c06f663f4ea7UL, 0x693b3af74e970fbaUL,
- 0x2102e7f1d69ec345UL, 0x0ba53cbc968a8089UL,
- /* 98 */ 0xca3d9dc7fea15537UL, 0x4c6824bb51536493UL,
- 0xb9886314844006b1UL, 0x40d2a72ab454cc60UL,
- /* 99 */ 0x5936a1b712570975UL, 0x91b9d648debda657UL,
- 0x3344094bb64330eaUL, 0x006ba10d12ee51d0UL,
- /* 100 */ 0x19228468f5de5d58UL, 0x0eb12f4c38cc05b0UL,
- 0xa1039f9dd5601990UL, 0x4502d4ce4fff0e0bUL,
- /* 101 */ 0xeb2054106837c189UL, 0xd0f6544c6dd3b93cUL,
- 0x40727064c416d74fUL, 0x6e15c6114b502ef0UL,
- /* 102 */ 0x4df2a398cfb1a76bUL, 0x11256c7419f2f6b1UL,
- 0x4a497962066e6043UL, 0x705b3aab41355b44UL,
- /* 103 */ 0x365ef536d797b1d8UL, 0x00076bd622ddf0dbUL,
- 0x3bbf33b0e0575a88UL, 0x3777aa05c8e4ca4dUL,
- /* 104 */ 0x392745c85578db5fUL, 0x6fda4149dbae5ae2UL,
- 0xb1f0b00b8adc9867UL, 0x09963437d36f1da3UL,
- /* 105 */ 0x7e824e90a5dc3853UL, 0xccb5f6641f135cbdUL,
- 0x6736d86c87ce8fccUL, 0x625f3ce26604249fUL,
- /* 106 */ 0xaf8ac8059502f63fUL, 0x0c05e70a2e351469UL,
- 0x35292e9c764b6305UL, 0x1a394360c7e23ac3UL,
- /* 107 */ 0xd5c6d53251183264UL, 0x62065abd43c2b74fUL,
- 0xb5fbf5d03b973f9bUL, 0x13a3da3661206e5eUL,
- /* 108 */ 0xc6bd5837725d94e5UL, 0x18e30912205016c5UL,
- 0x2088ce1570033c68UL, 0x7fba1f495c837987UL,
- /* 109 */ 0x5a8c7423f2f9079dUL, 0x1735157b34023fc5UL,
- 0xe4f9b49ad2fab351UL, 0x6691ff72c878e33cUL,
- /* 110 */ 0x122c2adedc5eff3eUL, 0xf8dd4bf1d8956cf4UL,
- 0xeb86205d9e9e5bdaUL, 0x049b92b9d975c743UL,
- /* 111 */ 0xa5379730b0f6c05aUL, 0x72a0ffacc6f3a553UL,
- 0xb0032c34b20dcd6dUL, 0x470e9dbc88d5164aUL,
- /* 112 */ 0xb19cf10ca237c047UL, 0xb65466711f6c81a2UL,
- 0xb3321bd16dd80b43UL, 0x48c14f600c5fbe8eUL,
- /* 113 */ 0x66451c264aa6c803UL, 0xb66e3904a4fa7da6UL,
- 0xd45f19b0b3128395UL, 0x31602627c3c9bc10UL,
- /* 114 */ 0x3120dc4832e4e10dUL, 0xeb20c46756c717f7UL,
- 0x00f52e3f67280294UL, 0x566d4fc14730c509UL,
- /* 115 */ 0x7e3a5d40fd837206UL, 0xc1e926dc7159547aUL,
- 0x216730fba68d6095UL, 0x22e8c3843f69cea7UL,
- /* 116 */ 0x33d074e8930e4b2bUL, 0xb6e4350e84d15816UL,
- 0x5534c26ad6ba2365UL, 0x7773c12f89f1f3f3UL,
- /* 117 */ 0x8cba404da57962aaUL, 0x5b9897a81999ce56UL,
- 0x508e862f121692fcUL, 0x3a81907fa093c291UL,
- /* 118 */ 0x0dded0ff4725a510UL, 0x10d8cc10673fc503UL,
- 0x5b9d151c9f1f4e89UL, 0x32a5c1d5cb09a44cUL,
- /* 119 */ 0x1e0aa442b90541fbUL, 0x5f85eb7cc1b485dbUL,
- 0xbee595ce8a9df2e5UL, 0x25e496c722422236UL,
- /* 120 */ 0x5edf3c46cd0fe5b9UL, 0x34e75a7ed2a43388UL,
- 0xe488de11d761e352UL, 0x0e878a01a085545cUL,
- /* 121 */ 0xba493c77e021bb04UL, 0x2b4d1843c7df899aUL,
- 0x9ea37a487ae80d67UL, 0x67a9958011e41794UL,
- /* 122 */ 0x4b58051a6697b065UL, 0x47e33f7d8d6ba6d4UL,
- 0xbb4da8d483ca46c1UL, 0x68becaa181c2db0dUL,
- /* 123 */ 0x8d8980e90b989aa5UL, 0xf95eb14a2c93c99bUL,
- 0x51c6c7c4796e73a2UL, 0x6e228363b5efb569UL,
- /* 124 */ 0xc6bbc0b02dd624c8UL, 0x777eb47dec8170eeUL,
- 0x3cde15a004cfafa9UL, 0x1dc6bc087160bf9bUL,
- /* 125 */ 0x2e07e043eec34002UL, 0x18e9fc677a68dc7fUL,
- 0xd8da03188bd15b9aUL, 0x48fbc3bb00568253UL,
- /* 126 */ 0x57547d4cfb654ce1UL, 0xd3565b82a058e2adUL,
- 0xf63eaf0bbf154478UL, 0x47531ef114dfbb18UL,
- /* 127 */ 0xe1ec630a4278c587UL, 0x5507d546ca8e83f3UL,
- 0x85e135c63adc0c2bUL, 0x0aa7efa85682844eUL,
- /* 128 */ 0x72691ba8b3e1f615UL, 0x32b4e9701fbe3ffaUL,
- 0x97b6d92e39bb7868UL, 0x2cfe53dea02e39e8UL,
- /* 129 */ 0x687392cd85cd52b0UL, 0x27ff66c910e29831UL,
- 0x97134556a9832d06UL, 0x269bb0360a84f8a0UL,
- /* 130 */ 0x706e55457643f85cUL, 0x3734a48c9b597d1bUL,
- 0x7aee91e8c6efa472UL, 0x5cd6abc198a9d9e0UL,
- /* 131 */ 0x0e04de06cb3ce41aUL, 0xd8c6eb893402e138UL,
- 0x904659bb686e3772UL, 0x7215c371746ba8c8UL,
- /* 132 */ 0xfd12a97eeae4a2d9UL, 0x9514b7516394f2c5UL,
- 0x266fd5809208f294UL, 0x5c847085619a26b9UL,
- /* 133 */ 0x52985410fed694eaUL, 0x3c905b934a2ed254UL,
- 0x10bb47692d3be467UL, 0x063b3d2d69e5e9e1UL,
- /* 134 */ 0x472726eedda57debUL, 0xefb6c4ae10f41891UL,
- 0x2b1641917b307614UL, 0x117c554fc4f45b7cUL,
- /* 135 */ 0xc07cf3118f9d8812UL, 0x01dbd82050017939UL,
- 0xd7e803f4171b2827UL, 0x1015e87487d225eaUL,
- /* 136 */ 0xc58de3fed23acc4dUL, 0x50db91c294a7be2dUL,
- 0x0b94d43d1c9cf457UL, 0x6b1640fa6e37524aUL,
- /* 137 */ 0x692f346c5fda0d09UL, 0x200b1c59fa4d3151UL,
- 0xb8c46f760777a296UL, 0x4b38395f3ffdfbcfUL,
- /* 138 */ 0x18d25e00be54d671UL, 0x60d50582bec8aba6UL,
- 0x87ad8f263b78b982UL, 0x50fdf64e9cda0432UL,
- /* 139 */ 0x90f567aac578dcf0UL, 0xef1e9b0ef2a3133bUL,
- 0x0eebba9242d9de71UL, 0x15473c9bf03101c7UL,
- /* 140 */ 0x7c77e8ae56b78095UL, 0xb678e7666e6f078eUL,
- 0x2da0b9615348ba1fUL, 0x7cf931c1ff733f0bUL,
- /* 141 */ 0x26b357f50a0a366cUL, 0xe9708cf42b87d732UL,
- 0xc13aeea5f91cb2c0UL, 0x35d90c991143bb4cUL,
- /* 142 */ 0x47c1c404a9a0d9dcUL, 0x659e58451972d251UL,
- 0x3875a8c473b38c31UL, 0x1fbd9ed379561f24UL,
- /* 143 */ 0x11fabc6fd41ec28dUL, 0x7ef8dfe3cd2a2dcaUL,
- 0x72e73b5d8c404595UL, 0x6135fa4954b72f27UL,
- /* 144 */ 0xccfc32a2de24b69cUL, 0x3f55698c1f095d88UL,
- 0xbe3350ed5ac3f929UL, 0x5e9bf806ca477eebUL,
- /* 145 */ 0xe9ce8fb63c309f68UL, 0x5376f63565e1f9f4UL,
- 0xd1afcfb35a6393f1UL, 0x6632a1ede5623506UL,
- /* 146 */ 0x0b7d6c390c2ded4cUL, 0x56cb3281df04cb1fUL,
- 0x66305a1249ecc3c7UL, 0x5d588b60a38ca72aUL,
- /* 147 */ 0xa6ecbf78e8e5f42dUL, 0x86eeb44b3c8a3eecUL,
- 0xec219c48fbd21604UL, 0x1aaf1af517c36731UL,
- /* 148 */ 0xc306a2836769bde7UL, 0x208280622b1e2adbUL,
- 0x8027f51ffbff94a6UL, 0x76cfa1ce1124f26bUL,
- /* 149 */ 0x18eb00562422abb6UL, 0xf377c4d58f8c29c3UL,
- 0x4dbbc207f531561aUL, 0x0253b7f082128a27UL,
- /* 150 */ 0x3d1f091cb62c17e0UL, 0x4860e1abd64628a9UL,
- 0x52d17436309d4253UL, 0x356f97e13efae576UL,
- /* 151 */ 0xd351e11aa150535bUL, 0x3e6b45bb1dd878ccUL,
- 0x0c776128bed92c98UL, 0x1d34ae93032885b8UL,
- /* 152 */ 0x4ba0488ca85ba4c3UL, 0x985348c33c9ce6ceUL,
- 0x66124c6f97bda770UL, 0x0f81a0290654124aUL,
- /* 153 */ 0x9ed09ca6569b86fdUL, 0x811009fd18af9a2dUL,
- 0xff08d03f93d8c20aUL, 0x52a148199faef26bUL,
- /* 154 */ 0x3e03f9dc2d8d1b73UL, 0x4205801873961a70UL,
- 0xc0d987f041a35970UL, 0x07aa1f15a1c0d549UL,
- /* 155 */ 0xdfd46ce08cd27224UL, 0x6d0a024f934e4239UL,
- 0x808a7a6399897b59UL, 0x0a4556e9e13d95a2UL,
- /* 156 */ 0xd21a991fe9c13045UL, 0x9b0e8548fe7751b8UL,
- 0x5da643cb4bf30035UL, 0x77db28d63940f721UL,
- /* 157 */ 0xfc5eeb614adc9011UL, 0x5229419ae8c411ebUL,
- 0x9ec3e7787d1dcf74UL, 0x340d053e216e4cb5UL,
- /* 158 */ 0xcac7af39b48df2b4UL, 0xc0faec2871a10a94UL,
- 0x140a69245ca575edUL, 0x0cf1c37134273a4cUL,
- /* 159 */ 0xc8ee306ac224b8a5UL, 0x57eaee7ccb4930b0UL,
- 0xa1e806bdaacbe74fUL, 0x7d9a62742eeb657dUL,
- /* 160 */ 0x9eb6b6ef546c4830UL, 0x885cca1fddb36e2eUL,
- 0xe6b9f383ef0d7105UL, 0x58654fef9d2e0412UL,
- /* 161 */ 0xa905c4ffbe0e8e26UL, 0x942de5df9b31816eUL,
- 0x497d723f802e88e1UL, 0x30684dea602f408dUL,
- /* 162 */ 0x21e5a278a3e6cb34UL, 0xaefb6e6f5b151dc4UL,
- 0xb30b8e049d77ca15UL, 0x28c3c9cf53b98981UL,
- /* 163 */ 0x287fb721556cdd2aUL, 0x0d317ca897022274UL,
- 0x7468c7423a543258UL, 0x4a7f11464eb5642fUL,
- /* 164 */ 0xa237a4774d193aa6UL, 0xd865986ea92129a1UL,
- 0x24c515ecf87c1a88UL, 0x604003575f39f5ebUL,
- /* 165 */ 0x47b9f189570a9b27UL, 0x2b98cede465e4b78UL,
- 0x026df551dbb85c20UL, 0x74fcd91047e21901UL,
- /* 166 */ 0x13e2a90a23c1bfa3UL, 0x0cb0074e478519f6UL,
- 0x5ff1cbbe3af6cf44UL, 0x67fe5438be812dbeUL,
- /* 167 */ 0xd13cf64fa40f05b0UL, 0x054dfb2f32283787UL,
- 0x4173915b7f0d2aeaUL, 0x482f144f1f610d4eUL,
- /* 168 */ 0xf6210201b47f8234UL, 0x5d0ae1929e70b990UL,
- 0xdcd7f455b049567cUL, 0x7e93d0f1f0916f01UL,
- /* 169 */ 0xdd79cbf18a7db4faUL, 0xbe8391bf6f74c62fUL,
- 0x027145d14b8291bdUL, 0x585a73ea2cbf1705UL,
- /* 170 */ 0x485ca03e928a0db2UL, 0x10fc01a5742857e7UL,
- 0x2f482edbd6d551a7UL, 0x0f0433b5048fdb8aUL,
- /* 171 */ 0x60da2e8dd7dc6247UL, 0x88b4c9d38cd4819aUL,
- 0x13033ac001f66697UL, 0x273b24fe3b367d75UL,
- /* 172 */ 0xc6e8f66a31b3b9d4UL, 0x281514a494df49d5UL,
- 0xd1726fdfc8b23da7UL, 0x4b3ae7d103dee548UL,
- /* 173 */ 0xc6256e19ce4b9d7eUL, 0xff5c5cf186e3c61cUL,
- 0xacc63ca34b8ec145UL, 0x74621888fee66574UL,
- /* 174 */ 0x956f409645290a1eUL, 0xef0bf8e3263a962eUL,
- 0xed6a50eb5ec2647bUL, 0x0694283a9dca7502UL,
- /* 175 */ 0x769b963643a2dcd1UL, 0x42b7c8ea09fc5353UL,
- 0x4f002aee13397eabUL, 0x63005e2c19b7d63aUL,
- /* 176 */ 0xca6736da63023beaUL, 0x966c7f6db12a99b7UL,
- 0xace09390c537c5e1UL, 0x0b696063a1aa89eeUL,
- /* 177 */ 0xebb03e97288c56e5UL, 0x432a9f9f938c8be8UL,
- 0xa6a5a93d5b717f71UL, 0x1a5fb4c3e18f9d97UL,
- /* 178 */ 0x1c94e7ad1c60cdceUL, 0xee202a43fc02c4a0UL,
- 0x8dafe4d867c46a20UL, 0x0a10263c8ac27b58UL,
- /* 179 */ 0xd0dea9dfe4432a4aUL, 0x856af87bbe9277c5UL,
- 0xce8472acc212c71aUL, 0x6f151b6d9bbb1e91UL,
- /* 180 */ 0x26776c527ceed56aUL, 0x7d211cb7fbf8faecUL,
- 0x37ae66a6fd4609ccUL, 0x1f81b702d2770c42UL,
- /* 181 */ 0x2fb0b057eac58392UL, 0xe1dd89fe29744e9dUL,
- 0xc964f8eb17beb4f8UL, 0x29571073c9a2d41eUL,
- /* 182 */ 0xa948a18981c0e254UL, 0x2df6369b65b22830UL,
- 0xa33eb2d75fcfd3c6UL, 0x078cd6ec4199a01fUL,
- /* 183 */ 0x4a584a41ad900d2fUL, 0x32142b78e2c74c52UL,
- 0x68c4e8338431c978UL, 0x7f69ea9008689fc2UL,
- /* 184 */ 0x52f2c81e46a38265UL, 0xfd78072d04a832fdUL,
- 0x8cd7d5fa25359e94UL, 0x4de71b7454cc29d2UL,
- /* 185 */ 0x42eb60ad1eda6ac9UL, 0x0aad37dfdbc09c3aUL,
- 0x81004b71e33cc191UL, 0x44e6be345122803cUL,
- /* 186 */ 0x03fe8388ba1920dbUL, 0xf5d57c32150db008UL,
- 0x49c8c4281af60c29UL, 0x21edb518de701aeeUL,
- /* 187 */ 0x7fb63e418f06dc99UL, 0xa4460d99c166d7b8UL,
- 0x24dd5248ce520a83UL, 0x5ec3ad712b928358UL,
- /* 188 */ 0x15022a5fbd17930fUL, 0xa4f64a77d82570e3UL,
- 0x12bc8d6915783712UL, 0x498194c0fc620abbUL,
- /* 189 */ 0x38a2d9d255686c82UL, 0x785c6bd9193e21f0UL,
- 0xe4d5c81ab24a5484UL, 0x56307860b2e20989UL,
- /* 190 */ 0x429d55f78b4d74c4UL, 0x22f1834643350131UL,
- 0x1e60c24598c71fffUL, 0x59f2f014979983efUL,
- /* 191 */ 0x46a47d56eb494a44UL, 0x3e22a854d636a18eUL,
- 0xb346e15274491c3bUL, 0x2ceafd4e5390cde7UL,
- /* 192 */ 0xba8a8538be0d6675UL, 0x4b9074bb50818e23UL,
- 0xcbdab89085d304c3UL, 0x61a24fe0e56192c4UL,
- /* 193 */ 0xcb7615e6db525bcbUL, 0xdd7d8c35a567e4caUL,
- 0xe6b4153acafcdd69UL, 0x2d668e097f3c9766UL,
- /* 194 */ 0xa57e7e265ce55ef0UL, 0x5d9f4e527cd4b967UL,
- 0xfbc83606492fd1e5UL, 0x090d52beb7c3f7aeUL,
- /* 195 */ 0x09b9515a1e7b4d7cUL, 0x1f266a2599da44c0UL,
- 0xa1c49548e2c55504UL, 0x7ef04287126f15ccUL,
- /* 196 */ 0xfed1659dbd30ef15UL, 0x8b4ab9eec4e0277bUL,
- 0x884d6236a5df3291UL, 0x1fd96ea6bf5cf788UL,
- /* 197 */ 0x42a161981f190d9aUL, 0x61d849507e6052c1UL,
- 0x9fe113bf285a2cd5UL, 0x7c22d676dbad85d8UL,
- /* 198 */ 0x82e770ed2bfbd27dUL, 0x4c05b2ece996f5a5UL,
- 0xcd40a9c2b0900150UL, 0x5895319213d9bf64UL,
- /* 199 */ 0xe7cc5d703fea2e08UL, 0xb50c491258e2188cUL,
- 0xcce30baa48205bf0UL, 0x537c659ccfa32d62UL,
- /* 200 */ 0x37b6623a98cfc088UL, 0xfe9bed1fa4d6aca4UL,
- 0x04d29b8e56a8d1b0UL, 0x725f71c40b519575UL,
- /* 201 */ 0x28c7f89cd0339ce6UL, 0x8367b14469ddc18bUL,
- 0x883ada83a6a1652cUL, 0x585f1974034d6c17UL,
- /* 202 */ 0x89cfb266f1b19188UL, 0xe63b4863e7c35217UL,
- 0xd88c9da6b4c0526aUL, 0x3e035c9df0954635UL,
- /* 203 */ 0xdd9d5412fb45de9dUL, 0xdd684532e4cff40dUL,
- 0x4b5c999b151d671cUL, 0x2d8c2cc811e7f690UL,
- /* 204 */ 0x7f54be1d90055d40UL, 0xa464c5df464aaf40UL,
- 0x33979624f0e917beUL, 0x2c018dc527356b30UL,
- /* 205 */ 0xa5415024e330b3d4UL, 0x73ff3d96691652d3UL,
- 0x94ec42c4ef9b59f1UL, 0x0747201618d08e5aUL,
- /* 206 */ 0x4d6ca48aca411c53UL, 0x66415f2fcfa66119UL,
- 0x9c4dd40051e227ffUL, 0x59810bc09a02f7ebUL,
- /* 207 */ 0x2a7eb171b3dc101dUL, 0x441c5ab99ffef68eUL,
- 0x32025c9b93b359eaUL, 0x5e8ce0a71e9d112fUL,
- /* 208 */ 0xbfcccb92429503fdUL, 0xd271ba752f095d55UL,
- 0x345ead5e972d091eUL, 0x18c8df11a83103baUL,
- /* 209 */ 0x90cd949a9aed0f4cUL, 0xc5d1f4cb6660e37eUL,
- 0xb8cac52d56c52e0bUL, 0x6e42e400c5808e0dUL,
- /* 210 */ 0xa3b46966eeaefd23UL, 0x0c4f1f0be39ecdcaUL,
- 0x189dc8c9d683a51dUL, 0x51f27f054c09351bUL,
- /* 211 */ 0x4c487ccd2a320682UL, 0x587ea95bb3df1c96UL,
- 0xc8ccf79e555cb8e8UL, 0x547dc829a206d73dUL,
- /* 212 */ 0xb822a6cd80c39b06UL, 0xe96d54732000d4c6UL,
- 0x28535b6f91463b4dUL, 0x228f4660e2486e1dUL,
- /* 213 */ 0x98799538de8d3abfUL, 0x8cd8330045ebca6eUL,
- 0x79952a008221e738UL, 0x4322e1a7535cd2bbUL,
- /* 214 */ 0xb114c11819d1801cUL, 0x2016e4d84f3f5ec7UL,
- 0xdd0e2df409260f4cUL, 0x5ec362c0ae5f7266UL,
- /* 215 */ 0xc0462b18b8b2b4eeUL, 0x7cc8d950274d1afbUL,
- 0xf25f7105436b02d2UL, 0x43bbf8dcbff9ccd3UL,
- /* 216 */ 0xb6ad1767a039e9dfUL, 0xb0714da8f69d3583UL,
- 0x5e55fa18b42931f5UL, 0x4ed5558f33c60961UL,
- /* 217 */ 0x1fe37901c647a5ddUL, 0x593ddf1f8081d357UL,
- 0x0249a4fd813fd7a6UL, 0x69acca274e9caf61UL,
- /* 218 */ 0x047ba3ea330721c9UL, 0x83423fc20e7e1ea0UL,
- 0x1df4c0af01314a60UL, 0x09a62dab89289527UL,
- /* 219 */ 0xa5b325a49cc6cb00UL, 0xe94b5dc654b56cb6UL,
- 0x3be28779adc994a0UL, 0x4296e8f8ba3a4aadUL,
- /* 220 */ 0x328689761e451eabUL, 0x2e4d598bff59594aUL,
- 0x49b96853d7a7084aUL, 0x4980a319601420a8UL,
- /* 221 */ 0x9565b9e12f552c42UL, 0x8a5318db7100fe96UL,
- 0x05c90b4d43add0d7UL, 0x538b4cd66a5d4edaUL,
- /* 222 */ 0xf4e94fc3e89f039fUL, 0x592c9af26f618045UL,
- 0x08a36eb5fd4b9550UL, 0x25fffaf6c2ed1419UL,
- /* 223 */ 0x34434459cc79d354UL, 0xeeecbfb4b1d5476bUL,
- 0xddeb34a061615d99UL, 0x5129cecceb64b773UL,
- /* 224 */ 0xee43215894993520UL, 0x772f9c7cf14c0b3bUL,
- 0xd2e2fce306bedad5UL, 0x715f42b546f06a97UL,
- /* 225 */ 0x434ecdceda5b5f1aUL, 0x0da17115a49741a9UL,
- 0x680bd77c73edad2eUL, 0x487c02354edd9041UL,
- /* 226 */ 0xb8efeff3a70ed9c4UL, 0x56a32aa3e857e302UL,
- 0xdf3a68bd48a2a5a0UL, 0x07f650b73176c444UL,
- /* 227 */ 0xe38b9b1626e0ccb1UL, 0x79e053c18b09fb36UL,
- 0x56d90319c9f94964UL, 0x1ca941e7ac9ff5c4UL,
- /* 228 */ 0x49c4df29162fa0bbUL, 0x8488cf3282b33305UL,
- 0x95dfda14cabb437dUL, 0x3391f78264d5ad86UL,
- /* 229 */ 0x729ae06ae2b5095dUL, 0xd58a58d73259a946UL,
- 0xe9834262d13921edUL, 0x27fedafaa54bb592UL,
- /* 230 */ 0xa99dc5b829ad48bbUL, 0x5f025742499ee260UL,
- 0x802c8ecd5d7513fdUL, 0x78ceb3ef3f6dd938UL,
- /* 231 */ 0xc342f44f8a135d94UL, 0x7b9edb44828cdda3UL,
- 0x9436d11a0537cfe7UL, 0x5064b164ec1ab4c8UL,
- /* 232 */ 0x7020eccfd37eb2fcUL, 0x1f31ea3ed90d25fcUL,
- 0x1b930d7bdfa1bb34UL, 0x5344467a48113044UL,
- /* 233 */ 0x70073170f25e6dfbUL, 0xe385dc1a50114cc8UL,
- 0x2348698ac8fc4f00UL, 0x2a77a55284dd40d8UL,
- /* 234 */ 0xfe06afe0c98c6ce4UL, 0xc235df96dddfd6e4UL,
- 0x1428d01e33bf1ed3UL, 0x785768ec9300bdafUL,
- /* 235 */ 0x9702e57a91deb63bUL, 0x61bdb8bfe5ce8b80UL,
- 0x645b426f3d1d58acUL, 0x4804a82227a557bcUL,
- /* 236 */ 0x8e57048ab44d2601UL, 0x68d6501a4b3a6935UL,
- 0xc39c9ec3f9e1c293UL, 0x4172f257d4de63e2UL,
- /* 237 */ 0xd368b450330c6401UL, 0x040d3017418f2391UL,
- 0x2c34bb6090b7d90dUL, 0x16f649228fdfd51fUL,
- /* 238 */ 0xbea6818e2b928ef5UL, 0xe28ccf91cdc11e72UL,
- 0x594aaa68e77a36cdUL, 0x313034806c7ffd0fUL,
- /* 239 */ 0x8a9d27ac2249bd65UL, 0x19a3b464018e9512UL,
- 0xc26ccff352b37ec7UL, 0x056f68341d797b21UL,
- /* 240 */ 0x5e79d6757efd2327UL, 0xfabdbcb6553afe15UL,
- 0xd3e7222c6eaf5a60UL, 0x7046c76d4dae743bUL,
- /* 241 */ 0x660be872b18d4a55UL, 0x19992518574e1496UL,
- 0xc103053a302bdcbbUL, 0x3ed8e9800b218e8eUL,
- /* 242 */ 0x7b0b9239fa75e03eUL, 0xefe9fb684633c083UL,
- 0x98a35fbe391a7793UL, 0x6065510fe2d0fe34UL,
- /* 243 */ 0x55cb668548abad0cUL, 0xb4584548da87e527UL,
- 0x2c43ecea0107c1ddUL, 0x526028809372de35UL,
- /* 244 */ 0x3415c56af9213b1fUL, 0x5bee1a4d017e98dbUL,
- 0x13f6b105b5cf709bUL, 0x5ff20e3482b29ab6UL,
- /* 245 */ 0x0aa29c75cc2e6c90UL, 0xfc7d73ca3a70e206UL,
- 0x899fc38fc4b5c515UL, 0x250386b124ffc207UL,
- /* 246 */ 0x54ea28d5ae3d2b56UL, 0x9913149dd6de60ceUL,
- 0x16694fc58f06d6c1UL, 0x46b23975eb018fc7UL,
- /* 247 */ 0x470a6a0fb4b7b4e2UL, 0x5d92475a8f7253deUL,
- 0xabeee5b52fbd3adbUL, 0x7fa20801a0806968UL,
- /* 248 */ 0x76f3faf19f7714d2UL, 0xb3e840c12f4660c3UL,
- 0x0fb4cd8df212744eUL, 0x4b065a251d3a2dd2UL,
- /* 249 */ 0x5cebde383d77cd4aUL, 0x6adf39df882c9cb1UL,
- 0xa2dd242eb09af759UL, 0x3147c0e50e5f6422UL,
- /* 250 */ 0x164ca5101d1350dbUL, 0xf8d13479c33fc962UL,
- 0xe640ce4d13e5da08UL, 0x4bdee0c45061f8baUL,
- /* 251 */ 0xd7c46dc1a4edb1c9UL, 0x5514d7b6437fd98aUL,
- 0x58942f6bb2a1c00bUL, 0x2dffb2ab1d70710eUL,
- /* 252 */ 0xccdfcf2fc18b6d68UL, 0xa8ebcba8b7806167UL,
- 0x980697f95e2937e3UL, 0x02fbba1cd0126e8cUL
-};
-
-/* c is two 512-bit products: c0[0:7]=a0[0:3]*b0[0:3] and c1[8:15]=a1[4:7]*b1[4:7]
- * a is two 256-bit integers: a0[0:3] and a1[4:7]
- * b is two 256-bit integers: b0[0:3] and b1[4:7]
- */
-static void mul2_256x256_integer_adx(u64 *const c, const u64 *const a,
- const u64 *const b)
+static __always_inline u64 eq_mask(u64 a, u64 b)
{
- asm volatile(
- "xorl %%r14d, %%r14d ;"
- "movq (%1), %%rdx; " /* A[0] */
- "mulx (%2), %%r8, %%r15; " /* A[0]*B[0] */
- "xorl %%r10d, %%r10d ;"
- "movq %%r8, (%0) ;"
- "mulx 8(%2), %%r10, %%rax; " /* A[0]*B[1] */
- "adox %%r10, %%r15 ;"
- "mulx 16(%2), %%r8, %%rbx; " /* A[0]*B[2] */
- "adox %%r8, %%rax ;"
- "mulx 24(%2), %%r10, %%rcx; " /* A[0]*B[3] */
- "adox %%r10, %%rbx ;"
- /******************************************/
- "adox %%r14, %%rcx ;"
-
- "movq 8(%1), %%rdx; " /* A[1] */
- "mulx (%2), %%r8, %%r9; " /* A[1]*B[0] */
- "adox %%r15, %%r8 ;"
- "movq %%r8, 8(%0) ;"
- "mulx 8(%2), %%r10, %%r11; " /* A[1]*B[1] */
- "adox %%r10, %%r9 ;"
- "adcx %%r9, %%rax ;"
- "mulx 16(%2), %%r8, %%r13; " /* A[1]*B[2] */
- "adox %%r8, %%r11 ;"
- "adcx %%r11, %%rbx ;"
- "mulx 24(%2), %%r10, %%r15; " /* A[1]*B[3] */
- "adox %%r10, %%r13 ;"
- "adcx %%r13, %%rcx ;"
- /******************************************/
- "adox %%r14, %%r15 ;"
- "adcx %%r14, %%r15 ;"
-
- "movq 16(%1), %%rdx; " /* A[2] */
- "xorl %%r10d, %%r10d ;"
- "mulx (%2), %%r8, %%r9; " /* A[2]*B[0] */
- "adox %%rax, %%r8 ;"
- "movq %%r8, 16(%0) ;"
- "mulx 8(%2), %%r10, %%r11; " /* A[2]*B[1] */
- "adox %%r10, %%r9 ;"
- "adcx %%r9, %%rbx ;"
- "mulx 16(%2), %%r8, %%r13; " /* A[2]*B[2] */
- "adox %%r8, %%r11 ;"
- "adcx %%r11, %%rcx ;"
- "mulx 24(%2), %%r10, %%rax; " /* A[2]*B[3] */
- "adox %%r10, %%r13 ;"
- "adcx %%r13, %%r15 ;"
- /******************************************/
- "adox %%r14, %%rax ;"
- "adcx %%r14, %%rax ;"
-
- "movq 24(%1), %%rdx; " /* A[3] */
- "xorl %%r10d, %%r10d ;"
- "mulx (%2), %%r8, %%r9; " /* A[3]*B[0] */
- "adox %%rbx, %%r8 ;"
- "movq %%r8, 24(%0) ;"
- "mulx 8(%2), %%r10, %%r11; " /* A[3]*B[1] */
- "adox %%r10, %%r9 ;"
- "adcx %%r9, %%rcx ;"
- "movq %%rcx, 32(%0) ;"
- "mulx 16(%2), %%r8, %%r13; " /* A[3]*B[2] */
- "adox %%r8, %%r11 ;"
- "adcx %%r11, %%r15 ;"
- "movq %%r15, 40(%0) ;"
- "mulx 24(%2), %%r10, %%rbx; " /* A[3]*B[3] */
- "adox %%r10, %%r13 ;"
- "adcx %%r13, %%rax ;"
- "movq %%rax, 48(%0) ;"
- /******************************************/
- "adox %%r14, %%rbx ;"
- "adcx %%r14, %%rbx ;"
- "movq %%rbx, 56(%0) ;"
-
- "movq 32(%1), %%rdx; " /* C[0] */
- "mulx 32(%2), %%r8, %%r15; " /* C[0]*D[0] */
- "xorl %%r10d, %%r10d ;"
- "movq %%r8, 64(%0);"
- "mulx 40(%2), %%r10, %%rax; " /* C[0]*D[1] */
- "adox %%r10, %%r15 ;"
- "mulx 48(%2), %%r8, %%rbx; " /* C[0]*D[2] */
- "adox %%r8, %%rax ;"
- "mulx 56(%2), %%r10, %%rcx; " /* C[0]*D[3] */
- "adox %%r10, %%rbx ;"
- /******************************************/
- "adox %%r14, %%rcx ;"
-
- "movq 40(%1), %%rdx; " /* C[1] */
- "xorl %%r10d, %%r10d ;"
- "mulx 32(%2), %%r8, %%r9; " /* C[1]*D[0] */
- "adox %%r15, %%r8 ;"
- "movq %%r8, 72(%0);"
- "mulx 40(%2), %%r10, %%r11; " /* C[1]*D[1] */
- "adox %%r10, %%r9 ;"
- "adcx %%r9, %%rax ;"
- "mulx 48(%2), %%r8, %%r13; " /* C[1]*D[2] */
- "adox %%r8, %%r11 ;"
- "adcx %%r11, %%rbx ;"
- "mulx 56(%2), %%r10, %%r15; " /* C[1]*D[3] */
- "adox %%r10, %%r13 ;"
- "adcx %%r13, %%rcx ;"
- /******************************************/
- "adox %%r14, %%r15 ;"
- "adcx %%r14, %%r15 ;"
-
- "movq 48(%1), %%rdx; " /* C[2] */
- "xorl %%r10d, %%r10d ;"
- "mulx 32(%2), %%r8, %%r9; " /* C[2]*D[0] */
- "adox %%rax, %%r8 ;"
- "movq %%r8, 80(%0);"
- "mulx 40(%2), %%r10, %%r11; " /* C[2]*D[1] */
- "adox %%r10, %%r9 ;"
- "adcx %%r9, %%rbx ;"
- "mulx 48(%2), %%r8, %%r13; " /* C[2]*D[2] */
- "adox %%r8, %%r11 ;"
- "adcx %%r11, %%rcx ;"
- "mulx 56(%2), %%r10, %%rax; " /* C[2]*D[3] */
- "adox %%r10, %%r13 ;"
- "adcx %%r13, %%r15 ;"
- /******************************************/
- "adox %%r14, %%rax ;"
- "adcx %%r14, %%rax ;"
-
- "movq 56(%1), %%rdx; " /* C[3] */
- "xorl %%r10d, %%r10d ;"
- "mulx 32(%2), %%r8, %%r9; " /* C[3]*D[0] */
- "adox %%rbx, %%r8 ;"
- "movq %%r8, 88(%0);"
- "mulx 40(%2), %%r10, %%r11; " /* C[3]*D[1] */
- "adox %%r10, %%r9 ;"
- "adcx %%r9, %%rcx ;"
- "movq %%rcx, 96(%0) ;"
- "mulx 48(%2), %%r8, %%r13; " /* C[3]*D[2] */
- "adox %%r8, %%r11 ;"
- "adcx %%r11, %%r15 ;"
- "movq %%r15, 104(%0) ;"
- "mulx 56(%2), %%r10, %%rbx; " /* C[3]*D[3] */
- "adox %%r10, %%r13 ;"
- "adcx %%r13, %%rax ;"
- "movq %%rax, 112(%0) ;"
- /******************************************/
- "adox %%r14, %%rbx ;"
- "adcx %%r14, %%rbx ;"
- "movq %%rbx, 120(%0) ;"
- :
- : "r"(c), "r"(a), "r"(b)
- : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9",
- "%r10", "%r11", "%r13", "%r14", "%r15");
+ u64 x = a ^ b;
+ u64 minus_x = ~x + (u64)1U;
+ u64 x_or_minus_x = x | minus_x;
+ u64 xnx = x_or_minus_x >> (u32)63U;
+ return xnx - (u64)1U;
}
-static void mul2_256x256_integer_bmi2(u64 *const c, const u64 *const a,
- const u64 *const b)
+static __always_inline u64 gte_mask(u64 a, u64 b)
{
- asm volatile(
- "movq (%1), %%rdx; " /* A[0] */
- "mulx (%2), %%r8, %%r15; " /* A[0]*B[0] */
- "movq %%r8, (%0) ;"
- "mulx 8(%2), %%r10, %%rax; " /* A[0]*B[1] */
- "addq %%r10, %%r15 ;"
- "mulx 16(%2), %%r8, %%rbx; " /* A[0]*B[2] */
- "adcq %%r8, %%rax ;"
- "mulx 24(%2), %%r10, %%rcx; " /* A[0]*B[3] */
- "adcq %%r10, %%rbx ;"
- /******************************************/
- "adcq $0, %%rcx ;"
-
- "movq 8(%1), %%rdx; " /* A[1] */
- "mulx (%2), %%r8, %%r9; " /* A[1]*B[0] */
- "addq %%r15, %%r8 ;"
- "movq %%r8, 8(%0) ;"
- "mulx 8(%2), %%r10, %%r11; " /* A[1]*B[1] */
- "adcq %%r10, %%r9 ;"
- "mulx 16(%2), %%r8, %%r13; " /* A[1]*B[2] */
- "adcq %%r8, %%r11 ;"
- "mulx 24(%2), %%r10, %%r15; " /* A[1]*B[3] */
- "adcq %%r10, %%r13 ;"
- /******************************************/
- "adcq $0, %%r15 ;"
-
- "addq %%r9, %%rax ;"
- "adcq %%r11, %%rbx ;"
- "adcq %%r13, %%rcx ;"
- "adcq $0, %%r15 ;"
-
- "movq 16(%1), %%rdx; " /* A[2] */
- "mulx (%2), %%r8, %%r9; " /* A[2]*B[0] */
- "addq %%rax, %%r8 ;"
- "movq %%r8, 16(%0) ;"
- "mulx 8(%2), %%r10, %%r11; " /* A[2]*B[1] */
- "adcq %%r10, %%r9 ;"
- "mulx 16(%2), %%r8, %%r13; " /* A[2]*B[2] */
- "adcq %%r8, %%r11 ;"
- "mulx 24(%2), %%r10, %%rax; " /* A[2]*B[3] */
- "adcq %%r10, %%r13 ;"
- /******************************************/
- "adcq $0, %%rax ;"
-
- "addq %%r9, %%rbx ;"
- "adcq %%r11, %%rcx ;"
- "adcq %%r13, %%r15 ;"
- "adcq $0, %%rax ;"
-
- "movq 24(%1), %%rdx; " /* A[3] */
- "mulx (%2), %%r8, %%r9; " /* A[3]*B[0] */
- "addq %%rbx, %%r8 ;"
- "movq %%r8, 24(%0) ;"
- "mulx 8(%2), %%r10, %%r11; " /* A[3]*B[1] */
- "adcq %%r10, %%r9 ;"
- "mulx 16(%2), %%r8, %%r13; " /* A[3]*B[2] */
- "adcq %%r8, %%r11 ;"
- "mulx 24(%2), %%r10, %%rbx; " /* A[3]*B[3] */
- "adcq %%r10, %%r13 ;"
- /******************************************/
- "adcq $0, %%rbx ;"
-
- "addq %%r9, %%rcx ;"
- "movq %%rcx, 32(%0) ;"
- "adcq %%r11, %%r15 ;"
- "movq %%r15, 40(%0) ;"
- "adcq %%r13, %%rax ;"
- "movq %%rax, 48(%0) ;"
- "adcq $0, %%rbx ;"
- "movq %%rbx, 56(%0) ;"
-
- "movq 32(%1), %%rdx; " /* C[0] */
- "mulx 32(%2), %%r8, %%r15; " /* C[0]*D[0] */
- "movq %%r8, 64(%0) ;"
- "mulx 40(%2), %%r10, %%rax; " /* C[0]*D[1] */
- "addq %%r10, %%r15 ;"
- "mulx 48(%2), %%r8, %%rbx; " /* C[0]*D[2] */
- "adcq %%r8, %%rax ;"
- "mulx 56(%2), %%r10, %%rcx; " /* C[0]*D[3] */
- "adcq %%r10, %%rbx ;"
- /******************************************/
- "adcq $0, %%rcx ;"
-
- "movq 40(%1), %%rdx; " /* C[1] */
- "mulx 32(%2), %%r8, %%r9; " /* C[1]*D[0] */
- "addq %%r15, %%r8 ;"
- "movq %%r8, 72(%0) ;"
- "mulx 40(%2), %%r10, %%r11; " /* C[1]*D[1] */
- "adcq %%r10, %%r9 ;"
- "mulx 48(%2), %%r8, %%r13; " /* C[1]*D[2] */
- "adcq %%r8, %%r11 ;"
- "mulx 56(%2), %%r10, %%r15; " /* C[1]*D[3] */
- "adcq %%r10, %%r13 ;"
- /******************************************/
- "adcq $0, %%r15 ;"
-
- "addq %%r9, %%rax ;"
- "adcq %%r11, %%rbx ;"
- "adcq %%r13, %%rcx ;"
- "adcq $0, %%r15 ;"
-
- "movq 48(%1), %%rdx; " /* C[2] */
- "mulx 32(%2), %%r8, %%r9; " /* C[2]*D[0] */
- "addq %%rax, %%r8 ;"
- "movq %%r8, 80(%0) ;"
- "mulx 40(%2), %%r10, %%r11; " /* C[2]*D[1] */
- "adcq %%r10, %%r9 ;"
- "mulx 48(%2), %%r8, %%r13; " /* C[2]*D[2] */
- "adcq %%r8, %%r11 ;"
- "mulx 56(%2), %%r10, %%rax; " /* C[2]*D[3] */
- "adcq %%r10, %%r13 ;"
- /******************************************/
- "adcq $0, %%rax ;"
-
- "addq %%r9, %%rbx ;"
- "adcq %%r11, %%rcx ;"
- "adcq %%r13, %%r15 ;"
- "adcq $0, %%rax ;"
-
- "movq 56(%1), %%rdx; " /* C[3] */
- "mulx 32(%2), %%r8, %%r9; " /* C[3]*D[0] */
- "addq %%rbx, %%r8 ;"
- "movq %%r8, 88(%0) ;"
- "mulx 40(%2), %%r10, %%r11; " /* C[3]*D[1] */
- "adcq %%r10, %%r9 ;"
- "mulx 48(%2), %%r8, %%r13; " /* C[3]*D[2] */
- "adcq %%r8, %%r11 ;"
- "mulx 56(%2), %%r10, %%rbx; " /* C[3]*D[3] */
- "adcq %%r10, %%r13 ;"
- /******************************************/
- "adcq $0, %%rbx ;"
-
- "addq %%r9, %%rcx ;"
- "movq %%rcx, 96(%0) ;"
- "adcq %%r11, %%r15 ;"
- "movq %%r15, 104(%0) ;"
- "adcq %%r13, %%rax ;"
- "movq %%rax, 112(%0) ;"
- "adcq $0, %%rbx ;"
- "movq %%rbx, 120(%0) ;"
- :
- : "r"(c), "r"(a), "r"(b)
- : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9",
- "%r10", "%r11", "%r13", "%r15");
+ u64 x = a;
+ u64 y = b;
+ u64 x_xor_y = x ^ y;
+ u64 x_sub_y = x - y;
+ u64 x_sub_y_xor_y = x_sub_y ^ y;
+ u64 q = x_xor_y | x_sub_y_xor_y;
+ u64 x_xor_q = x ^ q;
+ u64 x_xor_q_ = x_xor_q >> (u32)63U;
+ return x_xor_q_ - (u64)1U;
}
-static void sqr2_256x256_integer_adx(u64 *const c, const u64 *const a)
+/* Computes the addition of four-element f1 with value in f2
+ * and returns the carry (if any) */
+static inline u64 add_scalar(u64 *out, const u64 *f1, u64 f2)
{
- asm volatile(
- "movq (%1), %%rdx ;" /* A[0] */
- "mulx 8(%1), %%r8, %%r14 ;" /* A[1]*A[0] */
- "xorl %%r15d, %%r15d;"
- "mulx 16(%1), %%r9, %%r10 ;" /* A[2]*A[0] */
- "adcx %%r14, %%r9 ;"
- "mulx 24(%1), %%rax, %%rcx ;" /* A[3]*A[0] */
- "adcx %%rax, %%r10 ;"
- "movq 24(%1), %%rdx ;" /* A[3] */
- "mulx 8(%1), %%r11, %%rbx ;" /* A[1]*A[3] */
- "adcx %%rcx, %%r11 ;"
- "mulx 16(%1), %%rax, %%r13 ;" /* A[2]*A[3] */
- "adcx %%rax, %%rbx ;"
- "movq 8(%1), %%rdx ;" /* A[1] */
- "adcx %%r15, %%r13 ;"
- "mulx 16(%1), %%rax, %%rcx ;" /* A[2]*A[1] */
- "movq $0, %%r14 ;"
- /******************************************/
- "adcx %%r15, %%r14 ;"
-
- "xorl %%r15d, %%r15d;"
- "adox %%rax, %%r10 ;"
- "adcx %%r8, %%r8 ;"
- "adox %%rcx, %%r11 ;"
- "adcx %%r9, %%r9 ;"
- "adox %%r15, %%rbx ;"
- "adcx %%r10, %%r10 ;"
- "adox %%r15, %%r13 ;"
- "adcx %%r11, %%r11 ;"
- "adox %%r15, %%r14 ;"
- "adcx %%rbx, %%rbx ;"
- "adcx %%r13, %%r13 ;"
- "adcx %%r14, %%r14 ;"
-
- "movq (%1), %%rdx ;"
- "mulx %%rdx, %%rax, %%rcx ;" /* A[0]^2 */
- /*******************/
- "movq %%rax, 0(%0) ;"
- "addq %%rcx, %%r8 ;"
- "movq %%r8, 8(%0) ;"
- "movq 8(%1), %%rdx ;"
- "mulx %%rdx, %%rax, %%rcx ;" /* A[1]^2 */
- "adcq %%rax, %%r9 ;"
- "movq %%r9, 16(%0) ;"
- "adcq %%rcx, %%r10 ;"
- "movq %%r10, 24(%0) ;"
- "movq 16(%1), %%rdx ;"
- "mulx %%rdx, %%rax, %%rcx ;" /* A[2]^2 */
- "adcq %%rax, %%r11 ;"
- "movq %%r11, 32(%0) ;"
- "adcq %%rcx, %%rbx ;"
- "movq %%rbx, 40(%0) ;"
- "movq 24(%1), %%rdx ;"
- "mulx %%rdx, %%rax, %%rcx ;" /* A[3]^2 */
- "adcq %%rax, %%r13 ;"
- "movq %%r13, 48(%0) ;"
- "adcq %%rcx, %%r14 ;"
- "movq %%r14, 56(%0) ;"
-
-
- "movq 32(%1), %%rdx ;" /* B[0] */
- "mulx 40(%1), %%r8, %%r14 ;" /* B[1]*B[0] */
- "xorl %%r15d, %%r15d;"
- "mulx 48(%1), %%r9, %%r10 ;" /* B[2]*B[0] */
- "adcx %%r14, %%r9 ;"
- "mulx 56(%1), %%rax, %%rcx ;" /* B[3]*B[0] */
- "adcx %%rax, %%r10 ;"
- "movq 56(%1), %%rdx ;" /* B[3] */
- "mulx 40(%1), %%r11, %%rbx ;" /* B[1]*B[3] */
- "adcx %%rcx, %%r11 ;"
- "mulx 48(%1), %%rax, %%r13 ;" /* B[2]*B[3] */
- "adcx %%rax, %%rbx ;"
- "movq 40(%1), %%rdx ;" /* B[1] */
- "adcx %%r15, %%r13 ;"
- "mulx 48(%1), %%rax, %%rcx ;" /* B[2]*B[1] */
- "movq $0, %%r14 ;"
- /******************************************/
- "adcx %%r15, %%r14 ;"
-
- "xorl %%r15d, %%r15d;"
- "adox %%rax, %%r10 ;"
- "adcx %%r8, %%r8 ;"
- "adox %%rcx, %%r11 ;"
- "adcx %%r9, %%r9 ;"
- "adox %%r15, %%rbx ;"
- "adcx %%r10, %%r10 ;"
- "adox %%r15, %%r13 ;"
- "adcx %%r11, %%r11 ;"
- "adox %%r15, %%r14 ;"
- "adcx %%rbx, %%rbx ;"
- "adcx %%r13, %%r13 ;"
- "adcx %%r14, %%r14 ;"
-
- "movq 32(%1), %%rdx ;"
- "mulx %%rdx, %%rax, %%rcx ;" /* B[0]^2 */
- /*******************/
- "movq %%rax, 64(%0) ;"
- "addq %%rcx, %%r8 ;"
- "movq %%r8, 72(%0) ;"
- "movq 40(%1), %%rdx ;"
- "mulx %%rdx, %%rax, %%rcx ;" /* B[1]^2 */
- "adcq %%rax, %%r9 ;"
- "movq %%r9, 80(%0) ;"
- "adcq %%rcx, %%r10 ;"
- "movq %%r10, 88(%0) ;"
- "movq 48(%1), %%rdx ;"
- "mulx %%rdx, %%rax, %%rcx ;" /* B[2]^2 */
- "adcq %%rax, %%r11 ;"
- "movq %%r11, 96(%0) ;"
- "adcq %%rcx, %%rbx ;"
- "movq %%rbx, 104(%0) ;"
- "movq 56(%1), %%rdx ;"
- "mulx %%rdx, %%rax, %%rcx ;" /* B[3]^2 */
- "adcq %%rax, %%r13 ;"
- "movq %%r13, 112(%0) ;"
- "adcq %%rcx, %%r14 ;"
- "movq %%r14, 120(%0) ;"
- :
- : "r"(c), "r"(a)
- : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9",
- "%r10", "%r11", "%r13", "%r14", "%r15");
-}
+ u64 carry_r;
-static void sqr2_256x256_integer_bmi2(u64 *const c, const u64 *const a)
-{
asm volatile(
- "movq 8(%1), %%rdx ;" /* A[1] */
- "mulx (%1), %%r8, %%r9 ;" /* A[0]*A[1] */
- "mulx 16(%1), %%r10, %%r11 ;" /* A[2]*A[1] */
- "mulx 24(%1), %%rcx, %%r14 ;" /* A[3]*A[1] */
-
- "movq 16(%1), %%rdx ;" /* A[2] */
- "mulx 24(%1), %%r15, %%r13 ;" /* A[3]*A[2] */
- "mulx (%1), %%rax, %%rdx ;" /* A[0]*A[2] */
-
- "addq %%rax, %%r9 ;"
- "adcq %%rdx, %%r10 ;"
- "adcq %%rcx, %%r11 ;"
- "adcq %%r14, %%r15 ;"
- "adcq $0, %%r13 ;"
- "movq $0, %%r14 ;"
- "adcq $0, %%r14 ;"
-
- "movq (%1), %%rdx ;" /* A[0] */
- "mulx 24(%1), %%rax, %%rcx ;" /* A[0]*A[3] */
-
- "addq %%rax, %%r10 ;"
- "adcq %%rcx, %%r11 ;"
- "adcq $0, %%r15 ;"
- "adcq $0, %%r13 ;"
- "adcq $0, %%r14 ;"
-
- "shldq $1, %%r13, %%r14 ;"
- "shldq $1, %%r15, %%r13 ;"
- "shldq $1, %%r11, %%r15 ;"
- "shldq $1, %%r10, %%r11 ;"
- "shldq $1, %%r9, %%r10 ;"
- "shldq $1, %%r8, %%r9 ;"
- "shlq $1, %%r8 ;"
-
- /*******************/
- "mulx %%rdx, %%rax, %%rcx ; " /* A[0]^2 */
- /*******************/
- "movq %%rax, 0(%0) ;"
- "addq %%rcx, %%r8 ;"
- "movq %%r8, 8(%0) ;"
- "movq 8(%1), %%rdx ;"
- "mulx %%rdx, %%rax, %%rcx ; " /* A[1]^2 */
- "adcq %%rax, %%r9 ;"
- "movq %%r9, 16(%0) ;"
- "adcq %%rcx, %%r10 ;"
- "movq %%r10, 24(%0) ;"
- "movq 16(%1), %%rdx ;"
- "mulx %%rdx, %%rax, %%rcx ; " /* A[2]^2 */
- "adcq %%rax, %%r11 ;"
- "movq %%r11, 32(%0) ;"
- "adcq %%rcx, %%r15 ;"
- "movq %%r15, 40(%0) ;"
- "movq 24(%1), %%rdx ;"
- "mulx %%rdx, %%rax, %%rcx ; " /* A[3]^2 */
- "adcq %%rax, %%r13 ;"
- "movq %%r13, 48(%0) ;"
- "adcq %%rcx, %%r14 ;"
- "movq %%r14, 56(%0) ;"
-
- "movq 40(%1), %%rdx ;" /* B[1] */
- "mulx 32(%1), %%r8, %%r9 ;" /* B[0]*B[1] */
- "mulx 48(%1), %%r10, %%r11 ;" /* B[2]*B[1] */
- "mulx 56(%1), %%rcx, %%r14 ;" /* B[3]*B[1] */
-
- "movq 48(%1), %%rdx ;" /* B[2] */
- "mulx 56(%1), %%r15, %%r13 ;" /* B[3]*B[2] */
- "mulx 32(%1), %%rax, %%rdx ;" /* B[0]*B[2] */
-
- "addq %%rax, %%r9 ;"
- "adcq %%rdx, %%r10 ;"
- "adcq %%rcx, %%r11 ;"
- "adcq %%r14, %%r15 ;"
- "adcq $0, %%r13 ;"
- "movq $0, %%r14 ;"
- "adcq $0, %%r14 ;"
-
- "movq 32(%1), %%rdx ;" /* B[0] */
- "mulx 56(%1), %%rax, %%rcx ;" /* B[0]*B[3] */
-
- "addq %%rax, %%r10 ;"
- "adcq %%rcx, %%r11 ;"
- "adcq $0, %%r15 ;"
- "adcq $0, %%r13 ;"
- "adcq $0, %%r14 ;"
-
- "shldq $1, %%r13, %%r14 ;"
- "shldq $1, %%r15, %%r13 ;"
- "shldq $1, %%r11, %%r15 ;"
- "shldq $1, %%r10, %%r11 ;"
- "shldq $1, %%r9, %%r10 ;"
- "shldq $1, %%r8, %%r9 ;"
- "shlq $1, %%r8 ;"
-
- /*******************/
- "mulx %%rdx, %%rax, %%rcx ; " /* B[0]^2 */
- /*******************/
- "movq %%rax, 64(%0) ;"
- "addq %%rcx, %%r8 ;"
- "movq %%r8, 72(%0) ;"
- "movq 40(%1), %%rdx ;"
- "mulx %%rdx, %%rax, %%rcx ; " /* B[1]^2 */
- "adcq %%rax, %%r9 ;"
- "movq %%r9, 80(%0) ;"
- "adcq %%rcx, %%r10 ;"
- "movq %%r10, 88(%0) ;"
- "movq 48(%1), %%rdx ;"
- "mulx %%rdx, %%rax, %%rcx ; " /* B[2]^2 */
- "adcq %%rax, %%r11 ;"
- "movq %%r11, 96(%0) ;"
- "adcq %%rcx, %%r15 ;"
- "movq %%r15, 104(%0) ;"
- "movq 56(%1), %%rdx ;"
- "mulx %%rdx, %%rax, %%rcx ; " /* B[3]^2 */
- "adcq %%rax, %%r13 ;"
- "movq %%r13, 112(%0) ;"
- "adcq %%rcx, %%r14 ;"
- "movq %%r14, 120(%0) ;"
- :
- : "r"(c), "r"(a)
- : "memory", "cc", "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10",
- "%r11", "%r13", "%r14", "%r15");
+ /* Clear registers to propagate the carry bit */
+ " xor %%r8d, %%r8d;"
+ " xor %%r9d, %%r9d;"
+ " xor %%r10d, %%r10d;"
+ " xor %%r11d, %%r11d;"
+ " xor %k1, %k1;"
+
+ /* Begin addition chain */
+ " addq 0(%3), %0;"
+ " movq %0, 0(%2);"
+ " adcxq 8(%3), %%r8;"
+ " movq %%r8, 8(%2);"
+ " adcxq 16(%3), %%r9;"
+ " movq %%r9, 16(%2);"
+ " adcxq 24(%3), %%r10;"
+ " movq %%r10, 24(%2);"
+
+ /* Return the carry bit in a register */
+ " adcx %%r11, %1;"
+ : "+&r"(f2), "=&r"(carry_r)
+ : "r"(out), "r"(f1)
+ : "%r8", "%r9", "%r10", "%r11", "memory", "cc");
+
+ return carry_r;
}
-static void red_eltfp25519_2w_adx(u64 *const c, const u64 *const a)
+/* Computes the field addition of two field elements */
+static inline void fadd(u64 *out, const u64 *f1, const u64 *f2)
{
asm volatile(
- "movl $38, %%edx; " /* 2*c = 38 = 2^256 */
- "mulx 32(%1), %%r8, %%r10; " /* c*C[4] */
- "xorl %%ebx, %%ebx ;"
- "adox (%1), %%r8 ;"
- "mulx 40(%1), %%r9, %%r11; " /* c*C[5] */
- "adcx %%r10, %%r9 ;"
- "adox 8(%1), %%r9 ;"
- "mulx 48(%1), %%r10, %%rax; " /* c*C[6] */
- "adcx %%r11, %%r10 ;"
- "adox 16(%1), %%r10 ;"
- "mulx 56(%1), %%r11, %%rcx; " /* c*C[7] */
- "adcx %%rax, %%r11 ;"
- "adox 24(%1), %%r11 ;"
- /***************************************/
- "adcx %%rbx, %%rcx ;"
- "adox %%rbx, %%rcx ;"
- "imul %%rdx, %%rcx ;" /* c*C[4], cf=0, of=0 */
- "adcx %%rcx, %%r8 ;"
- "adcx %%rbx, %%r9 ;"
- "movq %%r9, 8(%0) ;"
- "adcx %%rbx, %%r10 ;"
- "movq %%r10, 16(%0) ;"
- "adcx %%rbx, %%r11 ;"
- "movq %%r11, 24(%0) ;"
- "mov $0, %%ecx ;"
- "cmovc %%edx, %%ecx ;"
- "addq %%rcx, %%r8 ;"
- "movq %%r8, (%0) ;"
-
- "mulx 96(%1), %%r8, %%r10; " /* c*C[4] */
- "xorl %%ebx, %%ebx ;"
- "adox 64(%1), %%r8 ;"
- "mulx 104(%1), %%r9, %%r11; " /* c*C[5] */
- "adcx %%r10, %%r9 ;"
- "adox 72(%1), %%r9 ;"
- "mulx 112(%1), %%r10, %%rax; " /* c*C[6] */
- "adcx %%r11, %%r10 ;"
- "adox 80(%1), %%r10 ;"
- "mulx 120(%1), %%r11, %%rcx; " /* c*C[7] */
- "adcx %%rax, %%r11 ;"
- "adox 88(%1), %%r11 ;"
- /****************************************/
- "adcx %%rbx, %%rcx ;"
- "adox %%rbx, %%rcx ;"
- "imul %%rdx, %%rcx ;" /* c*C[4], cf=0, of=0 */
- "adcx %%rcx, %%r8 ;"
- "adcx %%rbx, %%r9 ;"
- "movq %%r9, 40(%0) ;"
- "adcx %%rbx, %%r10 ;"
- "movq %%r10, 48(%0) ;"
- "adcx %%rbx, %%r11 ;"
- "movq %%r11, 56(%0) ;"
- "mov $0, %%ecx ;"
- "cmovc %%edx, %%ecx ;"
- "addq %%rcx, %%r8 ;"
- "movq %%r8, 32(%0) ;"
- :
- : "r"(c), "r"(a)
- : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9",
- "%r10", "%r11");
+ /* Compute the raw addition of f1 + f2 */
+ " movq 0(%0), %%r8;"
+ " addq 0(%2), %%r8;"
+ " movq 8(%0), %%r9;"
+ " adcxq 8(%2), %%r9;"
+ " movq 16(%0), %%r10;"
+ " adcxq 16(%2), %%r10;"
+ " movq 24(%0), %%r11;"
+ " adcxq 24(%2), %%r11;"
+
+ /* Wrap the result back into the field */
+
+ /* Step 1: Compute carry*38 */
+ " mov $0, %%rax;"
+ " mov $38, %0;"
+ " cmovc %0, %%rax;"
+
+ /* Step 2: Add carry*38 to the original sum */
+ " xor %%ecx, %%ecx;"
+ " add %%rax, %%r8;"
+ " adcx %%rcx, %%r9;"
+ " movq %%r9, 8(%1);"
+ " adcx %%rcx, %%r10;"
+ " movq %%r10, 16(%1);"
+ " adcx %%rcx, %%r11;"
+ " movq %%r11, 24(%1);"
+
+ /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
+ " mov $0, %%rax;"
+ " cmovc %0, %%rax;"
+ " add %%rax, %%r8;"
+ " movq %%r8, 0(%1);"
+ : "+&r"(f2)
+ : "r"(out), "r"(f1)
+ : "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11", "memory", "cc");
}
-static void red_eltfp25519_2w_bmi2(u64 *const c, const u64 *const a)
+/* Computes the field subtraction of two field elements */
+static inline void fsub(u64 *out, const u64 *f1, const u64 *f2)
{
asm volatile(
- "movl $38, %%edx ; " /* 2*c = 38 = 2^256 */
- "mulx 32(%1), %%r8, %%r10 ;" /* c*C[4] */
- "mulx 40(%1), %%r9, %%r11 ;" /* c*C[5] */
- "addq %%r10, %%r9 ;"
- "mulx 48(%1), %%r10, %%rax ;" /* c*C[6] */
- "adcq %%r11, %%r10 ;"
- "mulx 56(%1), %%r11, %%rcx ;" /* c*C[7] */
- "adcq %%rax, %%r11 ;"
- /***************************************/
- "adcq $0, %%rcx ;"
- "addq (%1), %%r8 ;"
- "adcq 8(%1), %%r9 ;"
- "adcq 16(%1), %%r10 ;"
- "adcq 24(%1), %%r11 ;"
- "adcq $0, %%rcx ;"
- "imul %%rdx, %%rcx ;" /* c*C[4], cf=0 */
- "addq %%rcx, %%r8 ;"
- "adcq $0, %%r9 ;"
- "movq %%r9, 8(%0) ;"
- "adcq $0, %%r10 ;"
- "movq %%r10, 16(%0) ;"
- "adcq $0, %%r11 ;"
- "movq %%r11, 24(%0) ;"
- "mov $0, %%ecx ;"
- "cmovc %%edx, %%ecx ;"
- "addq %%rcx, %%r8 ;"
- "movq %%r8, (%0) ;"
-
- "mulx 96(%1), %%r8, %%r10 ;" /* c*C[4] */
- "mulx 104(%1), %%r9, %%r11 ;" /* c*C[5] */
- "addq %%r10, %%r9 ;"
- "mulx 112(%1), %%r10, %%rax ;" /* c*C[6] */
- "adcq %%r11, %%r10 ;"
- "mulx 120(%1), %%r11, %%rcx ;" /* c*C[7] */
- "adcq %%rax, %%r11 ;"
- /****************************************/
- "adcq $0, %%rcx ;"
- "addq 64(%1), %%r8 ;"
- "adcq 72(%1), %%r9 ;"
- "adcq 80(%1), %%r10 ;"
- "adcq 88(%1), %%r11 ;"
- "adcq $0, %%rcx ;"
- "imul %%rdx, %%rcx ;" /* c*C[4], cf=0 */
- "addq %%rcx, %%r8 ;"
- "adcq $0, %%r9 ;"
- "movq %%r9, 40(%0) ;"
- "adcq $0, %%r10 ;"
- "movq %%r10, 48(%0) ;"
- "adcq $0, %%r11 ;"
- "movq %%r11, 56(%0) ;"
- "mov $0, %%ecx ;"
- "cmovc %%edx, %%ecx ;"
- "addq %%rcx, %%r8 ;"
- "movq %%r8, 32(%0) ;"
+ /* Compute the raw subtraction of f1-f2 */
+ " movq 0(%1), %%r8;"
+ " subq 0(%2), %%r8;"
+ " movq 8(%1), %%r9;"
+ " sbbq 8(%2), %%r9;"
+ " movq 16(%1), %%r10;"
+ " sbbq 16(%2), %%r10;"
+ " movq 24(%1), %%r11;"
+ " sbbq 24(%2), %%r11;"
+
+ /* Wrap the result back into the field */
+
+ /* Step 1: Compute carry*38 */
+ " mov $0, %%rax;"
+ " mov $38, %%rcx;"
+ " cmovc %%rcx, %%rax;"
+
+ /* Step 2: Subtract carry*38 from the original difference */
+ " sub %%rax, %%r8;"
+ " sbb $0, %%r9;"
+ " sbb $0, %%r10;"
+ " sbb $0, %%r11;"
+
+ /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
+ " mov $0, %%rax;"
+ " cmovc %%rcx, %%rax;"
+ " sub %%rax, %%r8;"
+
+ /* Store the result */
+ " movq %%r8, 0(%0);"
+ " movq %%r9, 8(%0);"
+ " movq %%r10, 16(%0);"
+ " movq %%r11, 24(%0);"
:
- : "r"(c), "r"(a)
- : "memory", "cc", "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10",
- "%r11");
+ : "r"(out), "r"(f1), "r"(f2)
+ : "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11", "memory", "cc");
}
-static void mul_256x256_integer_adx(u64 *const c, const u64 *const a,
- const u64 *const b)
+/* Computes a field multiplication: out <- f1 * f2
+ * Uses the 8-element buffer tmp for intermediate results */
+static inline void fmul(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp)
{
asm volatile(
- "movq (%1), %%rdx; " /* A[0] */
- "mulx (%2), %%r8, %%r9; " /* A[0]*B[0] */
- "xorl %%r10d, %%r10d ;"
- "movq %%r8, (%0) ;"
- "mulx 8(%2), %%r10, %%r11; " /* A[0]*B[1] */
- "adox %%r9, %%r10 ;"
- "movq %%r10, 8(%0) ;"
- "mulx 16(%2), %%r15, %%r13; " /* A[0]*B[2] */
- "adox %%r11, %%r15 ;"
- "mulx 24(%2), %%r14, %%rdx; " /* A[0]*B[3] */
- "adox %%r13, %%r14 ;"
- "movq $0, %%rax ;"
- /******************************************/
- "adox %%rdx, %%rax ;"
-
- "movq 8(%1), %%rdx; " /* A[1] */
- "mulx (%2), %%r8, %%r9; " /* A[1]*B[0] */
- "xorl %%r10d, %%r10d ;"
- "adcx 8(%0), %%r8 ;"
- "movq %%r8, 8(%0) ;"
- "mulx 8(%2), %%r10, %%r11; " /* A[1]*B[1] */
- "adox %%r9, %%r10 ;"
- "adcx %%r15, %%r10 ;"
- "movq %%r10, 16(%0) ;"
- "mulx 16(%2), %%r15, %%r13; " /* A[1]*B[2] */
- "adox %%r11, %%r15 ;"
- "adcx %%r14, %%r15 ;"
- "movq $0, %%r8 ;"
- "mulx 24(%2), %%r14, %%rdx; " /* A[1]*B[3] */
- "adox %%r13, %%r14 ;"
- "adcx %%rax, %%r14 ;"
- "movq $0, %%rax ;"
- /******************************************/
- "adox %%rdx, %%rax ;"
- "adcx %%r8, %%rax ;"
-
- "movq 16(%1), %%rdx; " /* A[2] */
- "mulx (%2), %%r8, %%r9; " /* A[2]*B[0] */
- "xorl %%r10d, %%r10d ;"
- "adcx 16(%0), %%r8 ;"
- "movq %%r8, 16(%0) ;"
- "mulx 8(%2), %%r10, %%r11; " /* A[2]*B[1] */
- "adox %%r9, %%r10 ;"
- "adcx %%r15, %%r10 ;"
- "movq %%r10, 24(%0) ;"
- "mulx 16(%2), %%r15, %%r13; " /* A[2]*B[2] */
- "adox %%r11, %%r15 ;"
- "adcx %%r14, %%r15 ;"
- "movq $0, %%r8 ;"
- "mulx 24(%2), %%r14, %%rdx; " /* A[2]*B[3] */
- "adox %%r13, %%r14 ;"
- "adcx %%rax, %%r14 ;"
- "movq $0, %%rax ;"
- /******************************************/
- "adox %%rdx, %%rax ;"
- "adcx %%r8, %%rax ;"
-
- "movq 24(%1), %%rdx; " /* A[3] */
- "mulx (%2), %%r8, %%r9; " /* A[3]*B[0] */
- "xorl %%r10d, %%r10d ;"
- "adcx 24(%0), %%r8 ;"
- "movq %%r8, 24(%0) ;"
- "mulx 8(%2), %%r10, %%r11; " /* A[3]*B[1] */
- "adox %%r9, %%r10 ;"
- "adcx %%r15, %%r10 ;"
- "movq %%r10, 32(%0) ;"
- "mulx 16(%2), %%r15, %%r13; " /* A[3]*B[2] */
- "adox %%r11, %%r15 ;"
- "adcx %%r14, %%r15 ;"
- "movq %%r15, 40(%0) ;"
- "movq $0, %%r8 ;"
- "mulx 24(%2), %%r14, %%rdx; " /* A[3]*B[3] */
- "adox %%r13, %%r14 ;"
- "adcx %%rax, %%r14 ;"
- "movq %%r14, 48(%0) ;"
- "movq $0, %%rax ;"
- /******************************************/
- "adox %%rdx, %%rax ;"
- "adcx %%r8, %%rax ;"
- "movq %%rax, 56(%0) ;"
- :
- : "r"(c), "r"(a), "r"(b)
- : "memory", "cc", "%rax", "%rdx", "%r8", "%r9", "%r10", "%r11",
- "%r13", "%r14", "%r15");
-}
-static void mul_256x256_integer_bmi2(u64 *const c, const u64 *const a,
- const u64 *const b)
-{
- asm volatile(
- "movq (%1), %%rdx; " /* A[0] */
- "mulx (%2), %%r8, %%r15; " /* A[0]*B[0] */
- "movq %%r8, (%0) ;"
- "mulx 8(%2), %%r10, %%rax; " /* A[0]*B[1] */
- "addq %%r10, %%r15 ;"
- "mulx 16(%2), %%r8, %%rbx; " /* A[0]*B[2] */
- "adcq %%r8, %%rax ;"
- "mulx 24(%2), %%r10, %%rcx; " /* A[0]*B[3] */
- "adcq %%r10, %%rbx ;"
- /******************************************/
- "adcq $0, %%rcx ;"
-
- "movq 8(%1), %%rdx; " /* A[1] */
- "mulx (%2), %%r8, %%r9; " /* A[1]*B[0] */
- "addq %%r15, %%r8 ;"
- "movq %%r8, 8(%0) ;"
- "mulx 8(%2), %%r10, %%r11; " /* A[1]*B[1] */
- "adcq %%r10, %%r9 ;"
- "mulx 16(%2), %%r8, %%r13; " /* A[1]*B[2] */
- "adcq %%r8, %%r11 ;"
- "mulx 24(%2), %%r10, %%r15; " /* A[1]*B[3] */
- "adcq %%r10, %%r13 ;"
- /******************************************/
- "adcq $0, %%r15 ;"
-
- "addq %%r9, %%rax ;"
- "adcq %%r11, %%rbx ;"
- "adcq %%r13, %%rcx ;"
- "adcq $0, %%r15 ;"
-
- "movq 16(%1), %%rdx; " /* A[2] */
- "mulx (%2), %%r8, %%r9; " /* A[2]*B[0] */
- "addq %%rax, %%r8 ;"
- "movq %%r8, 16(%0) ;"
- "mulx 8(%2), %%r10, %%r11; " /* A[2]*B[1] */
- "adcq %%r10, %%r9 ;"
- "mulx 16(%2), %%r8, %%r13; " /* A[2]*B[2] */
- "adcq %%r8, %%r11 ;"
- "mulx 24(%2), %%r10, %%rax; " /* A[2]*B[3] */
- "adcq %%r10, %%r13 ;"
- /******************************************/
- "adcq $0, %%rax ;"
-
- "addq %%r9, %%rbx ;"
- "adcq %%r11, %%rcx ;"
- "adcq %%r13, %%r15 ;"
- "adcq $0, %%rax ;"
-
- "movq 24(%1), %%rdx; " /* A[3] */
- "mulx (%2), %%r8, %%r9; " /* A[3]*B[0] */
- "addq %%rbx, %%r8 ;"
- "movq %%r8, 24(%0) ;"
- "mulx 8(%2), %%r10, %%r11; " /* A[3]*B[1] */
- "adcq %%r10, %%r9 ;"
- "mulx 16(%2), %%r8, %%r13; " /* A[3]*B[2] */
- "adcq %%r8, %%r11 ;"
- "mulx 24(%2), %%r10, %%rbx; " /* A[3]*B[3] */
- "adcq %%r10, %%r13 ;"
- /******************************************/
- "adcq $0, %%rbx ;"
-
- "addq %%r9, %%rcx ;"
- "movq %%rcx, 32(%0) ;"
- "adcq %%r11, %%r15 ;"
- "movq %%r15, 40(%0) ;"
- "adcq %%r13, %%rax ;"
- "movq %%rax, 48(%0) ;"
- "adcq $0, %%rbx ;"
- "movq %%rbx, 56(%0) ;"
- :
- : "r"(c), "r"(a), "r"(b)
- : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9",
- "%r10", "%r11", "%r13", "%r15");
+ /* Compute the raw multiplication: tmp <- src1 * src2 */
+
+ /* Compute src1[0] * src2 */
+ " movq 0(%0), %%rdx;"
+ " mulxq 0(%1), %%r8, %%r9;"
+ " xor %%r10d, %%r10d;"
+ " movq %%r8, 0(%2);"
+ " mulxq 8(%1), %%r10, %%r11;"
+ " adox %%r9, %%r10;"
+ " movq %%r10, 8(%2);"
+ " mulxq 16(%1), %%rbx, %%r13;"
+ " adox %%r11, %%rbx;"
+ " mulxq 24(%1), %%r14, %%rdx;"
+ " adox %%r13, %%r14;"
+ " mov $0, %%rax;"
+ " adox %%rdx, %%rax;"
+
+ /* Compute src1[1] * src2 */
+ " movq 8(%0), %%rdx;"
+ " mulxq 0(%1), %%r8, %%r9;"
+ " xor %%r10d, %%r10d;"
+ " adcxq 8(%2), %%r8;"
+ " movq %%r8, 8(%2);"
+ " mulxq 8(%1), %%r10, %%r11;"
+ " adox %%r9, %%r10;"
+ " adcx %%rbx, %%r10;"
+ " movq %%r10, 16(%2);"
+ " mulxq 16(%1), %%rbx, %%r13;"
+ " adox %%r11, %%rbx;"
+ " adcx %%r14, %%rbx;"
+ " mov $0, %%r8;"
+ " mulxq 24(%1), %%r14, %%rdx;"
+ " adox %%r13, %%r14;"
+ " adcx %%rax, %%r14;"
+ " mov $0, %%rax;"
+ " adox %%rdx, %%rax;"
+ " adcx %%r8, %%rax;"
+
+ /* Compute src1[2] * src2 */
+ " movq 16(%0), %%rdx;"
+ " mulxq 0(%1), %%r8, %%r9;"
+ " xor %%r10d, %%r10d;"
+ " adcxq 16(%2), %%r8;"
+ " movq %%r8, 16(%2);"
+ " mulxq 8(%1), %%r10, %%r11;"
+ " adox %%r9, %%r10;"
+ " adcx %%rbx, %%r10;"
+ " movq %%r10, 24(%2);"
+ " mulxq 16(%1), %%rbx, %%r13;"
+ " adox %%r11, %%rbx;"
+ " adcx %%r14, %%rbx;"
+ " mov $0, %%r8;"
+ " mulxq 24(%1), %%r14, %%rdx;"
+ " adox %%r13, %%r14;"
+ " adcx %%rax, %%r14;"
+ " mov $0, %%rax;"
+ " adox %%rdx, %%rax;"
+ " adcx %%r8, %%rax;"
+
+ /* Compute src1[3] * src2 */
+ " movq 24(%0), %%rdx;"
+ " mulxq 0(%1), %%r8, %%r9;"
+ " xor %%r10d, %%r10d;"
+ " adcxq 24(%2), %%r8;"
+ " movq %%r8, 24(%2);"
+ " mulxq 8(%1), %%r10, %%r11;"
+ " adox %%r9, %%r10;"
+ " adcx %%rbx, %%r10;"
+ " movq %%r10, 32(%2);"
+ " mulxq 16(%1), %%rbx, %%r13;"
+ " adox %%r11, %%rbx;"
+ " adcx %%r14, %%rbx;"
+ " movq %%rbx, 40(%2);"
+ " mov $0, %%r8;"
+ " mulxq 24(%1), %%r14, %%rdx;"
+ " adox %%r13, %%r14;"
+ " adcx %%rax, %%r14;"
+ " movq %%r14, 48(%2);"
+ " mov $0, %%rax;"
+ " adox %%rdx, %%rax;"
+ " adcx %%r8, %%rax;"
+ " movq %%rax, 56(%2);"
+
+ /* Line up pointers */
+ " mov %2, %0;"
+ " mov %3, %2;"
+
+ /* Wrap the result back into the field */
+
+ /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
+ " mov $38, %%rdx;"
+ " mulxq 32(%0), %%r8, %%r13;"
+ " xor %k1, %k1;"
+ " adoxq 0(%0), %%r8;"
+ " mulxq 40(%0), %%r9, %%rbx;"
+ " adcx %%r13, %%r9;"
+ " adoxq 8(%0), %%r9;"
+ " mulxq 48(%0), %%r10, %%r13;"
+ " adcx %%rbx, %%r10;"
+ " adoxq 16(%0), %%r10;"
+ " mulxq 56(%0), %%r11, %%rax;"
+ " adcx %%r13, %%r11;"
+ " adoxq 24(%0), %%r11;"
+ " adcx %1, %%rax;"
+ " adox %1, %%rax;"
+ " imul %%rdx, %%rax;"
+
+ /* Step 2: Fold the carry back into dst */
+ " add %%rax, %%r8;"
+ " adcx %1, %%r9;"
+ " movq %%r9, 8(%2);"
+ " adcx %1, %%r10;"
+ " movq %%r10, 16(%2);"
+ " adcx %1, %%r11;"
+ " movq %%r11, 24(%2);"
+
+ /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
+ " mov $0, %%rax;"
+ " cmovc %%rdx, %%rax;"
+ " add %%rax, %%r8;"
+ " movq %%r8, 0(%2);"
+ : "+&r"(f1), "+&r"(f2), "+&r"(tmp)
+ : "r"(out)
+ : "%rax", "%rbx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r13",
+ "%r14", "memory", "cc");
}
-static void sqr_256x256_integer_adx(u64 *const c, const u64 *const a)
+/* Computes two field multiplications:
+ * out[0] <- f1[0] * f2[0]
+ * out[1] <- f1[1] * f2[1]
+ * Uses the 16-element buffer tmp for intermediate results: */
+static inline void fmul2(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp)
{
asm volatile(
- "movq (%1), %%rdx ;" /* A[0] */
- "mulx 8(%1), %%r8, %%r14 ;" /* A[1]*A[0] */
- "xorl %%r15d, %%r15d;"
- "mulx 16(%1), %%r9, %%r10 ;" /* A[2]*A[0] */
- "adcx %%r14, %%r9 ;"
- "mulx 24(%1), %%rax, %%rcx ;" /* A[3]*A[0] */
- "adcx %%rax, %%r10 ;"
- "movq 24(%1), %%rdx ;" /* A[3] */
- "mulx 8(%1), %%r11, %%rbx ;" /* A[1]*A[3] */
- "adcx %%rcx, %%r11 ;"
- "mulx 16(%1), %%rax, %%r13 ;" /* A[2]*A[3] */
- "adcx %%rax, %%rbx ;"
- "movq 8(%1), %%rdx ;" /* A[1] */
- "adcx %%r15, %%r13 ;"
- "mulx 16(%1), %%rax, %%rcx ;" /* A[2]*A[1] */
- "movq $0, %%r14 ;"
- /******************************************/
- "adcx %%r15, %%r14 ;"
-
- "xorl %%r15d, %%r15d;"
- "adox %%rax, %%r10 ;"
- "adcx %%r8, %%r8 ;"
- "adox %%rcx, %%r11 ;"
- "adcx %%r9, %%r9 ;"
- "adox %%r15, %%rbx ;"
- "adcx %%r10, %%r10 ;"
- "adox %%r15, %%r13 ;"
- "adcx %%r11, %%r11 ;"
- "adox %%r15, %%r14 ;"
- "adcx %%rbx, %%rbx ;"
- "adcx %%r13, %%r13 ;"
- "adcx %%r14, %%r14 ;"
-
- "movq (%1), %%rdx ;"
- "mulx %%rdx, %%rax, %%rcx ;" /* A[0]^2 */
- /*******************/
- "movq %%rax, 0(%0) ;"
- "addq %%rcx, %%r8 ;"
- "movq %%r8, 8(%0) ;"
- "movq 8(%1), %%rdx ;"
- "mulx %%rdx, %%rax, %%rcx ;" /* A[1]^2 */
- "adcq %%rax, %%r9 ;"
- "movq %%r9, 16(%0) ;"
- "adcq %%rcx, %%r10 ;"
- "movq %%r10, 24(%0) ;"
- "movq 16(%1), %%rdx ;"
- "mulx %%rdx, %%rax, %%rcx ;" /* A[2]^2 */
- "adcq %%rax, %%r11 ;"
- "movq %%r11, 32(%0) ;"
- "adcq %%rcx, %%rbx ;"
- "movq %%rbx, 40(%0) ;"
- "movq 24(%1), %%rdx ;"
- "mulx %%rdx, %%rax, %%rcx ;" /* A[3]^2 */
- "adcq %%rax, %%r13 ;"
- "movq %%r13, 48(%0) ;"
- "adcq %%rcx, %%r14 ;"
- "movq %%r14, 56(%0) ;"
- :
- : "r"(c), "r"(a)
- : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9",
- "%r10", "%r11", "%r13", "%r14", "%r15");
-}
-static void sqr_256x256_integer_bmi2(u64 *const c, const u64 *const a)
-{
- asm volatile(
- "movq 8(%1), %%rdx ;" /* A[1] */
- "mulx (%1), %%r8, %%r9 ;" /* A[0]*A[1] */
- "mulx 16(%1), %%r10, %%r11 ;" /* A[2]*A[1] */
- "mulx 24(%1), %%rcx, %%r14 ;" /* A[3]*A[1] */
-
- "movq 16(%1), %%rdx ;" /* A[2] */
- "mulx 24(%1), %%r15, %%r13 ;" /* A[3]*A[2] */
- "mulx (%1), %%rax, %%rdx ;" /* A[0]*A[2] */
-
- "addq %%rax, %%r9 ;"
- "adcq %%rdx, %%r10 ;"
- "adcq %%rcx, %%r11 ;"
- "adcq %%r14, %%r15 ;"
- "adcq $0, %%r13 ;"
- "movq $0, %%r14 ;"
- "adcq $0, %%r14 ;"
-
- "movq (%1), %%rdx ;" /* A[0] */
- "mulx 24(%1), %%rax, %%rcx ;" /* A[0]*A[3] */
-
- "addq %%rax, %%r10 ;"
- "adcq %%rcx, %%r11 ;"
- "adcq $0, %%r15 ;"
- "adcq $0, %%r13 ;"
- "adcq $0, %%r14 ;"
-
- "shldq $1, %%r13, %%r14 ;"
- "shldq $1, %%r15, %%r13 ;"
- "shldq $1, %%r11, %%r15 ;"
- "shldq $1, %%r10, %%r11 ;"
- "shldq $1, %%r9, %%r10 ;"
- "shldq $1, %%r8, %%r9 ;"
- "shlq $1, %%r8 ;"
-
- /*******************/
- "mulx %%rdx, %%rax, %%rcx ;" /* A[0]^2 */
- /*******************/
- "movq %%rax, 0(%0) ;"
- "addq %%rcx, %%r8 ;"
- "movq %%r8, 8(%0) ;"
- "movq 8(%1), %%rdx ;"
- "mulx %%rdx, %%rax, %%rcx ;" /* A[1]^2 */
- "adcq %%rax, %%r9 ;"
- "movq %%r9, 16(%0) ;"
- "adcq %%rcx, %%r10 ;"
- "movq %%r10, 24(%0) ;"
- "movq 16(%1), %%rdx ;"
- "mulx %%rdx, %%rax, %%rcx ;" /* A[2]^2 */
- "adcq %%rax, %%r11 ;"
- "movq %%r11, 32(%0) ;"
- "adcq %%rcx, %%r15 ;"
- "movq %%r15, 40(%0) ;"
- "movq 24(%1), %%rdx ;"
- "mulx %%rdx, %%rax, %%rcx ;" /* A[3]^2 */
- "adcq %%rax, %%r13 ;"
- "movq %%r13, 48(%0) ;"
- "adcq %%rcx, %%r14 ;"
- "movq %%r14, 56(%0) ;"
- :
- : "r"(c), "r"(a)
- : "memory", "cc", "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10",
- "%r11", "%r13", "%r14", "%r15");
+ /* Compute the raw multiplication tmp[0] <- f1[0] * f2[0] */
+
+ /* Compute src1[0] * src2 */
+ " movq 0(%0), %%rdx;"
+ " mulxq 0(%1), %%r8, %%r9;"
+ " xor %%r10d, %%r10d;"
+ " movq %%r8, 0(%2);"
+ " mulxq 8(%1), %%r10, %%r11;"
+ " adox %%r9, %%r10;"
+ " movq %%r10, 8(%2);"
+ " mulxq 16(%1), %%rbx, %%r13;"
+ " adox %%r11, %%rbx;"
+ " mulxq 24(%1), %%r14, %%rdx;"
+ " adox %%r13, %%r14;"
+ " mov $0, %%rax;"
+ " adox %%rdx, %%rax;"
+
+ /* Compute src1[1] * src2 */
+ " movq 8(%0), %%rdx;"
+ " mulxq 0(%1), %%r8, %%r9;"
+ " xor %%r10d, %%r10d;"
+ " adcxq 8(%2), %%r8;"
+ " movq %%r8, 8(%2);"
+ " mulxq 8(%1), %%r10, %%r11;"
+ " adox %%r9, %%r10;"
+ " adcx %%rbx, %%r10;"
+ " movq %%r10, 16(%2);"
+ " mulxq 16(%1), %%rbx, %%r13;"
+ " adox %%r11, %%rbx;"
+ " adcx %%r14, %%rbx;"
+ " mov $0, %%r8;"
+ " mulxq 24(%1), %%r14, %%rdx;"
+ " adox %%r13, %%r14;"
+ " adcx %%rax, %%r14;"
+ " mov $0, %%rax;"
+ " adox %%rdx, %%rax;"
+ " adcx %%r8, %%rax;"
+
+ /* Compute src1[2] * src2 */
+ " movq 16(%0), %%rdx;"
+ " mulxq 0(%1), %%r8, %%r9;"
+ " xor %%r10d, %%r10d;"
+ " adcxq 16(%2), %%r8;"
+ " movq %%r8, 16(%2);"
+ " mulxq 8(%1), %%r10, %%r11;"
+ " adox %%r9, %%r10;"
+ " adcx %%rbx, %%r10;"
+ " movq %%r10, 24(%2);"
+ " mulxq 16(%1), %%rbx, %%r13;"
+ " adox %%r11, %%rbx;"
+ " adcx %%r14, %%rbx;"
+ " mov $0, %%r8;"
+ " mulxq 24(%1), %%r14, %%rdx;"
+ " adox %%r13, %%r14;"
+ " adcx %%rax, %%r14;"
+ " mov $0, %%rax;"
+ " adox %%rdx, %%rax;"
+ " adcx %%r8, %%rax;"
+
+ /* Compute src1[3] * src2 */
+ " movq 24(%0), %%rdx;"
+ " mulxq 0(%1), %%r8, %%r9;"
+ " xor %%r10d, %%r10d;"
+ " adcxq 24(%2), %%r8;"
+ " movq %%r8, 24(%2);"
+ " mulxq 8(%1), %%r10, %%r11;"
+ " adox %%r9, %%r10;"
+ " adcx %%rbx, %%r10;"
+ " movq %%r10, 32(%2);"
+ " mulxq 16(%1), %%rbx, %%r13;"
+ " adox %%r11, %%rbx;"
+ " adcx %%r14, %%rbx;"
+ " movq %%rbx, 40(%2);"
+ " mov $0, %%r8;"
+ " mulxq 24(%1), %%r14, %%rdx;"
+ " adox %%r13, %%r14;"
+ " adcx %%rax, %%r14;"
+ " movq %%r14, 48(%2);"
+ " mov $0, %%rax;"
+ " adox %%rdx, %%rax;"
+ " adcx %%r8, %%rax;"
+ " movq %%rax, 56(%2);"
+
+ /* Compute the raw multiplication tmp[1] <- f1[1] * f2[1] */
+
+ /* Compute src1[0] * src2 */
+ " movq 32(%0), %%rdx;"
+ " mulxq 32(%1), %%r8, %%r9;"
+ " xor %%r10d, %%r10d;"
+ " movq %%r8, 64(%2);"
+ " mulxq 40(%1), %%r10, %%r11;"
+ " adox %%r9, %%r10;"
+ " movq %%r10, 72(%2);"
+ " mulxq 48(%1), %%rbx, %%r13;"
+ " adox %%r11, %%rbx;"
+ " mulxq 56(%1), %%r14, %%rdx;"
+ " adox %%r13, %%r14;"
+ " mov $0, %%rax;"
+ " adox %%rdx, %%rax;"
+
+ /* Compute src1[1] * src2 */
+ " movq 40(%0), %%rdx;"
+ " mulxq 32(%1), %%r8, %%r9;"
+ " xor %%r10d, %%r10d;"
+ " adcxq 72(%2), %%r8;"
+ " movq %%r8, 72(%2);"
+ " mulxq 40(%1), %%r10, %%r11;"
+ " adox %%r9, %%r10;"
+ " adcx %%rbx, %%r10;"
+ " movq %%r10, 80(%2);"
+ " mulxq 48(%1), %%rbx, %%r13;"
+ " adox %%r11, %%rbx;"
+ " adcx %%r14, %%rbx;"
+ " mov $0, %%r8;"
+ " mulxq 56(%1), %%r14, %%rdx;"
+ " adox %%r13, %%r14;"
+ " adcx %%rax, %%r14;"
+ " mov $0, %%rax;"
+ " adox %%rdx, %%rax;"
+ " adcx %%r8, %%rax;"
+
+ /* Compute src1[2] * src2 */
+ " movq 48(%0), %%rdx;"
+ " mulxq 32(%1), %%r8, %%r9;"
+ " xor %%r10d, %%r10d;"
+ " adcxq 80(%2), %%r8;"
+ " movq %%r8, 80(%2);"
+ " mulxq 40(%1), %%r10, %%r11;"
+ " adox %%r9, %%r10;"
+ " adcx %%rbx, %%r10;"
+ " movq %%r10, 88(%2);"
+ " mulxq 48(%1), %%rbx, %%r13;"
+ " adox %%r11, %%rbx;"
+ " adcx %%r14, %%rbx;"
+ " mov $0, %%r8;"
+ " mulxq 56(%1), %%r14, %%rdx;"
+ " adox %%r13, %%r14;"
+ " adcx %%rax, %%r14;"
+ " mov $0, %%rax;"
+ " adox %%rdx, %%rax;"
+ " adcx %%r8, %%rax;"
+
+ /* Compute src1[3] * src2 */
+ " movq 56(%0), %%rdx;"
+ " mulxq 32(%1), %%r8, %%r9;"
+ " xor %%r10d, %%r10d;"
+ " adcxq 88(%2), %%r8;"
+ " movq %%r8, 88(%2);"
+ " mulxq 40(%1), %%r10, %%r11;"
+ " adox %%r9, %%r10;"
+ " adcx %%rbx, %%r10;"
+ " movq %%r10, 96(%2);"
+ " mulxq 48(%1), %%rbx, %%r13;"
+ " adox %%r11, %%rbx;"
+ " adcx %%r14, %%rbx;"
+ " movq %%rbx, 104(%2);"
+ " mov $0, %%r8;"
+ " mulxq 56(%1), %%r14, %%rdx;"
+ " adox %%r13, %%r14;"
+ " adcx %%rax, %%r14;"
+ " movq %%r14, 112(%2);"
+ " mov $0, %%rax;"
+ " adox %%rdx, %%rax;"
+ " adcx %%r8, %%rax;"
+ " movq %%rax, 120(%2);"
+
+ /* Line up pointers */
+ " mov %2, %0;"
+ " mov %3, %2;"
+
+ /* Wrap the results back into the field */
+
+ /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
+ " mov $38, %%rdx;"
+ " mulxq 32(%0), %%r8, %%r13;"
+ " xor %k1, %k1;"
+ " adoxq 0(%0), %%r8;"
+ " mulxq 40(%0), %%r9, %%rbx;"
+ " adcx %%r13, %%r9;"
+ " adoxq 8(%0), %%r9;"
+ " mulxq 48(%0), %%r10, %%r13;"
+ " adcx %%rbx, %%r10;"
+ " adoxq 16(%0), %%r10;"
+ " mulxq 56(%0), %%r11, %%rax;"
+ " adcx %%r13, %%r11;"
+ " adoxq 24(%0), %%r11;"
+ " adcx %1, %%rax;"
+ " adox %1, %%rax;"
+ " imul %%rdx, %%rax;"
+
+ /* Step 2: Fold the carry back into dst */
+ " add %%rax, %%r8;"
+ " adcx %1, %%r9;"
+ " movq %%r9, 8(%2);"
+ " adcx %1, %%r10;"
+ " movq %%r10, 16(%2);"
+ " adcx %1, %%r11;"
+ " movq %%r11, 24(%2);"
+
+ /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
+ " mov $0, %%rax;"
+ " cmovc %%rdx, %%rax;"
+ " add %%rax, %%r8;"
+ " movq %%r8, 0(%2);"
+
+ /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
+ " mov $38, %%rdx;"
+ " mulxq 96(%0), %%r8, %%r13;"
+ " xor %k1, %k1;"
+ " adoxq 64(%0), %%r8;"
+ " mulxq 104(%0), %%r9, %%rbx;"
+ " adcx %%r13, %%r9;"
+ " adoxq 72(%0), %%r9;"
+ " mulxq 112(%0), %%r10, %%r13;"
+ " adcx %%rbx, %%r10;"
+ " adoxq 80(%0), %%r10;"
+ " mulxq 120(%0), %%r11, %%rax;"
+ " adcx %%r13, %%r11;"
+ " adoxq 88(%0), %%r11;"
+ " adcx %1, %%rax;"
+ " adox %1, %%rax;"
+ " imul %%rdx, %%rax;"
+
+ /* Step 2: Fold the carry back into dst */
+ " add %%rax, %%r8;"
+ " adcx %1, %%r9;"
+ " movq %%r9, 40(%2);"
+ " adcx %1, %%r10;"
+ " movq %%r10, 48(%2);"
+ " adcx %1, %%r11;"
+ " movq %%r11, 56(%2);"
+
+ /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
+ " mov $0, %%rax;"
+ " cmovc %%rdx, %%rax;"
+ " add %%rax, %%r8;"
+ " movq %%r8, 32(%2);"
+ : "+&r"(f1), "+&r"(f2), "+&r"(tmp)
+ : "r"(out)
+ : "%rax", "%rbx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r13",
+ "%r14", "memory", "cc");
}
-static void red_eltfp25519_1w_adx(u64 *const c, const u64 *const a)
+/* Computes the field multiplication of four-element f1 with value in f2
+ * Requires f2 to be smaller than 2^17 */
+static inline void fmul_scalar(u64 *out, const u64 *f1, u64 f2)
{
- asm volatile(
- "movl $38, %%edx ;" /* 2*c = 38 = 2^256 */
- "mulx 32(%1), %%r8, %%r10 ;" /* c*C[4] */
- "xorl %%ebx, %%ebx ;"
- "adox (%1), %%r8 ;"
- "mulx 40(%1), %%r9, %%r11 ;" /* c*C[5] */
- "adcx %%r10, %%r9 ;"
- "adox 8(%1), %%r9 ;"
- "mulx 48(%1), %%r10, %%rax ;" /* c*C[6] */
- "adcx %%r11, %%r10 ;"
- "adox 16(%1), %%r10 ;"
- "mulx 56(%1), %%r11, %%rcx ;" /* c*C[7] */
- "adcx %%rax, %%r11 ;"
- "adox 24(%1), %%r11 ;"
- /***************************************/
- "adcx %%rbx, %%rcx ;"
- "adox %%rbx, %%rcx ;"
- "imul %%rdx, %%rcx ;" /* c*C[4], cf=0, of=0 */
- "adcx %%rcx, %%r8 ;"
- "adcx %%rbx, %%r9 ;"
- "movq %%r9, 8(%0) ;"
- "adcx %%rbx, %%r10 ;"
- "movq %%r10, 16(%0) ;"
- "adcx %%rbx, %%r11 ;"
- "movq %%r11, 24(%0) ;"
- "mov $0, %%ecx ;"
- "cmovc %%edx, %%ecx ;"
- "addq %%rcx, %%r8 ;"
- "movq %%r8, (%0) ;"
- :
- : "r"(c), "r"(a)
- : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9",
- "%r10", "%r11");
-}
+ register u64 f2_r asm("rdx") = f2;
-static void red_eltfp25519_1w_bmi2(u64 *const c, const u64 *const a)
-{
asm volatile(
- "movl $38, %%edx ;" /* 2*c = 38 = 2^256 */
- "mulx 32(%1), %%r8, %%r10 ;" /* c*C[4] */
- "mulx 40(%1), %%r9, %%r11 ;" /* c*C[5] */
- "addq %%r10, %%r9 ;"
- "mulx 48(%1), %%r10, %%rax ;" /* c*C[6] */
- "adcq %%r11, %%r10 ;"
- "mulx 56(%1), %%r11, %%rcx ;" /* c*C[7] */
- "adcq %%rax, %%r11 ;"
- /***************************************/
- "adcq $0, %%rcx ;"
- "addq (%1), %%r8 ;"
- "adcq 8(%1), %%r9 ;"
- "adcq 16(%1), %%r10 ;"
- "adcq 24(%1), %%r11 ;"
- "adcq $0, %%rcx ;"
- "imul %%rdx, %%rcx ;" /* c*C[4], cf=0 */
- "addq %%rcx, %%r8 ;"
- "adcq $0, %%r9 ;"
- "movq %%r9, 8(%0) ;"
- "adcq $0, %%r10 ;"
- "movq %%r10, 16(%0) ;"
- "adcq $0, %%r11 ;"
- "movq %%r11, 24(%0) ;"
- "mov $0, %%ecx ;"
- "cmovc %%edx, %%ecx ;"
- "addq %%rcx, %%r8 ;"
- "movq %%r8, (%0) ;"
- :
- : "r"(c), "r"(a)
- : "memory", "cc", "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10",
- "%r11");
+ /* Compute the raw multiplication of f1*f2 */
+ " mulxq 0(%2), %%r8, %%rcx;" /* f1[0]*f2 */
+ " mulxq 8(%2), %%r9, %%rbx;" /* f1[1]*f2 */
+ " add %%rcx, %%r9;"
+ " mov $0, %%rcx;"
+ " mulxq 16(%2), %%r10, %%r13;" /* f1[2]*f2 */
+ " adcx %%rbx, %%r10;"
+ " mulxq 24(%2), %%r11, %%rax;" /* f1[3]*f2 */
+ " adcx %%r13, %%r11;"
+ " adcx %%rcx, %%rax;"
+
+ /* Wrap the result back into the field */
+
+ /* Step 1: Compute carry*38 */
+ " mov $38, %%rdx;"
+ " imul %%rdx, %%rax;"
+
+ /* Step 2: Fold the carry back into dst */
+ " add %%rax, %%r8;"
+ " adcx %%rcx, %%r9;"
+ " movq %%r9, 8(%1);"
+ " adcx %%rcx, %%r10;"
+ " movq %%r10, 16(%1);"
+ " adcx %%rcx, %%r11;"
+ " movq %%r11, 24(%1);"
+
+ /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
+ " mov $0, %%rax;"
+ " cmovc %%rdx, %%rax;"
+ " add %%rax, %%r8;"
+ " movq %%r8, 0(%1);"
+ : "+&r"(f2_r)
+ : "r"(out), "r"(f1)
+ : "%rax", "%rbx", "%rcx", "%r8", "%r9", "%r10", "%r11", "%r13",
+ "memory", "cc");
}
-static __always_inline void
-add_eltfp25519_1w_adx(u64 *const c, const u64 *const a, const u64 *const b)
+/* Computes p1 <- bit ? p2 : p1 in constant time */
+static inline void cswap2(u64 bit, const u64 *p1, const u64 *p2)
{
asm volatile(
- "mov $38, %%eax ;"
- "xorl %%ecx, %%ecx ;"
- "movq (%2), %%r8 ;"
- "adcx (%1), %%r8 ;"
- "movq 8(%2), %%r9 ;"
- "adcx 8(%1), %%r9 ;"
- "movq 16(%2), %%r10 ;"
- "adcx 16(%1), %%r10 ;"
- "movq 24(%2), %%r11 ;"
- "adcx 24(%1), %%r11 ;"
- "cmovc %%eax, %%ecx ;"
- "xorl %%eax, %%eax ;"
- "adcx %%rcx, %%r8 ;"
- "adcx %%rax, %%r9 ;"
- "movq %%r9, 8(%0) ;"
- "adcx %%rax, %%r10 ;"
- "movq %%r10, 16(%0) ;"
- "adcx %%rax, %%r11 ;"
- "movq %%r11, 24(%0) ;"
- "mov $38, %%ecx ;"
- "cmovc %%ecx, %%eax ;"
- "addq %%rax, %%r8 ;"
- "movq %%r8, (%0) ;"
- :
- : "r"(c), "r"(a), "r"(b)
- : "memory", "cc", "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11");
+ /* Transfer bit into CF flag */
+ " add $18446744073709551615, %0;"
+
+ /* cswap p1[0], p2[0] */
+ " movq 0(%1), %%r8;"
+ " movq 0(%2), %%r9;"
+ " mov %%r8, %%r10;"
+ " cmovc %%r9, %%r8;"
+ " cmovc %%r10, %%r9;"
+ " movq %%r8, 0(%1);"
+ " movq %%r9, 0(%2);"
+
+ /* cswap p1[1], p2[1] */
+ " movq 8(%1), %%r8;"
+ " movq 8(%2), %%r9;"
+ " mov %%r8, %%r10;"
+ " cmovc %%r9, %%r8;"
+ " cmovc %%r10, %%r9;"
+ " movq %%r8, 8(%1);"
+ " movq %%r9, 8(%2);"
+
+ /* cswap p1[2], p2[2] */
+ " movq 16(%1), %%r8;"
+ " movq 16(%2), %%r9;"
+ " mov %%r8, %%r10;"
+ " cmovc %%r9, %%r8;"
+ " cmovc %%r10, %%r9;"
+ " movq %%r8, 16(%1);"
+ " movq %%r9, 16(%2);"
+
+ /* cswap p1[3], p2[3] */
+ " movq 24(%1), %%r8;"
+ " movq 24(%2), %%r9;"
+ " mov %%r8, %%r10;"
+ " cmovc %%r9, %%r8;"
+ " cmovc %%r10, %%r9;"
+ " movq %%r8, 24(%1);"
+ " movq %%r9, 24(%2);"
+
+ /* cswap p1[4], p2[4] */
+ " movq 32(%1), %%r8;"
+ " movq 32(%2), %%r9;"
+ " mov %%r8, %%r10;"
+ " cmovc %%r9, %%r8;"
+ " cmovc %%r10, %%r9;"
+ " movq %%r8, 32(%1);"
+ " movq %%r9, 32(%2);"
+
+ /* cswap p1[5], p2[5] */
+ " movq 40(%1), %%r8;"
+ " movq 40(%2), %%r9;"
+ " mov %%r8, %%r10;"
+ " cmovc %%r9, %%r8;"
+ " cmovc %%r10, %%r9;"
+ " movq %%r8, 40(%1);"
+ " movq %%r9, 40(%2);"
+
+ /* cswap p1[6], p2[6] */
+ " movq 48(%1), %%r8;"
+ " movq 48(%2), %%r9;"
+ " mov %%r8, %%r10;"
+ " cmovc %%r9, %%r8;"
+ " cmovc %%r10, %%r9;"
+ " movq %%r8, 48(%1);"
+ " movq %%r9, 48(%2);"
+
+ /* cswap p1[7], p2[7] */
+ " movq 56(%1), %%r8;"
+ " movq 56(%2), %%r9;"
+ " mov %%r8, %%r10;"
+ " cmovc %%r9, %%r8;"
+ " cmovc %%r10, %%r9;"
+ " movq %%r8, 56(%1);"
+ " movq %%r9, 56(%2);"
+ : "+&r"(bit)
+ : "r"(p1), "r"(p2)
+ : "%r8", "%r9", "%r10", "memory", "cc");
}
-static __always_inline void
-add_eltfp25519_1w_bmi2(u64 *const c, const u64 *const a, const u64 *const b)
+/* Computes the square of a field element: out <- f * f
+ * Uses the 8-element buffer tmp for intermediate results */
+static inline void fsqr(u64 *out, const u64 *f, u64 *tmp)
{
asm volatile(
- "mov $38, %%eax ;"
- "movq (%2), %%r8 ;"
- "addq (%1), %%r8 ;"
- "movq 8(%2), %%r9 ;"
- "adcq 8(%1), %%r9 ;"
- "movq 16(%2), %%r10 ;"
- "adcq 16(%1), %%r10 ;"
- "movq 24(%2), %%r11 ;"
- "adcq 24(%1), %%r11 ;"
- "mov $0, %%ecx ;"
- "cmovc %%eax, %%ecx ;"
- "addq %%rcx, %%r8 ;"
- "adcq $0, %%r9 ;"
- "movq %%r9, 8(%0) ;"
- "adcq $0, %%r10 ;"
- "movq %%r10, 16(%0) ;"
- "adcq $0, %%r11 ;"
- "movq %%r11, 24(%0) ;"
- "mov $0, %%ecx ;"
- "cmovc %%eax, %%ecx ;"
- "addq %%rcx, %%r8 ;"
- "movq %%r8, (%0) ;"
- :
- : "r"(c), "r"(a), "r"(b)
- : "memory", "cc", "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11");
+ /* Compute the raw multiplication: tmp <- f * f */
+
+ /* Step 1: Compute all partial products */
+ " movq 0(%0), %%rdx;" /* f[0] */
+ " mulxq 8(%0), %%r8, %%r14;"
+ " xor %%r15d, %%r15d;" /* f[1]*f[0] */
+ " mulxq 16(%0), %%r9, %%r10;"
+ " adcx %%r14, %%r9;" /* f[2]*f[0] */
+ " mulxq 24(%0), %%rax, %%rcx;"
+ " adcx %%rax, %%r10;" /* f[3]*f[0] */
+ " movq 24(%0), %%rdx;" /* f[3] */
+ " mulxq 8(%0), %%r11, %%rbx;"
+ " adcx %%rcx, %%r11;" /* f[1]*f[3] */
+ " mulxq 16(%0), %%rax, %%r13;"
+ " adcx %%rax, %%rbx;" /* f[2]*f[3] */
+ " movq 8(%0), %%rdx;"
+ " adcx %%r15, %%r13;" /* f1 */
+ " mulxq 16(%0), %%rax, %%rcx;"
+ " mov $0, %%r14;" /* f[2]*f[1] */
+
+ /* Step 2: Compute two parallel carry chains */
+ " xor %%r15d, %%r15d;"
+ " adox %%rax, %%r10;"
+ " adcx %%r8, %%r8;"
+ " adox %%rcx, %%r11;"
+ " adcx %%r9, %%r9;"
+ " adox %%r15, %%rbx;"
+ " adcx %%r10, %%r10;"
+ " adox %%r15, %%r13;"
+ " adcx %%r11, %%r11;"
+ " adox %%r15, %%r14;"
+ " adcx %%rbx, %%rbx;"
+ " adcx %%r13, %%r13;"
+ " adcx %%r14, %%r14;"
+
+ /* Step 3: Compute intermediate squares */
+ " movq 0(%0), %%rdx;"
+ " mulx %%rdx, %%rax, %%rcx;" /* f[0]^2 */
+ " movq %%rax, 0(%1);"
+ " add %%rcx, %%r8;"
+ " movq %%r8, 8(%1);"
+ " movq 8(%0), %%rdx;"
+ " mulx %%rdx, %%rax, %%rcx;" /* f[1]^2 */
+ " adcx %%rax, %%r9;"
+ " movq %%r9, 16(%1);"
+ " adcx %%rcx, %%r10;"
+ " movq %%r10, 24(%1);"
+ " movq 16(%0), %%rdx;"
+ " mulx %%rdx, %%rax, %%rcx;" /* f[2]^2 */
+ " adcx %%rax, %%r11;"
+ " movq %%r11, 32(%1);"
+ " adcx %%rcx, %%rbx;"
+ " movq %%rbx, 40(%1);"
+ " movq 24(%0), %%rdx;"
+ " mulx %%rdx, %%rax, %%rcx;" /* f[3]^2 */
+ " adcx %%rax, %%r13;"
+ " movq %%r13, 48(%1);"
+ " adcx %%rcx, %%r14;"
+ " movq %%r14, 56(%1);"
+
+ /* Line up pointers */
+ " mov %1, %0;"
+ " mov %2, %1;"
+
+ /* Wrap the result back into the field */
+
+ /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
+ " mov $38, %%rdx;"
+ " mulxq 32(%0), %%r8, %%r13;"
+ " xor %%ecx, %%ecx;"
+ " adoxq 0(%0), %%r8;"
+ " mulxq 40(%0), %%r9, %%rbx;"
+ " adcx %%r13, %%r9;"
+ " adoxq 8(%0), %%r9;"
+ " mulxq 48(%0), %%r10, %%r13;"
+ " adcx %%rbx, %%r10;"
+ " adoxq 16(%0), %%r10;"
+ " mulxq 56(%0), %%r11, %%rax;"
+ " adcx %%r13, %%r11;"
+ " adoxq 24(%0), %%r11;"
+ " adcx %%rcx, %%rax;"
+ " adox %%rcx, %%rax;"
+ " imul %%rdx, %%rax;"
+
+ /* Step 2: Fold the carry back into dst */
+ " add %%rax, %%r8;"
+ " adcx %%rcx, %%r9;"
+ " movq %%r9, 8(%1);"
+ " adcx %%rcx, %%r10;"
+ " movq %%r10, 16(%1);"
+ " adcx %%rcx, %%r11;"
+ " movq %%r11, 24(%1);"
+
+ /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
+ " mov $0, %%rax;"
+ " cmovc %%rdx, %%rax;"
+ " add %%rax, %%r8;"
+ " movq %%r8, 0(%1);"
+ : "+&r"(f), "+&r"(tmp)
+ : "r"(out)
+ : "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11",
+ "%r13", "%r14", "%r15", "memory", "cc");
}
-static __always_inline void
-sub_eltfp25519_1w(u64 *const c, const u64 *const a, const u64 *const b)
+/* Computes two field squarings:
+ * out[0] <- f[0] * f[0]
+ * out[1] <- f[1] * f[1]
+ * Uses the 16-element buffer tmp for intermediate results */
+static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp)
{
asm volatile(
- "mov $38, %%eax ;"
- "movq (%1), %%r8 ;"
- "subq (%2), %%r8 ;"
- "movq 8(%1), %%r9 ;"
- "sbbq 8(%2), %%r9 ;"
- "movq 16(%1), %%r10 ;"
- "sbbq 16(%2), %%r10 ;"
- "movq 24(%1), %%r11 ;"
- "sbbq 24(%2), %%r11 ;"
- "mov $0, %%ecx ;"
- "cmovc %%eax, %%ecx ;"
- "subq %%rcx, %%r8 ;"
- "sbbq $0, %%r9 ;"
- "movq %%r9, 8(%0) ;"
- "sbbq $0, %%r10 ;"
- "movq %%r10, 16(%0) ;"
- "sbbq $0, %%r11 ;"
- "movq %%r11, 24(%0) ;"
- "mov $0, %%ecx ;"
- "cmovc %%eax, %%ecx ;"
- "subq %%rcx, %%r8 ;"
- "movq %%r8, (%0) ;"
- :
- : "r"(c), "r"(a), "r"(b)
- : "memory", "cc", "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11");
+ /* Step 1: Compute all partial products */
+ " movq 0(%0), %%rdx;" /* f[0] */
+ " mulxq 8(%0), %%r8, %%r14;"
+ " xor %%r15d, %%r15d;" /* f[1]*f[0] */
+ " mulxq 16(%0), %%r9, %%r10;"
+ " adcx %%r14, %%r9;" /* f[2]*f[0] */
+ " mulxq 24(%0), %%rax, %%rcx;"
+ " adcx %%rax, %%r10;" /* f[3]*f[0] */
+ " movq 24(%0), %%rdx;" /* f[3] */
+ " mulxq 8(%0), %%r11, %%rbx;"
+ " adcx %%rcx, %%r11;" /* f[1]*f[3] */
+ " mulxq 16(%0), %%rax, %%r13;"
+ " adcx %%rax, %%rbx;" /* f[2]*f[3] */
+ " movq 8(%0), %%rdx;"
+ " adcx %%r15, %%r13;" /* f1 */
+ " mulxq 16(%0), %%rax, %%rcx;"
+ " mov $0, %%r14;" /* f[2]*f[1] */
+
+ /* Step 2: Compute two parallel carry chains */
+ " xor %%r15d, %%r15d;"
+ " adox %%rax, %%r10;"
+ " adcx %%r8, %%r8;"
+ " adox %%rcx, %%r11;"
+ " adcx %%r9, %%r9;"
+ " adox %%r15, %%rbx;"
+ " adcx %%r10, %%r10;"
+ " adox %%r15, %%r13;"
+ " adcx %%r11, %%r11;"
+ " adox %%r15, %%r14;"
+ " adcx %%rbx, %%rbx;"
+ " adcx %%r13, %%r13;"
+ " adcx %%r14, %%r14;"
+
+ /* Step 3: Compute intermediate squares */
+ " movq 0(%0), %%rdx;"
+ " mulx %%rdx, %%rax, %%rcx;" /* f[0]^2 */
+ " movq %%rax, 0(%1);"
+ " add %%rcx, %%r8;"
+ " movq %%r8, 8(%1);"
+ " movq 8(%0), %%rdx;"
+ " mulx %%rdx, %%rax, %%rcx;" /* f[1]^2 */
+ " adcx %%rax, %%r9;"
+ " movq %%r9, 16(%1);"
+ " adcx %%rcx, %%r10;"
+ " movq %%r10, 24(%1);"
+ " movq 16(%0), %%rdx;"
+ " mulx %%rdx, %%rax, %%rcx;" /* f[2]^2 */
+ " adcx %%rax, %%r11;"
+ " movq %%r11, 32(%1);"
+ " adcx %%rcx, %%rbx;"
+ " movq %%rbx, 40(%1);"
+ " movq 24(%0), %%rdx;"
+ " mulx %%rdx, %%rax, %%rcx;" /* f[3]^2 */
+ " adcx %%rax, %%r13;"
+ " movq %%r13, 48(%1);"
+ " adcx %%rcx, %%r14;"
+ " movq %%r14, 56(%1);"
+
+ /* Step 1: Compute all partial products */
+ " movq 32(%0), %%rdx;" /* f[0] */
+ " mulxq 40(%0), %%r8, %%r14;"
+ " xor %%r15d, %%r15d;" /* f[1]*f[0] */
+ " mulxq 48(%0), %%r9, %%r10;"
+ " adcx %%r14, %%r9;" /* f[2]*f[0] */
+ " mulxq 56(%0), %%rax, %%rcx;"
+ " adcx %%rax, %%r10;" /* f[3]*f[0] */
+ " movq 56(%0), %%rdx;" /* f[3] */
+ " mulxq 40(%0), %%r11, %%rbx;"
+ " adcx %%rcx, %%r11;" /* f[1]*f[3] */
+ " mulxq 48(%0), %%rax, %%r13;"
+ " adcx %%rax, %%rbx;" /* f[2]*f[3] */
+ " movq 40(%0), %%rdx;"
+ " adcx %%r15, %%r13;" /* f1 */
+ " mulxq 48(%0), %%rax, %%rcx;"
+ " mov $0, %%r14;" /* f[2]*f[1] */
+
+ /* Step 2: Compute two parallel carry chains */
+ " xor %%r15d, %%r15d;"
+ " adox %%rax, %%r10;"
+ " adcx %%r8, %%r8;"
+ " adox %%rcx, %%r11;"
+ " adcx %%r9, %%r9;"
+ " adox %%r15, %%rbx;"
+ " adcx %%r10, %%r10;"
+ " adox %%r15, %%r13;"
+ " adcx %%r11, %%r11;"
+ " adox %%r15, %%r14;"
+ " adcx %%rbx, %%rbx;"
+ " adcx %%r13, %%r13;"
+ " adcx %%r14, %%r14;"
+
+ /* Step 3: Compute intermediate squares */
+ " movq 32(%0), %%rdx;"
+ " mulx %%rdx, %%rax, %%rcx;" /* f[0]^2 */
+ " movq %%rax, 64(%1);"
+ " add %%rcx, %%r8;"
+ " movq %%r8, 72(%1);"
+ " movq 40(%0), %%rdx;"
+ " mulx %%rdx, %%rax, %%rcx;" /* f[1]^2 */
+ " adcx %%rax, %%r9;"
+ " movq %%r9, 80(%1);"
+ " adcx %%rcx, %%r10;"
+ " movq %%r10, 88(%1);"
+ " movq 48(%0), %%rdx;"
+ " mulx %%rdx, %%rax, %%rcx;" /* f[2]^2 */
+ " adcx %%rax, %%r11;"
+ " movq %%r11, 96(%1);"
+ " adcx %%rcx, %%rbx;"
+ " movq %%rbx, 104(%1);"
+ " movq 56(%0), %%rdx;"
+ " mulx %%rdx, %%rax, %%rcx;" /* f[3]^2 */
+ " adcx %%rax, %%r13;"
+ " movq %%r13, 112(%1);"
+ " adcx %%rcx, %%r14;"
+ " movq %%r14, 120(%1);"
+
+ /* Line up pointers */
+ " mov %1, %0;"
+ " mov %2, %1;"
+
+ /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
+ " mov $38, %%rdx;"
+ " mulxq 32(%0), %%r8, %%r13;"
+ " xor %%ecx, %%ecx;"
+ " adoxq 0(%0), %%r8;"
+ " mulxq 40(%0), %%r9, %%rbx;"
+ " adcx %%r13, %%r9;"
+ " adoxq 8(%0), %%r9;"
+ " mulxq 48(%0), %%r10, %%r13;"
+ " adcx %%rbx, %%r10;"
+ " adoxq 16(%0), %%r10;"
+ " mulxq 56(%0), %%r11, %%rax;"
+ " adcx %%r13, %%r11;"
+ " adoxq 24(%0), %%r11;"
+ " adcx %%rcx, %%rax;"
+ " adox %%rcx, %%rax;"
+ " imul %%rdx, %%rax;"
+
+ /* Step 2: Fold the carry back into dst */
+ " add %%rax, %%r8;"
+ " adcx %%rcx, %%r9;"
+ " movq %%r9, 8(%1);"
+ " adcx %%rcx, %%r10;"
+ " movq %%r10, 16(%1);"
+ " adcx %%rcx, %%r11;"
+ " movq %%r11, 24(%1);"
+
+ /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
+ " mov $0, %%rax;"
+ " cmovc %%rdx, %%rax;"
+ " add %%rax, %%r8;"
+ " movq %%r8, 0(%1);"
+
+ /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
+ " mov $38, %%rdx;"
+ " mulxq 96(%0), %%r8, %%r13;"
+ " xor %%ecx, %%ecx;"
+ " adoxq 64(%0), %%r8;"
+ " mulxq 104(%0), %%r9, %%rbx;"
+ " adcx %%r13, %%r9;"
+ " adoxq 72(%0), %%r9;"
+ " mulxq 112(%0), %%r10, %%r13;"
+ " adcx %%rbx, %%r10;"
+ " adoxq 80(%0), %%r10;"
+ " mulxq 120(%0), %%r11, %%rax;"
+ " adcx %%r13, %%r11;"
+ " adoxq 88(%0), %%r11;"
+ " adcx %%rcx, %%rax;"
+ " adox %%rcx, %%rax;"
+ " imul %%rdx, %%rax;"
+
+ /* Step 2: Fold the carry back into dst */
+ " add %%rax, %%r8;"
+ " adcx %%rcx, %%r9;"
+ " movq %%r9, 40(%1);"
+ " adcx %%rcx, %%r10;"
+ " movq %%r10, 48(%1);"
+ " adcx %%rcx, %%r11;"
+ " movq %%r11, 56(%1);"
+
+ /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
+ " mov $0, %%rax;"
+ " cmovc %%rdx, %%rax;"
+ " add %%rax, %%r8;"
+ " movq %%r8, 32(%1);"
+ : "+&r"(f), "+&r"(tmp)
+ : "r"(out)
+ : "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11",
+ "%r13", "%r14", "%r15", "memory", "cc");
}
-/* Multiplication by a24 = (A+2)/4 = (486662+2)/4 = 121666 */
-static __always_inline void
-mul_a24_eltfp25519_1w(u64 *const c, const u64 *const a)
+static void point_add_and_double(u64 *q, u64 *p01_tmp1, u64 *tmp2)
{
- const u64 a24 = 121666;
- asm volatile(
- "movq %2, %%rdx ;"
- "mulx (%1), %%r8, %%r10 ;"
- "mulx 8(%1), %%r9, %%r11 ;"
- "addq %%r10, %%r9 ;"
- "mulx 16(%1), %%r10, %%rax ;"
- "adcq %%r11, %%r10 ;"
- "mulx 24(%1), %%r11, %%rcx ;"
- "adcq %%rax, %%r11 ;"
- /**************************/
- "adcq $0, %%rcx ;"
- "movl $38, %%edx ;" /* 2*c = 38 = 2^256 mod 2^255-19*/
- "imul %%rdx, %%rcx ;"
- "addq %%rcx, %%r8 ;"
- "adcq $0, %%r9 ;"
- "movq %%r9, 8(%0) ;"
- "adcq $0, %%r10 ;"
- "movq %%r10, 16(%0) ;"
- "adcq $0, %%r11 ;"
- "movq %%r11, 24(%0) ;"
- "mov $0, %%ecx ;"
- "cmovc %%edx, %%ecx ;"
- "addq %%rcx, %%r8 ;"
- "movq %%r8, (%0) ;"
- :
- : "r"(c), "r"(a), "r"(a24)
- : "memory", "cc", "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10",
- "%r11");
+ u64 *nq = p01_tmp1;
+ u64 *nq_p1 = p01_tmp1 + (u32)8U;
+ u64 *tmp1 = p01_tmp1 + (u32)16U;
+ u64 *x1 = q;
+ u64 *x2 = nq;
+ u64 *z2 = nq + (u32)4U;
+ u64 *z3 = nq_p1 + (u32)4U;
+ u64 *a = tmp1;
+ u64 *b = tmp1 + (u32)4U;
+ u64 *ab = tmp1;
+ u64 *dc = tmp1 + (u32)8U;
+ u64 *x3;
+ u64 *z31;
+ u64 *d0;
+ u64 *c0;
+ u64 *a1;
+ u64 *b1;
+ u64 *d;
+ u64 *c;
+ u64 *ab1;
+ u64 *dc1;
+ fadd(a, x2, z2);
+ fsub(b, x2, z2);
+ x3 = nq_p1;
+ z31 = nq_p1 + (u32)4U;
+ d0 = dc;
+ c0 = dc + (u32)4U;
+ fadd(c0, x3, z31);
+ fsub(d0, x3, z31);
+ fmul2(dc, dc, ab, tmp2);
+ fadd(x3, d0, c0);
+ fsub(z31, d0, c0);
+ a1 = tmp1;
+ b1 = tmp1 + (u32)4U;
+ d = tmp1 + (u32)8U;
+ c = tmp1 + (u32)12U;
+ ab1 = tmp1;
+ dc1 = tmp1 + (u32)8U;
+ fsqr2(dc1, ab1, tmp2);
+ fsqr2(nq_p1, nq_p1, tmp2);
+ a1[0U] = c[0U];
+ a1[1U] = c[1U];
+ a1[2U] = c[2U];
+ a1[3U] = c[3U];
+ fsub(c, d, c);
+ fmul_scalar(b1, c, (u64)121665U);
+ fadd(b1, b1, d);
+ fmul2(nq, dc1, ab1, tmp2);
+ fmul(z3, z3, x1, tmp2);
}
-static void inv_eltfp25519_1w_adx(u64 *const c, const u64 *const a)
+static void point_double(u64 *nq, u64 *tmp1, u64 *tmp2)
{
- struct {
- eltfp25519_1w_buffer buffer;
- eltfp25519_1w x0, x1, x2;
- } __aligned(32) m;
- u64 *T[4];
-
- T[0] = m.x0;
- T[1] = c; /* x^(-1) */
- T[2] = m.x1;
- T[3] = m.x2;
-
- copy_eltfp25519_1w(T[1], a);
- sqrn_eltfp25519_1w_adx(T[1], 1);
- copy_eltfp25519_1w(T[2], T[1]);
- sqrn_eltfp25519_1w_adx(T[2], 2);
- mul_eltfp25519_1w_adx(T[0], a, T[2]);
- mul_eltfp25519_1w_adx(T[1], T[1], T[0]);
- copy_eltfp25519_1w(T[2], T[1]);
- sqrn_eltfp25519_1w_adx(T[2], 1);
- mul_eltfp25519_1w_adx(T[0], T[0], T[2]);
- copy_eltfp25519_1w(T[2], T[0]);
- sqrn_eltfp25519_1w_adx(T[2], 5);
- mul_eltfp25519_1w_adx(T[0], T[0], T[2]);
- copy_eltfp25519_1w(T[2], T[0]);
- sqrn_eltfp25519_1w_adx(T[2], 10);
- mul_eltfp25519_1w_adx(T[2], T[2], T[0]);
- copy_eltfp25519_1w(T[3], T[2]);
- sqrn_eltfp25519_1w_adx(T[3], 20);
- mul_eltfp25519_1w_adx(T[3], T[3], T[2]);
- sqrn_eltfp25519_1w_adx(T[3], 10);
- mul_eltfp25519_1w_adx(T[3], T[3], T[0]);
- copy_eltfp25519_1w(T[0], T[3]);
- sqrn_eltfp25519_1w_adx(T[0], 50);
- mul_eltfp25519_1w_adx(T[0], T[0], T[3]);
- copy_eltfp25519_1w(T[2], T[0]);
- sqrn_eltfp25519_1w_adx(T[2], 100);
- mul_eltfp25519_1w_adx(T[2], T[2], T[0]);
- sqrn_eltfp25519_1w_adx(T[2], 50);
- mul_eltfp25519_1w_adx(T[2], T[2], T[3]);
- sqrn_eltfp25519_1w_adx(T[2], 5);
- mul_eltfp25519_1w_adx(T[1], T[1], T[2]);
-
- memzero_explicit(&m, sizeof(m));
+ u64 *x2 = nq;
+ u64 *z2 = nq + (u32)4U;
+ u64 *a = tmp1;
+ u64 *b = tmp1 + (u32)4U;
+ u64 *d = tmp1 + (u32)8U;
+ u64 *c = tmp1 + (u32)12U;
+ u64 *ab = tmp1;
+ u64 *dc = tmp1 + (u32)8U;
+ fadd(a, x2, z2);
+ fsub(b, x2, z2);
+ fsqr2(dc, ab, tmp2);
+ a[0U] = c[0U];
+ a[1U] = c[1U];
+ a[2U] = c[2U];
+ a[3U] = c[3U];
+ fsub(c, d, c);
+ fmul_scalar(b, c, (u64)121665U);
+ fadd(b, b, d);
+ fmul2(nq, dc, ab, tmp2);
}
-static void inv_eltfp25519_1w_bmi2(u64 *const c, const u64 *const a)
+static void montgomery_ladder(u64 *out, const u8 *key, u64 *init1)
{
- struct {
- eltfp25519_1w_buffer buffer;
- eltfp25519_1w x0, x1, x2;
- } __aligned(32) m;
- u64 *T[5];
-
- T[0] = m.x0;
- T[1] = c; /* x^(-1) */
- T[2] = m.x1;
- T[3] = m.x2;
-
- copy_eltfp25519_1w(T[1], a);
- sqrn_eltfp25519_1w_bmi2(T[1], 1);
- copy_eltfp25519_1w(T[2], T[1]);
- sqrn_eltfp25519_1w_bmi2(T[2], 2);
- mul_eltfp25519_1w_bmi2(T[0], a, T[2]);
- mul_eltfp25519_1w_bmi2(T[1], T[1], T[0]);
- copy_eltfp25519_1w(T[2], T[1]);
- sqrn_eltfp25519_1w_bmi2(T[2], 1);
- mul_eltfp25519_1w_bmi2(T[0], T[0], T[2]);
- copy_eltfp25519_1w(T[2], T[0]);
- sqrn_eltfp25519_1w_bmi2(T[2], 5);
- mul_eltfp25519_1w_bmi2(T[0], T[0], T[2]);
- copy_eltfp25519_1w(T[2], T[0]);
- sqrn_eltfp25519_1w_bmi2(T[2], 10);
- mul_eltfp25519_1w_bmi2(T[2], T[2], T[0]);
- copy_eltfp25519_1w(T[3], T[2]);
- sqrn_eltfp25519_1w_bmi2(T[3], 20);
- mul_eltfp25519_1w_bmi2(T[3], T[3], T[2]);
- sqrn_eltfp25519_1w_bmi2(T[3], 10);
- mul_eltfp25519_1w_bmi2(T[3], T[3], T[0]);
- copy_eltfp25519_1w(T[0], T[3]);
- sqrn_eltfp25519_1w_bmi2(T[0], 50);
- mul_eltfp25519_1w_bmi2(T[0], T[0], T[3]);
- copy_eltfp25519_1w(T[2], T[0]);
- sqrn_eltfp25519_1w_bmi2(T[2], 100);
- mul_eltfp25519_1w_bmi2(T[2], T[2], T[0]);
- sqrn_eltfp25519_1w_bmi2(T[2], 50);
- mul_eltfp25519_1w_bmi2(T[2], T[2], T[3]);
- sqrn_eltfp25519_1w_bmi2(T[2], 5);
- mul_eltfp25519_1w_bmi2(T[1], T[1], T[2]);
-
- memzero_explicit(&m, sizeof(m));
+ u64 tmp2[16U] = { 0U };
+ u64 p01_tmp1_swap[33U] = { 0U };
+ u64 *p0 = p01_tmp1_swap;
+ u64 *p01 = p01_tmp1_swap;
+ u64 *p03 = p01;
+ u64 *p11 = p01 + (u32)8U;
+ u64 *x0;
+ u64 *z0;
+ u64 *p01_tmp1;
+ u64 *p01_tmp11;
+ u64 *nq10;
+ u64 *nq_p11;
+ u64 *swap1;
+ u64 sw0;
+ u64 *nq1;
+ u64 *tmp1;
+ memcpy(p11, init1, (u32)8U * sizeof(init1[0U]));
+ x0 = p03;
+ z0 = p03 + (u32)4U;
+ x0[0U] = (u64)1U;
+ x0[1U] = (u64)0U;
+ x0[2U] = (u64)0U;
+ x0[3U] = (u64)0U;
+ z0[0U] = (u64)0U;
+ z0[1U] = (u64)0U;
+ z0[2U] = (u64)0U;
+ z0[3U] = (u64)0U;
+ p01_tmp1 = p01_tmp1_swap;
+ p01_tmp11 = p01_tmp1_swap;
+ nq10 = p01_tmp1_swap;
+ nq_p11 = p01_tmp1_swap + (u32)8U;
+ swap1 = p01_tmp1_swap + (u32)32U;
+ cswap2((u64)1U, nq10, nq_p11);
+ point_add_and_double(init1, p01_tmp11, tmp2);
+ swap1[0U] = (u64)1U;
+ {
+ u32 i;
+ for (i = (u32)0U; i < (u32)251U; i = i + (u32)1U) {
+ u64 *p01_tmp12 = p01_tmp1_swap;
+ u64 *swap2 = p01_tmp1_swap + (u32)32U;
+ u64 *nq2 = p01_tmp12;
+ u64 *nq_p12 = p01_tmp12 + (u32)8U;
+ u64 bit = (u64)(key[((u32)253U - i) / (u32)8U] >> ((u32)253U - i) % (u32)8U & (u8)1U);
+ u64 sw = swap2[0U] ^ bit;
+ cswap2(sw, nq2, nq_p12);
+ point_add_and_double(init1, p01_tmp12, tmp2);
+ swap2[0U] = bit;
+ }
+ }
+ sw0 = swap1[0U];
+ cswap2(sw0, nq10, nq_p11);
+ nq1 = p01_tmp1;
+ tmp1 = p01_tmp1 + (u32)16U;
+ point_double(nq1, tmp1, tmp2);
+ point_double(nq1, tmp1, tmp2);
+ point_double(nq1, tmp1, tmp2);
+ memcpy(out, p0, (u32)8U * sizeof(p0[0U]));
+
+ memzero_explicit(tmp2, sizeof(tmp2));
+ memzero_explicit(p01_tmp1_swap, sizeof(p01_tmp1_swap));
}
-/* Given c, a 256-bit number, fred_eltfp25519_1w updates c
- * with a number such that 0 <= C < 2**255-19.
- */
-static __always_inline void fred_eltfp25519_1w(u64 *const c)
+static void fsquare_times(u64 *o, const u64 *inp, u64 *tmp, u32 n1)
{
- u64 tmp0 = 38, tmp1 = 19;
- asm volatile(
- "btrq $63, %3 ;" /* Put bit 255 in carry flag and clear */
- "cmovncl %k5, %k4 ;" /* c[255] ? 38 : 19 */
-
- /* Add either 19 or 38 to c */
- "addq %4, %0 ;"
- "adcq $0, %1 ;"
- "adcq $0, %2 ;"
- "adcq $0, %3 ;"
-
- /* Test for bit 255 again; only triggered on overflow modulo 2^255-19 */
- "movl $0, %k4 ;"
- "cmovnsl %k5, %k4 ;" /* c[255] ? 0 : 19 */
- "btrq $63, %3 ;" /* Clear bit 255 */
-
- /* Subtract 19 if necessary */
- "subq %4, %0 ;"
- "sbbq $0, %1 ;"
- "sbbq $0, %2 ;"
- "sbbq $0, %3 ;"
-
- : "+r"(c[0]), "+r"(c[1]), "+r"(c[2]), "+r"(c[3]), "+r"(tmp0),
- "+r"(tmp1)
- :
- : "memory", "cc");
+ u32 i;
+ fsqr(o, inp, tmp);
+ for (i = (u32)0U; i < n1 - (u32)1U; i = i + (u32)1U)
+ fsqr(o, o, tmp);
}
-static __always_inline void cswap(u8 bit, u64 *const px, u64 *const py)
+static void finv(u64 *o, const u64 *i, u64 *tmp)
{
- u64 temp;
- asm volatile(
- "test %9, %9 ;"
- "movq %0, %8 ;"
- "cmovnzq %4, %0 ;"
- "cmovnzq %8, %4 ;"
- "movq %1, %8 ;"
- "cmovnzq %5, %1 ;"
- "cmovnzq %8, %5 ;"
- "movq %2, %8 ;"
- "cmovnzq %6, %2 ;"
- "cmovnzq %8, %6 ;"
- "movq %3, %8 ;"
- "cmovnzq %7, %3 ;"
- "cmovnzq %8, %7 ;"
- : "+r"(px[0]), "+r"(px[1]), "+r"(px[2]), "+r"(px[3]),
- "+r"(py[0]), "+r"(py[1]), "+r"(py[2]), "+r"(py[3]),
- "=r"(temp)
- : "r"(bit)
- : "cc"
- );
+ u64 t1[16U] = { 0U };
+ u64 *a0 = t1;
+ u64 *b = t1 + (u32)4U;
+ u64 *c = t1 + (u32)8U;
+ u64 *t00 = t1 + (u32)12U;
+ u64 *tmp1 = tmp;
+ u64 *a;
+ u64 *t0;
+ fsquare_times(a0, i, tmp1, (u32)1U);
+ fsquare_times(t00, a0, tmp1, (u32)2U);
+ fmul(b, t00, i, tmp);
+ fmul(a0, b, a0, tmp);
+ fsquare_times(t00, a0, tmp1, (u32)1U);
+ fmul(b, t00, b, tmp);
+ fsquare_times(t00, b, tmp1, (u32)5U);
+ fmul(b, t00, b, tmp);
+ fsquare_times(t00, b, tmp1, (u32)10U);
+ fmul(c, t00, b, tmp);
+ fsquare_times(t00, c, tmp1, (u32)20U);
+ fmul(t00, t00, c, tmp);
+ fsquare_times(t00, t00, tmp1, (u32)10U);
+ fmul(b, t00, b, tmp);
+ fsquare_times(t00, b, tmp1, (u32)50U);
+ fmul(c, t00, b, tmp);
+ fsquare_times(t00, c, tmp1, (u32)100U);
+ fmul(t00, t00, c, tmp);
+ fsquare_times(t00, t00, tmp1, (u32)50U);
+ fmul(t00, t00, b, tmp);
+ fsquare_times(t00, t00, tmp1, (u32)5U);
+ a = t1;
+ t0 = t1 + (u32)12U;
+ fmul(o, t0, a, tmp);
}
-static __always_inline void cselect(u8 bit, u64 *const px, const u64 *const py)
+static void store_felem(u64 *b, u64 *f)
{
- asm volatile(
- "test %4, %4 ;"
- "cmovnzq %5, %0 ;"
- "cmovnzq %6, %1 ;"
- "cmovnzq %7, %2 ;"
- "cmovnzq %8, %3 ;"
- : "+r"(px[0]), "+r"(px[1]), "+r"(px[2]), "+r"(px[3])
- : "r"(bit), "rm"(py[0]), "rm"(py[1]), "rm"(py[2]), "rm"(py[3])
- : "cc"
- );
+ u64 f30 = f[3U];
+ u64 top_bit0 = f30 >> (u32)63U;
+ u64 f31;
+ u64 top_bit;
+ u64 f0;
+ u64 f1;
+ u64 f2;
+ u64 f3;
+ u64 m0;
+ u64 m1;
+ u64 m2;
+ u64 m3;
+ u64 mask;
+ u64 f0_;
+ u64 f1_;
+ u64 f2_;
+ u64 f3_;
+ u64 o0;
+ u64 o1;
+ u64 o2;
+ u64 o3;
+ f[3U] = f30 & (u64)0x7fffffffffffffffU;
+ add_scalar(f, f, (u64)19U * top_bit0);
+ f31 = f[3U];
+ top_bit = f31 >> (u32)63U;
+ f[3U] = f31 & (u64)0x7fffffffffffffffU;
+ add_scalar(f, f, (u64)19U * top_bit);
+ f0 = f[0U];
+ f1 = f[1U];
+ f2 = f[2U];
+ f3 = f[3U];
+ m0 = gte_mask(f0, (u64)0xffffffffffffffedU);
+ m1 = eq_mask(f1, (u64)0xffffffffffffffffU);
+ m2 = eq_mask(f2, (u64)0xffffffffffffffffU);
+ m3 = eq_mask(f3, (u64)0x7fffffffffffffffU);
+ mask = ((m0 & m1) & m2) & m3;
+ f0_ = f0 - (mask & (u64)0xffffffffffffffedU);
+ f1_ = f1 - (mask & (u64)0xffffffffffffffffU);
+ f2_ = f2 - (mask & (u64)0xffffffffffffffffU);
+ f3_ = f3 - (mask & (u64)0x7fffffffffffffffU);
+ o0 = f0_;
+ o1 = f1_;
+ o2 = f2_;
+ o3 = f3_;
+ b[0U] = o0;
+ b[1U] = o1;
+ b[2U] = o2;
+ b[3U] = o3;
}
-static void curve25519_adx(u8 shared[CURVE25519_KEY_SIZE],
- const u8 private_key[CURVE25519_KEY_SIZE],
- const u8 session_key[CURVE25519_KEY_SIZE])
+static void encode_point(u8 *o, const u64 *i)
{
- struct {
- u64 buffer[4 * NUM_WORDS_ELTFP25519];
- u64 coordinates[4 * NUM_WORDS_ELTFP25519];
- u64 workspace[6 * NUM_WORDS_ELTFP25519];
- u8 session[CURVE25519_KEY_SIZE];
- u8 private[CURVE25519_KEY_SIZE];
- } __aligned(32) m;
-
- int i = 0, j = 0;
- u64 prev = 0;
- u64 *const X1 = (u64 *)m.session;
- u64 *const key = (u64 *)m.private;
- u64 *const Px = m.coordinates + 0;
- u64 *const Pz = m.coordinates + 4;
- u64 *const Qx = m.coordinates + 8;
- u64 *const Qz = m.coordinates + 12;
- u64 *const X2 = Qx;
- u64 *const Z2 = Qz;
- u64 *const X3 = Px;
- u64 *const Z3 = Pz;
- u64 *const X2Z2 = Qx;
- u64 *const X3Z3 = Px;
-
- u64 *const A = m.workspace + 0;
- u64 *const B = m.workspace + 4;
- u64 *const D = m.workspace + 8;
- u64 *const C = m.workspace + 12;
- u64 *const DA = m.workspace + 16;
- u64 *const CB = m.workspace + 20;
- u64 *const AB = A;
- u64 *const DC = D;
- u64 *const DACB = DA;
-
- memcpy(m.private, private_key, sizeof(m.private));
- memcpy(m.session, session_key, sizeof(m.session));
-
- curve25519_clamp_secret(m.private);
-
- /* As in the draft:
- * When receiving such an array, implementations of curve25519
- * MUST mask the most-significant bit in the final byte. This
- * is done to preserve compatibility with point formats which
- * reserve the sign bit for use in other protocols and to
- * increase resistance to implementation fingerprinting
- */
- m.session[CURVE25519_KEY_SIZE - 1] &= (1 << (255 % 8)) - 1;
-
- copy_eltfp25519_1w(Px, X1);
- setzero_eltfp25519_1w(Pz);
- setzero_eltfp25519_1w(Qx);
- setzero_eltfp25519_1w(Qz);
-
- Pz[0] = 1;
- Qx[0] = 1;
-
- /* main-loop */
- prev = 0;
- j = 62;
- for (i = 3; i >= 0; --i) {
- while (j >= 0) {
- u64 bit = (key[i] >> j) & 0x1;
- u64 swap = bit ^ prev;
- prev = bit;
-
- add_eltfp25519_1w_adx(A, X2, Z2); /* A = (X2+Z2) */
- sub_eltfp25519_1w(B, X2, Z2); /* B = (X2-Z2) */
- add_eltfp25519_1w_adx(C, X3, Z3); /* C = (X3+Z3) */
- sub_eltfp25519_1w(D, X3, Z3); /* D = (X3-Z3) */
- mul_eltfp25519_2w_adx(DACB, AB, DC); /* [DA|CB] = [A|B]*[D|C] */
-
- cselect(swap, A, C);
- cselect(swap, B, D);
-
- sqr_eltfp25519_2w_adx(AB); /* [AA|BB] = [A^2|B^2] */
- add_eltfp25519_1w_adx(X3, DA, CB); /* X3 = (DA+CB) */
- sub_eltfp25519_1w(Z3, DA, CB); /* Z3 = (DA-CB) */
- sqr_eltfp25519_2w_adx(X3Z3); /* [X3|Z3] = [(DA+CB)|(DA+CB)]^2 */
-
- copy_eltfp25519_1w(X2, B); /* X2 = B^2 */
- sub_eltfp25519_1w(Z2, A, B); /* Z2 = E = AA-BB */
-
- mul_a24_eltfp25519_1w(B, Z2); /* B = a24*E */
- add_eltfp25519_1w_adx(B, B, X2); /* B = a24*E+B */
- mul_eltfp25519_2w_adx(X2Z2, X2Z2, AB); /* [X2|Z2] = [B|E]*[A|a24*E+B] */
- mul_eltfp25519_1w_adx(Z3, Z3, X1); /* Z3 = Z3*X1 */
- --j;
- }
- j = 63;
- }
-
- inv_eltfp25519_1w_adx(A, Qz);
- mul_eltfp25519_1w_adx((u64 *)shared, Qx, A);
- fred_eltfp25519_1w((u64 *)shared);
-
- memzero_explicit(&m, sizeof(m));
+ const u64 *x = i;
+ const u64 *z = i + (u32)4U;
+ u64 tmp[4U] = { 0U };
+ u64 tmp_w[16U] = { 0U };
+ finv(tmp, z, tmp_w);
+ fmul(tmp, tmp, x, tmp_w);
+ store_felem((u64 *)o, tmp);
}
-static void curve25519_adx_base(u8 session_key[CURVE25519_KEY_SIZE],
- const u8 private_key[CURVE25519_KEY_SIZE])
+static void curve25519_ever64(u8 *out, const u8 *priv, const u8 *pub)
{
- struct {
- u64 buffer[4 * NUM_WORDS_ELTFP25519];
- u64 coordinates[4 * NUM_WORDS_ELTFP25519];
- u64 workspace[4 * NUM_WORDS_ELTFP25519];
- u8 private[CURVE25519_KEY_SIZE];
- } __aligned(32) m;
-
- const int ite[4] = { 64, 64, 64, 63 };
- const int q = 3;
- u64 swap = 1;
-
- int i = 0, j = 0, k = 0;
- u64 *const key = (u64 *)m.private;
- u64 *const Ur1 = m.coordinates + 0;
- u64 *const Zr1 = m.coordinates + 4;
- u64 *const Ur2 = m.coordinates + 8;
- u64 *const Zr2 = m.coordinates + 12;
-
- u64 *const UZr1 = m.coordinates + 0;
- u64 *const ZUr2 = m.coordinates + 8;
-
- u64 *const A = m.workspace + 0;
- u64 *const B = m.workspace + 4;
- u64 *const C = m.workspace + 8;
- u64 *const D = m.workspace + 12;
-
- u64 *const AB = m.workspace + 0;
- u64 *const CD = m.workspace + 8;
-
- const u64 *const P = table_ladder_8k;
-
- memcpy(m.private, private_key, sizeof(m.private));
-
- curve25519_clamp_secret(m.private);
-
- setzero_eltfp25519_1w(Ur1);
- setzero_eltfp25519_1w(Zr1);
- setzero_eltfp25519_1w(Zr2);
- Ur1[0] = 1;
- Zr1[0] = 1;
- Zr2[0] = 1;
-
- /* G-S */
- Ur2[3] = 0x1eaecdeee27cab34UL;
- Ur2[2] = 0xadc7a0b9235d48e2UL;
- Ur2[1] = 0xbbf095ae14b2edf8UL;
- Ur2[0] = 0x7e94e1fec82faabdUL;
-
- /* main-loop */
- j = q;
- for (i = 0; i < NUM_WORDS_ELTFP25519; ++i) {
- while (j < ite[i]) {
- u64 bit = (key[i] >> j) & 0x1;
- k = (64 * i + j - q);
- swap = swap ^ bit;
- cswap(swap, Ur1, Ur2);
- cswap(swap, Zr1, Zr2);
- swap = bit;
- /* Addition */
- sub_eltfp25519_1w(B, Ur1, Zr1); /* B = Ur1-Zr1 */
- add_eltfp25519_1w_adx(A, Ur1, Zr1); /* A = Ur1+Zr1 */
- mul_eltfp25519_1w_adx(C, &P[4 * k], B); /* C = M0-B */
- sub_eltfp25519_1w(B, A, C); /* B = (Ur1+Zr1) - M*(Ur1-Zr1) */
- add_eltfp25519_1w_adx(A, A, C); /* A = (Ur1+Zr1) + M*(Ur1-Zr1) */
- sqr_eltfp25519_2w_adx(AB); /* A = A^2 | B = B^2 */
- mul_eltfp25519_2w_adx(UZr1, ZUr2, AB); /* Ur1 = Zr2*A | Zr1 = Ur2*B */
- ++j;
+ u64 init1[8U] = { 0U };
+ u64 tmp[4U] = { 0U };
+ u64 tmp3;
+ u64 *x;
+ u64 *z;
+ {
+ u32 i;
+ for (i = (u32)0U; i < (u32)4U; i = i + (u32)1U) {
+ u64 *os = tmp;
+ const u8 *bj = pub + i * (u32)8U;
+ u64 u = *(u64 *)bj;
+ u64 r = u;
+ u64 x0 = r;
+ os[i] = x0;
}
- j = 0;
}
-
- /* Doubling */
- for (i = 0; i < q; ++i) {
- add_eltfp25519_1w_adx(A, Ur1, Zr1); /* A = Ur1+Zr1 */
- sub_eltfp25519_1w(B, Ur1, Zr1); /* B = Ur1-Zr1 */
- sqr_eltfp25519_2w_adx(AB); /* A = A**2 B = B**2 */
- copy_eltfp25519_1w(C, B); /* C = B */
- sub_eltfp25519_1w(B, A, B); /* B = A-B */
- mul_a24_eltfp25519_1w(D, B); /* D = my_a24*B */
- add_eltfp25519_1w_adx(D, D, C); /* D = D+C */
- mul_eltfp25519_2w_adx(UZr1, AB, CD); /* Ur1 = A*B Zr1 = Zr1*A */
- }
-
- /* Convert to affine coordinates */
- inv_eltfp25519_1w_adx(A, Zr1);
- mul_eltfp25519_1w_adx((u64 *)session_key, Ur1, A);
- fred_eltfp25519_1w((u64 *)session_key);
-
- memzero_explicit(&m, sizeof(m));
+ tmp3 = tmp[3U];
+ tmp[3U] = tmp3 & (u64)0x7fffffffffffffffU;
+ x = init1;
+ z = init1 + (u32)4U;
+ z[0U] = (u64)1U;
+ z[1U] = (u64)0U;
+ z[2U] = (u64)0U;
+ z[3U] = (u64)0U;
+ x[0U] = tmp[0U];
+ x[1U] = tmp[1U];
+ x[2U] = tmp[2U];
+ x[3U] = tmp[3U];
+ montgomery_ladder(init1, priv, init1);
+ encode_point(out, init1);
}
-static void curve25519_bmi2(u8 shared[CURVE25519_KEY_SIZE],
- const u8 private_key[CURVE25519_KEY_SIZE],
- const u8 session_key[CURVE25519_KEY_SIZE])
-{
- struct {
- u64 buffer[4 * NUM_WORDS_ELTFP25519];
- u64 coordinates[4 * NUM_WORDS_ELTFP25519];
- u64 workspace[6 * NUM_WORDS_ELTFP25519];
- u8 session[CURVE25519_KEY_SIZE];
- u8 private[CURVE25519_KEY_SIZE];
- } __aligned(32) m;
-
- int i = 0, j = 0;
- u64 prev = 0;
- u64 *const X1 = (u64 *)m.session;
- u64 *const key = (u64 *)m.private;
- u64 *const Px = m.coordinates + 0;
- u64 *const Pz = m.coordinates + 4;
- u64 *const Qx = m.coordinates + 8;
- u64 *const Qz = m.coordinates + 12;
- u64 *const X2 = Qx;
- u64 *const Z2 = Qz;
- u64 *const X3 = Px;
- u64 *const Z3 = Pz;
- u64 *const X2Z2 = Qx;
- u64 *const X3Z3 = Px;
-
- u64 *const A = m.workspace + 0;
- u64 *const B = m.workspace + 4;
- u64 *const D = m.workspace + 8;
- u64 *const C = m.workspace + 12;
- u64 *const DA = m.workspace + 16;
- u64 *const CB = m.workspace + 20;
- u64 *const AB = A;
- u64 *const DC = D;
- u64 *const DACB = DA;
-
- memcpy(m.private, private_key, sizeof(m.private));
- memcpy(m.session, session_key, sizeof(m.session));
-
- curve25519_clamp_secret(m.private);
-
- /* As in the draft:
- * When receiving such an array, implementations of curve25519
- * MUST mask the most-significant bit in the final byte. This
- * is done to preserve compatibility with point formats which
- * reserve the sign bit for use in other protocols and to
- * increase resistance to implementation fingerprinting
- */
- m.session[CURVE25519_KEY_SIZE - 1] &= (1 << (255 % 8)) - 1;
-
- copy_eltfp25519_1w(Px, X1);
- setzero_eltfp25519_1w(Pz);
- setzero_eltfp25519_1w(Qx);
- setzero_eltfp25519_1w(Qz);
-
- Pz[0] = 1;
- Qx[0] = 1;
-
- /* main-loop */
- prev = 0;
- j = 62;
- for (i = 3; i >= 0; --i) {
- while (j >= 0) {
- u64 bit = (key[i] >> j) & 0x1;
- u64 swap = bit ^ prev;
- prev = bit;
-
- add_eltfp25519_1w_bmi2(A, X2, Z2); /* A = (X2+Z2) */
- sub_eltfp25519_1w(B, X2, Z2); /* B = (X2-Z2) */
- add_eltfp25519_1w_bmi2(C, X3, Z3); /* C = (X3+Z3) */
- sub_eltfp25519_1w(D, X3, Z3); /* D = (X3-Z3) */
- mul_eltfp25519_2w_bmi2(DACB, AB, DC); /* [DA|CB] = [A|B]*[D|C] */
-
- cselect(swap, A, C);
- cselect(swap, B, D);
-
- sqr_eltfp25519_2w_bmi2(AB); /* [AA|BB] = [A^2|B^2] */
- add_eltfp25519_1w_bmi2(X3, DA, CB); /* X3 = (DA+CB) */
- sub_eltfp25519_1w(Z3, DA, CB); /* Z3 = (DA-CB) */
- sqr_eltfp25519_2w_bmi2(X3Z3); /* [X3|Z3] = [(DA+CB)|(DA+CB)]^2 */
-
- copy_eltfp25519_1w(X2, B); /* X2 = B^2 */
- sub_eltfp25519_1w(Z2, A, B); /* Z2 = E = AA-BB */
-
- mul_a24_eltfp25519_1w(B, Z2); /* B = a24*E */
- add_eltfp25519_1w_bmi2(B, B, X2); /* B = a24*E+B */
- mul_eltfp25519_2w_bmi2(X2Z2, X2Z2, AB); /* [X2|Z2] = [B|E]*[A|a24*E+B] */
- mul_eltfp25519_1w_bmi2(Z3, Z3, X1); /* Z3 = Z3*X1 */
- --j;
- }
- j = 63;
- }
-
- inv_eltfp25519_1w_bmi2(A, Qz);
- mul_eltfp25519_1w_bmi2((u64 *)shared, Qx, A);
- fred_eltfp25519_1w((u64 *)shared);
+/* The below constants were generated using this sage script:
+ *
+ * #!/usr/bin/env sage
+ * import sys
+ * from sage.all import *
+ * def limbs(n):
+ * n = int(n)
+ * l = ((n >> 0) % 2^64, (n >> 64) % 2^64, (n >> 128) % 2^64, (n >> 192) % 2^64)
+ * return "0x%016xULL, 0x%016xULL, 0x%016xULL, 0x%016xULL" % l
+ * ec = EllipticCurve(GF(2^255 - 19), [0, 486662, 0, 1, 0])
+ * p_minus_s = (ec.lift_x(9) - ec.lift_x(1))[0]
+ * print("static const u64 p_minus_s[] = { %s };\n" % limbs(p_minus_s))
+ * print("static const u64 table_ladder[] = {")
+ * p = ec.lift_x(9)
+ * for i in range(252):
+ * l = (p[0] + p[2]) / (p[0] - p[2])
+ * print(("\t%s" + ("," if i != 251 else "")) % limbs(l))
+ * p = p * 2
+ * print("};")
+ *
+ */
- memzero_explicit(&m, sizeof(m));
-}
+static const u64 p_minus_s[] = { 0x816b1e0137d48290ULL, 0x440f6a51eb4d1207ULL, 0x52385f46dca2b71dULL, 0x215132111d8354cbULL };
+
+static const u64 table_ladder[] = {
+ 0xfffffffffffffff3ULL, 0xffffffffffffffffULL, 0xffffffffffffffffULL, 0x5fffffffffffffffULL,
+ 0x6b8220f416aafe96ULL, 0x82ebeb2b4f566a34ULL, 0xd5a9a5b075a5950fULL, 0x5142b2cf4b2488f4ULL,
+ 0x6aaebc750069680cULL, 0x89cf7820a0f99c41ULL, 0x2a58d9183b56d0f4ULL, 0x4b5aca80e36011a4ULL,
+ 0x329132348c29745dULL, 0xf4a2e616e1642fd7ULL, 0x1e45bb03ff67bc34ULL, 0x306912d0f42a9b4aULL,
+ 0xff886507e6af7154ULL, 0x04f50e13dfeec82fULL, 0xaa512fe82abab5ceULL, 0x174e251a68d5f222ULL,
+ 0xcf96700d82028898ULL, 0x1743e3370a2c02c5ULL, 0x379eec98b4e86eaaULL, 0x0c59888a51e0482eULL,
+ 0xfbcbf1d699b5d189ULL, 0xacaef0d58e9fdc84ULL, 0xc1c20d06231f7614ULL, 0x2938218da274f972ULL,
+ 0xf6af49beff1d7f18ULL, 0xcc541c22387ac9c2ULL, 0x96fcc9ef4015c56bULL, 0x69c1627c690913a9ULL,
+ 0x7a86fd2f4733db0eULL, 0xfdb8c4f29e087de9ULL, 0x095e4b1a8ea2a229ULL, 0x1ad7a7c829b37a79ULL,
+ 0x342d89cad17ea0c0ULL, 0x67bedda6cced2051ULL, 0x19ca31bf2bb42f74ULL, 0x3df7b4c84980acbbULL,
+ 0xa8c6444dc80ad883ULL, 0xb91e440366e3ab85ULL, 0xc215cda00164f6d8ULL, 0x3d867c6ef247e668ULL,
+ 0xc7dd582bcc3e658cULL, 0xfd2c4748ee0e5528ULL, 0xa0fd9b95cc9f4f71ULL, 0x7529d871b0675ddfULL,
+ 0xb8f568b42d3cbd78ULL, 0x1233011b91f3da82ULL, 0x2dce6ccd4a7c3b62ULL, 0x75e7fc8e9e498603ULL,
+ 0x2f4f13f1fcd0b6ecULL, 0xf1a8ca1f29ff7a45ULL, 0xc249c1a72981e29bULL, 0x6ebe0dbb8c83b56aULL,
+ 0x7114fa8d170bb222ULL, 0x65a2dcd5bf93935fULL, 0xbdc41f68b59c979aULL, 0x2f0eef79a2ce9289ULL,
+ 0x42ecbf0c083c37ceULL, 0x2930bc09ec496322ULL, 0xf294b0c19cfeac0dULL, 0x3780aa4bedfabb80ULL,
+ 0x56c17d3e7cead929ULL, 0xe7cb4beb2e5722c5ULL, 0x0ce931732dbfe15aULL, 0x41b883c7621052f8ULL,
+ 0xdbf75ca0c3d25350ULL, 0x2936be086eb1e351ULL, 0xc936e03cb4a9b212ULL, 0x1d45bf82322225aaULL,
+ 0xe81ab1036a024cc5ULL, 0xe212201c304c9a72ULL, 0xc5d73fba6832b1fcULL, 0x20ffdb5a4d839581ULL,
+ 0xa283d367be5d0fadULL, 0x6c2b25ca8b164475ULL, 0x9d4935467caaf22eULL, 0x5166408eee85ff49ULL,
+ 0x3c67baa2fab4e361ULL, 0xb3e433c67ef35cefULL, 0x5259729241159b1cULL, 0x6a621892d5b0ab33ULL,
+ 0x20b74a387555cdcbULL, 0x532aa10e1208923fULL, 0xeaa17b7762281dd1ULL, 0x61ab3443f05c44bfULL,
+ 0x257a6c422324def8ULL, 0x131c6c1017e3cf7fULL, 0x23758739f630a257ULL, 0x295a407a01a78580ULL,
+ 0xf8c443246d5da8d9ULL, 0x19d775450c52fa5dULL, 0x2afcfc92731bf83dULL, 0x7d10c8e81b2b4700ULL,
+ 0xc8e0271f70baa20bULL, 0x993748867ca63957ULL, 0x5412efb3cb7ed4bbULL, 0x3196d36173e62975ULL,
+ 0xde5bcad141c7dffcULL, 0x47cc8cd2b395c848ULL, 0xa34cd942e11af3cbULL, 0x0256dbf2d04ecec2ULL,
+ 0x875ab7e94b0e667fULL, 0xcad4dd83c0850d10ULL, 0x47f12e8f4e72c79fULL, 0x5f1a87bb8c85b19bULL,
+ 0x7ae9d0b6437f51b8ULL, 0x12c7ce5518879065ULL, 0x2ade09fe5cf77aeeULL, 0x23a05a2f7d2c5627ULL,
+ 0x5908e128f17c169aULL, 0xf77498dd8ad0852dULL, 0x74b4c4ceab102f64ULL, 0x183abadd10139845ULL,
+ 0xb165ba8daa92aaacULL, 0xd5c5ef9599386705ULL, 0xbe2f8f0cf8fc40d1ULL, 0x2701e635ee204514ULL,
+ 0x629fa80020156514ULL, 0xf223868764a8c1ceULL, 0x5b894fff0b3f060eULL, 0x60d9944cf708a3faULL,
+ 0xaeea001a1c7a201fULL, 0xebf16a633ee2ce63ULL, 0x6f7709594c7a07e1ULL, 0x79b958150d0208cbULL,
+ 0x24b55e5301d410e7ULL, 0xe3a34edff3fdc84dULL, 0xd88768e4904032d8ULL, 0x131384427b3aaeecULL,
+ 0x8405e51286234f14ULL, 0x14dc4739adb4c529ULL, 0xb8a2b5b250634ffdULL, 0x2fe2a94ad8a7ff93ULL,
+ 0xec5c57efe843faddULL, 0x2843ce40f0bb9918ULL, 0xa4b561d6cf3d6305ULL, 0x743629bde8fb777eULL,
+ 0x343edd46bbaf738fULL, 0xed981828b101a651ULL, 0xa401760b882c797aULL, 0x1fc223e28dc88730ULL,
+ 0x48604e91fc0fba0eULL, 0xb637f78f052c6fa4ULL, 0x91ccac3d09e9239cULL, 0x23f7eed4437a687cULL,
+ 0x5173b1118d9bd800ULL, 0x29d641b63189d4a7ULL, 0xfdbf177988bbc586ULL, 0x2959894fcad81df5ULL,
+ 0xaebc8ef3b4bbc899ULL, 0x4148995ab26992b9ULL, 0x24e20b0134f92cfbULL, 0x40d158894a05dee8ULL,
+ 0x46b00b1185af76f6ULL, 0x26bac77873187a79ULL, 0x3dc0bf95ab8fff5fULL, 0x2a608bd8945524d7ULL,
+ 0x26449588bd446302ULL, 0x7c4bc21c0388439cULL, 0x8e98a4f383bd11b2ULL, 0x26218d7bc9d876b9ULL,
+ 0xe3081542997c178aULL, 0x3c2d29a86fb6606fULL, 0x5c217736fa279374ULL, 0x7dde05734afeb1faULL,
+ 0x3bf10e3906d42babULL, 0xe4f7803e1980649cULL, 0xe6053bf89595bf7aULL, 0x394faf38da245530ULL,
+ 0x7a8efb58896928f4ULL, 0xfbc778e9cc6a113cULL, 0x72670ce330af596fULL, 0x48f222a81d3d6cf7ULL,
+ 0xf01fce410d72caa7ULL, 0x5a20ecc7213b5595ULL, 0x7bc21165c1fa1483ULL, 0x07f89ae31da8a741ULL,
+ 0x05d2c2b4c6830ff9ULL, 0xd43e330fc6316293ULL, 0xa5a5590a96d3a904ULL, 0x705edb91a65333b6ULL,
+ 0x048ee15e0bb9a5f7ULL, 0x3240cfca9e0aaf5dULL, 0x8f4b71ceedc4a40bULL, 0x621c0da3de544a6dULL,
+ 0x92872836a08c4091ULL, 0xce8375b010c91445ULL, 0x8a72eb524f276394ULL, 0x2667fcfa7ec83635ULL,
+ 0x7f4c173345e8752aULL, 0x061b47feee7079a5ULL, 0x25dd9afa9f86ff34ULL, 0x3780cef5425dc89cULL,
+ 0x1a46035a513bb4e9ULL, 0x3e1ef379ac575adaULL, 0xc78c5f1c5fa24b50ULL, 0x321a967634fd9f22ULL,
+ 0x946707b8826e27faULL, 0x3dca84d64c506fd0ULL, 0xc189218075e91436ULL, 0x6d9284169b3b8484ULL,
+ 0x3a67e840383f2ddfULL, 0x33eec9a30c4f9b75ULL, 0x3ec7c86fa783ef47ULL, 0x26ec449fbac9fbc4ULL,
+ 0x5c0f38cba09b9e7dULL, 0x81168cc762a3478cULL, 0x3e23b0d306fc121cULL, 0x5a238aa0a5efdcddULL,
+ 0x1ba26121c4ea43ffULL, 0x36f8c77f7c8832b5ULL, 0x88fbea0b0adcf99aULL, 0x5ca9938ec25bebf9ULL,
+ 0xd5436a5e51fccda0ULL, 0x1dbc4797c2cd893bULL, 0x19346a65d3224a08ULL, 0x0f5034e49b9af466ULL,
+ 0xf23c3967a1e0b96eULL, 0xe58b08fa867a4d88ULL, 0xfb2fabc6a7341679ULL, 0x2a75381eb6026946ULL,
+ 0xc80a3be4c19420acULL, 0x66b1f6c681f2b6dcULL, 0x7cf7036761e93388ULL, 0x25abbbd8a660a4c4ULL,
+ 0x91ea12ba14fd5198ULL, 0x684950fc4a3cffa9ULL, 0xf826842130f5ad28ULL, 0x3ea988f75301a441ULL,
+ 0xc978109a695f8c6fULL, 0x1746eb4a0530c3f3ULL, 0x444d6d77b4459995ULL, 0x75952b8c054e5cc7ULL,
+ 0xa3703f7915f4d6aaULL, 0x66c346202f2647d8ULL, 0xd01469df811d644bULL, 0x77fea47d81a5d71fULL,
+ 0xc5e9529ef57ca381ULL, 0x6eeeb4b9ce2f881aULL, 0xb6e91a28e8009bd6ULL, 0x4b80be3e9afc3fecULL,
+ 0x7e3773c526aed2c5ULL, 0x1b4afcb453c9a49dULL, 0xa920bdd7baffb24dULL, 0x7c54699f122d400eULL,
+ 0xef46c8e14fa94bc8ULL, 0xe0b074ce2952ed5eULL, 0xbea450e1dbd885d5ULL, 0x61b68649320f712cULL,
+ 0x8a485f7309ccbdd1ULL, 0xbd06320d7d4d1a2dULL, 0x25232973322dbef4ULL, 0x445dc4758c17f770ULL,
+ 0xdb0434177cc8933cULL, 0xed6fe82175ea059fULL, 0x1efebefdc053db34ULL, 0x4adbe867c65daf99ULL,
+ 0x3acd71a2a90609dfULL, 0xe5e991856dd04050ULL, 0x1ec69b688157c23cULL, 0x697427f6885cfe4dULL,
+ 0xd7be7b9b65e1a851ULL, 0xa03d28d522c536ddULL, 0x28399d658fd2b645ULL, 0x49e5b7e17c2641e1ULL,
+ 0x6f8c3a98700457a4ULL, 0x5078f0a25ebb6778ULL, 0xd13c3ccbc382960fULL, 0x2e003258a7df84b1ULL,
+ 0x8ad1f39be6296a1cULL, 0xc1eeaa652a5fbfb2ULL, 0x33ee0673fd26f3cbULL, 0x59256173a69d2cccULL,
+ 0x41ea07aa4e18fc41ULL, 0xd9fc19527c87a51eULL, 0xbdaacb805831ca6fULL, 0x445b652dc916694fULL,
+ 0xce92a3a7f2172315ULL, 0x1edc282de11b9964ULL, 0xa1823aafe04c314aULL, 0x790a2d94437cf586ULL,
+ 0x71c447fb93f6e009ULL, 0x8922a56722845276ULL, 0xbf70903b204f5169ULL, 0x2f7a89891ba319feULL,
+ 0x02a08eb577e2140cULL, 0xed9a4ed4427bdcf4ULL, 0x5253ec44e4323cd1ULL, 0x3e88363c14e9355bULL,
+ 0xaa66c14277110b8cULL, 0x1ae0391610a23390ULL, 0x2030bd12c93fc2a2ULL, 0x3ee141579555c7abULL,
+ 0x9214de3a6d6e7d41ULL, 0x3ccdd88607f17efeULL, 0x674f1288f8e11217ULL, 0x5682250f329f93d0ULL,
+ 0x6cf00b136d2e396eULL, 0x6e4cf86f1014debfULL, 0x5930b1b5bfcc4e83ULL, 0x047069b48aba16b6ULL,
+ 0x0d4ce4ab69b20793ULL, 0xb24db91a97d0fb9eULL, 0xcdfa50f54e00d01dULL, 0x221b1085368bddb5ULL,
+ 0xe7e59468b1e3d8d2ULL, 0x53c56563bd122f93ULL, 0xeee8a903e0663f09ULL, 0x61efa662cbbe3d42ULL,
+ 0x2cf8ddddde6eab2aULL, 0x9bf80ad51435f231ULL, 0x5deadacec9f04973ULL, 0x29275b5d41d29b27ULL,
+ 0xcfde0f0895ebf14fULL, 0xb9aab96b054905a7ULL, 0xcae80dd9a1c420fdULL, 0x0a63bf2f1673bbc7ULL,
+ 0x092f6e11958fbc8cULL, 0x672a81e804822fadULL, 0xcac8351560d52517ULL, 0x6f3f7722c8f192f8ULL,
+ 0xf8ba90ccc2e894b7ULL, 0x2c7557a438ff9f0dULL, 0x894d1d855ae52359ULL, 0x68e122157b743d69ULL,
+ 0xd87e5570cfb919f3ULL, 0x3f2cdecd95798db9ULL, 0x2121154710c0a2ceULL, 0x3c66a115246dc5b2ULL,
+ 0xcbedc562294ecb72ULL, 0xba7143c36a280b16ULL, 0x9610c2efd4078b67ULL, 0x6144735d946a4b1eULL,
+ 0x536f111ed75b3350ULL, 0x0211db8c2041d81bULL, 0xf93cb1000e10413cULL, 0x149dfd3c039e8876ULL,
+ 0xd479dde46b63155bULL, 0xb66e15e93c837976ULL, 0xdafde43b1f13e038ULL, 0x5fafda1a2e4b0b35ULL,
+ 0x3600bbdf17197581ULL, 0x3972050bbe3cd2c2ULL, 0x5938906dbdd5be86ULL, 0x34fce5e43f9b860fULL,
+ 0x75a8a4cd42d14d02ULL, 0x828dabc53441df65ULL, 0x33dcabedd2e131d3ULL, 0x3ebad76fb814d25fULL,
+ 0xd4906f566f70e10fULL, 0x5d12f7aa51690f5aULL, 0x45adb16e76cefcf2ULL, 0x01f768aead232999ULL,
+ 0x2b6cc77b6248febdULL, 0x3cd30628ec3aaffdULL, 0xce1c0b80d4ef486aULL, 0x4c3bff2ea6f66c23ULL,
+ 0x3f2ec4094aeaeb5fULL, 0x61b19b286e372ca7ULL, 0x5eefa966de2a701dULL, 0x23b20565de55e3efULL,
+ 0xe301ca5279d58557ULL, 0x07b2d4ce27c2874fULL, 0xa532cd8a9dcf1d67ULL, 0x2a52fee23f2bff56ULL,
+ 0x8624efb37cd8663dULL, 0xbbc7ac20ffbd7594ULL, 0x57b85e9c82d37445ULL, 0x7b3052cb86a6ec66ULL,
+ 0x3482f0ad2525e91eULL, 0x2cb68043d28edca0ULL, 0xaf4f6d052e1b003aULL, 0x185f8c2529781b0aULL,
+ 0xaa41de5bd80ce0d6ULL, 0x9407b2416853e9d6ULL, 0x563ec36e357f4c3aULL, 0x4cc4b8dd0e297bceULL,
+ 0xa2fc1a52ffb8730eULL, 0x1811f16e67058e37ULL, 0x10f9a366cddf4ee1ULL, 0x72f4a0c4a0b9f099ULL,
+ 0x8c16c06f663f4ea7ULL, 0x693b3af74e970fbaULL, 0x2102e7f1d69ec345ULL, 0x0ba53cbc968a8089ULL,
+ 0xca3d9dc7fea15537ULL, 0x4c6824bb51536493ULL, 0xb9886314844006b1ULL, 0x40d2a72ab454cc60ULL,
+ 0x5936a1b712570975ULL, 0x91b9d648debda657ULL, 0x3344094bb64330eaULL, 0x006ba10d12ee51d0ULL,
+ 0x19228468f5de5d58ULL, 0x0eb12f4c38cc05b0ULL, 0xa1039f9dd5601990ULL, 0x4502d4ce4fff0e0bULL,
+ 0xeb2054106837c189ULL, 0xd0f6544c6dd3b93cULL, 0x40727064c416d74fULL, 0x6e15c6114b502ef0ULL,
+ 0x4df2a398cfb1a76bULL, 0x11256c7419f2f6b1ULL, 0x4a497962066e6043ULL, 0x705b3aab41355b44ULL,
+ 0x365ef536d797b1d8ULL, 0x00076bd622ddf0dbULL, 0x3bbf33b0e0575a88ULL, 0x3777aa05c8e4ca4dULL,
+ 0x392745c85578db5fULL, 0x6fda4149dbae5ae2ULL, 0xb1f0b00b8adc9867ULL, 0x09963437d36f1da3ULL,
+ 0x7e824e90a5dc3853ULL, 0xccb5f6641f135cbdULL, 0x6736d86c87ce8fccULL, 0x625f3ce26604249fULL,
+ 0xaf8ac8059502f63fULL, 0x0c05e70a2e351469ULL, 0x35292e9c764b6305ULL, 0x1a394360c7e23ac3ULL,
+ 0xd5c6d53251183264ULL, 0x62065abd43c2b74fULL, 0xb5fbf5d03b973f9bULL, 0x13a3da3661206e5eULL,
+ 0xc6bd5837725d94e5ULL, 0x18e30912205016c5ULL, 0x2088ce1570033c68ULL, 0x7fba1f495c837987ULL,
+ 0x5a8c7423f2f9079dULL, 0x1735157b34023fc5ULL, 0xe4f9b49ad2fab351ULL, 0x6691ff72c878e33cULL,
+ 0x122c2adedc5eff3eULL, 0xf8dd4bf1d8956cf4ULL, 0xeb86205d9e9e5bdaULL, 0x049b92b9d975c743ULL,
+ 0xa5379730b0f6c05aULL, 0x72a0ffacc6f3a553ULL, 0xb0032c34b20dcd6dULL, 0x470e9dbc88d5164aULL,
+ 0xb19cf10ca237c047ULL, 0xb65466711f6c81a2ULL, 0xb3321bd16dd80b43ULL, 0x48c14f600c5fbe8eULL,
+ 0x66451c264aa6c803ULL, 0xb66e3904a4fa7da6ULL, 0xd45f19b0b3128395ULL, 0x31602627c3c9bc10ULL,
+ 0x3120dc4832e4e10dULL, 0xeb20c46756c717f7ULL, 0x00f52e3f67280294ULL, 0x566d4fc14730c509ULL,
+ 0x7e3a5d40fd837206ULL, 0xc1e926dc7159547aULL, 0x216730fba68d6095ULL, 0x22e8c3843f69cea7ULL,
+ 0x33d074e8930e4b2bULL, 0xb6e4350e84d15816ULL, 0x5534c26ad6ba2365ULL, 0x7773c12f89f1f3f3ULL,
+ 0x8cba404da57962aaULL, 0x5b9897a81999ce56ULL, 0x508e862f121692fcULL, 0x3a81907fa093c291ULL,
+ 0x0dded0ff4725a510ULL, 0x10d8cc10673fc503ULL, 0x5b9d151c9f1f4e89ULL, 0x32a5c1d5cb09a44cULL,
+ 0x1e0aa442b90541fbULL, 0x5f85eb7cc1b485dbULL, 0xbee595ce8a9df2e5ULL, 0x25e496c722422236ULL,
+ 0x5edf3c46cd0fe5b9ULL, 0x34e75a7ed2a43388ULL, 0xe488de11d761e352ULL, 0x0e878a01a085545cULL,
+ 0xba493c77e021bb04ULL, 0x2b4d1843c7df899aULL, 0x9ea37a487ae80d67ULL, 0x67a9958011e41794ULL,
+ 0x4b58051a6697b065ULL, 0x47e33f7d8d6ba6d4ULL, 0xbb4da8d483ca46c1ULL, 0x68becaa181c2db0dULL,
+ 0x8d8980e90b989aa5ULL, 0xf95eb14a2c93c99bULL, 0x51c6c7c4796e73a2ULL, 0x6e228363b5efb569ULL,
+ 0xc6bbc0b02dd624c8ULL, 0x777eb47dec8170eeULL, 0x3cde15a004cfafa9ULL, 0x1dc6bc087160bf9bULL,
+ 0x2e07e043eec34002ULL, 0x18e9fc677a68dc7fULL, 0xd8da03188bd15b9aULL, 0x48fbc3bb00568253ULL,
+ 0x57547d4cfb654ce1ULL, 0xd3565b82a058e2adULL, 0xf63eaf0bbf154478ULL, 0x47531ef114dfbb18ULL,
+ 0xe1ec630a4278c587ULL, 0x5507d546ca8e83f3ULL, 0x85e135c63adc0c2bULL, 0x0aa7efa85682844eULL,
+ 0x72691ba8b3e1f615ULL, 0x32b4e9701fbe3ffaULL, 0x97b6d92e39bb7868ULL, 0x2cfe53dea02e39e8ULL,
+ 0x687392cd85cd52b0ULL, 0x27ff66c910e29831ULL, 0x97134556a9832d06ULL, 0x269bb0360a84f8a0ULL,
+ 0x706e55457643f85cULL, 0x3734a48c9b597d1bULL, 0x7aee91e8c6efa472ULL, 0x5cd6abc198a9d9e0ULL,
+ 0x0e04de06cb3ce41aULL, 0xd8c6eb893402e138ULL, 0x904659bb686e3772ULL, 0x7215c371746ba8c8ULL,
+ 0xfd12a97eeae4a2d9ULL, 0x9514b7516394f2c5ULL, 0x266fd5809208f294ULL, 0x5c847085619a26b9ULL,
+ 0x52985410fed694eaULL, 0x3c905b934a2ed254ULL, 0x10bb47692d3be467ULL, 0x063b3d2d69e5e9e1ULL,
+ 0x472726eedda57debULL, 0xefb6c4ae10f41891ULL, 0x2b1641917b307614ULL, 0x117c554fc4f45b7cULL,
+ 0xc07cf3118f9d8812ULL, 0x01dbd82050017939ULL, 0xd7e803f4171b2827ULL, 0x1015e87487d225eaULL,
+ 0xc58de3fed23acc4dULL, 0x50db91c294a7be2dULL, 0x0b94d43d1c9cf457ULL, 0x6b1640fa6e37524aULL,
+ 0x692f346c5fda0d09ULL, 0x200b1c59fa4d3151ULL, 0xb8c46f760777a296ULL, 0x4b38395f3ffdfbcfULL,
+ 0x18d25e00be54d671ULL, 0x60d50582bec8aba6ULL, 0x87ad8f263b78b982ULL, 0x50fdf64e9cda0432ULL,
+ 0x90f567aac578dcf0ULL, 0xef1e9b0ef2a3133bULL, 0x0eebba9242d9de71ULL, 0x15473c9bf03101c7ULL,
+ 0x7c77e8ae56b78095ULL, 0xb678e7666e6f078eULL, 0x2da0b9615348ba1fULL, 0x7cf931c1ff733f0bULL,
+ 0x26b357f50a0a366cULL, 0xe9708cf42b87d732ULL, 0xc13aeea5f91cb2c0ULL, 0x35d90c991143bb4cULL,
+ 0x47c1c404a9a0d9dcULL, 0x659e58451972d251ULL, 0x3875a8c473b38c31ULL, 0x1fbd9ed379561f24ULL,
+ 0x11fabc6fd41ec28dULL, 0x7ef8dfe3cd2a2dcaULL, 0x72e73b5d8c404595ULL, 0x6135fa4954b72f27ULL,
+ 0xccfc32a2de24b69cULL, 0x3f55698c1f095d88ULL, 0xbe3350ed5ac3f929ULL, 0x5e9bf806ca477eebULL,
+ 0xe9ce8fb63c309f68ULL, 0x5376f63565e1f9f4ULL, 0xd1afcfb35a6393f1ULL, 0x6632a1ede5623506ULL,
+ 0x0b7d6c390c2ded4cULL, 0x56cb3281df04cb1fULL, 0x66305a1249ecc3c7ULL, 0x5d588b60a38ca72aULL,
+ 0xa6ecbf78e8e5f42dULL, 0x86eeb44b3c8a3eecULL, 0xec219c48fbd21604ULL, 0x1aaf1af517c36731ULL,
+ 0xc306a2836769bde7ULL, 0x208280622b1e2adbULL, 0x8027f51ffbff94a6ULL, 0x76cfa1ce1124f26bULL,
+ 0x18eb00562422abb6ULL, 0xf377c4d58f8c29c3ULL, 0x4dbbc207f531561aULL, 0x0253b7f082128a27ULL,
+ 0x3d1f091cb62c17e0ULL, 0x4860e1abd64628a9ULL, 0x52d17436309d4253ULL, 0x356f97e13efae576ULL,
+ 0xd351e11aa150535bULL, 0x3e6b45bb1dd878ccULL, 0x0c776128bed92c98ULL, 0x1d34ae93032885b8ULL,
+ 0x4ba0488ca85ba4c3ULL, 0x985348c33c9ce6ceULL, 0x66124c6f97bda770ULL, 0x0f81a0290654124aULL,
+ 0x9ed09ca6569b86fdULL, 0x811009fd18af9a2dULL, 0xff08d03f93d8c20aULL, 0x52a148199faef26bULL,
+ 0x3e03f9dc2d8d1b73ULL, 0x4205801873961a70ULL, 0xc0d987f041a35970ULL, 0x07aa1f15a1c0d549ULL,
+ 0xdfd46ce08cd27224ULL, 0x6d0a024f934e4239ULL, 0x808a7a6399897b59ULL, 0x0a4556e9e13d95a2ULL,
+ 0xd21a991fe9c13045ULL, 0x9b0e8548fe7751b8ULL, 0x5da643cb4bf30035ULL, 0x77db28d63940f721ULL,
+ 0xfc5eeb614adc9011ULL, 0x5229419ae8c411ebULL, 0x9ec3e7787d1dcf74ULL, 0x340d053e216e4cb5ULL,
+ 0xcac7af39b48df2b4ULL, 0xc0faec2871a10a94ULL, 0x140a69245ca575edULL, 0x0cf1c37134273a4cULL,
+ 0xc8ee306ac224b8a5ULL, 0x57eaee7ccb4930b0ULL, 0xa1e806bdaacbe74fULL, 0x7d9a62742eeb657dULL,
+ 0x9eb6b6ef546c4830ULL, 0x885cca1fddb36e2eULL, 0xe6b9f383ef0d7105ULL, 0x58654fef9d2e0412ULL,
+ 0xa905c4ffbe0e8e26ULL, 0x942de5df9b31816eULL, 0x497d723f802e88e1ULL, 0x30684dea602f408dULL,
+ 0x21e5a278a3e6cb34ULL, 0xaefb6e6f5b151dc4ULL, 0xb30b8e049d77ca15ULL, 0x28c3c9cf53b98981ULL,
+ 0x287fb721556cdd2aULL, 0x0d317ca897022274ULL, 0x7468c7423a543258ULL, 0x4a7f11464eb5642fULL,
+ 0xa237a4774d193aa6ULL, 0xd865986ea92129a1ULL, 0x24c515ecf87c1a88ULL, 0x604003575f39f5ebULL,
+ 0x47b9f189570a9b27ULL, 0x2b98cede465e4b78ULL, 0x026df551dbb85c20ULL, 0x74fcd91047e21901ULL,
+ 0x13e2a90a23c1bfa3ULL, 0x0cb0074e478519f6ULL, 0x5ff1cbbe3af6cf44ULL, 0x67fe5438be812dbeULL,
+ 0xd13cf64fa40f05b0ULL, 0x054dfb2f32283787ULL, 0x4173915b7f0d2aeaULL, 0x482f144f1f610d4eULL,
+ 0xf6210201b47f8234ULL, 0x5d0ae1929e70b990ULL, 0xdcd7f455b049567cULL, 0x7e93d0f1f0916f01ULL,
+ 0xdd79cbf18a7db4faULL, 0xbe8391bf6f74c62fULL, 0x027145d14b8291bdULL, 0x585a73ea2cbf1705ULL,
+ 0x485ca03e928a0db2ULL, 0x10fc01a5742857e7ULL, 0x2f482edbd6d551a7ULL, 0x0f0433b5048fdb8aULL,
+ 0x60da2e8dd7dc6247ULL, 0x88b4c9d38cd4819aULL, 0x13033ac001f66697ULL, 0x273b24fe3b367d75ULL,
+ 0xc6e8f66a31b3b9d4ULL, 0x281514a494df49d5ULL, 0xd1726fdfc8b23da7ULL, 0x4b3ae7d103dee548ULL,
+ 0xc6256e19ce4b9d7eULL, 0xff5c5cf186e3c61cULL, 0xacc63ca34b8ec145ULL, 0x74621888fee66574ULL,
+ 0x956f409645290a1eULL, 0xef0bf8e3263a962eULL, 0xed6a50eb5ec2647bULL, 0x0694283a9dca7502ULL,
+ 0x769b963643a2dcd1ULL, 0x42b7c8ea09fc5353ULL, 0x4f002aee13397eabULL, 0x63005e2c19b7d63aULL,
+ 0xca6736da63023beaULL, 0x966c7f6db12a99b7ULL, 0xace09390c537c5e1ULL, 0x0b696063a1aa89eeULL,
+ 0xebb03e97288c56e5ULL, 0x432a9f9f938c8be8ULL, 0xa6a5a93d5b717f71ULL, 0x1a5fb4c3e18f9d97ULL,
+ 0x1c94e7ad1c60cdceULL, 0xee202a43fc02c4a0ULL, 0x8dafe4d867c46a20ULL, 0x0a10263c8ac27b58ULL,
+ 0xd0dea9dfe4432a4aULL, 0x856af87bbe9277c5ULL, 0xce8472acc212c71aULL, 0x6f151b6d9bbb1e91ULL,
+ 0x26776c527ceed56aULL, 0x7d211cb7fbf8faecULL, 0x37ae66a6fd4609ccULL, 0x1f81b702d2770c42ULL,
+ 0x2fb0b057eac58392ULL, 0xe1dd89fe29744e9dULL, 0xc964f8eb17beb4f8ULL, 0x29571073c9a2d41eULL,
+ 0xa948a18981c0e254ULL, 0x2df6369b65b22830ULL, 0xa33eb2d75fcfd3c6ULL, 0x078cd6ec4199a01fULL,
+ 0x4a584a41ad900d2fULL, 0x32142b78e2c74c52ULL, 0x68c4e8338431c978ULL, 0x7f69ea9008689fc2ULL,
+ 0x52f2c81e46a38265ULL, 0xfd78072d04a832fdULL, 0x8cd7d5fa25359e94ULL, 0x4de71b7454cc29d2ULL,
+ 0x42eb60ad1eda6ac9ULL, 0x0aad37dfdbc09c3aULL, 0x81004b71e33cc191ULL, 0x44e6be345122803cULL,
+ 0x03fe8388ba1920dbULL, 0xf5d57c32150db008ULL, 0x49c8c4281af60c29ULL, 0x21edb518de701aeeULL,
+ 0x7fb63e418f06dc99ULL, 0xa4460d99c166d7b8ULL, 0x24dd5248ce520a83ULL, 0x5ec3ad712b928358ULL,
+ 0x15022a5fbd17930fULL, 0xa4f64a77d82570e3ULL, 0x12bc8d6915783712ULL, 0x498194c0fc620abbULL,
+ 0x38a2d9d255686c82ULL, 0x785c6bd9193e21f0ULL, 0xe4d5c81ab24a5484ULL, 0x56307860b2e20989ULL,
+ 0x429d55f78b4d74c4ULL, 0x22f1834643350131ULL, 0x1e60c24598c71fffULL, 0x59f2f014979983efULL,
+ 0x46a47d56eb494a44ULL, 0x3e22a854d636a18eULL, 0xb346e15274491c3bULL, 0x2ceafd4e5390cde7ULL,
+ 0xba8a8538be0d6675ULL, 0x4b9074bb50818e23ULL, 0xcbdab89085d304c3ULL, 0x61a24fe0e56192c4ULL,
+ 0xcb7615e6db525bcbULL, 0xdd7d8c35a567e4caULL, 0xe6b4153acafcdd69ULL, 0x2d668e097f3c9766ULL,
+ 0xa57e7e265ce55ef0ULL, 0x5d9f4e527cd4b967ULL, 0xfbc83606492fd1e5ULL, 0x090d52beb7c3f7aeULL,
+ 0x09b9515a1e7b4d7cULL, 0x1f266a2599da44c0ULL, 0xa1c49548e2c55504ULL, 0x7ef04287126f15ccULL,
+ 0xfed1659dbd30ef15ULL, 0x8b4ab9eec4e0277bULL, 0x884d6236a5df3291ULL, 0x1fd96ea6bf5cf788ULL,
+ 0x42a161981f190d9aULL, 0x61d849507e6052c1ULL, 0x9fe113bf285a2cd5ULL, 0x7c22d676dbad85d8ULL,
+ 0x82e770ed2bfbd27dULL, 0x4c05b2ece996f5a5ULL, 0xcd40a9c2b0900150ULL, 0x5895319213d9bf64ULL,
+ 0xe7cc5d703fea2e08ULL, 0xb50c491258e2188cULL, 0xcce30baa48205bf0ULL, 0x537c659ccfa32d62ULL,
+ 0x37b6623a98cfc088ULL, 0xfe9bed1fa4d6aca4ULL, 0x04d29b8e56a8d1b0ULL, 0x725f71c40b519575ULL,
+ 0x28c7f89cd0339ce6ULL, 0x8367b14469ddc18bULL, 0x883ada83a6a1652cULL, 0x585f1974034d6c17ULL,
+ 0x89cfb266f1b19188ULL, 0xe63b4863e7c35217ULL, 0xd88c9da6b4c0526aULL, 0x3e035c9df0954635ULL,
+ 0xdd9d5412fb45de9dULL, 0xdd684532e4cff40dULL, 0x4b5c999b151d671cULL, 0x2d8c2cc811e7f690ULL,
+ 0x7f54be1d90055d40ULL, 0xa464c5df464aaf40ULL, 0x33979624f0e917beULL, 0x2c018dc527356b30ULL,
+ 0xa5415024e330b3d4ULL, 0x73ff3d96691652d3ULL, 0x94ec42c4ef9b59f1ULL, 0x0747201618d08e5aULL,
+ 0x4d6ca48aca411c53ULL, 0x66415f2fcfa66119ULL, 0x9c4dd40051e227ffULL, 0x59810bc09a02f7ebULL,
+ 0x2a7eb171b3dc101dULL, 0x441c5ab99ffef68eULL, 0x32025c9b93b359eaULL, 0x5e8ce0a71e9d112fULL,
+ 0xbfcccb92429503fdULL, 0xd271ba752f095d55ULL, 0x345ead5e972d091eULL, 0x18c8df11a83103baULL,
+ 0x90cd949a9aed0f4cULL, 0xc5d1f4cb6660e37eULL, 0xb8cac52d56c52e0bULL, 0x6e42e400c5808e0dULL,
+ 0xa3b46966eeaefd23ULL, 0x0c4f1f0be39ecdcaULL, 0x189dc8c9d683a51dULL, 0x51f27f054c09351bULL,
+ 0x4c487ccd2a320682ULL, 0x587ea95bb3df1c96ULL, 0xc8ccf79e555cb8e8ULL, 0x547dc829a206d73dULL,
+ 0xb822a6cd80c39b06ULL, 0xe96d54732000d4c6ULL, 0x28535b6f91463b4dULL, 0x228f4660e2486e1dULL,
+ 0x98799538de8d3abfULL, 0x8cd8330045ebca6eULL, 0x79952a008221e738ULL, 0x4322e1a7535cd2bbULL,
+ 0xb114c11819d1801cULL, 0x2016e4d84f3f5ec7ULL, 0xdd0e2df409260f4cULL, 0x5ec362c0ae5f7266ULL,
+ 0xc0462b18b8b2b4eeULL, 0x7cc8d950274d1afbULL, 0xf25f7105436b02d2ULL, 0x43bbf8dcbff9ccd3ULL,
+ 0xb6ad1767a039e9dfULL, 0xb0714da8f69d3583ULL, 0x5e55fa18b42931f5ULL, 0x4ed5558f33c60961ULL,
+ 0x1fe37901c647a5ddULL, 0x593ddf1f8081d357ULL, 0x0249a4fd813fd7a6ULL, 0x69acca274e9caf61ULL,
+ 0x047ba3ea330721c9ULL, 0x83423fc20e7e1ea0ULL, 0x1df4c0af01314a60ULL, 0x09a62dab89289527ULL,
+ 0xa5b325a49cc6cb00ULL, 0xe94b5dc654b56cb6ULL, 0x3be28779adc994a0ULL, 0x4296e8f8ba3a4aadULL,
+ 0x328689761e451eabULL, 0x2e4d598bff59594aULL, 0x49b96853d7a7084aULL, 0x4980a319601420a8ULL,
+ 0x9565b9e12f552c42ULL, 0x8a5318db7100fe96ULL, 0x05c90b4d43add0d7ULL, 0x538b4cd66a5d4edaULL,
+ 0xf4e94fc3e89f039fULL, 0x592c9af26f618045ULL, 0x08a36eb5fd4b9550ULL, 0x25fffaf6c2ed1419ULL,
+ 0x34434459cc79d354ULL, 0xeeecbfb4b1d5476bULL, 0xddeb34a061615d99ULL, 0x5129cecceb64b773ULL,
+ 0xee43215894993520ULL, 0x772f9c7cf14c0b3bULL, 0xd2e2fce306bedad5ULL, 0x715f42b546f06a97ULL,
+ 0x434ecdceda5b5f1aULL, 0x0da17115a49741a9ULL, 0x680bd77c73edad2eULL, 0x487c02354edd9041ULL,
+ 0xb8efeff3a70ed9c4ULL, 0x56a32aa3e857e302ULL, 0xdf3a68bd48a2a5a0ULL, 0x07f650b73176c444ULL,
+ 0xe38b9b1626e0ccb1ULL, 0x79e053c18b09fb36ULL, 0x56d90319c9f94964ULL, 0x1ca941e7ac9ff5c4ULL,
+ 0x49c4df29162fa0bbULL, 0x8488cf3282b33305ULL, 0x95dfda14cabb437dULL, 0x3391f78264d5ad86ULL,
+ 0x729ae06ae2b5095dULL, 0xd58a58d73259a946ULL, 0xe9834262d13921edULL, 0x27fedafaa54bb592ULL,
+ 0xa99dc5b829ad48bbULL, 0x5f025742499ee260ULL, 0x802c8ecd5d7513fdULL, 0x78ceb3ef3f6dd938ULL,
+ 0xc342f44f8a135d94ULL, 0x7b9edb44828cdda3ULL, 0x9436d11a0537cfe7ULL, 0x5064b164ec1ab4c8ULL,
+ 0x7020eccfd37eb2fcULL, 0x1f31ea3ed90d25fcULL, 0x1b930d7bdfa1bb34ULL, 0x5344467a48113044ULL,
+ 0x70073170f25e6dfbULL, 0xe385dc1a50114cc8ULL, 0x2348698ac8fc4f00ULL, 0x2a77a55284dd40d8ULL,
+ 0xfe06afe0c98c6ce4ULL, 0xc235df96dddfd6e4ULL, 0x1428d01e33bf1ed3ULL, 0x785768ec9300bdafULL,
+ 0x9702e57a91deb63bULL, 0x61bdb8bfe5ce8b80ULL, 0x645b426f3d1d58acULL, 0x4804a82227a557bcULL,
+ 0x8e57048ab44d2601ULL, 0x68d6501a4b3a6935ULL, 0xc39c9ec3f9e1c293ULL, 0x4172f257d4de63e2ULL,
+ 0xd368b450330c6401ULL, 0x040d3017418f2391ULL, 0x2c34bb6090b7d90dULL, 0x16f649228fdfd51fULL,
+ 0xbea6818e2b928ef5ULL, 0xe28ccf91cdc11e72ULL, 0x594aaa68e77a36cdULL, 0x313034806c7ffd0fULL,
+ 0x8a9d27ac2249bd65ULL, 0x19a3b464018e9512ULL, 0xc26ccff352b37ec7ULL, 0x056f68341d797b21ULL,
+ 0x5e79d6757efd2327ULL, 0xfabdbcb6553afe15ULL, 0xd3e7222c6eaf5a60ULL, 0x7046c76d4dae743bULL,
+ 0x660be872b18d4a55ULL, 0x19992518574e1496ULL, 0xc103053a302bdcbbULL, 0x3ed8e9800b218e8eULL,
+ 0x7b0b9239fa75e03eULL, 0xefe9fb684633c083ULL, 0x98a35fbe391a7793ULL, 0x6065510fe2d0fe34ULL,
+ 0x55cb668548abad0cULL, 0xb4584548da87e527ULL, 0x2c43ecea0107c1ddULL, 0x526028809372de35ULL,
+ 0x3415c56af9213b1fULL, 0x5bee1a4d017e98dbULL, 0x13f6b105b5cf709bULL, 0x5ff20e3482b29ab6ULL,
+ 0x0aa29c75cc2e6c90ULL, 0xfc7d73ca3a70e206ULL, 0x899fc38fc4b5c515ULL, 0x250386b124ffc207ULL,
+ 0x54ea28d5ae3d2b56ULL, 0x9913149dd6de60ceULL, 0x16694fc58f06d6c1ULL, 0x46b23975eb018fc7ULL,
+ 0x470a6a0fb4b7b4e2ULL, 0x5d92475a8f7253deULL, 0xabeee5b52fbd3adbULL, 0x7fa20801a0806968ULL,
+ 0x76f3faf19f7714d2ULL, 0xb3e840c12f4660c3ULL, 0x0fb4cd8df212744eULL, 0x4b065a251d3a2dd2ULL,
+ 0x5cebde383d77cd4aULL, 0x6adf39df882c9cb1ULL, 0xa2dd242eb09af759ULL, 0x3147c0e50e5f6422ULL,
+ 0x164ca5101d1350dbULL, 0xf8d13479c33fc962ULL, 0xe640ce4d13e5da08ULL, 0x4bdee0c45061f8baULL,
+ 0xd7c46dc1a4edb1c9ULL, 0x5514d7b6437fd98aULL, 0x58942f6bb2a1c00bULL, 0x2dffb2ab1d70710eULL,
+ 0xccdfcf2fc18b6d68ULL, 0xa8ebcba8b7806167ULL, 0x980697f95e2937e3ULL, 0x02fbba1cd0126e8cULL
+};
-static void curve25519_bmi2_base(u8 session_key[CURVE25519_KEY_SIZE],
- const u8 private_key[CURVE25519_KEY_SIZE])
+static void curve25519_ever64_base(u8 *out, const u8 *priv)
{
- struct {
- u64 buffer[4 * NUM_WORDS_ELTFP25519];
- u64 coordinates[4 * NUM_WORDS_ELTFP25519];
- u64 workspace[4 * NUM_WORDS_ELTFP25519];
- u8 private[CURVE25519_KEY_SIZE];
- } __aligned(32) m;
-
- const int ite[4] = { 64, 64, 64, 63 };
- const int q = 3;
u64 swap = 1;
-
- int i = 0, j = 0, k = 0;
- u64 *const key = (u64 *)m.private;
- u64 *const Ur1 = m.coordinates + 0;
- u64 *const Zr1 = m.coordinates + 4;
- u64 *const Ur2 = m.coordinates + 8;
- u64 *const Zr2 = m.coordinates + 12;
-
- u64 *const UZr1 = m.coordinates + 0;
- u64 *const ZUr2 = m.coordinates + 8;
-
- u64 *const A = m.workspace + 0;
- u64 *const B = m.workspace + 4;
- u64 *const C = m.workspace + 8;
- u64 *const D = m.workspace + 12;
-
- u64 *const AB = m.workspace + 0;
- u64 *const CD = m.workspace + 8;
-
- const u64 *const P = table_ladder_8k;
-
- memcpy(m.private, private_key, sizeof(m.private));
-
- curve25519_clamp_secret(m.private);
-
- setzero_eltfp25519_1w(Ur1);
- setzero_eltfp25519_1w(Zr1);
- setzero_eltfp25519_1w(Zr2);
- Ur1[0] = 1;
- Zr1[0] = 1;
- Zr2[0] = 1;
-
- /* G-S */
- Ur2[3] = 0x1eaecdeee27cab34UL;
- Ur2[2] = 0xadc7a0b9235d48e2UL;
- Ur2[1] = 0xbbf095ae14b2edf8UL;
- Ur2[0] = 0x7e94e1fec82faabdUL;
-
- /* main-loop */
- j = q;
- for (i = 0; i < NUM_WORDS_ELTFP25519; ++i) {
- while (j < ite[i]) {
- u64 bit = (key[i] >> j) & 0x1;
- k = (64 * i + j - q);
+ int i, j, k;
+ u64 tmp[16 + 32 + 4];
+ u64 *x1 = &tmp[0];
+ u64 *z1 = &tmp[4];
+ u64 *x2 = &tmp[8];
+ u64 *z2 = &tmp[12];
+ u64 *xz1 = &tmp[0];
+ u64 *xz2 = &tmp[8];
+ u64 *a = &tmp[0 + 16];
+ u64 *b = &tmp[4 + 16];
+ u64 *c = &tmp[8 + 16];
+ u64 *ab = &tmp[0 + 16];
+ u64 *abcd = &tmp[0 + 16];
+ u64 *ef = &tmp[16 + 16];
+ u64 *efgh = &tmp[16 + 16];
+ u64 *key = &tmp[0 + 16 + 32];
+
+ memcpy(key, priv, 32);
+ ((u8 *)key)[0] &= 248;
+ ((u8 *)key)[31] = (((u8 *)key)[31] & 127) | 64;
+
+ x1[0] = 1, x1[1] = x1[2] = x1[3] = 0;
+ z1[0] = 1, z1[1] = z1[2] = z1[3] = 0;
+ z2[0] = 1, z2[1] = z2[2] = z2[3] = 0;
+ memcpy(x2, p_minus_s, sizeof(p_minus_s));
+
+ j = 3;
+ for (i = 0; i < 4; ++i) {
+ while (j < (const int[]){ 64, 64, 64, 63 }[i]) {
+ u64 bit = (key[i] >> j) & 1;
+ k = (64 * i + j - 3);
swap = swap ^ bit;
- cswap(swap, Ur1, Ur2);
- cswap(swap, Zr1, Zr2);
+ cswap2(swap, xz1, xz2);
swap = bit;
- /* Addition */
- sub_eltfp25519_1w(B, Ur1, Zr1); /* B = Ur1-Zr1 */
- add_eltfp25519_1w_bmi2(A, Ur1, Zr1); /* A = Ur1+Zr1 */
- mul_eltfp25519_1w_bmi2(C, &P[4 * k], B);/* C = M0-B */
- sub_eltfp25519_1w(B, A, C); /* B = (Ur1+Zr1) - M*(Ur1-Zr1) */
- add_eltfp25519_1w_bmi2(A, A, C); /* A = (Ur1+Zr1) + M*(Ur1-Zr1) */
- sqr_eltfp25519_2w_bmi2(AB); /* A = A^2 | B = B^2 */
- mul_eltfp25519_2w_bmi2(UZr1, ZUr2, AB); /* Ur1 = Zr2*A | Zr1 = Ur2*B */
+ fsub(b, x1, z1);
+ fadd(a, x1, z1);
+ fmul(c, &table_ladder[4 * k], b, ef);
+ fsub(b, a, c);
+ fadd(a, a, c);
+ fsqr2(ab, ab, efgh);
+ fmul2(xz1, xz2, ab, efgh);
++j;
}
j = 0;
}
- /* Doubling */
- for (i = 0; i < q; ++i) {
- add_eltfp25519_1w_bmi2(A, Ur1, Zr1); /* A = Ur1+Zr1 */
- sub_eltfp25519_1w(B, Ur1, Zr1); /* B = Ur1-Zr1 */
- sqr_eltfp25519_2w_bmi2(AB); /* A = A**2 B = B**2 */
- copy_eltfp25519_1w(C, B); /* C = B */
- sub_eltfp25519_1w(B, A, B); /* B = A-B */
- mul_a24_eltfp25519_1w(D, B); /* D = my_a24*B */
- add_eltfp25519_1w_bmi2(D, D, C); /* D = D+C */
- mul_eltfp25519_2w_bmi2(UZr1, AB, CD); /* Ur1 = A*B Zr1 = Zr1*A */
- }
-
- /* Convert to affine coordinates */
- inv_eltfp25519_1w_bmi2(A, Zr1);
- mul_eltfp25519_1w_bmi2((u64 *)session_key, Ur1, A);
- fred_eltfp25519_1w((u64 *)session_key);
+ point_double(xz1, abcd, efgh);
+ point_double(xz1, abcd, efgh);
+ point_double(xz1, abcd, efgh);
+ encode_point(out, xz1);
- memzero_explicit(&m, sizeof(m));
+ memzero_explicit(tmp, sizeof(tmp));
}
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(curve25519_use_bmi2_adx);
+
void curve25519_arch(u8 mypublic[CURVE25519_KEY_SIZE],
const u8 secret[CURVE25519_KEY_SIZE],
const u8 basepoint[CURVE25519_KEY_SIZE])
{
- if (static_branch_likely(&curve25519_use_adx))
- curve25519_adx(mypublic, secret, basepoint);
- else if (static_branch_likely(&curve25519_use_bmi2))
- curve25519_bmi2(mypublic, secret, basepoint);
+ if (static_branch_likely(&curve25519_use_bmi2_adx))
+ curve25519_ever64(mypublic, secret, basepoint);
else
curve25519_generic(mypublic, secret, basepoint);
}
@@ -2355,10 +1605,8 @@ EXPORT_SYMBOL(curve25519_arch);
void curve25519_base_arch(u8 pub[CURVE25519_KEY_SIZE],
const u8 secret[CURVE25519_KEY_SIZE])
{
- if (static_branch_likely(&curve25519_use_adx))
- curve25519_adx_base(pub, secret);
- else if (static_branch_likely(&curve25519_use_bmi2))
- curve25519_bmi2_base(pub, secret);
+ if (static_branch_likely(&curve25519_use_bmi2_adx))
+ curve25519_ever64_base(pub, secret);
else
curve25519_generic(pub, secret, curve25519_base_point);
}
@@ -2449,12 +1697,11 @@ static struct kpp_alg curve25519_alg = {
.max_size = curve25519_max_size,
};
+
static int __init curve25519_mod_init(void)
{
- if (boot_cpu_has(X86_FEATURE_BMI2))
- static_branch_enable(&curve25519_use_bmi2);
- else if (boot_cpu_has(X86_FEATURE_ADX))
- static_branch_enable(&curve25519_use_adx);
+ if (boot_cpu_has(X86_FEATURE_BMI2) && boot_cpu_has(X86_FEATURE_ADX))
+ static_branch_enable(&curve25519_use_bmi2_adx);
else
return 0;
return IS_REACHABLE(CONFIG_CRYPTO_KPP) ?
@@ -2464,7 +1711,7 @@ static int __init curve25519_mod_init(void)
static void __exit curve25519_mod_exit(void)
{
if (IS_REACHABLE(CONFIG_CRYPTO_KPP) &&
- (boot_cpu_has(X86_FEATURE_BMI2) || boot_cpu_has(X86_FEATURE_ADX)))
+ static_branch_likely(&curve25519_use_bmi2_adx))
crypto_unregister_kpp(&curve25519_alg);
}
@@ -2474,3 +1721,4 @@ module_exit(curve25519_mod_exit);
MODULE_ALIAS_CRYPTO("curve25519");
MODULE_ALIAS_CRYPTO("curve25519-x86");
MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Jason A. Donenfeld <Jason@zx2c4.com>");
diff --git a/arch/x86/crypto/des3_ede-asm_64.S b/arch/x86/crypto/des3_ede-asm_64.S
index fac0fdc3f25d..f4c760f4cade 100644
--- a/arch/x86/crypto/des3_ede-asm_64.S
+++ b/arch/x86/crypto/des3_ede-asm_64.S
@@ -243,7 +243,7 @@ SYM_FUNC_START(des3_ede_x86_64_crypt_blk)
popq %r12;
popq %rbx;
- ret;
+ RET;
SYM_FUNC_END(des3_ede_x86_64_crypt_blk)
/***********************************************************************
@@ -528,7 +528,7 @@ SYM_FUNC_START(des3_ede_x86_64_crypt_blk_3way)
popq %r12;
popq %rbx;
- ret;
+ RET;
SYM_FUNC_END(des3_ede_x86_64_crypt_blk_3way)
.section .rodata, "a", @progbits
diff --git a/arch/x86/crypto/des3_ede_glue.c b/arch/x86/crypto/des3_ede_glue.c
index 89830e531350..abb8b1fe123b 100644
--- a/arch/x86/crypto/des3_ede_glue.c
+++ b/arch/x86/crypto/des3_ede_glue.c
@@ -6,8 +6,6 @@
*
* CBC & ECB parts based on code (crypto/cbc.c,ecb.c) by:
* Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au>
- * CTR part based on code (crypto/ctr.c) by:
- * (C) Copyright IBM Corp. 2007 - Joy Latten <latten@us.ibm.com>
*/
#include <crypto/algapi.h>
@@ -47,14 +45,6 @@ static inline void des3_ede_dec_blk(struct des3_ede_x86_ctx *ctx, u8 *dst,
des3_ede_x86_64_crypt_blk(dec_ctx, dst, src);
}
-static inline void des3_ede_enc_blk_3way(struct des3_ede_x86_ctx *ctx, u8 *dst,
- const u8 *src)
-{
- u32 *enc_ctx = ctx->enc.expkey;
-
- des3_ede_x86_64_crypt_blk_3way(enc_ctx, dst, src);
-}
-
static inline void des3_ede_dec_blk_3way(struct des3_ede_x86_ctx *ctx, u8 *dst,
const u8 *src)
{
@@ -166,7 +156,7 @@ static int cbc_encrypt(struct skcipher_request *req)
err = skcipher_walk_virt(&walk, req, false);
- while ((nbytes = walk.nbytes)) {
+ while (walk.nbytes) {
nbytes = __cbc_encrypt(ctx, &walk);
err = skcipher_walk_done(&walk, nbytes);
}
@@ -245,7 +235,7 @@ static int cbc_decrypt(struct skcipher_request *req)
err = skcipher_walk_virt(&walk, req, false);
- while ((nbytes = walk.nbytes)) {
+ while (walk.nbytes) {
nbytes = __cbc_decrypt(ctx, &walk);
err = skcipher_walk_done(&walk, nbytes);
}
@@ -253,94 +243,6 @@ static int cbc_decrypt(struct skcipher_request *req)
return err;
}
-static void ctr_crypt_final(struct des3_ede_x86_ctx *ctx,
- struct skcipher_walk *walk)
-{
- u8 *ctrblk = walk->iv;
- u8 keystream[DES3_EDE_BLOCK_SIZE];
- u8 *src = walk->src.virt.addr;
- u8 *dst = walk->dst.virt.addr;
- unsigned int nbytes = walk->nbytes;
-
- des3_ede_enc_blk(ctx, keystream, ctrblk);
- crypto_xor_cpy(dst, keystream, src, nbytes);
-
- crypto_inc(ctrblk, DES3_EDE_BLOCK_SIZE);
-}
-
-static unsigned int __ctr_crypt(struct des3_ede_x86_ctx *ctx,
- struct skcipher_walk *walk)
-{
- unsigned int bsize = DES3_EDE_BLOCK_SIZE;
- unsigned int nbytes = walk->nbytes;
- __be64 *src = (__be64 *)walk->src.virt.addr;
- __be64 *dst = (__be64 *)walk->dst.virt.addr;
- u64 ctrblk = be64_to_cpu(*(__be64 *)walk->iv);
- __be64 ctrblocks[3];
-
- /* Process four block batch */
- if (nbytes >= bsize * 3) {
- do {
- /* create ctrblks for parallel encrypt */
- ctrblocks[0] = cpu_to_be64(ctrblk++);
- ctrblocks[1] = cpu_to_be64(ctrblk++);
- ctrblocks[2] = cpu_to_be64(ctrblk++);
-
- des3_ede_enc_blk_3way(ctx, (u8 *)ctrblocks,
- (u8 *)ctrblocks);
-
- dst[0] = src[0] ^ ctrblocks[0];
- dst[1] = src[1] ^ ctrblocks[1];
- dst[2] = src[2] ^ ctrblocks[2];
-
- src += 3;
- dst += 3;
- } while ((nbytes -= bsize * 3) >= bsize * 3);
-
- if (nbytes < bsize)
- goto done;
- }
-
- /* Handle leftovers */
- do {
- ctrblocks[0] = cpu_to_be64(ctrblk++);
-
- des3_ede_enc_blk(ctx, (u8 *)ctrblocks, (u8 *)ctrblocks);
-
- dst[0] = src[0] ^ ctrblocks[0];
-
- src += 1;
- dst += 1;
- } while ((nbytes -= bsize) >= bsize);
-
-done:
- *(__be64 *)walk->iv = cpu_to_be64(ctrblk);
- return nbytes;
-}
-
-static int ctr_crypt(struct skcipher_request *req)
-{
- struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
- struct des3_ede_x86_ctx *ctx = crypto_skcipher_ctx(tfm);
- struct skcipher_walk walk;
- unsigned int nbytes;
- int err;
-
- err = skcipher_walk_virt(&walk, req, false);
-
- while ((nbytes = walk.nbytes) >= DES3_EDE_BLOCK_SIZE) {
- nbytes = __ctr_crypt(ctx, &walk);
- err = skcipher_walk_done(&walk, nbytes);
- }
-
- if (nbytes) {
- ctr_crypt_final(ctx, &walk);
- err = skcipher_walk_done(&walk, 0);
- }
-
- return err;
-}
-
static int des3_ede_x86_setkey(struct crypto_tfm *tfm, const u8 *key,
unsigned int keylen)
{
@@ -428,20 +330,6 @@ static struct skcipher_alg des3_ede_skciphers[] = {
.setkey = des3_ede_x86_setkey_skcipher,
.encrypt = cbc_encrypt,
.decrypt = cbc_decrypt,
- }, {
- .base.cra_name = "ctr(des3_ede)",
- .base.cra_driver_name = "ctr-des3_ede-asm",
- .base.cra_priority = 300,
- .base.cra_blocksize = 1,
- .base.cra_ctxsize = sizeof(struct des3_ede_x86_ctx),
- .base.cra_module = THIS_MODULE,
- .min_keysize = DES3_EDE_KEY_SIZE,
- .max_keysize = DES3_EDE_KEY_SIZE,
- .ivsize = DES3_EDE_BLOCK_SIZE,
- .chunksize = DES3_EDE_BLOCK_SIZE,
- .setkey = des3_ede_x86_setkey_skcipher,
- .encrypt = ctr_crypt,
- .decrypt = ctr_crypt,
}
};
diff --git a/arch/x86/crypto/ecb_cbc_helpers.h b/arch/x86/crypto/ecb_cbc_helpers.h
new file mode 100644
index 000000000000..eaa15c7b29d6
--- /dev/null
+++ b/arch/x86/crypto/ecb_cbc_helpers.h
@@ -0,0 +1,76 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef _CRYPTO_ECB_CBC_HELPER_H
+#define _CRYPTO_ECB_CBC_HELPER_H
+
+#include <crypto/internal/skcipher.h>
+#include <asm/fpu/api.h>
+
+/*
+ * Mode helpers to instantiate parameterized skcipher ECB/CBC modes without
+ * having to rely on indirect calls and retpolines.
+ */
+
+#define ECB_WALK_START(req, bsize, fpu_blocks) do { \
+ void *ctx = crypto_skcipher_ctx(crypto_skcipher_reqtfm(req)); \
+ const int __bsize = (bsize); \
+ struct skcipher_walk walk; \
+ int err = skcipher_walk_virt(&walk, (req), false); \
+ while (walk.nbytes > 0) { \
+ unsigned int nbytes = walk.nbytes; \
+ bool do_fpu = (fpu_blocks) != -1 && \
+ nbytes >= (fpu_blocks) * __bsize; \
+ const u8 *src = walk.src.virt.addr; \
+ u8 *dst = walk.dst.virt.addr; \
+ u8 __maybe_unused buf[(bsize)]; \
+ if (do_fpu) kernel_fpu_begin()
+
+#define CBC_WALK_START(req, bsize, fpu_blocks) \
+ ECB_WALK_START(req, bsize, fpu_blocks)
+
+#define ECB_WALK_ADVANCE(blocks) do { \
+ dst += (blocks) * __bsize; \
+ src += (blocks) * __bsize; \
+ nbytes -= (blocks) * __bsize; \
+} while (0)
+
+#define ECB_BLOCK(blocks, func) do { \
+ while (nbytes >= (blocks) * __bsize) { \
+ (func)(ctx, dst, src); \
+ ECB_WALK_ADVANCE(blocks); \
+ } \
+} while (0)
+
+#define CBC_ENC_BLOCK(func) do { \
+ const u8 *__iv = walk.iv; \
+ while (nbytes >= __bsize) { \
+ crypto_xor_cpy(dst, src, __iv, __bsize); \
+ (func)(ctx, dst, dst); \
+ __iv = dst; \
+ ECB_WALK_ADVANCE(1); \
+ } \
+ memcpy(walk.iv, __iv, __bsize); \
+} while (0)
+
+#define CBC_DEC_BLOCK(blocks, func) do { \
+ while (nbytes >= (blocks) * __bsize) { \
+ const u8 *__iv = src + ((blocks) - 1) * __bsize; \
+ if (dst == src) \
+ __iv = memcpy(buf, __iv, __bsize); \
+ (func)(ctx, dst, src); \
+ crypto_xor(dst, walk.iv, __bsize); \
+ memcpy(walk.iv, __iv, __bsize); \
+ ECB_WALK_ADVANCE(blocks); \
+ } \
+} while (0)
+
+#define ECB_WALK_END() \
+ if (do_fpu) kernel_fpu_end(); \
+ err = skcipher_walk_done(&walk, nbytes); \
+ } \
+ return err; \
+} while (0)
+
+#define CBC_WALK_END() ECB_WALK_END()
+
+#endif
diff --git a/arch/x86/crypto/ghash-clmulni-intel_asm.S b/arch/x86/crypto/ghash-clmulni-intel_asm.S
index bb9735fbb865..2bf871899920 100644
--- a/arch/x86/crypto/ghash-clmulni-intel_asm.S
+++ b/arch/x86/crypto/ghash-clmulni-intel_asm.S
@@ -14,7 +14,6 @@
*/
#include <linux/linkage.h>
-#include <asm/inst.h>
#include <asm/frame.h>
.section .rodata.cst16.bswap_mask, "aM", @progbits, 16
@@ -51,9 +50,9 @@ SYM_FUNC_START_LOCAL(__clmul_gf128mul_ble)
pxor DATA, T2
pxor SHASH, T3
- PCLMULQDQ 0x00 SHASH DATA # DATA = a0 * b0
- PCLMULQDQ 0x11 SHASH T1 # T1 = a1 * b1
- PCLMULQDQ 0x00 T3 T2 # T2 = (a1 + a0) * (b1 + b0)
+ pclmulqdq $0x00, SHASH, DATA # DATA = a0 * b0
+ pclmulqdq $0x11, SHASH, T1 # T1 = a1 * b1
+ pclmulqdq $0x00, T3, T2 # T2 = (a1 + a0) * (b1 + b0)
pxor DATA, T2
pxor T1, T2 # T2 = a0 * b1 + a1 * b0
@@ -86,7 +85,7 @@ SYM_FUNC_START_LOCAL(__clmul_gf128mul_ble)
psrlq $1, T2
pxor T2, T1
pxor T1, DATA
- ret
+ RET
SYM_FUNC_END(__clmul_gf128mul_ble)
/* void clmul_ghash_mul(char *dst, const u128 *shash) */
@@ -95,12 +94,12 @@ SYM_FUNC_START(clmul_ghash_mul)
movups (%rdi), DATA
movups (%rsi), SHASH
movaps .Lbswap_mask, BSWAP
- PSHUFB_XMM BSWAP DATA
+ pshufb BSWAP, DATA
call __clmul_gf128mul_ble
- PSHUFB_XMM BSWAP DATA
+ pshufb BSWAP, DATA
movups DATA, (%rdi)
FRAME_END
- ret
+ RET
SYM_FUNC_END(clmul_ghash_mul)
/*
@@ -114,20 +113,20 @@ SYM_FUNC_START(clmul_ghash_update)
movaps .Lbswap_mask, BSWAP
movups (%rdi), DATA
movups (%rcx), SHASH
- PSHUFB_XMM BSWAP DATA
+ pshufb BSWAP, DATA
.align 4
.Lupdate_loop:
movups (%rsi), IN1
- PSHUFB_XMM BSWAP IN1
+ pshufb BSWAP, IN1
pxor IN1, DATA
call __clmul_gf128mul_ble
sub $16, %rdx
add $16, %rsi
cmp $16, %rdx
jge .Lupdate_loop
- PSHUFB_XMM BSWAP DATA
+ pshufb BSWAP, DATA
movups DATA, (%rdi)
.Lupdate_just_ret:
FRAME_END
- ret
+ RET
SYM_FUNC_END(clmul_ghash_update)
diff --git a/arch/x86/crypto/ghash-clmulni-intel_glue.c b/arch/x86/crypto/ghash-clmulni-intel_glue.c
index a4b728518e28..1f1a95f3dd0c 100644
--- a/arch/x86/crypto/ghash-clmulni-intel_glue.c
+++ b/arch/x86/crypto/ghash-clmulni-intel_glue.c
@@ -313,7 +313,7 @@ static struct ahash_alg ghash_async_alg = {
};
static const struct x86_cpu_id pcmul_cpu_id[] = {
- X86_FEATURE_MATCH(X86_FEATURE_PCLMULQDQ), /* Pickle-Mickle-Duck */
+ X86_MATCH_FEATURE(X86_FEATURE_PCLMULQDQ, NULL), /* Pickle-Mickle-Duck */
{}
};
MODULE_DEVICE_TABLE(x86cpu, pcmul_cpu_id);
diff --git a/arch/x86/crypto/glue_helper-asm-avx.S b/arch/x86/crypto/glue_helper-asm-avx.S
index d08fc575ef7f..3da385271227 100644
--- a/arch/x86/crypto/glue_helper-asm-avx.S
+++ b/arch/x86/crypto/glue_helper-asm-avx.S
@@ -34,107 +34,3 @@
vpxor (5*16)(src), x6, x6; \
vpxor (6*16)(src), x7, x7; \
store_8way(dst, x0, x1, x2, x3, x4, x5, x6, x7);
-
-#define inc_le128(x, minus_one, tmp) \
- vpcmpeqq minus_one, x, tmp; \
- vpsubq minus_one, x, x; \
- vpslldq $8, tmp, tmp; \
- vpsubq tmp, x, x;
-
-#define load_ctr_8way(iv, bswap, x0, x1, x2, x3, x4, x5, x6, x7, t0, t1, t2) \
- vpcmpeqd t0, t0, t0; \
- vpsrldq $8, t0, t0; /* low: -1, high: 0 */ \
- vmovdqa bswap, t1; \
- \
- /* load IV and byteswap */ \
- vmovdqu (iv), x7; \
- vpshufb t1, x7, x0; \
- \
- /* construct IVs */ \
- inc_le128(x7, t0, t2); \
- vpshufb t1, x7, x1; \
- inc_le128(x7, t0, t2); \
- vpshufb t1, x7, x2; \
- inc_le128(x7, t0, t2); \
- vpshufb t1, x7, x3; \
- inc_le128(x7, t0, t2); \
- vpshufb t1, x7, x4; \
- inc_le128(x7, t0, t2); \
- vpshufb t1, x7, x5; \
- inc_le128(x7, t0, t2); \
- vpshufb t1, x7, x6; \
- inc_le128(x7, t0, t2); \
- vmovdqa x7, t2; \
- vpshufb t1, x7, x7; \
- inc_le128(t2, t0, t1); \
- vmovdqu t2, (iv);
-
-#define store_ctr_8way(src, dst, x0, x1, x2, x3, x4, x5, x6, x7) \
- vpxor (0*16)(src), x0, x0; \
- vpxor (1*16)(src), x1, x1; \
- vpxor (2*16)(src), x2, x2; \
- vpxor (3*16)(src), x3, x3; \
- vpxor (4*16)(src), x4, x4; \
- vpxor (5*16)(src), x5, x5; \
- vpxor (6*16)(src), x6, x6; \
- vpxor (7*16)(src), x7, x7; \
- store_8way(dst, x0, x1, x2, x3, x4, x5, x6, x7);
-
-#define gf128mul_x_ble(iv, mask, tmp) \
- vpsrad $31, iv, tmp; \
- vpaddq iv, iv, iv; \
- vpshufd $0x13, tmp, tmp; \
- vpand mask, tmp, tmp; \
- vpxor tmp, iv, iv;
-
-#define load_xts_8way(iv, src, dst, x0, x1, x2, x3, x4, x5, x6, x7, tiv, t0, \
- t1, xts_gf128mul_and_shl1_mask) \
- vmovdqa xts_gf128mul_and_shl1_mask, t0; \
- \
- /* load IV */ \
- vmovdqu (iv), tiv; \
- vpxor (0*16)(src), tiv, x0; \
- vmovdqu tiv, (0*16)(dst); \
- \
- /* construct and store IVs, also xor with source */ \
- gf128mul_x_ble(tiv, t0, t1); \
- vpxor (1*16)(src), tiv, x1; \
- vmovdqu tiv, (1*16)(dst); \
- \
- gf128mul_x_ble(tiv, t0, t1); \
- vpxor (2*16)(src), tiv, x2; \
- vmovdqu tiv, (2*16)(dst); \
- \
- gf128mul_x_ble(tiv, t0, t1); \
- vpxor (3*16)(src), tiv, x3; \
- vmovdqu tiv, (3*16)(dst); \
- \
- gf128mul_x_ble(tiv, t0, t1); \
- vpxor (4*16)(src), tiv, x4; \
- vmovdqu tiv, (4*16)(dst); \
- \
- gf128mul_x_ble(tiv, t0, t1); \
- vpxor (5*16)(src), tiv, x5; \
- vmovdqu tiv, (5*16)(dst); \
- \
- gf128mul_x_ble(tiv, t0, t1); \
- vpxor (6*16)(src), tiv, x6; \
- vmovdqu tiv, (6*16)(dst); \
- \
- gf128mul_x_ble(tiv, t0, t1); \
- vpxor (7*16)(src), tiv, x7; \
- vmovdqu tiv, (7*16)(dst); \
- \
- gf128mul_x_ble(tiv, t0, t1); \
- vmovdqu tiv, (iv);
-
-#define store_xts_8way(dst, x0, x1, x2, x3, x4, x5, x6, x7) \
- vpxor (0*16)(dst), x0, x0; \
- vpxor (1*16)(dst), x1, x1; \
- vpxor (2*16)(dst), x2, x2; \
- vpxor (3*16)(dst), x3, x3; \
- vpxor (4*16)(dst), x4, x4; \
- vpxor (5*16)(dst), x5, x5; \
- vpxor (6*16)(dst), x6, x6; \
- vpxor (7*16)(dst), x7, x7; \
- store_8way(dst, x0, x1, x2, x3, x4, x5, x6, x7);
diff --git a/arch/x86/crypto/glue_helper-asm-avx2.S b/arch/x86/crypto/glue_helper-asm-avx2.S
index d84508c85c13..c77e9049431f 100644
--- a/arch/x86/crypto/glue_helper-asm-avx2.S
+++ b/arch/x86/crypto/glue_helper-asm-avx2.S
@@ -37,139 +37,3 @@
vpxor (5*32+16)(src), x6, x6; \
vpxor (6*32+16)(src), x7, x7; \
store_16way(dst, x0, x1, x2, x3, x4, x5, x6, x7);
-
-#define inc_le128(x, minus_one, tmp) \
- vpcmpeqq minus_one, x, tmp; \
- vpsubq minus_one, x, x; \
- vpslldq $8, tmp, tmp; \
- vpsubq tmp, x, x;
-
-#define add2_le128(x, minus_one, minus_two, tmp1, tmp2) \
- vpcmpeqq minus_one, x, tmp1; \
- vpcmpeqq minus_two, x, tmp2; \
- vpsubq minus_two, x, x; \
- vpor tmp2, tmp1, tmp1; \
- vpslldq $8, tmp1, tmp1; \
- vpsubq tmp1, x, x;
-
-#define load_ctr_16way(iv, bswap, x0, x1, x2, x3, x4, x5, x6, x7, t0, t0x, t1, \
- t1x, t2, t2x, t3, t3x, t4, t5) \
- vpcmpeqd t0, t0, t0; \
- vpsrldq $8, t0, t0; /* ab: -1:0 ; cd: -1:0 */ \
- vpaddq t0, t0, t4; /* ab: -2:0 ; cd: -2:0 */\
- \
- /* load IV and byteswap */ \
- vmovdqu (iv), t2x; \
- vmovdqa t2x, t3x; \
- inc_le128(t2x, t0x, t1x); \
- vbroadcasti128 bswap, t1; \
- vinserti128 $1, t2x, t3, t2; /* ab: le0 ; cd: le1 */ \
- vpshufb t1, t2, x0; \
- \
- /* construct IVs */ \
- add2_le128(t2, t0, t4, t3, t5); /* ab: le2 ; cd: le3 */ \
- vpshufb t1, t2, x1; \
- add2_le128(t2, t0, t4, t3, t5); \
- vpshufb t1, t2, x2; \
- add2_le128(t2, t0, t4, t3, t5); \
- vpshufb t1, t2, x3; \
- add2_le128(t2, t0, t4, t3, t5); \
- vpshufb t1, t2, x4; \
- add2_le128(t2, t0, t4, t3, t5); \
- vpshufb t1, t2, x5; \
- add2_le128(t2, t0, t4, t3, t5); \
- vpshufb t1, t2, x6; \
- add2_le128(t2, t0, t4, t3, t5); \
- vpshufb t1, t2, x7; \
- vextracti128 $1, t2, t2x; \
- inc_le128(t2x, t0x, t3x); \
- vmovdqu t2x, (iv);
-
-#define store_ctr_16way(src, dst, x0, x1, x2, x3, x4, x5, x6, x7) \
- vpxor (0*32)(src), x0, x0; \
- vpxor (1*32)(src), x1, x1; \
- vpxor (2*32)(src), x2, x2; \
- vpxor (3*32)(src), x3, x3; \
- vpxor (4*32)(src), x4, x4; \
- vpxor (5*32)(src), x5, x5; \
- vpxor (6*32)(src), x6, x6; \
- vpxor (7*32)(src), x7, x7; \
- store_16way(dst, x0, x1, x2, x3, x4, x5, x6, x7);
-
-#define gf128mul_x_ble(iv, mask, tmp) \
- vpsrad $31, iv, tmp; \
- vpaddq iv, iv, iv; \
- vpshufd $0x13, tmp, tmp; \
- vpand mask, tmp, tmp; \
- vpxor tmp, iv, iv;
-
-#define gf128mul_x2_ble(iv, mask1, mask2, tmp0, tmp1) \
- vpsrad $31, iv, tmp0; \
- vpaddq iv, iv, tmp1; \
- vpsllq $2, iv, iv; \
- vpshufd $0x13, tmp0, tmp0; \
- vpsrad $31, tmp1, tmp1; \
- vpand mask2, tmp0, tmp0; \
- vpshufd $0x13, tmp1, tmp1; \
- vpxor tmp0, iv, iv; \
- vpand mask1, tmp1, tmp1; \
- vpxor tmp1, iv, iv;
-
-#define load_xts_16way(iv, src, dst, x0, x1, x2, x3, x4, x5, x6, x7, tiv, \
- tivx, t0, t0x, t1, t1x, t2, t2x, t3, \
- xts_gf128mul_and_shl1_mask_0, \
- xts_gf128mul_and_shl1_mask_1) \
- vbroadcasti128 xts_gf128mul_and_shl1_mask_0, t1; \
- \
- /* load IV and construct second IV */ \
- vmovdqu (iv), tivx; \
- vmovdqa tivx, t0x; \
- gf128mul_x_ble(tivx, t1x, t2x); \
- vbroadcasti128 xts_gf128mul_and_shl1_mask_1, t2; \
- vinserti128 $1, tivx, t0, tiv; \
- vpxor (0*32)(src), tiv, x0; \
- vmovdqu tiv, (0*32)(dst); \
- \
- /* construct and store IVs, also xor with source */ \
- gf128mul_x2_ble(tiv, t1, t2, t0, t3); \
- vpxor (1*32)(src), tiv, x1; \
- vmovdqu tiv, (1*32)(dst); \
- \
- gf128mul_x2_ble(tiv, t1, t2, t0, t3); \
- vpxor (2*32)(src), tiv, x2; \
- vmovdqu tiv, (2*32)(dst); \
- \
- gf128mul_x2_ble(tiv, t1, t2, t0, t3); \
- vpxor (3*32)(src), tiv, x3; \
- vmovdqu tiv, (3*32)(dst); \
- \
- gf128mul_x2_ble(tiv, t1, t2, t0, t3); \
- vpxor (4*32)(src), tiv, x4; \
- vmovdqu tiv, (4*32)(dst); \
- \
- gf128mul_x2_ble(tiv, t1, t2, t0, t3); \
- vpxor (5*32)(src), tiv, x5; \
- vmovdqu tiv, (5*32)(dst); \
- \
- gf128mul_x2_ble(tiv, t1, t2, t0, t3); \
- vpxor (6*32)(src), tiv, x6; \
- vmovdqu tiv, (6*32)(dst); \
- \
- gf128mul_x2_ble(tiv, t1, t2, t0, t3); \
- vpxor (7*32)(src), tiv, x7; \
- vmovdqu tiv, (7*32)(dst); \
- \
- vextracti128 $1, tiv, tivx; \
- gf128mul_x_ble(tivx, t1x, t2x); \
- vmovdqu tivx, (iv);
-
-#define store_xts_16way(dst, x0, x1, x2, x3, x4, x5, x6, x7) \
- vpxor (0*32)(dst), x0, x0; \
- vpxor (1*32)(dst), x1, x1; \
- vpxor (2*32)(dst), x2, x2; \
- vpxor (3*32)(dst), x3, x3; \
- vpxor (4*32)(dst), x4, x4; \
- vpxor (5*32)(dst), x5, x5; \
- vpxor (6*32)(dst), x6, x6; \
- vpxor (7*32)(dst), x7, x7; \
- store_16way(dst, x0, x1, x2, x3, x4, x5, x6, x7);
diff --git a/arch/x86/crypto/glue_helper.c b/arch/x86/crypto/glue_helper.c
deleted file mode 100644
index d3d91a0abf88..000000000000
--- a/arch/x86/crypto/glue_helper.c
+++ /dev/null
@@ -1,381 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * Shared glue code for 128bit block ciphers
- *
- * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
- *
- * CBC & ECB parts based on code (crypto/cbc.c,ecb.c) by:
- * Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au>
- * CTR part based on code (crypto/ctr.c) by:
- * (C) Copyright IBM Corp. 2007 - Joy Latten <latten@us.ibm.com>
- */
-
-#include <linux/module.h>
-#include <crypto/b128ops.h>
-#include <crypto/gf128mul.h>
-#include <crypto/internal/skcipher.h>
-#include <crypto/scatterwalk.h>
-#include <crypto/xts.h>
-#include <asm/crypto/glue_helper.h>
-
-int glue_ecb_req_128bit(const struct common_glue_ctx *gctx,
- struct skcipher_request *req)
-{
- void *ctx = crypto_skcipher_ctx(crypto_skcipher_reqtfm(req));
- const unsigned int bsize = 128 / 8;
- struct skcipher_walk walk;
- bool fpu_enabled = false;
- unsigned int nbytes;
- int err;
-
- err = skcipher_walk_virt(&walk, req, false);
-
- while ((nbytes = walk.nbytes)) {
- const u8 *src = walk.src.virt.addr;
- u8 *dst = walk.dst.virt.addr;
- unsigned int func_bytes;
- unsigned int i;
-
- fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
- &walk, fpu_enabled, nbytes);
- for (i = 0; i < gctx->num_funcs; i++) {
- func_bytes = bsize * gctx->funcs[i].num_blocks;
-
- if (nbytes < func_bytes)
- continue;
-
- /* Process multi-block batch */
- do {
- gctx->funcs[i].fn_u.ecb(ctx, dst, src);
- src += func_bytes;
- dst += func_bytes;
- nbytes -= func_bytes;
- } while (nbytes >= func_bytes);
-
- if (nbytes < bsize)
- break;
- }
- err = skcipher_walk_done(&walk, nbytes);
- }
-
- glue_fpu_end(fpu_enabled);
- return err;
-}
-EXPORT_SYMBOL_GPL(glue_ecb_req_128bit);
-
-int glue_cbc_encrypt_req_128bit(const common_glue_func_t fn,
- struct skcipher_request *req)
-{
- void *ctx = crypto_skcipher_ctx(crypto_skcipher_reqtfm(req));
- const unsigned int bsize = 128 / 8;
- struct skcipher_walk walk;
- unsigned int nbytes;
- int err;
-
- err = skcipher_walk_virt(&walk, req, false);
-
- while ((nbytes = walk.nbytes)) {
- const u128 *src = (u128 *)walk.src.virt.addr;
- u128 *dst = (u128 *)walk.dst.virt.addr;
- u128 *iv = (u128 *)walk.iv;
-
- do {
- u128_xor(dst, src, iv);
- fn(ctx, (u8 *)dst, (u8 *)dst);
- iv = dst;
- src++;
- dst++;
- nbytes -= bsize;
- } while (nbytes >= bsize);
-
- *(u128 *)walk.iv = *iv;
- err = skcipher_walk_done(&walk, nbytes);
- }
- return err;
-}
-EXPORT_SYMBOL_GPL(glue_cbc_encrypt_req_128bit);
-
-int glue_cbc_decrypt_req_128bit(const struct common_glue_ctx *gctx,
- struct skcipher_request *req)
-{
- void *ctx = crypto_skcipher_ctx(crypto_skcipher_reqtfm(req));
- const unsigned int bsize = 128 / 8;
- struct skcipher_walk walk;
- bool fpu_enabled = false;
- unsigned int nbytes;
- int err;
-
- err = skcipher_walk_virt(&walk, req, false);
-
- while ((nbytes = walk.nbytes)) {
- const u128 *src = walk.src.virt.addr;
- u128 *dst = walk.dst.virt.addr;
- unsigned int func_bytes, num_blocks;
- unsigned int i;
- u128 last_iv;
-
- fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
- &walk, fpu_enabled, nbytes);
- /* Start of the last block. */
- src += nbytes / bsize - 1;
- dst += nbytes / bsize - 1;
-
- last_iv = *src;
-
- for (i = 0; i < gctx->num_funcs; i++) {
- num_blocks = gctx->funcs[i].num_blocks;
- func_bytes = bsize * num_blocks;
-
- if (nbytes < func_bytes)
- continue;
-
- /* Process multi-block batch */
- do {
- src -= num_blocks - 1;
- dst -= num_blocks - 1;
-
- gctx->funcs[i].fn_u.cbc(ctx, (u8 *)dst,
- (const u8 *)src);
-
- nbytes -= func_bytes;
- if (nbytes < bsize)
- goto done;
-
- u128_xor(dst, dst, --src);
- dst--;
- } while (nbytes >= func_bytes);
- }
-done:
- u128_xor(dst, dst, (u128 *)walk.iv);
- *(u128 *)walk.iv = last_iv;
- err = skcipher_walk_done(&walk, nbytes);
- }
-
- glue_fpu_end(fpu_enabled);
- return err;
-}
-EXPORT_SYMBOL_GPL(glue_cbc_decrypt_req_128bit);
-
-int glue_ctr_req_128bit(const struct common_glue_ctx *gctx,
- struct skcipher_request *req)
-{
- void *ctx = crypto_skcipher_ctx(crypto_skcipher_reqtfm(req));
- const unsigned int bsize = 128 / 8;
- struct skcipher_walk walk;
- bool fpu_enabled = false;
- unsigned int nbytes;
- int err;
-
- err = skcipher_walk_virt(&walk, req, false);
-
- while ((nbytes = walk.nbytes) >= bsize) {
- const u128 *src = walk.src.virt.addr;
- u128 *dst = walk.dst.virt.addr;
- unsigned int func_bytes, num_blocks;
- unsigned int i;
- le128 ctrblk;
-
- fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
- &walk, fpu_enabled, nbytes);
-
- be128_to_le128(&ctrblk, (be128 *)walk.iv);
-
- for (i = 0; i < gctx->num_funcs; i++) {
- num_blocks = gctx->funcs[i].num_blocks;
- func_bytes = bsize * num_blocks;
-
- if (nbytes < func_bytes)
- continue;
-
- /* Process multi-block batch */
- do {
- gctx->funcs[i].fn_u.ctr(ctx, (u8 *)dst,
- (const u8 *)src,
- &ctrblk);
- src += num_blocks;
- dst += num_blocks;
- nbytes -= func_bytes;
- } while (nbytes >= func_bytes);
-
- if (nbytes < bsize)
- break;
- }
-
- le128_to_be128((be128 *)walk.iv, &ctrblk);
- err = skcipher_walk_done(&walk, nbytes);
- }
-
- glue_fpu_end(fpu_enabled);
-
- if (nbytes) {
- le128 ctrblk;
- u128 tmp;
-
- be128_to_le128(&ctrblk, (be128 *)walk.iv);
- memcpy(&tmp, walk.src.virt.addr, nbytes);
- gctx->funcs[gctx->num_funcs - 1].fn_u.ctr(ctx, (u8 *)&tmp,
- (const u8 *)&tmp,
- &ctrblk);
- memcpy(walk.dst.virt.addr, &tmp, nbytes);
- le128_to_be128((be128 *)walk.iv, &ctrblk);
-
- err = skcipher_walk_done(&walk, 0);
- }
-
- return err;
-}
-EXPORT_SYMBOL_GPL(glue_ctr_req_128bit);
-
-static unsigned int __glue_xts_req_128bit(const struct common_glue_ctx *gctx,
- void *ctx,
- struct skcipher_walk *walk)
-{
- const unsigned int bsize = 128 / 8;
- unsigned int nbytes = walk->nbytes;
- u128 *src = walk->src.virt.addr;
- u128 *dst = walk->dst.virt.addr;
- unsigned int num_blocks, func_bytes;
- unsigned int i;
-
- /* Process multi-block batch */
- for (i = 0; i < gctx->num_funcs; i++) {
- num_blocks = gctx->funcs[i].num_blocks;
- func_bytes = bsize * num_blocks;
-
- if (nbytes >= func_bytes) {
- do {
- gctx->funcs[i].fn_u.xts(ctx, (u8 *)dst,
- (const u8 *)src,
- walk->iv);
-
- src += num_blocks;
- dst += num_blocks;
- nbytes -= func_bytes;
- } while (nbytes >= func_bytes);
-
- if (nbytes < bsize)
- goto done;
- }
- }
-
-done:
- return nbytes;
-}
-
-int glue_xts_req_128bit(const struct common_glue_ctx *gctx,
- struct skcipher_request *req,
- common_glue_func_t tweak_fn, void *tweak_ctx,
- void *crypt_ctx, bool decrypt)
-{
- const bool cts = (req->cryptlen % XTS_BLOCK_SIZE);
- const unsigned int bsize = 128 / 8;
- struct skcipher_request subreq;
- struct skcipher_walk walk;
- bool fpu_enabled = false;
- unsigned int nbytes, tail;
- int err;
-
- if (req->cryptlen < XTS_BLOCK_SIZE)
- return -EINVAL;
-
- if (unlikely(cts)) {
- struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
-
- tail = req->cryptlen % XTS_BLOCK_SIZE + XTS_BLOCK_SIZE;
-
- skcipher_request_set_tfm(&subreq, tfm);
- skcipher_request_set_callback(&subreq,
- crypto_skcipher_get_flags(tfm),
- NULL, NULL);
- skcipher_request_set_crypt(&subreq, req->src, req->dst,
- req->cryptlen - tail, req->iv);
- req = &subreq;
- }
-
- err = skcipher_walk_virt(&walk, req, false);
- nbytes = walk.nbytes;
- if (err)
- return err;
-
- /* set minimum length to bsize, for tweak_fn */
- fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
- &walk, fpu_enabled,
- nbytes < bsize ? bsize : nbytes);
-
- /* calculate first value of T */
- tweak_fn(tweak_ctx, walk.iv, walk.iv);
-
- while (nbytes) {
- nbytes = __glue_xts_req_128bit(gctx, crypt_ctx, &walk);
-
- err = skcipher_walk_done(&walk, nbytes);
- nbytes = walk.nbytes;
- }
-
- if (unlikely(cts)) {
- u8 *next_tweak, *final_tweak = req->iv;
- struct scatterlist *src, *dst;
- struct scatterlist s[2], d[2];
- le128 b[2];
-
- dst = src = scatterwalk_ffwd(s, req->src, req->cryptlen);
- if (req->dst != req->src)
- dst = scatterwalk_ffwd(d, req->dst, req->cryptlen);
-
- if (decrypt) {
- next_tweak = memcpy(b, req->iv, XTS_BLOCK_SIZE);
- gf128mul_x_ble(b, b);
- } else {
- next_tweak = req->iv;
- }
-
- skcipher_request_set_crypt(&subreq, src, dst, XTS_BLOCK_SIZE,
- next_tweak);
-
- err = skcipher_walk_virt(&walk, req, false) ?:
- skcipher_walk_done(&walk,
- __glue_xts_req_128bit(gctx, crypt_ctx, &walk));
- if (err)
- goto out;
-
- scatterwalk_map_and_copy(b, dst, 0, XTS_BLOCK_SIZE, 0);
- memcpy(b + 1, b, tail - XTS_BLOCK_SIZE);
- scatterwalk_map_and_copy(b, src, XTS_BLOCK_SIZE,
- tail - XTS_BLOCK_SIZE, 0);
- scatterwalk_map_and_copy(b, dst, 0, tail, 1);
-
- skcipher_request_set_crypt(&subreq, dst, dst, XTS_BLOCK_SIZE,
- final_tweak);
-
- err = skcipher_walk_virt(&walk, req, false) ?:
- skcipher_walk_done(&walk,
- __glue_xts_req_128bit(gctx, crypt_ctx, &walk));
- }
-
-out:
- glue_fpu_end(fpu_enabled);
-
- return err;
-}
-EXPORT_SYMBOL_GPL(glue_xts_req_128bit);
-
-void glue_xts_crypt_128bit_one(const void *ctx, u8 *dst, const u8 *src,
- le128 *iv, common_glue_func_t fn)
-{
- le128 ivblk = *iv;
-
- /* generate next IV */
- gf128mul_x_ble(iv, &ivblk);
-
- /* CC <- T xor C */
- u128_xor((u128 *)dst, (const u128 *)src, (u128 *)&ivblk);
-
- /* PP <- D(Key2,CC) */
- fn(ctx, dst, dst);
-
- /* P <- T xor PP */
- u128_xor((u128 *)dst, (u128 *)dst, (u128 *)&ivblk);
-}
-EXPORT_SYMBOL_GPL(glue_xts_crypt_128bit_one);
-
-MODULE_LICENSE("GPL");
diff --git a/arch/x86/crypto/nh-avx2-x86_64.S b/arch/x86/crypto/nh-avx2-x86_64.S
index b22c7b936272..6a0b15e7196a 100644
--- a/arch/x86/crypto/nh-avx2-x86_64.S
+++ b/arch/x86/crypto/nh-avx2-x86_64.S
@@ -153,5 +153,5 @@ SYM_FUNC_START(nh_avx2)
vpaddq T1, T0, T0
vpaddq T4, T0, T0
vmovdqu T0, (HASH)
- ret
+ RET
SYM_FUNC_END(nh_avx2)
diff --git a/arch/x86/crypto/nh-sse2-x86_64.S b/arch/x86/crypto/nh-sse2-x86_64.S
index d7ae22dd6683..34c567bbcb4f 100644
--- a/arch/x86/crypto/nh-sse2-x86_64.S
+++ b/arch/x86/crypto/nh-sse2-x86_64.S
@@ -119,5 +119,5 @@ SYM_FUNC_START(nh_sse2)
paddq PASS2_SUMS, T1
movdqu T0, 0x00(HASH)
movdqu T1, 0x10(HASH)
- ret
+ RET
SYM_FUNC_END(nh_sse2)
diff --git a/arch/x86/crypto/nhpoly1305-avx2-glue.c b/arch/x86/crypto/nhpoly1305-avx2-glue.c
index f7567cbd35b6..8ea5ab0f1ca7 100644
--- a/arch/x86/crypto/nhpoly1305-avx2-glue.c
+++ b/arch/x86/crypto/nhpoly1305-avx2-glue.c
@@ -10,6 +10,7 @@
#include <crypto/internal/simd.h>
#include <crypto/nhpoly1305.h>
#include <linux/module.h>
+#include <linux/sizes.h>
#include <asm/simd.h>
asmlinkage void nh_avx2(const u32 *key, const u8 *message, size_t message_len,
@@ -29,7 +30,7 @@ static int nhpoly1305_avx2_update(struct shash_desc *desc,
return crypto_nhpoly1305_update(desc, src, srclen);
do {
- unsigned int n = min_t(unsigned int, srclen, PAGE_SIZE);
+ unsigned int n = min_t(unsigned int, srclen, SZ_4K);
kernel_fpu_begin();
crypto_nhpoly1305_update_helper(desc, src, n, _nh_avx2);
diff --git a/arch/x86/crypto/nhpoly1305-sse2-glue.c b/arch/x86/crypto/nhpoly1305-sse2-glue.c
index a661ede3b5cf..2b353d42ed13 100644
--- a/arch/x86/crypto/nhpoly1305-sse2-glue.c
+++ b/arch/x86/crypto/nhpoly1305-sse2-glue.c
@@ -10,6 +10,7 @@
#include <crypto/internal/simd.h>
#include <crypto/nhpoly1305.h>
#include <linux/module.h>
+#include <linux/sizes.h>
#include <asm/simd.h>
asmlinkage void nh_sse2(const u32 *key, const u8 *message, size_t message_len,
@@ -29,7 +30,7 @@ static int nhpoly1305_sse2_update(struct shash_desc *desc,
return crypto_nhpoly1305_update(desc, src, srclen);
do {
- unsigned int n = min_t(unsigned int, srclen, PAGE_SIZE);
+ unsigned int n = min_t(unsigned int, srclen, SZ_4K);
kernel_fpu_begin();
crypto_nhpoly1305_update_helper(desc, src, n, _nh_sse2);
diff --git a/arch/x86/crypto/poly1305-x86_64-cryptogams.pl b/arch/x86/crypto/poly1305-x86_64-cryptogams.pl
index 7a6b5380a46f..2077ce7a5647 100644
--- a/arch/x86/crypto/poly1305-x86_64-cryptogams.pl
+++ b/arch/x86/crypto/poly1305-x86_64-cryptogams.pl
@@ -246,12 +246,12 @@ $code.=<<___ if (!$kernel);
___
&declare_function("poly1305_init_x86_64", 32, 3);
$code.=<<___;
- xor %rax,%rax
+ xor %eax,%eax
mov %rax,0($ctx) # initialize hash value
mov %rax,8($ctx)
mov %rax,16($ctx)
- cmp \$0,$inp
+ test $inp,$inp
je .Lno_key
___
$code.=<<___ if (!$kernel);
@@ -297,7 +297,7 @@ ___
$code.=<<___;
mov \$1,%eax
.Lno_key:
- ret
+ RET
___
&end_function("poly1305_init_x86_64");
@@ -373,7 +373,7 @@ $code.=<<___;
.cfi_adjust_cfa_offset -48
.Lno_data:
.Lblocks_epilogue:
- ret
+ RET
.cfi_endproc
___
&end_function("poly1305_blocks_x86_64");
@@ -399,15 +399,11 @@ $code.=<<___;
mov %rax,0($mac) # write result
mov %rcx,8($mac)
- ret
+ RET
___
&end_function("poly1305_emit_x86_64");
if ($avx) {
-if($kernel) {
- $code .= "#ifdef CONFIG_AS_AVX\n";
-}
-
########################################################################
# Layout of opaque area is following.
#
@@ -433,7 +429,7 @@ ___
&poly1305_iteration();
$code.=<<___;
pop $ctx
- ret
+ RET
.size __poly1305_block,.-__poly1305_block
.type __poly1305_init_avx,\@abi-omnipotent
@@ -598,7 +594,7 @@ __poly1305_init_avx:
lea -48-64($ctx),$ctx # size [de-]optimization
pop %rbp
- ret
+ RET
.size __poly1305_init_avx,.-__poly1305_init_avx
___
@@ -751,7 +747,7 @@ $code.=<<___;
.cfi_restore %rbp
.Lno_data_avx:
.Lblocks_avx_epilogue:
- ret
+ RET
.cfi_endproc
.align 32
@@ -1456,7 +1452,7 @@ $code.=<<___ if (!$win64);
___
$code.=<<___;
vzeroupper
- ret
+ RET
.cfi_endproc
___
&end_function("poly1305_blocks_avx");
@@ -1512,20 +1508,12 @@ $code.=<<___;
mov %rax,0($mac) # write result
mov %rcx,8($mac)
- ret
+ RET
___
&end_function("poly1305_emit_avx");
-if ($kernel) {
- $code .= "#endif\n";
-}
-
if ($avx>1) {
-if ($kernel) {
- $code .= "#ifdef CONFIG_AS_AVX2\n";
-}
-
my ($H0,$H1,$H2,$H3,$H4, $MASK, $T4,$T0,$T1,$T2,$T3, $D0,$D1,$D2,$D3,$D4) =
map("%ymm$_",(0..15));
my $S4=$MASK;
@@ -1687,7 +1675,7 @@ $code.=<<___;
.cfi_restore %rbp
.Lno_data_avx2$suffix:
.Lblocks_avx2_epilogue$suffix:
- ret
+ RET
.cfi_endproc
.align 32
@@ -2213,7 +2201,7 @@ $code.=<<___ if (!$win64);
___
$code.=<<___;
vzeroupper
- ret
+ RET
.cfi_endproc
___
if($avx > 2 && $avx512) {
@@ -2804,7 +2792,7 @@ $code.=<<___ if (!$win64);
.cfi_def_cfa_register %rsp
___
$code.=<<___;
- ret
+ RET
.cfi_endproc
___
@@ -2816,10 +2804,6 @@ ___
poly1305_blocks_avxN(0);
&end_function("poly1305_blocks_avx2");
-if($kernel) {
- $code .= "#endif\n";
-}
-
#######################################################################
if ($avx>2) {
# On entry we have input length divisible by 64. But since inner loop
@@ -2869,7 +2853,7 @@ $code.=<<___;
.type poly1305_init_base2_44,\@function,3
.align 32
poly1305_init_base2_44:
- xor %rax,%rax
+ xor %eax,%eax
mov %rax,0($ctx) # initialize hash value
mov %rax,8($ctx)
mov %rax,16($ctx)
@@ -2909,7 +2893,7 @@ $code.=<<___ if ($flavour =~ /elf32/);
___
$code.=<<___;
mov \$1,%eax
- ret
+ RET
.size poly1305_init_base2_44,.-poly1305_init_base2_44
___
{
@@ -3026,7 +3010,7 @@ poly1305_blocks_vpmadd52:
jnz .Lblocks_vpmadd52_4x
.Lno_data_vpmadd52:
- ret
+ RET
.size poly1305_blocks_vpmadd52,.-poly1305_blocks_vpmadd52
___
}
@@ -3467,7 +3451,7 @@ poly1305_blocks_vpmadd52_4x:
vzeroall
.Lno_data_vpmadd52_4x:
- ret
+ RET
.size poly1305_blocks_vpmadd52_4x,.-poly1305_blocks_vpmadd52_4x
___
}
@@ -3840,7 +3824,7 @@ $code.=<<___;
vzeroall
.Lno_data_vpmadd52_8x:
- ret
+ RET
.size poly1305_blocks_vpmadd52_8x,.-poly1305_blocks_vpmadd52_8x
___
}
@@ -3877,7 +3861,7 @@ poly1305_emit_base2_44:
mov %rax,0($mac) # write result
mov %rcx,8($mac)
- ret
+ RET
.size poly1305_emit_base2_44,.-poly1305_emit_base2_44
___
} } }
@@ -3932,7 +3916,7 @@ xor128_encrypt_n_pad:
.Ldone_enc:
mov $otp,%rax
- ret
+ RET
.size xor128_encrypt_n_pad,.-xor128_encrypt_n_pad
.globl xor128_decrypt_n_pad
@@ -3963,7 +3947,7 @@ xor128_decrypt_n_pad:
mov \$16,$len
sub %r10,$len
xor %eax,%eax
- xor %r11,%r11
+ xor %r11d,%r11d
.Loop_dec_byte:
mov ($inp,$otp),%r11b
mov ($otp),%al
@@ -3983,7 +3967,7 @@ xor128_decrypt_n_pad:
.Ldone_dec:
mov $otp,%rax
- ret
+ RET
.size xor128_decrypt_n_pad,.-xor128_decrypt_n_pad
___
}
@@ -4101,7 +4085,7 @@ avx_handler:
.long 0xa548f3fc # cld; rep movsq
mov $disp,%rsi
- xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
+ xor %ecx,%ecx # arg1, UNW_FLAG_NHANDLER
mov 8(%rsi),%rdx # arg2, disp->ImageBase
mov 0(%rsi),%r8 # arg3, disp->ControlPc
mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
@@ -4125,7 +4109,7 @@ avx_handler:
pop %rbx
pop %rdi
pop %rsi
- ret
+ RET
.size avx_handler,.-avx_handler
.section .pdata
diff --git a/arch/x86/crypto/poly1305_glue.c b/arch/x86/crypto/poly1305_glue.c
index 79bb58737d52..1dfb8af48a3c 100644
--- a/arch/x86/crypto/poly1305_glue.c
+++ b/arch/x86/crypto/poly1305_glue.c
@@ -11,11 +11,12 @@
#include <linux/jump_label.h>
#include <linux/kernel.h>
#include <linux/module.h>
+#include <linux/sizes.h>
#include <asm/intel-family.h>
#include <asm/simd.h>
asmlinkage void poly1305_init_x86_64(void *ctx,
- const u8 key[POLY1305_KEY_SIZE]);
+ const u8 key[POLY1305_BLOCK_SIZE]);
asmlinkage void poly1305_blocks_x86_64(void *ctx, const u8 *inp,
const size_t len, const u32 padbit);
asmlinkage void poly1305_emit_x86_64(void *ctx, u8 mac[POLY1305_DIGEST_SIZE],
@@ -80,7 +81,7 @@ static void convert_to_base2_64(void *ctx)
state->is_base2_26 = 0;
}
-static void poly1305_simd_init(void *ctx, const u8 key[POLY1305_KEY_SIZE])
+static void poly1305_simd_init(void *ctx, const u8 key[POLY1305_BLOCK_SIZE])
{
poly1305_init_x86_64(ctx, key);
}
@@ -91,10 +92,10 @@ static void poly1305_simd_blocks(void *ctx, const u8 *inp, size_t len,
struct poly1305_arch_internal *state = ctx;
/* SIMD disables preemption, so relax after processing each page. */
- BUILD_BUG_ON(PAGE_SIZE < POLY1305_BLOCK_SIZE ||
- PAGE_SIZE % POLY1305_BLOCK_SIZE);
+ BUILD_BUG_ON(SZ_4K < POLY1305_BLOCK_SIZE ||
+ SZ_4K % POLY1305_BLOCK_SIZE);
- if (!IS_ENABLED(CONFIG_AS_AVX) || !static_branch_likely(&poly1305_use_avx) ||
+ if (!static_branch_likely(&poly1305_use_avx) ||
(len < (POLY1305_BLOCK_SIZE * 18) && !state->is_base2_26) ||
!crypto_simd_usable()) {
convert_to_base2_64(ctx);
@@ -102,34 +103,33 @@ static void poly1305_simd_blocks(void *ctx, const u8 *inp, size_t len,
return;
}
- for (;;) {
- const size_t bytes = min_t(size_t, len, PAGE_SIZE);
+ do {
+ const size_t bytes = min_t(size_t, len, SZ_4K);
kernel_fpu_begin();
if (IS_ENABLED(CONFIG_AS_AVX512) && static_branch_likely(&poly1305_use_avx512))
poly1305_blocks_avx512(ctx, inp, bytes, padbit);
- else if (IS_ENABLED(CONFIG_AS_AVX2) && static_branch_likely(&poly1305_use_avx2))
+ else if (static_branch_likely(&poly1305_use_avx2))
poly1305_blocks_avx2(ctx, inp, bytes, padbit);
else
poly1305_blocks_avx(ctx, inp, bytes, padbit);
kernel_fpu_end();
+
len -= bytes;
- if (!len)
- break;
inp += bytes;
- }
+ } while (len);
}
static void poly1305_simd_emit(void *ctx, u8 mac[POLY1305_DIGEST_SIZE],
const u32 nonce[4])
{
- if (!IS_ENABLED(CONFIG_AS_AVX) || !static_branch_likely(&poly1305_use_avx))
+ if (!static_branch_likely(&poly1305_use_avx))
poly1305_emit_x86_64(ctx, mac, nonce);
else
poly1305_emit_avx(ctx, mac, nonce);
}
-void poly1305_init_arch(struct poly1305_desc_ctx *dctx, const u8 *key)
+void poly1305_init_arch(struct poly1305_desc_ctx *dctx, const u8 key[POLY1305_KEY_SIZE])
{
poly1305_simd_init(&dctx->h, key);
dctx->s[0] = get_unaligned_le32(&key[16]);
@@ -158,8 +158,6 @@ static unsigned int crypto_poly1305_setdctxkey(struct poly1305_desc_ctx *dctx,
dctx->s[1] = get_unaligned_le32(&inp[4]);
dctx->s[2] = get_unaligned_le32(&inp[8]);
dctx->s[3] = get_unaligned_le32(&inp[12]);
- inp += POLY1305_BLOCK_SIZE;
- len -= POLY1305_BLOCK_SIZE;
acc += POLY1305_BLOCK_SIZE;
dctx->sset = true;
}
@@ -212,7 +210,7 @@ void poly1305_final_arch(struct poly1305_desc_ctx *dctx, u8 *dst)
}
poly1305_simd_emit(&dctx->h, dst, dctx->s);
- *dctx = (struct poly1305_desc_ctx){};
+ memzero_explicit(dctx, sizeof(*dctx));
}
EXPORT_SYMBOL(poly1305_final_arch);
@@ -261,11 +259,10 @@ static struct shash_alg alg = {
static int __init poly1305_simd_mod_init(void)
{
- if (IS_ENABLED(CONFIG_AS_AVX) && boot_cpu_has(X86_FEATURE_AVX) &&
+ if (boot_cpu_has(X86_FEATURE_AVX) &&
cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL))
static_branch_enable(&poly1305_use_avx);
- if (IS_ENABLED(CONFIG_AS_AVX2) && boot_cpu_has(X86_FEATURE_AVX) &&
- boot_cpu_has(X86_FEATURE_AVX2) &&
+ if (boot_cpu_has(X86_FEATURE_AVX) && boot_cpu_has(X86_FEATURE_AVX2) &&
cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL))
static_branch_enable(&poly1305_use_avx2);
if (IS_ENABLED(CONFIG_AS_AVX512) && boot_cpu_has(X86_FEATURE_AVX) &&
diff --git a/arch/x86/crypto/polyval-clmulni_asm.S b/arch/x86/crypto/polyval-clmulni_asm.S
new file mode 100644
index 000000000000..a6ebe4e7dd2b
--- /dev/null
+++ b/arch/x86/crypto/polyval-clmulni_asm.S
@@ -0,0 +1,321 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright 2021 Google LLC
+ */
+/*
+ * This is an efficient implementation of POLYVAL using intel PCLMULQDQ-NI
+ * instructions. It works on 8 blocks at a time, by precomputing the first 8
+ * keys powers h^8, ..., h^1 in the POLYVAL finite field. This precomputation
+ * allows us to split finite field multiplication into two steps.
+ *
+ * In the first step, we consider h^i, m_i as normal polynomials of degree less
+ * than 128. We then compute p(x) = h^8m_0 + ... + h^1m_7 where multiplication
+ * is simply polynomial multiplication.
+ *
+ * In the second step, we compute the reduction of p(x) modulo the finite field
+ * modulus g(x) = x^128 + x^127 + x^126 + x^121 + 1.
+ *
+ * This two step process is equivalent to computing h^8m_0 + ... + h^1m_7 where
+ * multiplication is finite field multiplication. The advantage is that the
+ * two-step process only requires 1 finite field reduction for every 8
+ * polynomial multiplications. Further parallelism is gained by interleaving the
+ * multiplications and polynomial reductions.
+ */
+
+#include <linux/linkage.h>
+#include <asm/frame.h>
+
+#define STRIDE_BLOCKS 8
+
+#define GSTAR %xmm7
+#define PL %xmm8
+#define PH %xmm9
+#define TMP_XMM %xmm11
+#define LO %xmm12
+#define HI %xmm13
+#define MI %xmm14
+#define SUM %xmm15
+
+#define KEY_POWERS %rdi
+#define MSG %rsi
+#define BLOCKS_LEFT %rdx
+#define ACCUMULATOR %rcx
+#define TMP %rax
+
+.section .rodata.cst16.gstar, "aM", @progbits, 16
+.align 16
+
+.Lgstar:
+ .quad 0xc200000000000000, 0xc200000000000000
+
+.text
+
+/*
+ * Performs schoolbook1_iteration on two lists of 128-bit polynomials of length
+ * count pointed to by MSG and KEY_POWERS.
+ */
+.macro schoolbook1 count
+ .set i, 0
+ .rept (\count)
+ schoolbook1_iteration i 0
+ .set i, (i +1)
+ .endr
+.endm
+
+/*
+ * Computes the product of two 128-bit polynomials at the memory locations
+ * specified by (MSG + 16*i) and (KEY_POWERS + 16*i) and XORs the components of
+ * the 256-bit product into LO, MI, HI.
+ *
+ * Given:
+ * X = [X_1 : X_0]
+ * Y = [Y_1 : Y_0]
+ *
+ * We compute:
+ * LO += X_0 * Y_0
+ * MI += X_0 * Y_1 + X_1 * Y_0
+ * HI += X_1 * Y_1
+ *
+ * Later, the 256-bit result can be extracted as:
+ * [HI_1 : HI_0 + MI_1 : LO_1 + MI_0 : LO_0]
+ * This step is done when computing the polynomial reduction for efficiency
+ * reasons.
+ *
+ * If xor_sum == 1, then also XOR the value of SUM into m_0. This avoids an
+ * extra multiplication of SUM and h^8.
+ */
+.macro schoolbook1_iteration i xor_sum
+ movups (16*\i)(MSG), %xmm0
+ .if (\i == 0 && \xor_sum == 1)
+ pxor SUM, %xmm0
+ .endif
+ vpclmulqdq $0x01, (16*\i)(KEY_POWERS), %xmm0, %xmm2
+ vpclmulqdq $0x00, (16*\i)(KEY_POWERS), %xmm0, %xmm1
+ vpclmulqdq $0x10, (16*\i)(KEY_POWERS), %xmm0, %xmm3
+ vpclmulqdq $0x11, (16*\i)(KEY_POWERS), %xmm0, %xmm4
+ vpxor %xmm2, MI, MI
+ vpxor %xmm1, LO, LO
+ vpxor %xmm4, HI, HI
+ vpxor %xmm3, MI, MI
+.endm
+
+/*
+ * Performs the same computation as schoolbook1_iteration, except we expect the
+ * arguments to already be loaded into xmm0 and xmm1 and we set the result
+ * registers LO, MI, and HI directly rather than XOR'ing into them.
+ */
+.macro schoolbook1_noload
+ vpclmulqdq $0x01, %xmm0, %xmm1, MI
+ vpclmulqdq $0x10, %xmm0, %xmm1, %xmm2
+ vpclmulqdq $0x00, %xmm0, %xmm1, LO
+ vpclmulqdq $0x11, %xmm0, %xmm1, HI
+ vpxor %xmm2, MI, MI
+.endm
+
+/*
+ * Computes the 256-bit polynomial represented by LO, HI, MI. Stores
+ * the result in PL, PH.
+ * [PH : PL] = [HI_1 : HI_0 + MI_1 : LO_1 + MI_0 : LO_0]
+ */
+.macro schoolbook2
+ vpslldq $8, MI, PL
+ vpsrldq $8, MI, PH
+ pxor LO, PL
+ pxor HI, PH
+.endm
+
+/*
+ * Computes the 128-bit reduction of PH : PL. Stores the result in dest.
+ *
+ * This macro computes p(x) mod g(x) where p(x) is in montgomery form and g(x) =
+ * x^128 + x^127 + x^126 + x^121 + 1.
+ *
+ * We have a 256-bit polynomial PH : PL = P_3 : P_2 : P_1 : P_0 that is the
+ * product of two 128-bit polynomials in Montgomery form. We need to reduce it
+ * mod g(x). Also, since polynomials in Montgomery form have an "extra" factor
+ * of x^128, this product has two extra factors of x^128. To get it back into
+ * Montgomery form, we need to remove one of these factors by dividing by x^128.
+ *
+ * To accomplish both of these goals, we add multiples of g(x) that cancel out
+ * the low 128 bits P_1 : P_0, leaving just the high 128 bits. Since the low
+ * bits are zero, the polynomial division by x^128 can be done by right shifting.
+ *
+ * Since the only nonzero term in the low 64 bits of g(x) is the constant term,
+ * the multiple of g(x) needed to cancel out P_0 is P_0 * g(x). The CPU can
+ * only do 64x64 bit multiplications, so split P_0 * g(x) into x^128 * P_0 +
+ * x^64 * g*(x) * P_0 + P_0, where g*(x) is bits 64-127 of g(x). Adding this to
+ * the original polynomial gives P_3 : P_2 + P_0 + T_1 : P_1 + T_0 : 0, where T
+ * = T_1 : T_0 = g*(x) * P_0. Thus, bits 0-63 got "folded" into bits 64-191.
+ *
+ * Repeating this same process on the next 64 bits "folds" bits 64-127 into bits
+ * 128-255, giving the answer in bits 128-255. This time, we need to cancel P_1
+ * + T_0 in bits 64-127. The multiple of g(x) required is (P_1 + T_0) * g(x) *
+ * x^64. Adding this to our previous computation gives P_3 + P_1 + T_0 + V_1 :
+ * P_2 + P_0 + T_1 + V_0 : 0 : 0, where V = V_1 : V_0 = g*(x) * (P_1 + T_0).
+ *
+ * So our final computation is:
+ * T = T_1 : T_0 = g*(x) * P_0
+ * V = V_1 : V_0 = g*(x) * (P_1 + T_0)
+ * p(x) / x^{128} mod g(x) = P_3 + P_1 + T_0 + V_1 : P_2 + P_0 + T_1 + V_0
+ *
+ * The implementation below saves a XOR instruction by computing P_1 + T_0 : P_0
+ * + T_1 and XORing into dest, rather than separately XORing P_1 : P_0 and T_0 :
+ * T_1 into dest. This allows us to reuse P_1 + T_0 when computing V.
+ */
+.macro montgomery_reduction dest
+ vpclmulqdq $0x00, PL, GSTAR, TMP_XMM # TMP_XMM = T_1 : T_0 = P_0 * g*(x)
+ pshufd $0b01001110, TMP_XMM, TMP_XMM # TMP_XMM = T_0 : T_1
+ pxor PL, TMP_XMM # TMP_XMM = P_1 + T_0 : P_0 + T_1
+ pxor TMP_XMM, PH # PH = P_3 + P_1 + T_0 : P_2 + P_0 + T_1
+ pclmulqdq $0x11, GSTAR, TMP_XMM # TMP_XMM = V_1 : V_0 = V = [(P_1 + T_0) * g*(x)]
+ vpxor TMP_XMM, PH, \dest
+.endm
+
+/*
+ * Compute schoolbook multiplication for 8 blocks
+ * m_0h^8 + ... + m_7h^1
+ *
+ * If reduce is set, also computes the montgomery reduction of the
+ * previous full_stride call and XORs with the first message block.
+ * (m_0 + REDUCE(PL, PH))h^8 + ... + m_7h^1.
+ * I.e., the first multiplication uses m_0 + REDUCE(PL, PH) instead of m_0.
+ */
+.macro full_stride reduce
+ pxor LO, LO
+ pxor HI, HI
+ pxor MI, MI
+
+ schoolbook1_iteration 7 0
+ .if \reduce
+ vpclmulqdq $0x00, PL, GSTAR, TMP_XMM
+ .endif
+
+ schoolbook1_iteration 6 0
+ .if \reduce
+ pshufd $0b01001110, TMP_XMM, TMP_XMM
+ .endif
+
+ schoolbook1_iteration 5 0
+ .if \reduce
+ pxor PL, TMP_XMM
+ .endif
+
+ schoolbook1_iteration 4 0
+ .if \reduce
+ pxor TMP_XMM, PH
+ .endif
+
+ schoolbook1_iteration 3 0
+ .if \reduce
+ pclmulqdq $0x11, GSTAR, TMP_XMM
+ .endif
+
+ schoolbook1_iteration 2 0
+ .if \reduce
+ vpxor TMP_XMM, PH, SUM
+ .endif
+
+ schoolbook1_iteration 1 0
+
+ schoolbook1_iteration 0 1
+
+ addq $(8*16), MSG
+ schoolbook2
+.endm
+
+/*
+ * Process BLOCKS_LEFT blocks, where 0 < BLOCKS_LEFT < STRIDE_BLOCKS
+ */
+.macro partial_stride
+ mov BLOCKS_LEFT, TMP
+ shlq $4, TMP
+ addq $(16*STRIDE_BLOCKS), KEY_POWERS
+ subq TMP, KEY_POWERS
+
+ movups (MSG), %xmm0
+ pxor SUM, %xmm0
+ movaps (KEY_POWERS), %xmm1
+ schoolbook1_noload
+ dec BLOCKS_LEFT
+ addq $16, MSG
+ addq $16, KEY_POWERS
+
+ test $4, BLOCKS_LEFT
+ jz .Lpartial4BlocksDone
+ schoolbook1 4
+ addq $(4*16), MSG
+ addq $(4*16), KEY_POWERS
+.Lpartial4BlocksDone:
+ test $2, BLOCKS_LEFT
+ jz .Lpartial2BlocksDone
+ schoolbook1 2
+ addq $(2*16), MSG
+ addq $(2*16), KEY_POWERS
+.Lpartial2BlocksDone:
+ test $1, BLOCKS_LEFT
+ jz .LpartialDone
+ schoolbook1 1
+.LpartialDone:
+ schoolbook2
+ montgomery_reduction SUM
+.endm
+
+/*
+ * Perform montgomery multiplication in GF(2^128) and store result in op1.
+ *
+ * Computes op1*op2*x^{-128} mod x^128 + x^127 + x^126 + x^121 + 1
+ * If op1, op2 are in montgomery form, this computes the montgomery
+ * form of op1*op2.
+ *
+ * void clmul_polyval_mul(u8 *op1, const u8 *op2);
+ */
+SYM_FUNC_START(clmul_polyval_mul)
+ FRAME_BEGIN
+ vmovdqa .Lgstar(%rip), GSTAR
+ movups (%rdi), %xmm0
+ movups (%rsi), %xmm1
+ schoolbook1_noload
+ schoolbook2
+ montgomery_reduction SUM
+ movups SUM, (%rdi)
+ FRAME_END
+ RET
+SYM_FUNC_END(clmul_polyval_mul)
+
+/*
+ * Perform polynomial evaluation as specified by POLYVAL. This computes:
+ * h^n * accumulator + h^n * m_0 + ... + h^1 * m_{n-1}
+ * where n=nblocks, h is the hash key, and m_i are the message blocks.
+ *
+ * rdi - pointer to precomputed key powers h^8 ... h^1
+ * rsi - pointer to message blocks
+ * rdx - number of blocks to hash
+ * rcx - pointer to the accumulator
+ *
+ * void clmul_polyval_update(const struct polyval_tfm_ctx *keys,
+ * const u8 *in, size_t nblocks, u8 *accumulator);
+ */
+SYM_FUNC_START(clmul_polyval_update)
+ FRAME_BEGIN
+ vmovdqa .Lgstar(%rip), GSTAR
+ movups (ACCUMULATOR), SUM
+ subq $STRIDE_BLOCKS, BLOCKS_LEFT
+ js .LstrideLoopExit
+ full_stride 0
+ subq $STRIDE_BLOCKS, BLOCKS_LEFT
+ js .LstrideLoopExitReduce
+.LstrideLoop:
+ full_stride 1
+ subq $STRIDE_BLOCKS, BLOCKS_LEFT
+ jns .LstrideLoop
+.LstrideLoopExitReduce:
+ montgomery_reduction SUM
+.LstrideLoopExit:
+ add $STRIDE_BLOCKS, BLOCKS_LEFT
+ jz .LskipPartial
+ partial_stride
+.LskipPartial:
+ movups SUM, (ACCUMULATOR)
+ FRAME_END
+ RET
+SYM_FUNC_END(clmul_polyval_update)
diff --git a/arch/x86/crypto/polyval-clmulni_glue.c b/arch/x86/crypto/polyval-clmulni_glue.c
new file mode 100644
index 000000000000..8fa58b0f3cb3
--- /dev/null
+++ b/arch/x86/crypto/polyval-clmulni_glue.c
@@ -0,0 +1,212 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Glue code for POLYVAL using PCMULQDQ-NI
+ *
+ * Copyright (c) 2007 Nokia Siemens Networks - Mikko Herranen <mh1@iki.fi>
+ * Copyright (c) 2009 Intel Corp.
+ * Author: Huang Ying <ying.huang@intel.com>
+ * Copyright 2021 Google LLC
+ */
+
+/*
+ * Glue code based on ghash-clmulni-intel_glue.c.
+ *
+ * This implementation of POLYVAL uses montgomery multiplication
+ * accelerated by PCLMULQDQ-NI to implement the finite field
+ * operations.
+ */
+
+#include <crypto/algapi.h>
+#include <crypto/internal/hash.h>
+#include <crypto/internal/simd.h>
+#include <crypto/polyval.h>
+#include <linux/crypto.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <asm/cpu_device_id.h>
+#include <asm/simd.h>
+
+#define POLYVAL_ALIGN 16
+#define POLYVAL_ALIGN_ATTR __aligned(POLYVAL_ALIGN)
+#define POLYVAL_ALIGN_EXTRA ((POLYVAL_ALIGN - 1) & ~(CRYPTO_MINALIGN - 1))
+#define POLYVAL_CTX_SIZE (sizeof(struct polyval_tfm_ctx) + POLYVAL_ALIGN_EXTRA)
+#define NUM_KEY_POWERS 8
+
+struct polyval_tfm_ctx {
+ /*
+ * These powers must be in the order h^8, ..., h^1.
+ */
+ u8 key_powers[NUM_KEY_POWERS][POLYVAL_BLOCK_SIZE] POLYVAL_ALIGN_ATTR;
+};
+
+struct polyval_desc_ctx {
+ u8 buffer[POLYVAL_BLOCK_SIZE];
+ u32 bytes;
+};
+
+asmlinkage void clmul_polyval_update(const struct polyval_tfm_ctx *keys,
+ const u8 *in, size_t nblocks, u8 *accumulator);
+asmlinkage void clmul_polyval_mul(u8 *op1, const u8 *op2);
+
+static inline struct polyval_tfm_ctx *polyval_tfm_ctx(struct crypto_shash *tfm)
+{
+ return PTR_ALIGN(crypto_shash_ctx(tfm), POLYVAL_ALIGN);
+}
+
+static void internal_polyval_update(const struct polyval_tfm_ctx *keys,
+ const u8 *in, size_t nblocks, u8 *accumulator)
+{
+ if (likely(crypto_simd_usable())) {
+ kernel_fpu_begin();
+ clmul_polyval_update(keys, in, nblocks, accumulator);
+ kernel_fpu_end();
+ } else {
+ polyval_update_non4k(keys->key_powers[NUM_KEY_POWERS-1], in,
+ nblocks, accumulator);
+ }
+}
+
+static void internal_polyval_mul(u8 *op1, const u8 *op2)
+{
+ if (likely(crypto_simd_usable())) {
+ kernel_fpu_begin();
+ clmul_polyval_mul(op1, op2);
+ kernel_fpu_end();
+ } else {
+ polyval_mul_non4k(op1, op2);
+ }
+}
+
+static int polyval_x86_setkey(struct crypto_shash *tfm,
+ const u8 *key, unsigned int keylen)
+{
+ struct polyval_tfm_ctx *tctx = polyval_tfm_ctx(tfm);
+ int i;
+
+ if (keylen != POLYVAL_BLOCK_SIZE)
+ return -EINVAL;
+
+ memcpy(tctx->key_powers[NUM_KEY_POWERS-1], key, POLYVAL_BLOCK_SIZE);
+
+ for (i = NUM_KEY_POWERS-2; i >= 0; i--) {
+ memcpy(tctx->key_powers[i], key, POLYVAL_BLOCK_SIZE);
+ internal_polyval_mul(tctx->key_powers[i],
+ tctx->key_powers[i+1]);
+ }
+
+ return 0;
+}
+
+static int polyval_x86_init(struct shash_desc *desc)
+{
+ struct polyval_desc_ctx *dctx = shash_desc_ctx(desc);
+
+ memset(dctx, 0, sizeof(*dctx));
+
+ return 0;
+}
+
+static int polyval_x86_update(struct shash_desc *desc,
+ const u8 *src, unsigned int srclen)
+{
+ struct polyval_desc_ctx *dctx = shash_desc_ctx(desc);
+ const struct polyval_tfm_ctx *tctx = polyval_tfm_ctx(desc->tfm);
+ u8 *pos;
+ unsigned int nblocks;
+ unsigned int n;
+
+ if (dctx->bytes) {
+ n = min(srclen, dctx->bytes);
+ pos = dctx->buffer + POLYVAL_BLOCK_SIZE - dctx->bytes;
+
+ dctx->bytes -= n;
+ srclen -= n;
+
+ while (n--)
+ *pos++ ^= *src++;
+
+ if (!dctx->bytes)
+ internal_polyval_mul(dctx->buffer,
+ tctx->key_powers[NUM_KEY_POWERS-1]);
+ }
+
+ while (srclen >= POLYVAL_BLOCK_SIZE) {
+ /* Allow rescheduling every 4K bytes. */
+ nblocks = min(srclen, 4096U) / POLYVAL_BLOCK_SIZE;
+ internal_polyval_update(tctx, src, nblocks, dctx->buffer);
+ srclen -= nblocks * POLYVAL_BLOCK_SIZE;
+ src += nblocks * POLYVAL_BLOCK_SIZE;
+ }
+
+ if (srclen) {
+ dctx->bytes = POLYVAL_BLOCK_SIZE - srclen;
+ pos = dctx->buffer;
+ while (srclen--)
+ *pos++ ^= *src++;
+ }
+
+ return 0;
+}
+
+static int polyval_x86_final(struct shash_desc *desc, u8 *dst)
+{
+ struct polyval_desc_ctx *dctx = shash_desc_ctx(desc);
+ const struct polyval_tfm_ctx *tctx = polyval_tfm_ctx(desc->tfm);
+
+ if (dctx->bytes) {
+ internal_polyval_mul(dctx->buffer,
+ tctx->key_powers[NUM_KEY_POWERS-1]);
+ }
+
+ memcpy(dst, dctx->buffer, POLYVAL_BLOCK_SIZE);
+
+ return 0;
+}
+
+static struct shash_alg polyval_alg = {
+ .digestsize = POLYVAL_DIGEST_SIZE,
+ .init = polyval_x86_init,
+ .update = polyval_x86_update,
+ .final = polyval_x86_final,
+ .setkey = polyval_x86_setkey,
+ .descsize = sizeof(struct polyval_desc_ctx),
+ .base = {
+ .cra_name = "polyval",
+ .cra_driver_name = "polyval-clmulni",
+ .cra_priority = 200,
+ .cra_blocksize = POLYVAL_BLOCK_SIZE,
+ .cra_ctxsize = POLYVAL_CTX_SIZE,
+ .cra_module = THIS_MODULE,
+ },
+};
+
+__maybe_unused static const struct x86_cpu_id pcmul_cpu_id[] = {
+ X86_MATCH_FEATURE(X86_FEATURE_PCLMULQDQ, NULL),
+ {}
+};
+MODULE_DEVICE_TABLE(x86cpu, pcmul_cpu_id);
+
+static int __init polyval_clmulni_mod_init(void)
+{
+ if (!x86_match_cpu(pcmul_cpu_id))
+ return -ENODEV;
+
+ if (!boot_cpu_has(X86_FEATURE_AVX))
+ return -ENODEV;
+
+ return crypto_register_shash(&polyval_alg);
+}
+
+static void __exit polyval_clmulni_mod_exit(void)
+{
+ crypto_unregister_shash(&polyval_alg);
+}
+
+module_init(polyval_clmulni_mod_init);
+module_exit(polyval_clmulni_mod_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("POLYVAL hash function accelerated by PCLMULQDQ-NI");
+MODULE_ALIAS_CRYPTO("polyval");
+MODULE_ALIAS_CRYPTO("polyval-clmulni");
diff --git a/arch/x86/crypto/serpent-avx-x86_64-asm_64.S b/arch/x86/crypto/serpent-avx-x86_64-asm_64.S
index ba9e4c1e7f5c..82f2313f512b 100644
--- a/arch/x86/crypto/serpent-avx-x86_64-asm_64.S
+++ b/arch/x86/crypto/serpent-avx-x86_64-asm_64.S
@@ -18,10 +18,6 @@
.align 16
.Lbswap128_mask:
.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
-.section .rodata.cst16.xts_gf128mul_and_shl1_mask, "aM", @progbits, 16
-.align 16
-.Lxts_gf128mul_and_shl1_mask:
- .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0
.text
@@ -605,7 +601,7 @@ SYM_FUNC_START_LOCAL(__serpent_enc_blk8_avx)
write_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2);
write_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2);
- ret;
+ RET;
SYM_FUNC_END(__serpent_enc_blk8_avx)
.align 8
@@ -659,7 +655,7 @@ SYM_FUNC_START_LOCAL(__serpent_dec_blk8_avx)
write_blocks(RC1, RD1, RB1, RE1, RK0, RK1, RK2);
write_blocks(RC2, RD2, RB2, RE2, RK0, RK1, RK2);
- ret;
+ RET;
SYM_FUNC_END(__serpent_dec_blk8_avx)
SYM_FUNC_START(serpent_ecb_enc_8way_avx)
@@ -677,7 +673,7 @@ SYM_FUNC_START(serpent_ecb_enc_8way_avx)
store_8way(%rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
FRAME_END
- ret;
+ RET;
SYM_FUNC_END(serpent_ecb_enc_8way_avx)
SYM_FUNC_START(serpent_ecb_dec_8way_avx)
@@ -695,7 +691,7 @@ SYM_FUNC_START(serpent_ecb_dec_8way_avx)
store_8way(%rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2);
FRAME_END
- ret;
+ RET;
SYM_FUNC_END(serpent_ecb_dec_8way_avx)
SYM_FUNC_START(serpent_cbc_dec_8way_avx)
@@ -713,69 +709,5 @@ SYM_FUNC_START(serpent_cbc_dec_8way_avx)
store_cbc_8way(%rdx, %rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2);
FRAME_END
- ret;
+ RET;
SYM_FUNC_END(serpent_cbc_dec_8way_avx)
-
-SYM_FUNC_START(serpent_ctr_8way_avx)
- /* input:
- * %rdi: ctx, CTX
- * %rsi: dst
- * %rdx: src
- * %rcx: iv (little endian, 128bit)
- */
- FRAME_BEGIN
-
- load_ctr_8way(%rcx, .Lbswap128_mask, RA1, RB1, RC1, RD1, RA2, RB2, RC2,
- RD2, RK0, RK1, RK2);
-
- call __serpent_enc_blk8_avx;
-
- store_ctr_8way(%rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
-
- FRAME_END
- ret;
-SYM_FUNC_END(serpent_ctr_8way_avx)
-
-SYM_FUNC_START(serpent_xts_enc_8way_avx)
- /* input:
- * %rdi: ctx, CTX
- * %rsi: dst
- * %rdx: src
- * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
- */
- FRAME_BEGIN
-
- /* regs <= src, dst <= IVs, regs <= regs xor IVs */
- load_xts_8way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2,
- RK0, RK1, RK2, .Lxts_gf128mul_and_shl1_mask);
-
- call __serpent_enc_blk8_avx;
-
- /* dst <= regs xor IVs(in dst) */
- store_xts_8way(%rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
-
- FRAME_END
- ret;
-SYM_FUNC_END(serpent_xts_enc_8way_avx)
-
-SYM_FUNC_START(serpent_xts_dec_8way_avx)
- /* input:
- * %rdi: ctx, CTX
- * %rsi: dst
- * %rdx: src
- * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
- */
- FRAME_BEGIN
-
- /* regs <= src, dst <= IVs, regs <= regs xor IVs */
- load_xts_8way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2,
- RK0, RK1, RK2, .Lxts_gf128mul_and_shl1_mask);
-
- call __serpent_dec_blk8_avx;
-
- /* dst <= regs xor IVs(in dst) */
- store_xts_8way(%rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2);
-
- FRAME_END
- ret;
-SYM_FUNC_END(serpent_xts_dec_8way_avx)
diff --git a/arch/x86/crypto/serpent-avx.h b/arch/x86/crypto/serpent-avx.h
new file mode 100644
index 000000000000..23f3361a0e72
--- /dev/null
+++ b/arch/x86/crypto/serpent-avx.h
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef ASM_X86_SERPENT_AVX_H
+#define ASM_X86_SERPENT_AVX_H
+
+#include <crypto/b128ops.h>
+#include <crypto/serpent.h>
+#include <linux/types.h>
+
+struct crypto_skcipher;
+
+#define SERPENT_PARALLEL_BLOCKS 8
+
+asmlinkage void serpent_ecb_enc_8way_avx(const void *ctx, u8 *dst,
+ const u8 *src);
+asmlinkage void serpent_ecb_dec_8way_avx(const void *ctx, u8 *dst,
+ const u8 *src);
+
+asmlinkage void serpent_cbc_dec_8way_avx(const void *ctx, u8 *dst,
+ const u8 *src);
+
+#endif
diff --git a/arch/x86/crypto/serpent-avx2-asm_64.S b/arch/x86/crypto/serpent-avx2-asm_64.S
index c9648aeae705..8ea34c9b9316 100644
--- a/arch/x86/crypto/serpent-avx2-asm_64.S
+++ b/arch/x86/crypto/serpent-avx2-asm_64.S
@@ -20,16 +20,6 @@
.Lbswap128_mask:
.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
-.section .rodata.cst16.xts_gf128mul_and_shl1_mask_0, "aM", @progbits, 16
-.align 16
-.Lxts_gf128mul_and_shl1_mask_0:
- .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0
-
-.section .rodata.cst16.xts_gf128mul_and_shl1_mask_1, "aM", @progbits, 16
-.align 16
-.Lxts_gf128mul_and_shl1_mask_1:
- .byte 0x0e, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0
-
.text
#define CTX %rdi
@@ -611,7 +601,7 @@ SYM_FUNC_START_LOCAL(__serpent_enc_blk16)
write_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2);
write_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2);
- ret;
+ RET;
SYM_FUNC_END(__serpent_enc_blk16)
.align 8
@@ -665,7 +655,7 @@ SYM_FUNC_START_LOCAL(__serpent_dec_blk16)
write_blocks(RC1, RD1, RB1, RE1, RK0, RK1, RK2);
write_blocks(RC2, RD2, RB2, RE2, RK0, RK1, RK2);
- ret;
+ RET;
SYM_FUNC_END(__serpent_dec_blk16)
SYM_FUNC_START(serpent_ecb_enc_16way)
@@ -687,7 +677,7 @@ SYM_FUNC_START(serpent_ecb_enc_16way)
vzeroupper;
FRAME_END
- ret;
+ RET;
SYM_FUNC_END(serpent_ecb_enc_16way)
SYM_FUNC_START(serpent_ecb_dec_16way)
@@ -709,7 +699,7 @@ SYM_FUNC_START(serpent_ecb_dec_16way)
vzeroupper;
FRAME_END
- ret;
+ RET;
SYM_FUNC_END(serpent_ecb_dec_16way)
SYM_FUNC_START(serpent_cbc_dec_16way)
@@ -732,82 +722,5 @@ SYM_FUNC_START(serpent_cbc_dec_16way)
vzeroupper;
FRAME_END
- ret;
+ RET;
SYM_FUNC_END(serpent_cbc_dec_16way)
-
-SYM_FUNC_START(serpent_ctr_16way)
- /* input:
- * %rdi: ctx, CTX
- * %rsi: dst (16 blocks)
- * %rdx: src (16 blocks)
- * %rcx: iv (little endian, 128bit)
- */
- FRAME_BEGIN
-
- vzeroupper;
-
- load_ctr_16way(%rcx, .Lbswap128_mask, RA1, RB1, RC1, RD1, RA2, RB2, RC2,
- RD2, RK0, RK0x, RK1, RK1x, RK2, RK2x, RK3, RK3x, RNOT,
- tp);
-
- call __serpent_enc_blk16;
-
- store_ctr_16way(%rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
-
- vzeroupper;
-
- FRAME_END
- ret;
-SYM_FUNC_END(serpent_ctr_16way)
-
-SYM_FUNC_START(serpent_xts_enc_16way)
- /* input:
- * %rdi: ctx, CTX
- * %rsi: dst (16 blocks)
- * %rdx: src (16 blocks)
- * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
- */
- FRAME_BEGIN
-
- vzeroupper;
-
- load_xts_16way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2,
- RD2, RK0, RK0x, RK1, RK1x, RK2, RK2x, RK3, RK3x, RNOT,
- .Lxts_gf128mul_and_shl1_mask_0,
- .Lxts_gf128mul_and_shl1_mask_1);
-
- call __serpent_enc_blk16;
-
- store_xts_16way(%rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
-
- vzeroupper;
-
- FRAME_END
- ret;
-SYM_FUNC_END(serpent_xts_enc_16way)
-
-SYM_FUNC_START(serpent_xts_dec_16way)
- /* input:
- * %rdi: ctx, CTX
- * %rsi: dst (16 blocks)
- * %rdx: src (16 blocks)
- * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
- */
- FRAME_BEGIN
-
- vzeroupper;
-
- load_xts_16way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2,
- RD2, RK0, RK0x, RK1, RK1x, RK2, RK2x, RK3, RK3x, RNOT,
- .Lxts_gf128mul_and_shl1_mask_0,
- .Lxts_gf128mul_and_shl1_mask_1);
-
- call __serpent_dec_blk16;
-
- store_xts_16way(%rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2);
-
- vzeroupper;
-
- FRAME_END
- ret;
-SYM_FUNC_END(serpent_xts_dec_16way)
diff --git a/arch/x86/crypto/serpent-sse2-i586-asm_32.S b/arch/x86/crypto/serpent-sse2-i586-asm_32.S
index 6379b99cb722..8ccb03ad7cef 100644
--- a/arch/x86/crypto/serpent-sse2-i586-asm_32.S
+++ b/arch/x86/crypto/serpent-sse2-i586-asm_32.S
@@ -553,12 +553,12 @@ SYM_FUNC_START(__serpent_enc_blk_4way)
write_blocks(%eax, RA, RB, RC, RD, RT0, RT1, RE);
- ret;
+ RET;
.L__enc_xor4:
xor_blocks(%eax, RA, RB, RC, RD, RT0, RT1, RE);
- ret;
+ RET;
SYM_FUNC_END(__serpent_enc_blk_4way)
SYM_FUNC_START(serpent_dec_blk_4way)
@@ -612,5 +612,5 @@ SYM_FUNC_START(serpent_dec_blk_4way)
movl arg_dst(%esp), %eax;
write_blocks(%eax, RC, RD, RB, RE, RT0, RT1, RA);
- ret;
+ RET;
SYM_FUNC_END(serpent_dec_blk_4way)
diff --git a/arch/x86/crypto/serpent-sse2-x86_64-asm_64.S b/arch/x86/crypto/serpent-sse2-x86_64-asm_64.S
index efb6dc17dc90..e0998a011d1d 100644
--- a/arch/x86/crypto/serpent-sse2-x86_64-asm_64.S
+++ b/arch/x86/crypto/serpent-sse2-x86_64-asm_64.S
@@ -675,13 +675,13 @@ SYM_FUNC_START(__serpent_enc_blk_8way)
write_blocks(%rsi, RA1, RB1, RC1, RD1, RK0, RK1, RK2);
write_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);
- ret;
+ RET;
.L__enc_xor8:
xor_blocks(%rsi, RA1, RB1, RC1, RD1, RK0, RK1, RK2);
xor_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);
- ret;
+ RET;
SYM_FUNC_END(__serpent_enc_blk_8way)
SYM_FUNC_START(serpent_dec_blk_8way)
@@ -735,5 +735,5 @@ SYM_FUNC_START(serpent_dec_blk_8way)
write_blocks(%rsi, RC1, RD1, RB1, RE1, RK0, RK1, RK2);
write_blocks(%rax, RC2, RD2, RB2, RE2, RK0, RK1, RK2);
- ret;
+ RET;
SYM_FUNC_END(serpent_dec_blk_8way)
diff --git a/arch/x86/crypto/serpent-sse2.h b/arch/x86/crypto/serpent-sse2.h
new file mode 100644
index 000000000000..860ca248914b
--- /dev/null
+++ b/arch/x86/crypto/serpent-sse2.h
@@ -0,0 +1,60 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef ASM_X86_SERPENT_SSE2_H
+#define ASM_X86_SERPENT_SSE2_H
+
+#include <linux/crypto.h>
+#include <crypto/serpent.h>
+
+#ifdef CONFIG_X86_32
+
+#define SERPENT_PARALLEL_BLOCKS 4
+
+asmlinkage void __serpent_enc_blk_4way(const struct serpent_ctx *ctx, u8 *dst,
+ const u8 *src, bool xor);
+asmlinkage void serpent_dec_blk_4way(const struct serpent_ctx *ctx, u8 *dst,
+ const u8 *src);
+
+static inline void serpent_enc_blk_xway(const void *ctx, u8 *dst, const u8 *src)
+{
+ __serpent_enc_blk_4way(ctx, dst, src, false);
+}
+
+static inline void serpent_enc_blk_xway_xor(const struct serpent_ctx *ctx,
+ u8 *dst, const u8 *src)
+{
+ __serpent_enc_blk_4way(ctx, dst, src, true);
+}
+
+static inline void serpent_dec_blk_xway(const void *ctx, u8 *dst, const u8 *src)
+{
+ serpent_dec_blk_4way(ctx, dst, src);
+}
+
+#else
+
+#define SERPENT_PARALLEL_BLOCKS 8
+
+asmlinkage void __serpent_enc_blk_8way(const struct serpent_ctx *ctx, u8 *dst,
+ const u8 *src, bool xor);
+asmlinkage void serpent_dec_blk_8way(const struct serpent_ctx *ctx, u8 *dst,
+ const u8 *src);
+
+static inline void serpent_enc_blk_xway(const void *ctx, u8 *dst, const u8 *src)
+{
+ __serpent_enc_blk_8way(ctx, dst, src, false);
+}
+
+static inline void serpent_enc_blk_xway_xor(const struct serpent_ctx *ctx,
+ u8 *dst, const u8 *src)
+{
+ __serpent_enc_blk_8way(ctx, dst, src, true);
+}
+
+static inline void serpent_dec_blk_xway(const void *ctx, u8 *dst, const u8 *src)
+{
+ serpent_dec_blk_8way(ctx, dst, src);
+}
+
+#endif
+
+#endif
diff --git a/arch/x86/crypto/serpent_avx2_glue.c b/arch/x86/crypto/serpent_avx2_glue.c
index f973ace44ad3..347e97f4b713 100644
--- a/arch/x86/crypto/serpent_avx2_glue.c
+++ b/arch/x86/crypto/serpent_avx2_glue.c
@@ -12,9 +12,9 @@
#include <crypto/algapi.h>
#include <crypto/internal/simd.h>
#include <crypto/serpent.h>
-#include <crypto/xts.h>
-#include <asm/crypto/glue_helper.h>
-#include <asm/crypto/serpent-avx.h>
+
+#include "serpent-avx.h"
+#include "ecb_cbc_helpers.h"
#define SERPENT_AVX2_PARALLEL_BLOCKS 16
@@ -23,158 +23,44 @@ asmlinkage void serpent_ecb_enc_16way(const void *ctx, u8 *dst, const u8 *src);
asmlinkage void serpent_ecb_dec_16way(const void *ctx, u8 *dst, const u8 *src);
asmlinkage void serpent_cbc_dec_16way(const void *ctx, u8 *dst, const u8 *src);
-asmlinkage void serpent_ctr_16way(const void *ctx, u8 *dst, const u8 *src,
- le128 *iv);
-asmlinkage void serpent_xts_enc_16way(const void *ctx, u8 *dst, const u8 *src,
- le128 *iv);
-asmlinkage void serpent_xts_dec_16way(const void *ctx, u8 *dst, const u8 *src,
- le128 *iv);
-
static int serpent_setkey_skcipher(struct crypto_skcipher *tfm,
const u8 *key, unsigned int keylen)
{
return __serpent_setkey(crypto_skcipher_ctx(tfm), key, keylen);
}
-static const struct common_glue_ctx serpent_enc = {
- .num_funcs = 3,
- .fpu_blocks_limit = 8,
-
- .funcs = { {
- .num_blocks = 16,
- .fn_u = { .ecb = serpent_ecb_enc_16way }
- }, {
- .num_blocks = 8,
- .fn_u = { .ecb = serpent_ecb_enc_8way_avx }
- }, {
- .num_blocks = 1,
- .fn_u = { .ecb = __serpent_encrypt }
- } }
-};
-
-static const struct common_glue_ctx serpent_ctr = {
- .num_funcs = 3,
- .fpu_blocks_limit = 8,
-
- .funcs = { {
- .num_blocks = 16,
- .fn_u = { .ctr = serpent_ctr_16way }
- }, {
- .num_blocks = 8,
- .fn_u = { .ctr = serpent_ctr_8way_avx }
- }, {
- .num_blocks = 1,
- .fn_u = { .ctr = __serpent_crypt_ctr }
- } }
-};
-
-static const struct common_glue_ctx serpent_enc_xts = {
- .num_funcs = 3,
- .fpu_blocks_limit = 8,
-
- .funcs = { {
- .num_blocks = 16,
- .fn_u = { .xts = serpent_xts_enc_16way }
- }, {
- .num_blocks = 8,
- .fn_u = { .xts = serpent_xts_enc_8way_avx }
- }, {
- .num_blocks = 1,
- .fn_u = { .xts = serpent_xts_enc }
- } }
-};
-
-static const struct common_glue_ctx serpent_dec = {
- .num_funcs = 3,
- .fpu_blocks_limit = 8,
-
- .funcs = { {
- .num_blocks = 16,
- .fn_u = { .ecb = serpent_ecb_dec_16way }
- }, {
- .num_blocks = 8,
- .fn_u = { .ecb = serpent_ecb_dec_8way_avx }
- }, {
- .num_blocks = 1,
- .fn_u = { .ecb = __serpent_decrypt }
- } }
-};
-
-static const struct common_glue_ctx serpent_dec_cbc = {
- .num_funcs = 3,
- .fpu_blocks_limit = 8,
-
- .funcs = { {
- .num_blocks = 16,
- .fn_u = { .cbc = serpent_cbc_dec_16way }
- }, {
- .num_blocks = 8,
- .fn_u = { .cbc = serpent_cbc_dec_8way_avx }
- }, {
- .num_blocks = 1,
- .fn_u = { .cbc = __serpent_decrypt }
- } }
-};
-
-static const struct common_glue_ctx serpent_dec_xts = {
- .num_funcs = 3,
- .fpu_blocks_limit = 8,
-
- .funcs = { {
- .num_blocks = 16,
- .fn_u = { .xts = serpent_xts_dec_16way }
- }, {
- .num_blocks = 8,
- .fn_u = { .xts = serpent_xts_dec_8way_avx }
- }, {
- .num_blocks = 1,
- .fn_u = { .xts = serpent_xts_dec }
- } }
-};
-
static int ecb_encrypt(struct skcipher_request *req)
{
- return glue_ecb_req_128bit(&serpent_enc, req);
+ ECB_WALK_START(req, SERPENT_BLOCK_SIZE, SERPENT_PARALLEL_BLOCKS);
+ ECB_BLOCK(SERPENT_AVX2_PARALLEL_BLOCKS, serpent_ecb_enc_16way);
+ ECB_BLOCK(SERPENT_PARALLEL_BLOCKS, serpent_ecb_enc_8way_avx);
+ ECB_BLOCK(1, __serpent_encrypt);
+ ECB_WALK_END();
}
static int ecb_decrypt(struct skcipher_request *req)
{
- return glue_ecb_req_128bit(&serpent_dec, req);
+ ECB_WALK_START(req, SERPENT_BLOCK_SIZE, SERPENT_PARALLEL_BLOCKS);
+ ECB_BLOCK(SERPENT_AVX2_PARALLEL_BLOCKS, serpent_ecb_dec_16way);
+ ECB_BLOCK(SERPENT_PARALLEL_BLOCKS, serpent_ecb_dec_8way_avx);
+ ECB_BLOCK(1, __serpent_decrypt);
+ ECB_WALK_END();
}
static int cbc_encrypt(struct skcipher_request *req)
{
- return glue_cbc_encrypt_req_128bit(__serpent_encrypt, req);
+ CBC_WALK_START(req, SERPENT_BLOCK_SIZE, -1);
+ CBC_ENC_BLOCK(__serpent_encrypt);
+ CBC_WALK_END();
}
static int cbc_decrypt(struct skcipher_request *req)
{
- return glue_cbc_decrypt_req_128bit(&serpent_dec_cbc, req);
-}
-
-static int ctr_crypt(struct skcipher_request *req)
-{
- return glue_ctr_req_128bit(&serpent_ctr, req);
-}
-
-static int xts_encrypt(struct skcipher_request *req)
-{
- struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
- struct serpent_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
-
- return glue_xts_req_128bit(&serpent_enc_xts, req,
- __serpent_encrypt, &ctx->tweak_ctx,
- &ctx->crypt_ctx, false);
-}
-
-static int xts_decrypt(struct skcipher_request *req)
-{
- struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
- struct serpent_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
-
- return glue_xts_req_128bit(&serpent_dec_xts, req,
- __serpent_encrypt, &ctx->tweak_ctx,
- &ctx->crypt_ctx, true);
+ CBC_WALK_START(req, SERPENT_BLOCK_SIZE, SERPENT_PARALLEL_BLOCKS);
+ CBC_DEC_BLOCK(SERPENT_AVX2_PARALLEL_BLOCKS, serpent_cbc_dec_16way);
+ CBC_DEC_BLOCK(SERPENT_PARALLEL_BLOCKS, serpent_cbc_dec_8way_avx);
+ CBC_DEC_BLOCK(1, __serpent_decrypt);
+ CBC_WALK_END();
}
static struct skcipher_alg serpent_algs[] = {
@@ -205,41 +91,12 @@ static struct skcipher_alg serpent_algs[] = {
.setkey = serpent_setkey_skcipher,
.encrypt = cbc_encrypt,
.decrypt = cbc_decrypt,
- }, {
- .base.cra_name = "__ctr(serpent)",
- .base.cra_driver_name = "__ctr-serpent-avx2",
- .base.cra_priority = 600,
- .base.cra_flags = CRYPTO_ALG_INTERNAL,
- .base.cra_blocksize = 1,
- .base.cra_ctxsize = sizeof(struct serpent_ctx),
- .base.cra_module = THIS_MODULE,
- .min_keysize = SERPENT_MIN_KEY_SIZE,
- .max_keysize = SERPENT_MAX_KEY_SIZE,
- .ivsize = SERPENT_BLOCK_SIZE,
- .chunksize = SERPENT_BLOCK_SIZE,
- .setkey = serpent_setkey_skcipher,
- .encrypt = ctr_crypt,
- .decrypt = ctr_crypt,
- }, {
- .base.cra_name = "__xts(serpent)",
- .base.cra_driver_name = "__xts-serpent-avx2",
- .base.cra_priority = 600,
- .base.cra_flags = CRYPTO_ALG_INTERNAL,
- .base.cra_blocksize = SERPENT_BLOCK_SIZE,
- .base.cra_ctxsize = sizeof(struct serpent_xts_ctx),
- .base.cra_module = THIS_MODULE,
- .min_keysize = 2 * SERPENT_MIN_KEY_SIZE,
- .max_keysize = 2 * SERPENT_MAX_KEY_SIZE,
- .ivsize = SERPENT_BLOCK_SIZE,
- .setkey = xts_serpent_setkey,
- .encrypt = xts_encrypt,
- .decrypt = xts_decrypt,
},
};
static struct simd_skcipher_alg *serpent_simd_algs[ARRAY_SIZE(serpent_algs)];
-static int __init init(void)
+static int __init serpent_avx2_init(void)
{
const char *feature_name;
@@ -258,14 +115,14 @@ static int __init init(void)
serpent_simd_algs);
}
-static void __exit fini(void)
+static void __exit serpent_avx2_fini(void)
{
simd_unregister_skciphers(serpent_algs, ARRAY_SIZE(serpent_algs),
serpent_simd_algs);
}
-module_init(init);
-module_exit(fini);
+module_init(serpent_avx2_init);
+module_exit(serpent_avx2_fini);
MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("Serpent Cipher Algorithm, AVX2 optimized");
diff --git a/arch/x86/crypto/serpent_avx_glue.c b/arch/x86/crypto/serpent_avx_glue.c
index 7806d1cbe854..6c248e1ea4ef 100644
--- a/arch/x86/crypto/serpent_avx_glue.c
+++ b/arch/x86/crypto/serpent_avx_glue.c
@@ -15,9 +15,9 @@
#include <crypto/algapi.h>
#include <crypto/internal/simd.h>
#include <crypto/serpent.h>
-#include <crypto/xts.h>
-#include <asm/crypto/glue_helper.h>
-#include <asm/crypto/serpent-avx.h>
+
+#include "serpent-avx.h"
+#include "ecb_cbc_helpers.h"
/* 8-way parallel cipher functions */
asmlinkage void serpent_ecb_enc_8way_avx(const void *ctx, u8 *dst,
@@ -32,191 +32,41 @@ asmlinkage void serpent_cbc_dec_8way_avx(const void *ctx, u8 *dst,
const u8 *src);
EXPORT_SYMBOL_GPL(serpent_cbc_dec_8way_avx);
-asmlinkage void serpent_ctr_8way_avx(const void *ctx, u8 *dst, const u8 *src,
- le128 *iv);
-EXPORT_SYMBOL_GPL(serpent_ctr_8way_avx);
-
-asmlinkage void serpent_xts_enc_8way_avx(const void *ctx, u8 *dst,
- const u8 *src, le128 *iv);
-EXPORT_SYMBOL_GPL(serpent_xts_enc_8way_avx);
-
-asmlinkage void serpent_xts_dec_8way_avx(const void *ctx, u8 *dst,
- const u8 *src, le128 *iv);
-EXPORT_SYMBOL_GPL(serpent_xts_dec_8way_avx);
-
-void __serpent_crypt_ctr(const void *ctx, u8 *d, const u8 *s, le128 *iv)
-{
- be128 ctrblk;
- u128 *dst = (u128 *)d;
- const u128 *src = (const u128 *)s;
-
- le128_to_be128(&ctrblk, iv);
- le128_inc(iv);
-
- __serpent_encrypt(ctx, (u8 *)&ctrblk, (u8 *)&ctrblk);
- u128_xor(dst, src, (u128 *)&ctrblk);
-}
-EXPORT_SYMBOL_GPL(__serpent_crypt_ctr);
-
-void serpent_xts_enc(const void *ctx, u8 *dst, const u8 *src, le128 *iv)
-{
- glue_xts_crypt_128bit_one(ctx, dst, src, iv, __serpent_encrypt);
-}
-EXPORT_SYMBOL_GPL(serpent_xts_enc);
-
-void serpent_xts_dec(const void *ctx, u8 *dst, const u8 *src, le128 *iv)
-{
- glue_xts_crypt_128bit_one(ctx, dst, src, iv, __serpent_decrypt);
-}
-EXPORT_SYMBOL_GPL(serpent_xts_dec);
-
static int serpent_setkey_skcipher(struct crypto_skcipher *tfm,
const u8 *key, unsigned int keylen)
{
return __serpent_setkey(crypto_skcipher_ctx(tfm), key, keylen);
}
-int xts_serpent_setkey(struct crypto_skcipher *tfm, const u8 *key,
- unsigned int keylen)
-{
- struct serpent_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
- int err;
-
- err = xts_verify_key(tfm, key, keylen);
- if (err)
- return err;
-
- /* first half of xts-key is for crypt */
- err = __serpent_setkey(&ctx->crypt_ctx, key, keylen / 2);
- if (err)
- return err;
-
- /* second half of xts-key is for tweak */
- return __serpent_setkey(&ctx->tweak_ctx, key + keylen / 2, keylen / 2);
-}
-EXPORT_SYMBOL_GPL(xts_serpent_setkey);
-
-static const struct common_glue_ctx serpent_enc = {
- .num_funcs = 2,
- .fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS,
-
- .funcs = { {
- .num_blocks = SERPENT_PARALLEL_BLOCKS,
- .fn_u = { .ecb = serpent_ecb_enc_8way_avx }
- }, {
- .num_blocks = 1,
- .fn_u = { .ecb = __serpent_encrypt }
- } }
-};
-
-static const struct common_glue_ctx serpent_ctr = {
- .num_funcs = 2,
- .fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS,
-
- .funcs = { {
- .num_blocks = SERPENT_PARALLEL_BLOCKS,
- .fn_u = { .ctr = serpent_ctr_8way_avx }
- }, {
- .num_blocks = 1,
- .fn_u = { .ctr = __serpent_crypt_ctr }
- } }
-};
-
-static const struct common_glue_ctx serpent_enc_xts = {
- .num_funcs = 2,
- .fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS,
-
- .funcs = { {
- .num_blocks = SERPENT_PARALLEL_BLOCKS,
- .fn_u = { .xts = serpent_xts_enc_8way_avx }
- }, {
- .num_blocks = 1,
- .fn_u = { .xts = serpent_xts_enc }
- } }
-};
-
-static const struct common_glue_ctx serpent_dec = {
- .num_funcs = 2,
- .fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS,
-
- .funcs = { {
- .num_blocks = SERPENT_PARALLEL_BLOCKS,
- .fn_u = { .ecb = serpent_ecb_dec_8way_avx }
- }, {
- .num_blocks = 1,
- .fn_u = { .ecb = __serpent_decrypt }
- } }
-};
-
-static const struct common_glue_ctx serpent_dec_cbc = {
- .num_funcs = 2,
- .fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS,
-
- .funcs = { {
- .num_blocks = SERPENT_PARALLEL_BLOCKS,
- .fn_u = { .cbc = serpent_cbc_dec_8way_avx }
- }, {
- .num_blocks = 1,
- .fn_u = { .cbc = __serpent_decrypt }
- } }
-};
-
-static const struct common_glue_ctx serpent_dec_xts = {
- .num_funcs = 2,
- .fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS,
-
- .funcs = { {
- .num_blocks = SERPENT_PARALLEL_BLOCKS,
- .fn_u = { .xts = serpent_xts_dec_8way_avx }
- }, {
- .num_blocks = 1,
- .fn_u = { .xts = serpent_xts_dec }
- } }
-};
-
static int ecb_encrypt(struct skcipher_request *req)
{
- return glue_ecb_req_128bit(&serpent_enc, req);
+ ECB_WALK_START(req, SERPENT_BLOCK_SIZE, SERPENT_PARALLEL_BLOCKS);
+ ECB_BLOCK(SERPENT_PARALLEL_BLOCKS, serpent_ecb_enc_8way_avx);
+ ECB_BLOCK(1, __serpent_encrypt);
+ ECB_WALK_END();
}
static int ecb_decrypt(struct skcipher_request *req)
{
- return glue_ecb_req_128bit(&serpent_dec, req);
+ ECB_WALK_START(req, SERPENT_BLOCK_SIZE, SERPENT_PARALLEL_BLOCKS);
+ ECB_BLOCK(SERPENT_PARALLEL_BLOCKS, serpent_ecb_dec_8way_avx);
+ ECB_BLOCK(1, __serpent_decrypt);
+ ECB_WALK_END();
}
static int cbc_encrypt(struct skcipher_request *req)
{
- return glue_cbc_encrypt_req_128bit(__serpent_encrypt, req);
+ CBC_WALK_START(req, SERPENT_BLOCK_SIZE, -1);
+ CBC_ENC_BLOCK(__serpent_encrypt);
+ CBC_WALK_END();
}
static int cbc_decrypt(struct skcipher_request *req)
{
- return glue_cbc_decrypt_req_128bit(&serpent_dec_cbc, req);
-}
-
-static int ctr_crypt(struct skcipher_request *req)
-{
- return glue_ctr_req_128bit(&serpent_ctr, req);
-}
-
-static int xts_encrypt(struct skcipher_request *req)
-{
- struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
- struct serpent_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
-
- return glue_xts_req_128bit(&serpent_enc_xts, req,
- __serpent_encrypt, &ctx->tweak_ctx,
- &ctx->crypt_ctx, false);
-}
-
-static int xts_decrypt(struct skcipher_request *req)
-{
- struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
- struct serpent_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
-
- return glue_xts_req_128bit(&serpent_dec_xts, req,
- __serpent_encrypt, &ctx->tweak_ctx,
- &ctx->crypt_ctx, true);
+ CBC_WALK_START(req, SERPENT_BLOCK_SIZE, SERPENT_PARALLEL_BLOCKS);
+ CBC_DEC_BLOCK(SERPENT_PARALLEL_BLOCKS, serpent_cbc_dec_8way_avx);
+ CBC_DEC_BLOCK(1, __serpent_decrypt);
+ CBC_WALK_END();
}
static struct skcipher_alg serpent_algs[] = {
@@ -247,35 +97,6 @@ static struct skcipher_alg serpent_algs[] = {
.setkey = serpent_setkey_skcipher,
.encrypt = cbc_encrypt,
.decrypt = cbc_decrypt,
- }, {
- .base.cra_name = "__ctr(serpent)",
- .base.cra_driver_name = "__ctr-serpent-avx",
- .base.cra_priority = 500,
- .base.cra_flags = CRYPTO_ALG_INTERNAL,
- .base.cra_blocksize = 1,
- .base.cra_ctxsize = sizeof(struct serpent_ctx),
- .base.cra_module = THIS_MODULE,
- .min_keysize = SERPENT_MIN_KEY_SIZE,
- .max_keysize = SERPENT_MAX_KEY_SIZE,
- .ivsize = SERPENT_BLOCK_SIZE,
- .chunksize = SERPENT_BLOCK_SIZE,
- .setkey = serpent_setkey_skcipher,
- .encrypt = ctr_crypt,
- .decrypt = ctr_crypt,
- }, {
- .base.cra_name = "__xts(serpent)",
- .base.cra_driver_name = "__xts-serpent-avx",
- .base.cra_priority = 500,
- .base.cra_flags = CRYPTO_ALG_INTERNAL,
- .base.cra_blocksize = SERPENT_BLOCK_SIZE,
- .base.cra_ctxsize = sizeof(struct serpent_xts_ctx),
- .base.cra_module = THIS_MODULE,
- .min_keysize = 2 * SERPENT_MIN_KEY_SIZE,
- .max_keysize = 2 * SERPENT_MAX_KEY_SIZE,
- .ivsize = SERPENT_BLOCK_SIZE,
- .setkey = xts_serpent_setkey,
- .encrypt = xts_encrypt,
- .decrypt = xts_decrypt,
},
};
diff --git a/arch/x86/crypto/serpent_sse2_glue.c b/arch/x86/crypto/serpent_sse2_glue.c
index 4fed8d26b91a..d78f37e9b2cf 100644
--- a/arch/x86/crypto/serpent_sse2_glue.c
+++ b/arch/x86/crypto/serpent_sse2_glue.c
@@ -10,8 +10,6 @@
*
* CBC & ECB parts based on code (crypto/cbc.c,ecb.c) by:
* Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au>
- * CTR part based on code (crypto/ctr.c) by:
- * (C) Copyright IBM Corp. 2007 - Joy Latten <latten@us.ibm.com>
*/
#include <linux/module.h>
@@ -22,8 +20,9 @@
#include <crypto/b128ops.h>
#include <crypto/internal/simd.h>
#include <crypto/serpent.h>
-#include <asm/crypto/serpent-sse2.h>
-#include <asm/crypto/glue_helper.h>
+
+#include "serpent-sse2.h"
+#include "ecb_cbc_helpers.h"
static int serpent_setkey_skcipher(struct crypto_skcipher *tfm,
const u8 *key, unsigned int keylen)
@@ -31,130 +30,46 @@ static int serpent_setkey_skcipher(struct crypto_skcipher *tfm,
return __serpent_setkey(crypto_skcipher_ctx(tfm), key, keylen);
}
-static void serpent_decrypt_cbc_xway(const void *ctx, u8 *d, const u8 *s)
-{
- u128 ivs[SERPENT_PARALLEL_BLOCKS - 1];
- u128 *dst = (u128 *)d;
- const u128 *src = (const u128 *)s;
- unsigned int j;
-
- for (j = 0; j < SERPENT_PARALLEL_BLOCKS - 1; j++)
- ivs[j] = src[j];
-
- serpent_dec_blk_xway(ctx, (u8 *)dst, (u8 *)src);
-
- for (j = 0; j < SERPENT_PARALLEL_BLOCKS - 1; j++)
- u128_xor(dst + (j + 1), dst + (j + 1), ivs + j);
-}
-
-static void serpent_crypt_ctr(const void *ctx, u8 *d, const u8 *s, le128 *iv)
-{
- be128 ctrblk;
- u128 *dst = (u128 *)d;
- const u128 *src = (const u128 *)s;
-
- le128_to_be128(&ctrblk, iv);
- le128_inc(iv);
-
- __serpent_encrypt(ctx, (u8 *)&ctrblk, (u8 *)&ctrblk);
- u128_xor(dst, src, (u128 *)&ctrblk);
-}
-
-static void serpent_crypt_ctr_xway(const void *ctx, u8 *d, const u8 *s,
- le128 *iv)
+static void serpent_decrypt_cbc_xway(const void *ctx, u8 *dst, const u8 *src)
{
- be128 ctrblks[SERPENT_PARALLEL_BLOCKS];
- u128 *dst = (u128 *)d;
- const u128 *src = (const u128 *)s;
- unsigned int i;
-
- for (i = 0; i < SERPENT_PARALLEL_BLOCKS; i++) {
- if (dst != src)
- dst[i] = src[i];
-
- le128_to_be128(&ctrblks[i], iv);
- le128_inc(iv);
- }
+ u8 buf[SERPENT_PARALLEL_BLOCKS - 1][SERPENT_BLOCK_SIZE];
+ const u8 *s = src;
- serpent_enc_blk_xway_xor(ctx, (u8 *)dst, (u8 *)ctrblks);
+ if (dst == src)
+ s = memcpy(buf, src, sizeof(buf));
+ serpent_dec_blk_xway(ctx, dst, src);
+ crypto_xor(dst + SERPENT_BLOCK_SIZE, s, sizeof(buf));
}
-static const struct common_glue_ctx serpent_enc = {
- .num_funcs = 2,
- .fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS,
-
- .funcs = { {
- .num_blocks = SERPENT_PARALLEL_BLOCKS,
- .fn_u = { .ecb = serpent_enc_blk_xway }
- }, {
- .num_blocks = 1,
- .fn_u = { .ecb = __serpent_encrypt }
- } }
-};
-
-static const struct common_glue_ctx serpent_ctr = {
- .num_funcs = 2,
- .fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS,
-
- .funcs = { {
- .num_blocks = SERPENT_PARALLEL_BLOCKS,
- .fn_u = { .ctr = serpent_crypt_ctr_xway }
- }, {
- .num_blocks = 1,
- .fn_u = { .ctr = serpent_crypt_ctr }
- } }
-};
-
-static const struct common_glue_ctx serpent_dec = {
- .num_funcs = 2,
- .fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS,
-
- .funcs = { {
- .num_blocks = SERPENT_PARALLEL_BLOCKS,
- .fn_u = { .ecb = serpent_dec_blk_xway }
- }, {
- .num_blocks = 1,
- .fn_u = { .ecb = __serpent_decrypt }
- } }
-};
-
-static const struct common_glue_ctx serpent_dec_cbc = {
- .num_funcs = 2,
- .fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS,
-
- .funcs = { {
- .num_blocks = SERPENT_PARALLEL_BLOCKS,
- .fn_u = { .cbc = serpent_decrypt_cbc_xway }
- }, {
- .num_blocks = 1,
- .fn_u = { .cbc = __serpent_decrypt }
- } }
-};
-
static int ecb_encrypt(struct skcipher_request *req)
{
- return glue_ecb_req_128bit(&serpent_enc, req);
+ ECB_WALK_START(req, SERPENT_BLOCK_SIZE, SERPENT_PARALLEL_BLOCKS);
+ ECB_BLOCK(SERPENT_PARALLEL_BLOCKS, serpent_enc_blk_xway);
+ ECB_BLOCK(1, __serpent_encrypt);
+ ECB_WALK_END();
}
static int ecb_decrypt(struct skcipher_request *req)
{
- return glue_ecb_req_128bit(&serpent_dec, req);
+ ECB_WALK_START(req, SERPENT_BLOCK_SIZE, SERPENT_PARALLEL_BLOCKS);
+ ECB_BLOCK(SERPENT_PARALLEL_BLOCKS, serpent_dec_blk_xway);
+ ECB_BLOCK(1, __serpent_decrypt);
+ ECB_WALK_END();
}
static int cbc_encrypt(struct skcipher_request *req)
{
- return glue_cbc_encrypt_req_128bit(__serpent_encrypt,
- req);
+ CBC_WALK_START(req, SERPENT_BLOCK_SIZE, -1);
+ CBC_ENC_BLOCK(__serpent_encrypt);
+ CBC_WALK_END();
}
static int cbc_decrypt(struct skcipher_request *req)
{
- return glue_cbc_decrypt_req_128bit(&serpent_dec_cbc, req);
-}
-
-static int ctr_crypt(struct skcipher_request *req)
-{
- return glue_ctr_req_128bit(&serpent_ctr, req);
+ CBC_WALK_START(req, SERPENT_BLOCK_SIZE, SERPENT_PARALLEL_BLOCKS);
+ CBC_DEC_BLOCK(SERPENT_PARALLEL_BLOCKS, serpent_decrypt_cbc_xway);
+ CBC_DEC_BLOCK(1, __serpent_decrypt);
+ CBC_WALK_END();
}
static struct skcipher_alg serpent_algs[] = {
@@ -185,21 +100,6 @@ static struct skcipher_alg serpent_algs[] = {
.setkey = serpent_setkey_skcipher,
.encrypt = cbc_encrypt,
.decrypt = cbc_decrypt,
- }, {
- .base.cra_name = "__ctr(serpent)",
- .base.cra_driver_name = "__ctr-serpent-sse2",
- .base.cra_priority = 400,
- .base.cra_flags = CRYPTO_ALG_INTERNAL,
- .base.cra_blocksize = 1,
- .base.cra_ctxsize = sizeof(struct serpent_ctx),
- .base.cra_module = THIS_MODULE,
- .min_keysize = SERPENT_MIN_KEY_SIZE,
- .max_keysize = SERPENT_MAX_KEY_SIZE,
- .ivsize = SERPENT_BLOCK_SIZE,
- .chunksize = SERPENT_BLOCK_SIZE,
- .setkey = serpent_setkey_skcipher,
- .encrypt = ctr_crypt,
- .decrypt = ctr_crypt,
},
};
diff --git a/arch/x86/crypto/sha1_avx2_x86_64_asm.S b/arch/x86/crypto/sha1_avx2_x86_64_asm.S
index 1e594d60afa5..a96b2fd26dab 100644
--- a/arch/x86/crypto/sha1_avx2_x86_64_asm.S
+++ b/arch/x86/crypto/sha1_avx2_x86_64_asm.S
@@ -645,9 +645,9 @@ _loop3:
RESERVE_STACK = (W_SIZE*4 + 8+24)
/* Align stack */
- mov %rsp, %rbx
+ push %rbp
+ mov %rsp, %rbp
and $~(0x20-1), %rsp
- push %rbx
sub $RESERVE_STACK, %rsp
avx2_zeroupper
@@ -665,8 +665,8 @@ _loop3:
avx2_zeroupper
- add $RESERVE_STACK, %rsp
- pop %rsp
+ mov %rbp, %rsp
+ pop %rbp
pop %r15
pop %r14
@@ -674,7 +674,7 @@ _loop3:
pop %r12
pop %rbx
- ret
+ RET
SYM_FUNC_END(\name)
.endm
diff --git a/arch/x86/crypto/sha1_ni_asm.S b/arch/x86/crypto/sha1_ni_asm.S
index 11efe3a45a1f..2f94ec0e763b 100644
--- a/arch/x86/crypto/sha1_ni_asm.S
+++ b/arch/x86/crypto/sha1_ni_asm.S
@@ -59,8 +59,6 @@
#define DATA_PTR %rsi /* 2nd arg */
#define NUM_BLKS %rdx /* 3rd arg */
-#define RSPSAVE %rax
-
/* gcc conversion */
#define FRAME_SIZE 32 /* space for 2x16 bytes */
@@ -96,7 +94,8 @@
.text
.align 32
SYM_FUNC_START(sha1_ni_transform)
- mov %rsp, RSPSAVE
+ push %rbp
+ mov %rsp, %rbp
sub $FRAME_SIZE, %rsp
and $~0xF, %rsp
@@ -288,9 +287,10 @@ SYM_FUNC_START(sha1_ni_transform)
pextrd $3, E0, 1*16(DIGEST_PTR)
.Ldone_hash:
- mov RSPSAVE, %rsp
+ mov %rbp, %rsp
+ pop %rbp
- ret
+ RET
SYM_FUNC_END(sha1_ni_transform)
.section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
diff --git a/arch/x86/crypto/sha1_ssse3_asm.S b/arch/x86/crypto/sha1_ssse3_asm.S
index 12e2d19d7402..263f916362e0 100644
--- a/arch/x86/crypto/sha1_ssse3_asm.S
+++ b/arch/x86/crypto/sha1_ssse3_asm.S
@@ -99,7 +99,7 @@
pop %rbp
pop %r12
pop %rbx
- ret
+ RET
SYM_FUNC_END(\name)
.endm
@@ -467,8 +467,6 @@ W_PRECALC_SSSE3
*/
SHA1_VECTOR_ASM sha1_transform_ssse3
-#ifdef CONFIG_AS_AVX
-
.macro W_PRECALC_AVX
.purgem W_PRECALC_00_15
@@ -553,5 +551,3 @@ W_PRECALC_AVX
* const u8 *data, int blocks);
*/
SHA1_VECTOR_ASM sha1_transform_avx
-
-#endif
diff --git a/arch/x86/crypto/sha1_ssse3_glue.c b/arch/x86/crypto/sha1_ssse3_glue.c
index d70b40ad594c..44340a1139e0 100644
--- a/arch/x86/crypto/sha1_ssse3_glue.c
+++ b/arch/x86/crypto/sha1_ssse3_glue.c
@@ -21,9 +21,8 @@
#include <linux/init.h>
#include <linux/module.h>
#include <linux/mm.h>
-#include <linux/cryptohash.h>
#include <linux/types.h>
-#include <crypto/sha.h>
+#include <crypto/sha1.h>
#include <crypto/sha1_base.h>
#include <asm/simd.h>
@@ -114,7 +113,6 @@ static void unregister_sha1_ssse3(void)
crypto_unregister_shash(&sha1_ssse3_alg);
}
-#ifdef CONFIG_AS_AVX
asmlinkage void sha1_transform_avx(struct sha1_state *state,
const u8 *data, int blocks);
@@ -175,13 +173,6 @@ static void unregister_sha1_avx(void)
crypto_unregister_shash(&sha1_avx_alg);
}
-#else /* CONFIG_AS_AVX */
-static inline int register_sha1_avx(void) { return 0; }
-static inline void unregister_sha1_avx(void) { }
-#endif /* CONFIG_AS_AVX */
-
-
-#if defined(CONFIG_AS_AVX2) && (CONFIG_AS_AVX)
#define SHA1_AVX2_BLOCK_OPTSIZE 4 /* optimal 4*64 bytes of SHA1 blocks */
asmlinkage void sha1_transform_avx2(struct sha1_state *state,
@@ -253,11 +244,6 @@ static void unregister_sha1_avx2(void)
crypto_unregister_shash(&sha1_avx2_alg);
}
-#else
-static inline int register_sha1_avx2(void) { return 0; }
-static inline void unregister_sha1_avx2(void) { }
-#endif
-
#ifdef CONFIG_AS_SHA1_NI
asmlinkage void sha1_ni_transform(struct sha1_state *digest, const u8 *data,
int rounds);
diff --git a/arch/x86/crypto/sha256-avx-asm.S b/arch/x86/crypto/sha256-avx-asm.S
index fcbc30f58c38..3baa1ec39097 100644
--- a/arch/x86/crypto/sha256-avx-asm.S
+++ b/arch/x86/crypto/sha256-avx-asm.S
@@ -47,7 +47,6 @@
# This code schedules 1 block at a time, with 4 lanes per block
########################################################################
-#ifdef CONFIG_AS_AVX
#include <linux/linkage.h>
## assume buffers not aligned
@@ -459,7 +458,7 @@ done_hash:
popq %r13
popq %r12
popq %rbx
- ret
+ RET
SYM_FUNC_END(sha256_transform_avx)
.section .rodata.cst256.K256, "aM", @progbits, 256
@@ -498,5 +497,3 @@ _SHUF_00BA:
# shuffle xDxC -> DC00
_SHUF_DC00:
.octa 0x0b0a090803020100FFFFFFFFFFFFFFFF
-
-#endif
diff --git a/arch/x86/crypto/sha256-avx2-asm.S b/arch/x86/crypto/sha256-avx2-asm.S
index 499d9ec129de..9bcdbc47b8b4 100644
--- a/arch/x86/crypto/sha256-avx2-asm.S
+++ b/arch/x86/crypto/sha256-avx2-asm.S
@@ -48,7 +48,6 @@
# This code schedules 2 blocks at a time, with 4 lanes per block
########################################################################
-#ifdef CONFIG_AS_AVX2
#include <linux/linkage.h>
## assume buffers not aligned
@@ -118,15 +117,13 @@ _XMM_SAVE_SIZE = 0
_INP_END_SIZE = 8
_INP_SIZE = 8
_CTX_SIZE = 8
-_RSP_SIZE = 8
_XFER = 0
_XMM_SAVE = _XFER + _XFER_SIZE
_INP_END = _XMM_SAVE + _XMM_SAVE_SIZE
_INP = _INP_END + _INP_END_SIZE
_CTX = _INP + _INP_SIZE
-_RSP = _CTX + _CTX_SIZE
-STACK_SIZE = _RSP + _RSP_SIZE
+STACK_SIZE = _CTX + _CTX_SIZE
# rotate_Xs
# Rotate values of symbols X0...X3
@@ -534,11 +531,11 @@ SYM_FUNC_START(sha256_transform_rorx)
pushq %r14
pushq %r15
- mov %rsp, %rax
+ push %rbp
+ mov %rsp, %rbp
+
subq $STACK_SIZE, %rsp
and $-32, %rsp # align rsp to 32 byte boundary
- mov %rax, _RSP(%rsp)
-
shl $6, NUM_BLKS # convert to bytes
jz done_hash
@@ -705,14 +702,15 @@ only_one_block:
done_hash:
- mov _RSP(%rsp), %rsp
+ mov %rbp, %rsp
+ pop %rbp
popq %r15
popq %r14
popq %r13
popq %r12
popq %rbx
- ret
+ RET
SYM_FUNC_END(sha256_transform_rorx)
.section .rodata.cst512.K256, "aM", @progbits, 512
@@ -767,5 +765,3 @@ _SHUF_00BA:
.align 32
_SHUF_DC00:
.octa 0x0b0a090803020100FFFFFFFFFFFFFFFF,0x0b0a090803020100FFFFFFFFFFFFFFFF
-
-#endif
diff --git a/arch/x86/crypto/sha256-ssse3-asm.S b/arch/x86/crypto/sha256-ssse3-asm.S
index ddfa863b4ee3..c4a5db612c32 100644
--- a/arch/x86/crypto/sha256-ssse3-asm.S
+++ b/arch/x86/crypto/sha256-ssse3-asm.S
@@ -472,7 +472,7 @@ done_hash:
popq %r12
popq %rbx
- ret
+ RET
SYM_FUNC_END(sha256_transform_ssse3)
.section .rodata.cst256.K256, "aM", @progbits, 256
diff --git a/arch/x86/crypto/sha256_ni_asm.S b/arch/x86/crypto/sha256_ni_asm.S
index 7abade04a3a3..94d50dd27cb5 100644
--- a/arch/x86/crypto/sha256_ni_asm.S
+++ b/arch/x86/crypto/sha256_ni_asm.S
@@ -326,7 +326,7 @@ SYM_FUNC_START(sha256_ni_transform)
.Ldone_hash:
- ret
+ RET
SYM_FUNC_END(sha256_ni_transform)
.section .rodata.cst256.K256, "aM", @progbits, 256
diff --git a/arch/x86/crypto/sha256_ssse3_glue.c b/arch/x86/crypto/sha256_ssse3_glue.c
index 03ad657c04bd..3a5f6be7dbba 100644
--- a/arch/x86/crypto/sha256_ssse3_glue.c
+++ b/arch/x86/crypto/sha256_ssse3_glue.c
@@ -34,9 +34,8 @@
#include <linux/init.h>
#include <linux/module.h>
#include <linux/mm.h>
-#include <linux/cryptohash.h>
#include <linux/types.h>
-#include <crypto/sha.h>
+#include <crypto/sha2.h>
#include <crypto/sha256_base.h>
#include <linux/string.h>
#include <asm/simd.h>
@@ -144,7 +143,6 @@ static void unregister_sha256_ssse3(void)
ARRAY_SIZE(sha256_ssse3_algs));
}
-#ifdef CONFIG_AS_AVX
asmlinkage void sha256_transform_avx(struct sha256_state *state,
const u8 *data, int blocks);
@@ -221,12 +219,6 @@ static void unregister_sha256_avx(void)
ARRAY_SIZE(sha256_avx_algs));
}
-#else
-static inline int register_sha256_avx(void) { return 0; }
-static inline void unregister_sha256_avx(void) { }
-#endif
-
-#if defined(CONFIG_AS_AVX2) && defined(CONFIG_AS_AVX)
asmlinkage void sha256_transform_rorx(struct sha256_state *state,
const u8 *data, int blocks);
@@ -301,11 +293,6 @@ static void unregister_sha256_avx2(void)
ARRAY_SIZE(sha256_avx2_algs));
}
-#else
-static inline int register_sha256_avx2(void) { return 0; }
-static inline void unregister_sha256_avx2(void) { }
-#endif
-
#ifdef CONFIG_AS_SHA256_NI
asmlinkage void sha256_ni_transform(struct sha256_state *digest,
const u8 *data, int rounds);
diff --git a/arch/x86/crypto/sha512-avx-asm.S b/arch/x86/crypto/sha512-avx-asm.S
index 90ea945ba5e6..1fefe6dd3a9e 100644
--- a/arch/x86/crypto/sha512-avx-asm.S
+++ b/arch/x86/crypto/sha512-avx-asm.S
@@ -47,7 +47,6 @@
#
########################################################################
-#ifdef CONFIG_AS_AVX
#include <linux/linkage.h>
.text
@@ -77,14 +76,10 @@ tmp0 = %rax
W_SIZE = 80*8
# W[t] + K[t] | W[t+1] + K[t+1]
WK_SIZE = 2*8
-RSPSAVE_SIZE = 1*8
-GPRSAVE_SIZE = 5*8
frame_W = 0
frame_WK = frame_W + W_SIZE
-frame_RSPSAVE = frame_WK + WK_SIZE
-frame_GPRSAVE = frame_RSPSAVE + RSPSAVE_SIZE
-frame_size = frame_GPRSAVE + GPRSAVE_SIZE
+frame_size = frame_WK + WK_SIZE
# Useful QWORD "arrays" for simpler memory references
# MSG, DIGEST, K_t, W_t are arrays
@@ -279,21 +274,21 @@ frame_size = frame_GPRSAVE + GPRSAVE_SIZE
# "blocks" is the message length in SHA512 blocks
########################################################################
SYM_FUNC_START(sha512_transform_avx)
- cmp $0, msglen
+ test msglen, msglen
je nowork
+ # Save GPRs
+ push %rbx
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+
# Allocate Stack Space
- mov %rsp, %rax
+ push %rbp
+ mov %rsp, %rbp
sub $frame_size, %rsp
and $~(0x20 - 1), %rsp
- mov %rax, frame_RSPSAVE(%rsp)
-
- # Save GPRs
- mov %rbx, frame_GPRSAVE(%rsp)
- mov %r12, frame_GPRSAVE +8*1(%rsp)
- mov %r13, frame_GPRSAVE +8*2(%rsp)
- mov %r14, frame_GPRSAVE +8*3(%rsp)
- mov %r15, frame_GPRSAVE +8*4(%rsp)
updateblock:
@@ -354,18 +349,19 @@ updateblock:
dec msglen
jnz updateblock
- # Restore GPRs
- mov frame_GPRSAVE(%rsp), %rbx
- mov frame_GPRSAVE +8*1(%rsp), %r12
- mov frame_GPRSAVE +8*2(%rsp), %r13
- mov frame_GPRSAVE +8*3(%rsp), %r14
- mov frame_GPRSAVE +8*4(%rsp), %r15
-
# Restore Stack Pointer
- mov frame_RSPSAVE(%rsp), %rsp
+ mov %rbp, %rsp
+ pop %rbp
+
+ # Restore GPRs
+ pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbx
nowork:
- ret
+ RET
SYM_FUNC_END(sha512_transform_avx)
########################################################################
@@ -424,4 +420,3 @@ K512:
.quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
.quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
.quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
-#endif
diff --git a/arch/x86/crypto/sha512-avx2-asm.S b/arch/x86/crypto/sha512-avx2-asm.S
index 3dd886b14e7d..5cdaab7d6901 100644
--- a/arch/x86/crypto/sha512-avx2-asm.S
+++ b/arch/x86/crypto/sha512-avx2-asm.S
@@ -49,7 +49,6 @@
# This code schedules 1 blocks at a time, with 4 lanes per block
########################################################################
-#ifdef CONFIG_AS_AVX2
#include <linux/linkage.h>
.text
@@ -103,17 +102,13 @@ SRND_SIZE = 1*8
INP_SIZE = 1*8
INPEND_SIZE = 1*8
CTX_SIZE = 1*8
-RSPSAVE_SIZE = 1*8
-GPRSAVE_SIZE = 5*8
frame_XFER = 0
frame_SRND = frame_XFER + XFER_SIZE
frame_INP = frame_SRND + SRND_SIZE
frame_INPEND = frame_INP + INP_SIZE
frame_CTX = frame_INPEND + INPEND_SIZE
-frame_RSPSAVE = frame_CTX + CTX_SIZE
-frame_GPRSAVE = frame_RSPSAVE + RSPSAVE_SIZE
-frame_size = frame_GPRSAVE + GPRSAVE_SIZE
+frame_size = frame_CTX + CTX_SIZE
## assume buffers not aligned
#define VMOVDQ vmovdqu
@@ -571,18 +566,18 @@ frame_size = frame_GPRSAVE + GPRSAVE_SIZE
# "blocks" is the message length in SHA512 blocks
########################################################################
SYM_FUNC_START(sha512_transform_rorx)
+ # Save GPRs
+ push %rbx
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+
# Allocate Stack Space
- mov %rsp, %rax
+ push %rbp
+ mov %rsp, %rbp
sub $frame_size, %rsp
and $~(0x20 - 1), %rsp
- mov %rax, frame_RSPSAVE(%rsp)
-
- # Save GPRs
- mov %rbx, 8*0+frame_GPRSAVE(%rsp)
- mov %r12, 8*1+frame_GPRSAVE(%rsp)
- mov %r13, 8*2+frame_GPRSAVE(%rsp)
- mov %r14, 8*3+frame_GPRSAVE(%rsp)
- mov %r15, 8*4+frame_GPRSAVE(%rsp)
shl $7, NUM_BLKS # convert to bytes
jz done_hash
@@ -673,16 +668,18 @@ loop2:
done_hash:
-# Restore GPRs
- mov 8*0+frame_GPRSAVE(%rsp), %rbx
- mov 8*1+frame_GPRSAVE(%rsp), %r12
- mov 8*2+frame_GPRSAVE(%rsp), %r13
- mov 8*3+frame_GPRSAVE(%rsp), %r14
- mov 8*4+frame_GPRSAVE(%rsp), %r15
-
# Restore Stack Pointer
- mov frame_RSPSAVE(%rsp), %rsp
- ret
+ mov %rbp, %rsp
+ pop %rbp
+
+ # Restore GPRs
+ pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbx
+
+ RET
SYM_FUNC_END(sha512_transform_rorx)
########################################################################
@@ -749,5 +746,3 @@ PSHUFFLE_BYTE_FLIP_MASK:
MASK_YMM_LO:
.octa 0x00000000000000000000000000000000
.octa 0xFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF
-
-#endif
diff --git a/arch/x86/crypto/sha512-ssse3-asm.S b/arch/x86/crypto/sha512-ssse3-asm.S
index 7946a1bee85b..b84c22e06c5f 100644
--- a/arch/x86/crypto/sha512-ssse3-asm.S
+++ b/arch/x86/crypto/sha512-ssse3-asm.S
@@ -74,14 +74,10 @@ tmp0 = %rax
W_SIZE = 80*8
WK_SIZE = 2*8
-RSPSAVE_SIZE = 1*8
-GPRSAVE_SIZE = 5*8
frame_W = 0
frame_WK = frame_W + W_SIZE
-frame_RSPSAVE = frame_WK + WK_SIZE
-frame_GPRSAVE = frame_RSPSAVE + RSPSAVE_SIZE
-frame_size = frame_GPRSAVE + GPRSAVE_SIZE
+frame_size = frame_WK + WK_SIZE
# Useful QWORD "arrays" for simpler memory references
# MSG, DIGEST, K_t, W_t are arrays
@@ -280,21 +276,21 @@ frame_size = frame_GPRSAVE + GPRSAVE_SIZE
########################################################################
SYM_FUNC_START(sha512_transform_ssse3)
- cmp $0, msglen
+ test msglen, msglen
je nowork
+ # Save GPRs
+ push %rbx
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+
# Allocate Stack Space
- mov %rsp, %rax
+ push %rbp
+ mov %rsp, %rbp
sub $frame_size, %rsp
and $~(0x20 - 1), %rsp
- mov %rax, frame_RSPSAVE(%rsp)
-
- # Save GPRs
- mov %rbx, frame_GPRSAVE(%rsp)
- mov %r12, frame_GPRSAVE +8*1(%rsp)
- mov %r13, frame_GPRSAVE +8*2(%rsp)
- mov %r14, frame_GPRSAVE +8*3(%rsp)
- mov %r15, frame_GPRSAVE +8*4(%rsp)
updateblock:
@@ -355,18 +351,19 @@ updateblock:
dec msglen
jnz updateblock
- # Restore GPRs
- mov frame_GPRSAVE(%rsp), %rbx
- mov frame_GPRSAVE +8*1(%rsp), %r12
- mov frame_GPRSAVE +8*2(%rsp), %r13
- mov frame_GPRSAVE +8*3(%rsp), %r14
- mov frame_GPRSAVE +8*4(%rsp), %r15
-
# Restore Stack Pointer
- mov frame_RSPSAVE(%rsp), %rsp
+ mov %rbp, %rsp
+ pop %rbp
+
+ # Restore GPRs
+ pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbx
nowork:
- ret
+ RET
SYM_FUNC_END(sha512_transform_ssse3)
########################################################################
diff --git a/arch/x86/crypto/sha512_ssse3_glue.c b/arch/x86/crypto/sha512_ssse3_glue.c
index 1c444f41037c..6d3b85e53d0e 100644
--- a/arch/x86/crypto/sha512_ssse3_glue.c
+++ b/arch/x86/crypto/sha512_ssse3_glue.c
@@ -32,11 +32,11 @@
#include <linux/init.h>
#include <linux/module.h>
#include <linux/mm.h>
-#include <linux/cryptohash.h>
#include <linux/string.h>
#include <linux/types.h>
-#include <crypto/sha.h>
+#include <crypto/sha2.h>
#include <crypto/sha512_base.h>
+#include <asm/cpu_device_id.h>
#include <asm/simd.h>
asmlinkage void sha512_transform_ssse3(struct sha512_state *state,
@@ -142,7 +142,6 @@ static void unregister_sha512_ssse3(void)
ARRAY_SIZE(sha512_ssse3_algs));
}
-#ifdef CONFIG_AS_AVX
asmlinkage void sha512_transform_avx(struct sha512_state *state,
const u8 *data, int blocks);
static bool avx_usable(void)
@@ -218,12 +217,7 @@ static void unregister_sha512_avx(void)
crypto_unregister_shashes(sha512_avx_algs,
ARRAY_SIZE(sha512_avx_algs));
}
-#else
-static inline int register_sha512_avx(void) { return 0; }
-static inline void unregister_sha512_avx(void) { }
-#endif
-#if defined(CONFIG_AS_AVX2) && defined(CONFIG_AS_AVX)
asmlinkage void sha512_transform_rorx(struct sha512_state *state,
const u8 *data, int blocks);
@@ -291,6 +285,13 @@ static int register_sha512_avx2(void)
ARRAY_SIZE(sha512_avx2_algs));
return 0;
}
+static const struct x86_cpu_id module_cpu_ids[] = {
+ X86_MATCH_FEATURE(X86_FEATURE_AVX2, NULL),
+ X86_MATCH_FEATURE(X86_FEATURE_AVX, NULL),
+ X86_MATCH_FEATURE(X86_FEATURE_SSSE3, NULL),
+ {}
+};
+MODULE_DEVICE_TABLE(x86cpu, module_cpu_ids);
static void unregister_sha512_avx2(void)
{
@@ -298,13 +299,11 @@ static void unregister_sha512_avx2(void)
crypto_unregister_shashes(sha512_avx2_algs,
ARRAY_SIZE(sha512_avx2_algs));
}
-#else
-static inline int register_sha512_avx2(void) { return 0; }
-static inline void unregister_sha512_avx2(void) { }
-#endif
static int __init sha512_ssse3_mod_init(void)
{
+ if (!x86_match_cpu(module_cpu_ids))
+ return -ENODEV;
if (register_sha512_ssse3())
goto fail;
diff --git a/arch/x86/crypto/sm3-avx-asm_64.S b/arch/x86/crypto/sm3-avx-asm_64.S
new file mode 100644
index 000000000000..b12b9efb5ec5
--- /dev/null
+++ b/arch/x86/crypto/sm3-avx-asm_64.S
@@ -0,0 +1,517 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * SM3 AVX accelerated transform.
+ * specified in: https://datatracker.ietf.org/doc/html/draft-sca-cfrg-sm3-02
+ *
+ * Copyright (C) 2021 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ * Copyright (C) 2021 Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
+ */
+
+/* Based on SM3 AES/BMI2 accelerated work by libgcrypt at:
+ * https://gnupg.org/software/libgcrypt/index.html
+ */
+
+#include <linux/linkage.h>
+#include <asm/frame.h>
+
+/* Context structure */
+
+#define state_h0 0
+#define state_h1 4
+#define state_h2 8
+#define state_h3 12
+#define state_h4 16
+#define state_h5 20
+#define state_h6 24
+#define state_h7 28
+
+/* Constants */
+
+/* Round constant macros */
+
+#define K0 2043430169 /* 0x79cc4519 */
+#define K1 -208106958 /* 0xf3988a32 */
+#define K2 -416213915 /* 0xe7311465 */
+#define K3 -832427829 /* 0xce6228cb */
+#define K4 -1664855657 /* 0x9cc45197 */
+#define K5 965255983 /* 0x3988a32f */
+#define K6 1930511966 /* 0x7311465e */
+#define K7 -433943364 /* 0xe6228cbc */
+#define K8 -867886727 /* 0xcc451979 */
+#define K9 -1735773453 /* 0x988a32f3 */
+#define K10 823420391 /* 0x311465e7 */
+#define K11 1646840782 /* 0x6228cbce */
+#define K12 -1001285732 /* 0xc451979c */
+#define K13 -2002571463 /* 0x88a32f39 */
+#define K14 289824371 /* 0x11465e73 */
+#define K15 579648742 /* 0x228cbce6 */
+#define K16 -1651869049 /* 0x9d8a7a87 */
+#define K17 991229199 /* 0x3b14f50f */
+#define K18 1982458398 /* 0x7629ea1e */
+#define K19 -330050500 /* 0xec53d43c */
+#define K20 -660100999 /* 0xd8a7a879 */
+#define K21 -1320201997 /* 0xb14f50f3 */
+#define K22 1654563303 /* 0x629ea1e7 */
+#define K23 -985840690 /* 0xc53d43ce */
+#define K24 -1971681379 /* 0x8a7a879d */
+#define K25 351604539 /* 0x14f50f3b */
+#define K26 703209078 /* 0x29ea1e76 */
+#define K27 1406418156 /* 0x53d43cec */
+#define K28 -1482130984 /* 0xa7a879d8 */
+#define K29 1330705329 /* 0x4f50f3b1 */
+#define K30 -1633556638 /* 0x9ea1e762 */
+#define K31 1027854021 /* 0x3d43cec5 */
+#define K32 2055708042 /* 0x7a879d8a */
+#define K33 -183551212 /* 0xf50f3b14 */
+#define K34 -367102423 /* 0xea1e7629 */
+#define K35 -734204845 /* 0xd43cec53 */
+#define K36 -1468409689 /* 0xa879d8a7 */
+#define K37 1358147919 /* 0x50f3b14f */
+#define K38 -1578671458 /* 0xa1e7629e */
+#define K39 1137624381 /* 0x43cec53d */
+#define K40 -2019718534 /* 0x879d8a7a */
+#define K41 255530229 /* 0x0f3b14f5 */
+#define K42 511060458 /* 0x1e7629ea */
+#define K43 1022120916 /* 0x3cec53d4 */
+#define K44 2044241832 /* 0x79d8a7a8 */
+#define K45 -206483632 /* 0xf3b14f50 */
+#define K46 -412967263 /* 0xe7629ea1 */
+#define K47 -825934525 /* 0xcec53d43 */
+#define K48 -1651869049 /* 0x9d8a7a87 */
+#define K49 991229199 /* 0x3b14f50f */
+#define K50 1982458398 /* 0x7629ea1e */
+#define K51 -330050500 /* 0xec53d43c */
+#define K52 -660100999 /* 0xd8a7a879 */
+#define K53 -1320201997 /* 0xb14f50f3 */
+#define K54 1654563303 /* 0x629ea1e7 */
+#define K55 -985840690 /* 0xc53d43ce */
+#define K56 -1971681379 /* 0x8a7a879d */
+#define K57 351604539 /* 0x14f50f3b */
+#define K58 703209078 /* 0x29ea1e76 */
+#define K59 1406418156 /* 0x53d43cec */
+#define K60 -1482130984 /* 0xa7a879d8 */
+#define K61 1330705329 /* 0x4f50f3b1 */
+#define K62 -1633556638 /* 0x9ea1e762 */
+#define K63 1027854021 /* 0x3d43cec5 */
+
+/* Register macros */
+
+#define RSTATE %rdi
+#define RDATA %rsi
+#define RNBLKS %rdx
+
+#define t0 %eax
+#define t1 %ebx
+#define t2 %ecx
+
+#define a %r8d
+#define b %r9d
+#define c %r10d
+#define d %r11d
+#define e %r12d
+#define f %r13d
+#define g %r14d
+#define h %r15d
+
+#define W0 %xmm0
+#define W1 %xmm1
+#define W2 %xmm2
+#define W3 %xmm3
+#define W4 %xmm4
+#define W5 %xmm5
+
+#define XTMP0 %xmm6
+#define XTMP1 %xmm7
+#define XTMP2 %xmm8
+#define XTMP3 %xmm9
+#define XTMP4 %xmm10
+#define XTMP5 %xmm11
+#define XTMP6 %xmm12
+
+#define BSWAP_REG %xmm15
+
+/* Stack structure */
+
+#define STACK_W_SIZE (32 * 2 * 3)
+#define STACK_REG_SAVE_SIZE (64)
+
+#define STACK_W (0)
+#define STACK_REG_SAVE (STACK_W + STACK_W_SIZE)
+#define STACK_SIZE (STACK_REG_SAVE + STACK_REG_SAVE_SIZE)
+
+/* Instruction helpers. */
+
+#define roll2(v, reg) \
+ roll $(v), reg;
+
+#define roll3mov(v, src, dst) \
+ movl src, dst; \
+ roll $(v), dst;
+
+#define roll3(v, src, dst) \
+ rorxl $(32-(v)), src, dst;
+
+#define addl2(a, out) \
+ leal (a, out), out;
+
+/* Round function macros. */
+
+#define GG1(x, y, z, o, t) \
+ movl x, o; \
+ xorl y, o; \
+ xorl z, o;
+
+#define FF1(x, y, z, o, t) GG1(x, y, z, o, t)
+
+#define GG2(x, y, z, o, t) \
+ andnl z, x, o; \
+ movl y, t; \
+ andl x, t; \
+ addl2(t, o);
+
+#define FF2(x, y, z, o, t) \
+ movl y, o; \
+ xorl x, o; \
+ movl y, t; \
+ andl x, t; \
+ andl z, o; \
+ xorl t, o;
+
+#define R(i, a, b, c, d, e, f, g, h, round, widx, wtype) \
+ /* rol(a, 12) => t0 */ \
+ roll3mov(12, a, t0); /* rorxl here would reduce perf by 6% on zen3 */ \
+ /* rol (t0 + e + t), 7) => t1 */ \
+ leal K##round(t0, e, 1), t1; \
+ roll2(7, t1); \
+ /* h + w1 => h */ \
+ addl wtype##_W1_ADDR(round, widx), h; \
+ /* h + t1 => h */ \
+ addl2(t1, h); \
+ /* t1 ^ t0 => t0 */ \
+ xorl t1, t0; \
+ /* w1w2 + d => d */ \
+ addl wtype##_W1W2_ADDR(round, widx), d; \
+ /* FF##i(a,b,c) => t1 */ \
+ FF##i(a, b, c, t1, t2); \
+ /* d + t1 => d */ \
+ addl2(t1, d); \
+ /* GG#i(e,f,g) => t2 */ \
+ GG##i(e, f, g, t2, t1); \
+ /* h + t2 => h */ \
+ addl2(t2, h); \
+ /* rol (f, 19) => f */ \
+ roll2(19, f); \
+ /* d + t0 => d */ \
+ addl2(t0, d); \
+ /* rol (b, 9) => b */ \
+ roll2(9, b); \
+ /* P0(h) => h */ \
+ roll3(9, h, t2); \
+ roll3(17, h, t1); \
+ xorl t2, h; \
+ xorl t1, h;
+
+#define R1(a, b, c, d, e, f, g, h, round, widx, wtype) \
+ R(1, a, b, c, d, e, f, g, h, round, widx, wtype)
+
+#define R2(a, b, c, d, e, f, g, h, round, widx, wtype) \
+ R(2, a, b, c, d, e, f, g, h, round, widx, wtype)
+
+/* Input expansion macros. */
+
+/* Byte-swapped input address. */
+#define IW_W_ADDR(round, widx, offs) \
+ (STACK_W + ((round) / 4) * 64 + (offs) + ((widx) * 4))(%rsp)
+
+/* Expanded input address. */
+#define XW_W_ADDR(round, widx, offs) \
+ (STACK_W + ((((round) / 3) - 4) % 2) * 64 + (offs) + ((widx) * 4))(%rsp)
+
+/* Rounds 1-12, byte-swapped input block addresses. */
+#define IW_W1_ADDR(round, widx) IW_W_ADDR(round, widx, 0)
+#define IW_W1W2_ADDR(round, widx) IW_W_ADDR(round, widx, 32)
+
+/* Rounds 1-12, expanded input block addresses. */
+#define XW_W1_ADDR(round, widx) XW_W_ADDR(round, widx, 0)
+#define XW_W1W2_ADDR(round, widx) XW_W_ADDR(round, widx, 32)
+
+/* Input block loading. */
+#define LOAD_W_XMM_1() \
+ vmovdqu 0*16(RDATA), XTMP0; /* XTMP0: w3, w2, w1, w0 */ \
+ vmovdqu 1*16(RDATA), XTMP1; /* XTMP1: w7, w6, w5, w4 */ \
+ vmovdqu 2*16(RDATA), XTMP2; /* XTMP2: w11, w10, w9, w8 */ \
+ vmovdqu 3*16(RDATA), XTMP3; /* XTMP3: w15, w14, w13, w12 */ \
+ vpshufb BSWAP_REG, XTMP0, XTMP0; \
+ vpshufb BSWAP_REG, XTMP1, XTMP1; \
+ vpshufb BSWAP_REG, XTMP2, XTMP2; \
+ vpshufb BSWAP_REG, XTMP3, XTMP3; \
+ vpxor XTMP0, XTMP1, XTMP4; \
+ vpxor XTMP1, XTMP2, XTMP5; \
+ vpxor XTMP2, XTMP3, XTMP6; \
+ leaq 64(RDATA), RDATA; \
+ vmovdqa XTMP0, IW_W1_ADDR(0, 0); \
+ vmovdqa XTMP4, IW_W1W2_ADDR(0, 0); \
+ vmovdqa XTMP1, IW_W1_ADDR(4, 0); \
+ vmovdqa XTMP5, IW_W1W2_ADDR(4, 0);
+
+#define LOAD_W_XMM_2() \
+ vmovdqa XTMP2, IW_W1_ADDR(8, 0); \
+ vmovdqa XTMP6, IW_W1W2_ADDR(8, 0);
+
+#define LOAD_W_XMM_3() \
+ vpshufd $0b00000000, XTMP0, W0; /* W0: xx, w0, xx, xx */ \
+ vpshufd $0b11111001, XTMP0, W1; /* W1: xx, w3, w2, w1 */ \
+ vmovdqa XTMP1, W2; /* W2: xx, w6, w5, w4 */ \
+ vpalignr $12, XTMP1, XTMP2, W3; /* W3: xx, w9, w8, w7 */ \
+ vpalignr $8, XTMP2, XTMP3, W4; /* W4: xx, w12, w11, w10 */ \
+ vpshufd $0b11111001, XTMP3, W5; /* W5: xx, w15, w14, w13 */
+
+/* Message scheduling. Note: 3 words per XMM register. */
+#define SCHED_W_0(round, w0, w1, w2, w3, w4, w5) \
+ /* Load (w[i - 16]) => XTMP0 */ \
+ vpshufd $0b10111111, w0, XTMP0; \
+ vpalignr $12, XTMP0, w1, XTMP0; /* XTMP0: xx, w2, w1, w0 */ \
+ /* Load (w[i - 13]) => XTMP1 */ \
+ vpshufd $0b10111111, w1, XTMP1; \
+ vpalignr $12, XTMP1, w2, XTMP1; \
+ /* w[i - 9] == w3 */ \
+ /* XMM3 ^ XTMP0 => XTMP0 */ \
+ vpxor w3, XTMP0, XTMP0;
+
+#define SCHED_W_1(round, w0, w1, w2, w3, w4, w5) \
+ /* w[i - 3] == w5 */ \
+ /* rol(XMM5, 15) ^ XTMP0 => XTMP0 */ \
+ vpslld $15, w5, XTMP2; \
+ vpsrld $(32-15), w5, XTMP3; \
+ vpxor XTMP2, XTMP3, XTMP3; \
+ vpxor XTMP3, XTMP0, XTMP0; \
+ /* rol(XTMP1, 7) => XTMP1 */ \
+ vpslld $7, XTMP1, XTMP5; \
+ vpsrld $(32-7), XTMP1, XTMP1; \
+ vpxor XTMP5, XTMP1, XTMP1; \
+ /* XMM4 ^ XTMP1 => XTMP1 */ \
+ vpxor w4, XTMP1, XTMP1; \
+ /* w[i - 6] == XMM4 */ \
+ /* P1(XTMP0) ^ XTMP1 => XMM0 */ \
+ vpslld $15, XTMP0, XTMP5; \
+ vpsrld $(32-15), XTMP0, XTMP6; \
+ vpslld $23, XTMP0, XTMP2; \
+ vpsrld $(32-23), XTMP0, XTMP3; \
+ vpxor XTMP0, XTMP1, XTMP1; \
+ vpxor XTMP6, XTMP5, XTMP5; \
+ vpxor XTMP3, XTMP2, XTMP2; \
+ vpxor XTMP2, XTMP5, XTMP5; \
+ vpxor XTMP5, XTMP1, w0;
+
+#define SCHED_W_2(round, w0, w1, w2, w3, w4, w5) \
+ /* W1 in XMM12 */ \
+ vpshufd $0b10111111, w4, XTMP4; \
+ vpalignr $12, XTMP4, w5, XTMP4; \
+ vmovdqa XTMP4, XW_W1_ADDR((round), 0); \
+ /* W1 ^ W2 => XTMP1 */ \
+ vpxor w0, XTMP4, XTMP1; \
+ vmovdqa XTMP1, XW_W1W2_ADDR((round), 0);
+
+
+.section .rodata.cst16, "aM", @progbits, 16
+.align 16
+
+.Lbe32mask:
+ .long 0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f
+
+.text
+
+/*
+ * Transform nblocks*64 bytes (nblocks*16 32-bit words) at DATA.
+ *
+ * void sm3_transform_avx(struct sm3_state *state,
+ * const u8 *data, int nblocks);
+ */
+.align 16
+SYM_FUNC_START(sm3_transform_avx)
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: data (64*nblks bytes)
+ * %rdx: nblocks
+ */
+ vzeroupper;
+
+ pushq %rbp;
+ movq %rsp, %rbp;
+
+ movq %rdx, RNBLKS;
+
+ subq $STACK_SIZE, %rsp;
+ andq $(~63), %rsp;
+
+ movq %rbx, (STACK_REG_SAVE + 0 * 8)(%rsp);
+ movq %r15, (STACK_REG_SAVE + 1 * 8)(%rsp);
+ movq %r14, (STACK_REG_SAVE + 2 * 8)(%rsp);
+ movq %r13, (STACK_REG_SAVE + 3 * 8)(%rsp);
+ movq %r12, (STACK_REG_SAVE + 4 * 8)(%rsp);
+
+ vmovdqa .Lbe32mask (%rip), BSWAP_REG;
+
+ /* Get the values of the chaining variables. */
+ movl state_h0(RSTATE), a;
+ movl state_h1(RSTATE), b;
+ movl state_h2(RSTATE), c;
+ movl state_h3(RSTATE), d;
+ movl state_h4(RSTATE), e;
+ movl state_h5(RSTATE), f;
+ movl state_h6(RSTATE), g;
+ movl state_h7(RSTATE), h;
+
+.align 16
+.Loop:
+ /* Load data part1. */
+ LOAD_W_XMM_1();
+
+ leaq -1(RNBLKS), RNBLKS;
+
+ /* Transform 0-3 + Load data part2. */
+ R1(a, b, c, d, e, f, g, h, 0, 0, IW); LOAD_W_XMM_2();
+ R1(d, a, b, c, h, e, f, g, 1, 1, IW);
+ R1(c, d, a, b, g, h, e, f, 2, 2, IW);
+ R1(b, c, d, a, f, g, h, e, 3, 3, IW); LOAD_W_XMM_3();
+
+ /* Transform 4-7 + Precalc 12-14. */
+ R1(a, b, c, d, e, f, g, h, 4, 0, IW);
+ R1(d, a, b, c, h, e, f, g, 5, 1, IW);
+ R1(c, d, a, b, g, h, e, f, 6, 2, IW); SCHED_W_0(12, W0, W1, W2, W3, W4, W5);
+ R1(b, c, d, a, f, g, h, e, 7, 3, IW); SCHED_W_1(12, W0, W1, W2, W3, W4, W5);
+
+ /* Transform 8-11 + Precalc 12-17. */
+ R1(a, b, c, d, e, f, g, h, 8, 0, IW); SCHED_W_2(12, W0, W1, W2, W3, W4, W5);
+ R1(d, a, b, c, h, e, f, g, 9, 1, IW); SCHED_W_0(15, W1, W2, W3, W4, W5, W0);
+ R1(c, d, a, b, g, h, e, f, 10, 2, IW); SCHED_W_1(15, W1, W2, W3, W4, W5, W0);
+ R1(b, c, d, a, f, g, h, e, 11, 3, IW); SCHED_W_2(15, W1, W2, W3, W4, W5, W0);
+
+ /* Transform 12-14 + Precalc 18-20 */
+ R1(a, b, c, d, e, f, g, h, 12, 0, XW); SCHED_W_0(18, W2, W3, W4, W5, W0, W1);
+ R1(d, a, b, c, h, e, f, g, 13, 1, XW); SCHED_W_1(18, W2, W3, W4, W5, W0, W1);
+ R1(c, d, a, b, g, h, e, f, 14, 2, XW); SCHED_W_2(18, W2, W3, W4, W5, W0, W1);
+
+ /* Transform 15-17 + Precalc 21-23 */
+ R1(b, c, d, a, f, g, h, e, 15, 0, XW); SCHED_W_0(21, W3, W4, W5, W0, W1, W2);
+ R2(a, b, c, d, e, f, g, h, 16, 1, XW); SCHED_W_1(21, W3, W4, W5, W0, W1, W2);
+ R2(d, a, b, c, h, e, f, g, 17, 2, XW); SCHED_W_2(21, W3, W4, W5, W0, W1, W2);
+
+ /* Transform 18-20 + Precalc 24-26 */
+ R2(c, d, a, b, g, h, e, f, 18, 0, XW); SCHED_W_0(24, W4, W5, W0, W1, W2, W3);
+ R2(b, c, d, a, f, g, h, e, 19, 1, XW); SCHED_W_1(24, W4, W5, W0, W1, W2, W3);
+ R2(a, b, c, d, e, f, g, h, 20, 2, XW); SCHED_W_2(24, W4, W5, W0, W1, W2, W3);
+
+ /* Transform 21-23 + Precalc 27-29 */
+ R2(d, a, b, c, h, e, f, g, 21, 0, XW); SCHED_W_0(27, W5, W0, W1, W2, W3, W4);
+ R2(c, d, a, b, g, h, e, f, 22, 1, XW); SCHED_W_1(27, W5, W0, W1, W2, W3, W4);
+ R2(b, c, d, a, f, g, h, e, 23, 2, XW); SCHED_W_2(27, W5, W0, W1, W2, W3, W4);
+
+ /* Transform 24-26 + Precalc 30-32 */
+ R2(a, b, c, d, e, f, g, h, 24, 0, XW); SCHED_W_0(30, W0, W1, W2, W3, W4, W5);
+ R2(d, a, b, c, h, e, f, g, 25, 1, XW); SCHED_W_1(30, W0, W1, W2, W3, W4, W5);
+ R2(c, d, a, b, g, h, e, f, 26, 2, XW); SCHED_W_2(30, W0, W1, W2, W3, W4, W5);
+
+ /* Transform 27-29 + Precalc 33-35 */
+ R2(b, c, d, a, f, g, h, e, 27, 0, XW); SCHED_W_0(33, W1, W2, W3, W4, W5, W0);
+ R2(a, b, c, d, e, f, g, h, 28, 1, XW); SCHED_W_1(33, W1, W2, W3, W4, W5, W0);
+ R2(d, a, b, c, h, e, f, g, 29, 2, XW); SCHED_W_2(33, W1, W2, W3, W4, W5, W0);
+
+ /* Transform 30-32 + Precalc 36-38 */
+ R2(c, d, a, b, g, h, e, f, 30, 0, XW); SCHED_W_0(36, W2, W3, W4, W5, W0, W1);
+ R2(b, c, d, a, f, g, h, e, 31, 1, XW); SCHED_W_1(36, W2, W3, W4, W5, W0, W1);
+ R2(a, b, c, d, e, f, g, h, 32, 2, XW); SCHED_W_2(36, W2, W3, W4, W5, W0, W1);
+
+ /* Transform 33-35 + Precalc 39-41 */
+ R2(d, a, b, c, h, e, f, g, 33, 0, XW); SCHED_W_0(39, W3, W4, W5, W0, W1, W2);
+ R2(c, d, a, b, g, h, e, f, 34, 1, XW); SCHED_W_1(39, W3, W4, W5, W0, W1, W2);
+ R2(b, c, d, a, f, g, h, e, 35, 2, XW); SCHED_W_2(39, W3, W4, W5, W0, W1, W2);
+
+ /* Transform 36-38 + Precalc 42-44 */
+ R2(a, b, c, d, e, f, g, h, 36, 0, XW); SCHED_W_0(42, W4, W5, W0, W1, W2, W3);
+ R2(d, a, b, c, h, e, f, g, 37, 1, XW); SCHED_W_1(42, W4, W5, W0, W1, W2, W3);
+ R2(c, d, a, b, g, h, e, f, 38, 2, XW); SCHED_W_2(42, W4, W5, W0, W1, W2, W3);
+
+ /* Transform 39-41 + Precalc 45-47 */
+ R2(b, c, d, a, f, g, h, e, 39, 0, XW); SCHED_W_0(45, W5, W0, W1, W2, W3, W4);
+ R2(a, b, c, d, e, f, g, h, 40, 1, XW); SCHED_W_1(45, W5, W0, W1, W2, W3, W4);
+ R2(d, a, b, c, h, e, f, g, 41, 2, XW); SCHED_W_2(45, W5, W0, W1, W2, W3, W4);
+
+ /* Transform 42-44 + Precalc 48-50 */
+ R2(c, d, a, b, g, h, e, f, 42, 0, XW); SCHED_W_0(48, W0, W1, W2, W3, W4, W5);
+ R2(b, c, d, a, f, g, h, e, 43, 1, XW); SCHED_W_1(48, W0, W1, W2, W3, W4, W5);
+ R2(a, b, c, d, e, f, g, h, 44, 2, XW); SCHED_W_2(48, W0, W1, W2, W3, W4, W5);
+
+ /* Transform 45-47 + Precalc 51-53 */
+ R2(d, a, b, c, h, e, f, g, 45, 0, XW); SCHED_W_0(51, W1, W2, W3, W4, W5, W0);
+ R2(c, d, a, b, g, h, e, f, 46, 1, XW); SCHED_W_1(51, W1, W2, W3, W4, W5, W0);
+ R2(b, c, d, a, f, g, h, e, 47, 2, XW); SCHED_W_2(51, W1, W2, W3, W4, W5, W0);
+
+ /* Transform 48-50 + Precalc 54-56 */
+ R2(a, b, c, d, e, f, g, h, 48, 0, XW); SCHED_W_0(54, W2, W3, W4, W5, W0, W1);
+ R2(d, a, b, c, h, e, f, g, 49, 1, XW); SCHED_W_1(54, W2, W3, W4, W5, W0, W1);
+ R2(c, d, a, b, g, h, e, f, 50, 2, XW); SCHED_W_2(54, W2, W3, W4, W5, W0, W1);
+
+ /* Transform 51-53 + Precalc 57-59 */
+ R2(b, c, d, a, f, g, h, e, 51, 0, XW); SCHED_W_0(57, W3, W4, W5, W0, W1, W2);
+ R2(a, b, c, d, e, f, g, h, 52, 1, XW); SCHED_W_1(57, W3, W4, W5, W0, W1, W2);
+ R2(d, a, b, c, h, e, f, g, 53, 2, XW); SCHED_W_2(57, W3, W4, W5, W0, W1, W2);
+
+ /* Transform 54-56 + Precalc 60-62 */
+ R2(c, d, a, b, g, h, e, f, 54, 0, XW); SCHED_W_0(60, W4, W5, W0, W1, W2, W3);
+ R2(b, c, d, a, f, g, h, e, 55, 1, XW); SCHED_W_1(60, W4, W5, W0, W1, W2, W3);
+ R2(a, b, c, d, e, f, g, h, 56, 2, XW); SCHED_W_2(60, W4, W5, W0, W1, W2, W3);
+
+ /* Transform 57-59 + Precalc 63 */
+ R2(d, a, b, c, h, e, f, g, 57, 0, XW); SCHED_W_0(63, W5, W0, W1, W2, W3, W4);
+ R2(c, d, a, b, g, h, e, f, 58, 1, XW);
+ R2(b, c, d, a, f, g, h, e, 59, 2, XW); SCHED_W_1(63, W5, W0, W1, W2, W3, W4);
+
+ /* Transform 60-62 + Precalc 63 */
+ R2(a, b, c, d, e, f, g, h, 60, 0, XW);
+ R2(d, a, b, c, h, e, f, g, 61, 1, XW); SCHED_W_2(63, W5, W0, W1, W2, W3, W4);
+ R2(c, d, a, b, g, h, e, f, 62, 2, XW);
+
+ /* Transform 63 */
+ R2(b, c, d, a, f, g, h, e, 63, 0, XW);
+
+ /* Update the chaining variables. */
+ xorl state_h0(RSTATE), a;
+ xorl state_h1(RSTATE), b;
+ xorl state_h2(RSTATE), c;
+ xorl state_h3(RSTATE), d;
+ movl a, state_h0(RSTATE);
+ movl b, state_h1(RSTATE);
+ movl c, state_h2(RSTATE);
+ movl d, state_h3(RSTATE);
+ xorl state_h4(RSTATE), e;
+ xorl state_h5(RSTATE), f;
+ xorl state_h6(RSTATE), g;
+ xorl state_h7(RSTATE), h;
+ movl e, state_h4(RSTATE);
+ movl f, state_h5(RSTATE);
+ movl g, state_h6(RSTATE);
+ movl h, state_h7(RSTATE);
+
+ cmpq $0, RNBLKS;
+ jne .Loop;
+
+ vzeroall;
+
+ movq (STACK_REG_SAVE + 0 * 8)(%rsp), %rbx;
+ movq (STACK_REG_SAVE + 1 * 8)(%rsp), %r15;
+ movq (STACK_REG_SAVE + 2 * 8)(%rsp), %r14;
+ movq (STACK_REG_SAVE + 3 * 8)(%rsp), %r13;
+ movq (STACK_REG_SAVE + 4 * 8)(%rsp), %r12;
+
+ vmovdqa %xmm0, IW_W1_ADDR(0, 0);
+ vmovdqa %xmm0, IW_W1W2_ADDR(0, 0);
+ vmovdqa %xmm0, IW_W1_ADDR(4, 0);
+ vmovdqa %xmm0, IW_W1W2_ADDR(4, 0);
+ vmovdqa %xmm0, IW_W1_ADDR(8, 0);
+ vmovdqa %xmm0, IW_W1W2_ADDR(8, 0);
+
+ movq %rbp, %rsp;
+ popq %rbp;
+ RET;
+SYM_FUNC_END(sm3_transform_avx)
diff --git a/arch/x86/crypto/sm3_avx_glue.c b/arch/x86/crypto/sm3_avx_glue.c
new file mode 100644
index 000000000000..661b6f22ffcd
--- /dev/null
+++ b/arch/x86/crypto/sm3_avx_glue.c
@@ -0,0 +1,134 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * SM3 Secure Hash Algorithm, AVX assembler accelerated.
+ * specified in: https://datatracker.ietf.org/doc/html/draft-sca-cfrg-sm3-02
+ *
+ * Copyright (C) 2021 Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <crypto/internal/hash.h>
+#include <crypto/internal/simd.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <crypto/sm3.h>
+#include <crypto/sm3_base.h>
+#include <asm/simd.h>
+
+asmlinkage void sm3_transform_avx(struct sm3_state *state,
+ const u8 *data, int nblocks);
+
+static int sm3_avx_update(struct shash_desc *desc, const u8 *data,
+ unsigned int len)
+{
+ struct sm3_state *sctx = shash_desc_ctx(desc);
+
+ if (!crypto_simd_usable() ||
+ (sctx->count % SM3_BLOCK_SIZE) + len < SM3_BLOCK_SIZE) {
+ sm3_update(sctx, data, len);
+ return 0;
+ }
+
+ /*
+ * Make sure struct sm3_state begins directly with the SM3
+ * 256-bit internal state, as this is what the asm functions expect.
+ */
+ BUILD_BUG_ON(offsetof(struct sm3_state, state) != 0);
+
+ kernel_fpu_begin();
+ sm3_base_do_update(desc, data, len, sm3_transform_avx);
+ kernel_fpu_end();
+
+ return 0;
+}
+
+static int sm3_avx_finup(struct shash_desc *desc, const u8 *data,
+ unsigned int len, u8 *out)
+{
+ if (!crypto_simd_usable()) {
+ struct sm3_state *sctx = shash_desc_ctx(desc);
+
+ if (len)
+ sm3_update(sctx, data, len);
+
+ sm3_final(sctx, out);
+ return 0;
+ }
+
+ kernel_fpu_begin();
+ if (len)
+ sm3_base_do_update(desc, data, len, sm3_transform_avx);
+ sm3_base_do_finalize(desc, sm3_transform_avx);
+ kernel_fpu_end();
+
+ return sm3_base_finish(desc, out);
+}
+
+static int sm3_avx_final(struct shash_desc *desc, u8 *out)
+{
+ if (!crypto_simd_usable()) {
+ sm3_final(shash_desc_ctx(desc), out);
+ return 0;
+ }
+
+ kernel_fpu_begin();
+ sm3_base_do_finalize(desc, sm3_transform_avx);
+ kernel_fpu_end();
+
+ return sm3_base_finish(desc, out);
+}
+
+static struct shash_alg sm3_avx_alg = {
+ .digestsize = SM3_DIGEST_SIZE,
+ .init = sm3_base_init,
+ .update = sm3_avx_update,
+ .final = sm3_avx_final,
+ .finup = sm3_avx_finup,
+ .descsize = sizeof(struct sm3_state),
+ .base = {
+ .cra_name = "sm3",
+ .cra_driver_name = "sm3-avx",
+ .cra_priority = 300,
+ .cra_blocksize = SM3_BLOCK_SIZE,
+ .cra_module = THIS_MODULE,
+ }
+};
+
+static int __init sm3_avx_mod_init(void)
+{
+ const char *feature_name;
+
+ if (!boot_cpu_has(X86_FEATURE_AVX)) {
+ pr_info("AVX instruction are not detected.\n");
+ return -ENODEV;
+ }
+
+ if (!boot_cpu_has(X86_FEATURE_BMI2)) {
+ pr_info("BMI2 instruction are not detected.\n");
+ return -ENODEV;
+ }
+
+ if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM,
+ &feature_name)) {
+ pr_info("CPU feature '%s' is not supported.\n", feature_name);
+ return -ENODEV;
+ }
+
+ return crypto_register_shash(&sm3_avx_alg);
+}
+
+static void __exit sm3_avx_mod_exit(void)
+{
+ crypto_unregister_shash(&sm3_avx_alg);
+}
+
+module_init(sm3_avx_mod_init);
+module_exit(sm3_avx_mod_exit);
+
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Tianjia Zhang <tianjia.zhang@linux.alibaba.com>");
+MODULE_DESCRIPTION("SM3 Secure Hash Algorithm, AVX assembler accelerated");
+MODULE_ALIAS_CRYPTO("sm3");
+MODULE_ALIAS_CRYPTO("sm3-avx");
diff --git a/arch/x86/crypto/sm4-aesni-avx-asm_64.S b/arch/x86/crypto/sm4-aesni-avx-asm_64.S
new file mode 100644
index 000000000000..4767ab61ff48
--- /dev/null
+++ b/arch/x86/crypto/sm4-aesni-avx-asm_64.S
@@ -0,0 +1,594 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * SM4 Cipher Algorithm, AES-NI/AVX optimized.
+ * as specified in
+ * https://tools.ietf.org/id/draft-ribose-cfrg-sm4-10.html
+ *
+ * Copyright (C) 2018 Markku-Juhani O. Saarinen <mjos@iki.fi>
+ * Copyright (C) 2020 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ * Copyright (c) 2021 Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
+ */
+
+/* Based on SM4 AES-NI work by libgcrypt and Markku-Juhani O. Saarinen at:
+ * https://github.com/mjosaarinen/sm4ni
+ */
+
+#include <linux/linkage.h>
+#include <asm/frame.h>
+
+#define rRIP (%rip)
+
+#define RX0 %xmm0
+#define RX1 %xmm1
+#define MASK_4BIT %xmm2
+#define RTMP0 %xmm3
+#define RTMP1 %xmm4
+#define RTMP2 %xmm5
+#define RTMP3 %xmm6
+#define RTMP4 %xmm7
+
+#define RA0 %xmm8
+#define RA1 %xmm9
+#define RA2 %xmm10
+#define RA3 %xmm11
+
+#define RB0 %xmm12
+#define RB1 %xmm13
+#define RB2 %xmm14
+#define RB3 %xmm15
+
+#define RNOT %xmm0
+#define RBSWAP %xmm1
+
+
+/* Transpose four 32-bit words between 128-bit vectors. */
+#define transpose_4x4(x0, x1, x2, x3, t1, t2) \
+ vpunpckhdq x1, x0, t2; \
+ vpunpckldq x1, x0, x0; \
+ \
+ vpunpckldq x3, x2, t1; \
+ vpunpckhdq x3, x2, x2; \
+ \
+ vpunpckhqdq t1, x0, x1; \
+ vpunpcklqdq t1, x0, x0; \
+ \
+ vpunpckhqdq x2, t2, x3; \
+ vpunpcklqdq x2, t2, x2;
+
+/* pre-SubByte transform. */
+#define transform_pre(x, lo_t, hi_t, mask4bit, tmp0) \
+ vpand x, mask4bit, tmp0; \
+ vpandn x, mask4bit, x; \
+ vpsrld $4, x, x; \
+ \
+ vpshufb tmp0, lo_t, tmp0; \
+ vpshufb x, hi_t, x; \
+ vpxor tmp0, x, x;
+
+/* post-SubByte transform. Note: x has been XOR'ed with mask4bit by
+ * 'vaeslastenc' instruction.
+ */
+#define transform_post(x, lo_t, hi_t, mask4bit, tmp0) \
+ vpandn mask4bit, x, tmp0; \
+ vpsrld $4, x, x; \
+ vpand x, mask4bit, x; \
+ \
+ vpshufb tmp0, lo_t, tmp0; \
+ vpshufb x, hi_t, x; \
+ vpxor tmp0, x, x;
+
+
+.section .rodata.cst16, "aM", @progbits, 16
+.align 16
+
+/*
+ * Following four affine transform look-up tables are from work by
+ * Markku-Juhani O. Saarinen, at https://github.com/mjosaarinen/sm4ni
+ *
+ * These allow exposing SM4 S-Box from AES SubByte.
+ */
+
+/* pre-SubByte affine transform, from SM4 field to AES field. */
+.Lpre_tf_lo_s:
+ .quad 0x9197E2E474720701, 0xC7C1B4B222245157
+.Lpre_tf_hi_s:
+ .quad 0xE240AB09EB49A200, 0xF052B91BF95BB012
+
+/* post-SubByte affine transform, from AES field to SM4 field. */
+.Lpost_tf_lo_s:
+ .quad 0x5B67F2CEA19D0834, 0xEDD14478172BBE82
+.Lpost_tf_hi_s:
+ .quad 0xAE7201DD73AFDC00, 0x11CDBE62CC1063BF
+
+/* For isolating SubBytes from AESENCLAST, inverse shift row */
+.Linv_shift_row:
+ .byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b
+ .byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03
+
+/* Inverse shift row + Rotate left by 8 bits on 32-bit words with vpshufb */
+.Linv_shift_row_rol_8:
+ .byte 0x07, 0x00, 0x0d, 0x0a, 0x0b, 0x04, 0x01, 0x0e
+ .byte 0x0f, 0x08, 0x05, 0x02, 0x03, 0x0c, 0x09, 0x06
+
+/* Inverse shift row + Rotate left by 16 bits on 32-bit words with vpshufb */
+.Linv_shift_row_rol_16:
+ .byte 0x0a, 0x07, 0x00, 0x0d, 0x0e, 0x0b, 0x04, 0x01
+ .byte 0x02, 0x0f, 0x08, 0x05, 0x06, 0x03, 0x0c, 0x09
+
+/* Inverse shift row + Rotate left by 24 bits on 32-bit words with vpshufb */
+.Linv_shift_row_rol_24:
+ .byte 0x0d, 0x0a, 0x07, 0x00, 0x01, 0x0e, 0x0b, 0x04
+ .byte 0x05, 0x02, 0x0f, 0x08, 0x09, 0x06, 0x03, 0x0c
+
+/* For CTR-mode IV byteswap */
+.Lbswap128_mask:
+ .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+
+/* For input word byte-swap */
+.Lbswap32_mask:
+ .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
+
+.align 4
+/* 4-bit mask */
+.L0f0f0f0f:
+ .long 0x0f0f0f0f
+
+/* 12 bytes, only for padding */
+.Lpadding_deadbeef:
+ .long 0xdeadbeef, 0xdeadbeef, 0xdeadbeef
+
+
+.text
+.align 16
+
+/*
+ * void sm4_aesni_avx_crypt4(const u32 *rk, u8 *dst,
+ * const u8 *src, int nblocks)
+ */
+.align 8
+SYM_FUNC_START(sm4_aesni_avx_crypt4)
+ /* input:
+ * %rdi: round key array, CTX
+ * %rsi: dst (1..4 blocks)
+ * %rdx: src (1..4 blocks)
+ * %rcx: num blocks (1..4)
+ */
+ FRAME_BEGIN
+
+ vmovdqu 0*16(%rdx), RA0;
+ vmovdqa RA0, RA1;
+ vmovdqa RA0, RA2;
+ vmovdqa RA0, RA3;
+ cmpq $2, %rcx;
+ jb .Lblk4_load_input_done;
+ vmovdqu 1*16(%rdx), RA1;
+ je .Lblk4_load_input_done;
+ vmovdqu 2*16(%rdx), RA2;
+ cmpq $3, %rcx;
+ je .Lblk4_load_input_done;
+ vmovdqu 3*16(%rdx), RA3;
+
+.Lblk4_load_input_done:
+
+ vmovdqa .Lbswap32_mask rRIP, RTMP2;
+ vpshufb RTMP2, RA0, RA0;
+ vpshufb RTMP2, RA1, RA1;
+ vpshufb RTMP2, RA2, RA2;
+ vpshufb RTMP2, RA3, RA3;
+
+ vbroadcastss .L0f0f0f0f rRIP, MASK_4BIT;
+ vmovdqa .Lpre_tf_lo_s rRIP, RTMP4;
+ vmovdqa .Lpre_tf_hi_s rRIP, RB0;
+ vmovdqa .Lpost_tf_lo_s rRIP, RB1;
+ vmovdqa .Lpost_tf_hi_s rRIP, RB2;
+ vmovdqa .Linv_shift_row rRIP, RB3;
+ vmovdqa .Linv_shift_row_rol_8 rRIP, RTMP2;
+ vmovdqa .Linv_shift_row_rol_16 rRIP, RTMP3;
+ transpose_4x4(RA0, RA1, RA2, RA3, RTMP0, RTMP1);
+
+#define ROUND(round, s0, s1, s2, s3) \
+ vbroadcastss (4*(round))(%rdi), RX0; \
+ vpxor s1, RX0, RX0; \
+ vpxor s2, RX0, RX0; \
+ vpxor s3, RX0, RX0; /* s1 ^ s2 ^ s3 ^ rk */ \
+ \
+ /* sbox, non-linear part */ \
+ transform_pre(RX0, RTMP4, RB0, MASK_4BIT, RTMP0); \
+ vaesenclast MASK_4BIT, RX0, RX0; \
+ transform_post(RX0, RB1, RB2, MASK_4BIT, RTMP0); \
+ \
+ /* linear part */ \
+ vpshufb RB3, RX0, RTMP0; \
+ vpxor RTMP0, s0, s0; /* s0 ^ x */ \
+ vpshufb RTMP2, RX0, RTMP1; \
+ vpxor RTMP1, RTMP0, RTMP0; /* x ^ rol(x,8) */ \
+ vpshufb RTMP3, RX0, RTMP1; \
+ vpxor RTMP1, RTMP0, RTMP0; /* x ^ rol(x,8) ^ rol(x,16) */ \
+ vpshufb .Linv_shift_row_rol_24 rRIP, RX0, RTMP1; \
+ vpxor RTMP1, s0, s0; /* s0 ^ x ^ rol(x,24) */ \
+ vpslld $2, RTMP0, RTMP1; \
+ vpsrld $30, RTMP0, RTMP0; \
+ vpxor RTMP0, s0, s0; \
+ /* s0 ^ x ^ rol(x,2) ^ rol(x,10) ^ rol(x,18) ^ rol(x,24) */ \
+ vpxor RTMP1, s0, s0;
+
+ leaq (32*4)(%rdi), %rax;
+.align 16
+.Lroundloop_blk4:
+ ROUND(0, RA0, RA1, RA2, RA3);
+ ROUND(1, RA1, RA2, RA3, RA0);
+ ROUND(2, RA2, RA3, RA0, RA1);
+ ROUND(3, RA3, RA0, RA1, RA2);
+ leaq (4*4)(%rdi), %rdi;
+ cmpq %rax, %rdi;
+ jne .Lroundloop_blk4;
+
+#undef ROUND
+
+ vmovdqa .Lbswap128_mask rRIP, RTMP2;
+
+ transpose_4x4(RA0, RA1, RA2, RA3, RTMP0, RTMP1);
+ vpshufb RTMP2, RA0, RA0;
+ vpshufb RTMP2, RA1, RA1;
+ vpshufb RTMP2, RA2, RA2;
+ vpshufb RTMP2, RA3, RA3;
+
+ vmovdqu RA0, 0*16(%rsi);
+ cmpq $2, %rcx;
+ jb .Lblk4_store_output_done;
+ vmovdqu RA1, 1*16(%rsi);
+ je .Lblk4_store_output_done;
+ vmovdqu RA2, 2*16(%rsi);
+ cmpq $3, %rcx;
+ je .Lblk4_store_output_done;
+ vmovdqu RA3, 3*16(%rsi);
+
+.Lblk4_store_output_done:
+ vzeroall;
+ FRAME_END
+ RET;
+SYM_FUNC_END(sm4_aesni_avx_crypt4)
+
+.align 8
+SYM_FUNC_START_LOCAL(__sm4_crypt_blk8)
+ /* input:
+ * %rdi: round key array, CTX
+ * RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3: eight parallel
+ * plaintext blocks
+ * output:
+ * RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3: eight parallel
+ * ciphertext blocks
+ */
+ FRAME_BEGIN
+
+ vmovdqa .Lbswap32_mask rRIP, RTMP2;
+ vpshufb RTMP2, RA0, RA0;
+ vpshufb RTMP2, RA1, RA1;
+ vpshufb RTMP2, RA2, RA2;
+ vpshufb RTMP2, RA3, RA3;
+ vpshufb RTMP2, RB0, RB0;
+ vpshufb RTMP2, RB1, RB1;
+ vpshufb RTMP2, RB2, RB2;
+ vpshufb RTMP2, RB3, RB3;
+
+ vbroadcastss .L0f0f0f0f rRIP, MASK_4BIT;
+ transpose_4x4(RA0, RA1, RA2, RA3, RTMP0, RTMP1);
+ transpose_4x4(RB0, RB1, RB2, RB3, RTMP0, RTMP1);
+
+#define ROUND(round, s0, s1, s2, s3, r0, r1, r2, r3) \
+ vbroadcastss (4*(round))(%rdi), RX0; \
+ vmovdqa .Lpre_tf_lo_s rRIP, RTMP4; \
+ vmovdqa .Lpre_tf_hi_s rRIP, RTMP1; \
+ vmovdqa RX0, RX1; \
+ vpxor s1, RX0, RX0; \
+ vpxor s2, RX0, RX0; \
+ vpxor s3, RX0, RX0; /* s1 ^ s2 ^ s3 ^ rk */ \
+ vmovdqa .Lpost_tf_lo_s rRIP, RTMP2; \
+ vmovdqa .Lpost_tf_hi_s rRIP, RTMP3; \
+ vpxor r1, RX1, RX1; \
+ vpxor r2, RX1, RX1; \
+ vpxor r3, RX1, RX1; /* r1 ^ r2 ^ r3 ^ rk */ \
+ \
+ /* sbox, non-linear part */ \
+ transform_pre(RX0, RTMP4, RTMP1, MASK_4BIT, RTMP0); \
+ transform_pre(RX1, RTMP4, RTMP1, MASK_4BIT, RTMP0); \
+ vmovdqa .Linv_shift_row rRIP, RTMP4; \
+ vaesenclast MASK_4BIT, RX0, RX0; \
+ vaesenclast MASK_4BIT, RX1, RX1; \
+ transform_post(RX0, RTMP2, RTMP3, MASK_4BIT, RTMP0); \
+ transform_post(RX1, RTMP2, RTMP3, MASK_4BIT, RTMP0); \
+ \
+ /* linear part */ \
+ vpshufb RTMP4, RX0, RTMP0; \
+ vpxor RTMP0, s0, s0; /* s0 ^ x */ \
+ vpshufb RTMP4, RX1, RTMP2; \
+ vmovdqa .Linv_shift_row_rol_8 rRIP, RTMP4; \
+ vpxor RTMP2, r0, r0; /* r0 ^ x */ \
+ vpshufb RTMP4, RX0, RTMP1; \
+ vpxor RTMP1, RTMP0, RTMP0; /* x ^ rol(x,8) */ \
+ vpshufb RTMP4, RX1, RTMP3; \
+ vmovdqa .Linv_shift_row_rol_16 rRIP, RTMP4; \
+ vpxor RTMP3, RTMP2, RTMP2; /* x ^ rol(x,8) */ \
+ vpshufb RTMP4, RX0, RTMP1; \
+ vpxor RTMP1, RTMP0, RTMP0; /* x ^ rol(x,8) ^ rol(x,16) */ \
+ vpshufb RTMP4, RX1, RTMP3; \
+ vmovdqa .Linv_shift_row_rol_24 rRIP, RTMP4; \
+ vpxor RTMP3, RTMP2, RTMP2; /* x ^ rol(x,8) ^ rol(x,16) */ \
+ vpshufb RTMP4, RX0, RTMP1; \
+ vpxor RTMP1, s0, s0; /* s0 ^ x ^ rol(x,24) */ \
+ /* s0 ^ x ^ rol(x,2) ^ rol(x,10) ^ rol(x,18) ^ rol(x,24) */ \
+ vpslld $2, RTMP0, RTMP1; \
+ vpsrld $30, RTMP0, RTMP0; \
+ vpxor RTMP0, s0, s0; \
+ vpxor RTMP1, s0, s0; \
+ vpshufb RTMP4, RX1, RTMP3; \
+ vpxor RTMP3, r0, r0; /* r0 ^ x ^ rol(x,24) */ \
+ /* r0 ^ x ^ rol(x,2) ^ rol(x,10) ^ rol(x,18) ^ rol(x,24) */ \
+ vpslld $2, RTMP2, RTMP3; \
+ vpsrld $30, RTMP2, RTMP2; \
+ vpxor RTMP2, r0, r0; \
+ vpxor RTMP3, r0, r0;
+
+ leaq (32*4)(%rdi), %rax;
+.align 16
+.Lroundloop_blk8:
+ ROUND(0, RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3);
+ ROUND(1, RA1, RA2, RA3, RA0, RB1, RB2, RB3, RB0);
+ ROUND(2, RA2, RA3, RA0, RA1, RB2, RB3, RB0, RB1);
+ ROUND(3, RA3, RA0, RA1, RA2, RB3, RB0, RB1, RB2);
+ leaq (4*4)(%rdi), %rdi;
+ cmpq %rax, %rdi;
+ jne .Lroundloop_blk8;
+
+#undef ROUND
+
+ vmovdqa .Lbswap128_mask rRIP, RTMP2;
+
+ transpose_4x4(RA0, RA1, RA2, RA3, RTMP0, RTMP1);
+ transpose_4x4(RB0, RB1, RB2, RB3, RTMP0, RTMP1);
+ vpshufb RTMP2, RA0, RA0;
+ vpshufb RTMP2, RA1, RA1;
+ vpshufb RTMP2, RA2, RA2;
+ vpshufb RTMP2, RA3, RA3;
+ vpshufb RTMP2, RB0, RB0;
+ vpshufb RTMP2, RB1, RB1;
+ vpshufb RTMP2, RB2, RB2;
+ vpshufb RTMP2, RB3, RB3;
+
+ FRAME_END
+ RET;
+SYM_FUNC_END(__sm4_crypt_blk8)
+
+/*
+ * void sm4_aesni_avx_crypt8(const u32 *rk, u8 *dst,
+ * const u8 *src, int nblocks)
+ */
+.align 8
+SYM_FUNC_START(sm4_aesni_avx_crypt8)
+ /* input:
+ * %rdi: round key array, CTX
+ * %rsi: dst (1..8 blocks)
+ * %rdx: src (1..8 blocks)
+ * %rcx: num blocks (1..8)
+ */
+ cmpq $5, %rcx;
+ jb sm4_aesni_avx_crypt4;
+
+ FRAME_BEGIN
+
+ vmovdqu (0 * 16)(%rdx), RA0;
+ vmovdqu (1 * 16)(%rdx), RA1;
+ vmovdqu (2 * 16)(%rdx), RA2;
+ vmovdqu (3 * 16)(%rdx), RA3;
+ vmovdqu (4 * 16)(%rdx), RB0;
+ vmovdqa RB0, RB1;
+ vmovdqa RB0, RB2;
+ vmovdqa RB0, RB3;
+ je .Lblk8_load_input_done;
+ vmovdqu (5 * 16)(%rdx), RB1;
+ cmpq $7, %rcx;
+ jb .Lblk8_load_input_done;
+ vmovdqu (6 * 16)(%rdx), RB2;
+ je .Lblk8_load_input_done;
+ vmovdqu (7 * 16)(%rdx), RB3;
+
+.Lblk8_load_input_done:
+ call __sm4_crypt_blk8;
+
+ cmpq $6, %rcx;
+ vmovdqu RA0, (0 * 16)(%rsi);
+ vmovdqu RA1, (1 * 16)(%rsi);
+ vmovdqu RA2, (2 * 16)(%rsi);
+ vmovdqu RA3, (3 * 16)(%rsi);
+ vmovdqu RB0, (4 * 16)(%rsi);
+ jb .Lblk8_store_output_done;
+ vmovdqu RB1, (5 * 16)(%rsi);
+ je .Lblk8_store_output_done;
+ vmovdqu RB2, (6 * 16)(%rsi);
+ cmpq $7, %rcx;
+ je .Lblk8_store_output_done;
+ vmovdqu RB3, (7 * 16)(%rsi);
+
+.Lblk8_store_output_done:
+ vzeroall;
+ FRAME_END
+ RET;
+SYM_FUNC_END(sm4_aesni_avx_crypt8)
+
+/*
+ * void sm4_aesni_avx_ctr_enc_blk8(const u32 *rk, u8 *dst,
+ * const u8 *src, u8 *iv)
+ */
+.align 8
+SYM_FUNC_START(sm4_aesni_avx_ctr_enc_blk8)
+ /* input:
+ * %rdi: round key array, CTX
+ * %rsi: dst (8 blocks)
+ * %rdx: src (8 blocks)
+ * %rcx: iv (big endian, 128bit)
+ */
+ FRAME_BEGIN
+
+ /* load IV and byteswap */
+ vmovdqu (%rcx), RA0;
+
+ vmovdqa .Lbswap128_mask rRIP, RBSWAP;
+ vpshufb RBSWAP, RA0, RTMP0; /* be => le */
+
+ vpcmpeqd RNOT, RNOT, RNOT;
+ vpsrldq $8, RNOT, RNOT; /* low: -1, high: 0 */
+
+#define inc_le128(x, minus_one, tmp) \
+ vpcmpeqq minus_one, x, tmp; \
+ vpsubq minus_one, x, x; \
+ vpslldq $8, tmp, tmp; \
+ vpsubq tmp, x, x;
+
+ /* construct IVs */
+ inc_le128(RTMP0, RNOT, RTMP2); /* +1 */
+ vpshufb RBSWAP, RTMP0, RA1;
+ inc_le128(RTMP0, RNOT, RTMP2); /* +2 */
+ vpshufb RBSWAP, RTMP0, RA2;
+ inc_le128(RTMP0, RNOT, RTMP2); /* +3 */
+ vpshufb RBSWAP, RTMP0, RA3;
+ inc_le128(RTMP0, RNOT, RTMP2); /* +4 */
+ vpshufb RBSWAP, RTMP0, RB0;
+ inc_le128(RTMP0, RNOT, RTMP2); /* +5 */
+ vpshufb RBSWAP, RTMP0, RB1;
+ inc_le128(RTMP0, RNOT, RTMP2); /* +6 */
+ vpshufb RBSWAP, RTMP0, RB2;
+ inc_le128(RTMP0, RNOT, RTMP2); /* +7 */
+ vpshufb RBSWAP, RTMP0, RB3;
+ inc_le128(RTMP0, RNOT, RTMP2); /* +8 */
+ vpshufb RBSWAP, RTMP0, RTMP1;
+
+ /* store new IV */
+ vmovdqu RTMP1, (%rcx);
+
+ call __sm4_crypt_blk8;
+
+ vpxor (0 * 16)(%rdx), RA0, RA0;
+ vpxor (1 * 16)(%rdx), RA1, RA1;
+ vpxor (2 * 16)(%rdx), RA2, RA2;
+ vpxor (3 * 16)(%rdx), RA3, RA3;
+ vpxor (4 * 16)(%rdx), RB0, RB0;
+ vpxor (5 * 16)(%rdx), RB1, RB1;
+ vpxor (6 * 16)(%rdx), RB2, RB2;
+ vpxor (7 * 16)(%rdx), RB3, RB3;
+
+ vmovdqu RA0, (0 * 16)(%rsi);
+ vmovdqu RA1, (1 * 16)(%rsi);
+ vmovdqu RA2, (2 * 16)(%rsi);
+ vmovdqu RA3, (3 * 16)(%rsi);
+ vmovdqu RB0, (4 * 16)(%rsi);
+ vmovdqu RB1, (5 * 16)(%rsi);
+ vmovdqu RB2, (6 * 16)(%rsi);
+ vmovdqu RB3, (7 * 16)(%rsi);
+
+ vzeroall;
+ FRAME_END
+ RET;
+SYM_FUNC_END(sm4_aesni_avx_ctr_enc_blk8)
+
+/*
+ * void sm4_aesni_avx_cbc_dec_blk8(const u32 *rk, u8 *dst,
+ * const u8 *src, u8 *iv)
+ */
+.align 8
+SYM_FUNC_START(sm4_aesni_avx_cbc_dec_blk8)
+ /* input:
+ * %rdi: round key array, CTX
+ * %rsi: dst (8 blocks)
+ * %rdx: src (8 blocks)
+ * %rcx: iv
+ */
+ FRAME_BEGIN
+
+ vmovdqu (0 * 16)(%rdx), RA0;
+ vmovdqu (1 * 16)(%rdx), RA1;
+ vmovdqu (2 * 16)(%rdx), RA2;
+ vmovdqu (3 * 16)(%rdx), RA3;
+ vmovdqu (4 * 16)(%rdx), RB0;
+ vmovdqu (5 * 16)(%rdx), RB1;
+ vmovdqu (6 * 16)(%rdx), RB2;
+ vmovdqu (7 * 16)(%rdx), RB3;
+
+ call __sm4_crypt_blk8;
+
+ vmovdqu (7 * 16)(%rdx), RNOT;
+ vpxor (%rcx), RA0, RA0;
+ vpxor (0 * 16)(%rdx), RA1, RA1;
+ vpxor (1 * 16)(%rdx), RA2, RA2;
+ vpxor (2 * 16)(%rdx), RA3, RA3;
+ vpxor (3 * 16)(%rdx), RB0, RB0;
+ vpxor (4 * 16)(%rdx), RB1, RB1;
+ vpxor (5 * 16)(%rdx), RB2, RB2;
+ vpxor (6 * 16)(%rdx), RB3, RB3;
+ vmovdqu RNOT, (%rcx); /* store new IV */
+
+ vmovdqu RA0, (0 * 16)(%rsi);
+ vmovdqu RA1, (1 * 16)(%rsi);
+ vmovdqu RA2, (2 * 16)(%rsi);
+ vmovdqu RA3, (3 * 16)(%rsi);
+ vmovdqu RB0, (4 * 16)(%rsi);
+ vmovdqu RB1, (5 * 16)(%rsi);
+ vmovdqu RB2, (6 * 16)(%rsi);
+ vmovdqu RB3, (7 * 16)(%rsi);
+
+ vzeroall;
+ FRAME_END
+ RET;
+SYM_FUNC_END(sm4_aesni_avx_cbc_dec_blk8)
+
+/*
+ * void sm4_aesni_avx_cfb_dec_blk8(const u32 *rk, u8 *dst,
+ * const u8 *src, u8 *iv)
+ */
+.align 8
+SYM_FUNC_START(sm4_aesni_avx_cfb_dec_blk8)
+ /* input:
+ * %rdi: round key array, CTX
+ * %rsi: dst (8 blocks)
+ * %rdx: src (8 blocks)
+ * %rcx: iv
+ */
+ FRAME_BEGIN
+
+ /* Load input */
+ vmovdqu (%rcx), RA0;
+ vmovdqu 0 * 16(%rdx), RA1;
+ vmovdqu 1 * 16(%rdx), RA2;
+ vmovdqu 2 * 16(%rdx), RA3;
+ vmovdqu 3 * 16(%rdx), RB0;
+ vmovdqu 4 * 16(%rdx), RB1;
+ vmovdqu 5 * 16(%rdx), RB2;
+ vmovdqu 6 * 16(%rdx), RB3;
+
+ /* Update IV */
+ vmovdqu 7 * 16(%rdx), RNOT;
+ vmovdqu RNOT, (%rcx);
+
+ call __sm4_crypt_blk8;
+
+ vpxor (0 * 16)(%rdx), RA0, RA0;
+ vpxor (1 * 16)(%rdx), RA1, RA1;
+ vpxor (2 * 16)(%rdx), RA2, RA2;
+ vpxor (3 * 16)(%rdx), RA3, RA3;
+ vpxor (4 * 16)(%rdx), RB0, RB0;
+ vpxor (5 * 16)(%rdx), RB1, RB1;
+ vpxor (6 * 16)(%rdx), RB2, RB2;
+ vpxor (7 * 16)(%rdx), RB3, RB3;
+
+ vmovdqu RA0, (0 * 16)(%rsi);
+ vmovdqu RA1, (1 * 16)(%rsi);
+ vmovdqu RA2, (2 * 16)(%rsi);
+ vmovdqu RA3, (3 * 16)(%rsi);
+ vmovdqu RB0, (4 * 16)(%rsi);
+ vmovdqu RB1, (5 * 16)(%rsi);
+ vmovdqu RB2, (6 * 16)(%rsi);
+ vmovdqu RB3, (7 * 16)(%rsi);
+
+ vzeroall;
+ FRAME_END
+ RET;
+SYM_FUNC_END(sm4_aesni_avx_cfb_dec_blk8)
diff --git a/arch/x86/crypto/sm4-aesni-avx2-asm_64.S b/arch/x86/crypto/sm4-aesni-avx2-asm_64.S
new file mode 100644
index 000000000000..4732fe8bb65b
--- /dev/null
+++ b/arch/x86/crypto/sm4-aesni-avx2-asm_64.S
@@ -0,0 +1,501 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * SM4 Cipher Algorithm, AES-NI/AVX2 optimized.
+ * as specified in
+ * https://tools.ietf.org/id/draft-ribose-cfrg-sm4-10.html
+ *
+ * Copyright (C) 2018 Markku-Juhani O. Saarinen <mjos@iki.fi>
+ * Copyright (C) 2020 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ * Copyright (c) 2021 Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
+ */
+
+/* Based on SM4 AES-NI work by libgcrypt and Markku-Juhani O. Saarinen at:
+ * https://github.com/mjosaarinen/sm4ni
+ */
+
+#include <linux/linkage.h>
+#include <asm/frame.h>
+
+#define rRIP (%rip)
+
+/* vector registers */
+#define RX0 %ymm0
+#define RX1 %ymm1
+#define MASK_4BIT %ymm2
+#define RTMP0 %ymm3
+#define RTMP1 %ymm4
+#define RTMP2 %ymm5
+#define RTMP3 %ymm6
+#define RTMP4 %ymm7
+
+#define RA0 %ymm8
+#define RA1 %ymm9
+#define RA2 %ymm10
+#define RA3 %ymm11
+
+#define RB0 %ymm12
+#define RB1 %ymm13
+#define RB2 %ymm14
+#define RB3 %ymm15
+
+#define RNOT %ymm0
+#define RBSWAP %ymm1
+
+#define RX0x %xmm0
+#define RX1x %xmm1
+#define MASK_4BITx %xmm2
+
+#define RNOTx %xmm0
+#define RBSWAPx %xmm1
+
+#define RTMP0x %xmm3
+#define RTMP1x %xmm4
+#define RTMP2x %xmm5
+#define RTMP3x %xmm6
+#define RTMP4x %xmm7
+
+
+/* helper macros */
+
+/* Transpose four 32-bit words between 128-bit vector lanes. */
+#define transpose_4x4(x0, x1, x2, x3, t1, t2) \
+ vpunpckhdq x1, x0, t2; \
+ vpunpckldq x1, x0, x0; \
+ \
+ vpunpckldq x3, x2, t1; \
+ vpunpckhdq x3, x2, x2; \
+ \
+ vpunpckhqdq t1, x0, x1; \
+ vpunpcklqdq t1, x0, x0; \
+ \
+ vpunpckhqdq x2, t2, x3; \
+ vpunpcklqdq x2, t2, x2;
+
+/* post-SubByte transform. */
+#define transform_pre(x, lo_t, hi_t, mask4bit, tmp0) \
+ vpand x, mask4bit, tmp0; \
+ vpandn x, mask4bit, x; \
+ vpsrld $4, x, x; \
+ \
+ vpshufb tmp0, lo_t, tmp0; \
+ vpshufb x, hi_t, x; \
+ vpxor tmp0, x, x;
+
+/* post-SubByte transform. Note: x has been XOR'ed with mask4bit by
+ * 'vaeslastenc' instruction. */
+#define transform_post(x, lo_t, hi_t, mask4bit, tmp0) \
+ vpandn mask4bit, x, tmp0; \
+ vpsrld $4, x, x; \
+ vpand x, mask4bit, x; \
+ \
+ vpshufb tmp0, lo_t, tmp0; \
+ vpshufb x, hi_t, x; \
+ vpxor tmp0, x, x;
+
+
+.section .rodata.cst16, "aM", @progbits, 16
+.align 16
+
+/*
+ * Following four affine transform look-up tables are from work by
+ * Markku-Juhani O. Saarinen, at https://github.com/mjosaarinen/sm4ni
+ *
+ * These allow exposing SM4 S-Box from AES SubByte.
+ */
+
+/* pre-SubByte affine transform, from SM4 field to AES field. */
+.Lpre_tf_lo_s:
+ .quad 0x9197E2E474720701, 0xC7C1B4B222245157
+.Lpre_tf_hi_s:
+ .quad 0xE240AB09EB49A200, 0xF052B91BF95BB012
+
+/* post-SubByte affine transform, from AES field to SM4 field. */
+.Lpost_tf_lo_s:
+ .quad 0x5B67F2CEA19D0834, 0xEDD14478172BBE82
+.Lpost_tf_hi_s:
+ .quad 0xAE7201DD73AFDC00, 0x11CDBE62CC1063BF
+
+/* For isolating SubBytes from AESENCLAST, inverse shift row */
+.Linv_shift_row:
+ .byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b
+ .byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03
+
+/* Inverse shift row + Rotate left by 8 bits on 32-bit words with vpshufb */
+.Linv_shift_row_rol_8:
+ .byte 0x07, 0x00, 0x0d, 0x0a, 0x0b, 0x04, 0x01, 0x0e
+ .byte 0x0f, 0x08, 0x05, 0x02, 0x03, 0x0c, 0x09, 0x06
+
+/* Inverse shift row + Rotate left by 16 bits on 32-bit words with vpshufb */
+.Linv_shift_row_rol_16:
+ .byte 0x0a, 0x07, 0x00, 0x0d, 0x0e, 0x0b, 0x04, 0x01
+ .byte 0x02, 0x0f, 0x08, 0x05, 0x06, 0x03, 0x0c, 0x09
+
+/* Inverse shift row + Rotate left by 24 bits on 32-bit words with vpshufb */
+.Linv_shift_row_rol_24:
+ .byte 0x0d, 0x0a, 0x07, 0x00, 0x01, 0x0e, 0x0b, 0x04
+ .byte 0x05, 0x02, 0x0f, 0x08, 0x09, 0x06, 0x03, 0x0c
+
+/* For CTR-mode IV byteswap */
+.Lbswap128_mask:
+ .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+
+/* For input word byte-swap */
+.Lbswap32_mask:
+ .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
+
+.align 4
+/* 4-bit mask */
+.L0f0f0f0f:
+ .long 0x0f0f0f0f
+
+/* 12 bytes, only for padding */
+.Lpadding_deadbeef:
+ .long 0xdeadbeef, 0xdeadbeef, 0xdeadbeef
+
+.text
+.align 16
+
+.align 8
+SYM_FUNC_START_LOCAL(__sm4_crypt_blk16)
+ /* input:
+ * %rdi: round key array, CTX
+ * RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3: sixteen parallel
+ * plaintext blocks
+ * output:
+ * RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3: sixteen parallel
+ * ciphertext blocks
+ */
+ FRAME_BEGIN
+
+ vbroadcasti128 .Lbswap32_mask rRIP, RTMP2;
+ vpshufb RTMP2, RA0, RA0;
+ vpshufb RTMP2, RA1, RA1;
+ vpshufb RTMP2, RA2, RA2;
+ vpshufb RTMP2, RA3, RA3;
+ vpshufb RTMP2, RB0, RB0;
+ vpshufb RTMP2, RB1, RB1;
+ vpshufb RTMP2, RB2, RB2;
+ vpshufb RTMP2, RB3, RB3;
+
+ vpbroadcastd .L0f0f0f0f rRIP, MASK_4BIT;
+ transpose_4x4(RA0, RA1, RA2, RA3, RTMP0, RTMP1);
+ transpose_4x4(RB0, RB1, RB2, RB3, RTMP0, RTMP1);
+
+#define ROUND(round, s0, s1, s2, s3, r0, r1, r2, r3) \
+ vpbroadcastd (4*(round))(%rdi), RX0; \
+ vbroadcasti128 .Lpre_tf_lo_s rRIP, RTMP4; \
+ vbroadcasti128 .Lpre_tf_hi_s rRIP, RTMP1; \
+ vmovdqa RX0, RX1; \
+ vpxor s1, RX0, RX0; \
+ vpxor s2, RX0, RX0; \
+ vpxor s3, RX0, RX0; /* s1 ^ s2 ^ s3 ^ rk */ \
+ vbroadcasti128 .Lpost_tf_lo_s rRIP, RTMP2; \
+ vbroadcasti128 .Lpost_tf_hi_s rRIP, RTMP3; \
+ vpxor r1, RX1, RX1; \
+ vpxor r2, RX1, RX1; \
+ vpxor r3, RX1, RX1; /* r1 ^ r2 ^ r3 ^ rk */ \
+ \
+ /* sbox, non-linear part */ \
+ transform_pre(RX0, RTMP4, RTMP1, MASK_4BIT, RTMP0); \
+ transform_pre(RX1, RTMP4, RTMP1, MASK_4BIT, RTMP0); \
+ vextracti128 $1, RX0, RTMP4x; \
+ vextracti128 $1, RX1, RTMP0x; \
+ vaesenclast MASK_4BITx, RX0x, RX0x; \
+ vaesenclast MASK_4BITx, RTMP4x, RTMP4x; \
+ vaesenclast MASK_4BITx, RX1x, RX1x; \
+ vaesenclast MASK_4BITx, RTMP0x, RTMP0x; \
+ vinserti128 $1, RTMP4x, RX0, RX0; \
+ vbroadcasti128 .Linv_shift_row rRIP, RTMP4; \
+ vinserti128 $1, RTMP0x, RX1, RX1; \
+ transform_post(RX0, RTMP2, RTMP3, MASK_4BIT, RTMP0); \
+ transform_post(RX1, RTMP2, RTMP3, MASK_4BIT, RTMP0); \
+ \
+ /* linear part */ \
+ vpshufb RTMP4, RX0, RTMP0; \
+ vpxor RTMP0, s0, s0; /* s0 ^ x */ \
+ vpshufb RTMP4, RX1, RTMP2; \
+ vbroadcasti128 .Linv_shift_row_rol_8 rRIP, RTMP4; \
+ vpxor RTMP2, r0, r0; /* r0 ^ x */ \
+ vpshufb RTMP4, RX0, RTMP1; \
+ vpxor RTMP1, RTMP0, RTMP0; /* x ^ rol(x,8) */ \
+ vpshufb RTMP4, RX1, RTMP3; \
+ vbroadcasti128 .Linv_shift_row_rol_16 rRIP, RTMP4; \
+ vpxor RTMP3, RTMP2, RTMP2; /* x ^ rol(x,8) */ \
+ vpshufb RTMP4, RX0, RTMP1; \
+ vpxor RTMP1, RTMP0, RTMP0; /* x ^ rol(x,8) ^ rol(x,16) */ \
+ vpshufb RTMP4, RX1, RTMP3; \
+ vbroadcasti128 .Linv_shift_row_rol_24 rRIP, RTMP4; \
+ vpxor RTMP3, RTMP2, RTMP2; /* x ^ rol(x,8) ^ rol(x,16) */ \
+ vpshufb RTMP4, RX0, RTMP1; \
+ vpxor RTMP1, s0, s0; /* s0 ^ x ^ rol(x,24) */ \
+ vpslld $2, RTMP0, RTMP1; \
+ vpsrld $30, RTMP0, RTMP0; \
+ vpxor RTMP0, s0, s0; \
+ /* s0 ^ x ^ rol(x,2) ^ rol(x,10) ^ rol(x,18) ^ rol(x,24) */ \
+ vpxor RTMP1, s0, s0; \
+ vpshufb RTMP4, RX1, RTMP3; \
+ vpxor RTMP3, r0, r0; /* r0 ^ x ^ rol(x,24) */ \
+ vpslld $2, RTMP2, RTMP3; \
+ vpsrld $30, RTMP2, RTMP2; \
+ vpxor RTMP2, r0, r0; \
+ /* r0 ^ x ^ rol(x,2) ^ rol(x,10) ^ rol(x,18) ^ rol(x,24) */ \
+ vpxor RTMP3, r0, r0;
+
+ leaq (32*4)(%rdi), %rax;
+.align 16
+.Lroundloop_blk8:
+ ROUND(0, RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3);
+ ROUND(1, RA1, RA2, RA3, RA0, RB1, RB2, RB3, RB0);
+ ROUND(2, RA2, RA3, RA0, RA1, RB2, RB3, RB0, RB1);
+ ROUND(3, RA3, RA0, RA1, RA2, RB3, RB0, RB1, RB2);
+ leaq (4*4)(%rdi), %rdi;
+ cmpq %rax, %rdi;
+ jne .Lroundloop_blk8;
+
+#undef ROUND
+
+ vbroadcasti128 .Lbswap128_mask rRIP, RTMP2;
+
+ transpose_4x4(RA0, RA1, RA2, RA3, RTMP0, RTMP1);
+ transpose_4x4(RB0, RB1, RB2, RB3, RTMP0, RTMP1);
+ vpshufb RTMP2, RA0, RA0;
+ vpshufb RTMP2, RA1, RA1;
+ vpshufb RTMP2, RA2, RA2;
+ vpshufb RTMP2, RA3, RA3;
+ vpshufb RTMP2, RB0, RB0;
+ vpshufb RTMP2, RB1, RB1;
+ vpshufb RTMP2, RB2, RB2;
+ vpshufb RTMP2, RB3, RB3;
+
+ FRAME_END
+ RET;
+SYM_FUNC_END(__sm4_crypt_blk16)
+
+#define inc_le128(x, minus_one, tmp) \
+ vpcmpeqq minus_one, x, tmp; \
+ vpsubq minus_one, x, x; \
+ vpslldq $8, tmp, tmp; \
+ vpsubq tmp, x, x;
+
+/*
+ * void sm4_aesni_avx2_ctr_enc_blk16(const u32 *rk, u8 *dst,
+ * const u8 *src, u8 *iv)
+ */
+.align 8
+SYM_FUNC_START(sm4_aesni_avx2_ctr_enc_blk16)
+ /* input:
+ * %rdi: round key array, CTX
+ * %rsi: dst (16 blocks)
+ * %rdx: src (16 blocks)
+ * %rcx: iv (big endian, 128bit)
+ */
+ FRAME_BEGIN
+
+ movq 8(%rcx), %rax;
+ bswapq %rax;
+
+ vzeroupper;
+
+ vbroadcasti128 .Lbswap128_mask rRIP, RTMP3;
+ vpcmpeqd RNOT, RNOT, RNOT;
+ vpsrldq $8, RNOT, RNOT; /* ab: -1:0 ; cd: -1:0 */
+ vpaddq RNOT, RNOT, RTMP2; /* ab: -2:0 ; cd: -2:0 */
+
+ /* load IV and byteswap */
+ vmovdqu (%rcx), RTMP4x;
+ vpshufb RTMP3x, RTMP4x, RTMP4x;
+ vmovdqa RTMP4x, RTMP0x;
+ inc_le128(RTMP4x, RNOTx, RTMP1x);
+ vinserti128 $1, RTMP4x, RTMP0, RTMP0;
+ vpshufb RTMP3, RTMP0, RA0; /* +1 ; +0 */
+
+ /* check need for handling 64-bit overflow and carry */
+ cmpq $(0xffffffffffffffff - 16), %rax;
+ ja .Lhandle_ctr_carry;
+
+ /* construct IVs */
+ vpsubq RTMP2, RTMP0, RTMP0; /* +3 ; +2 */
+ vpshufb RTMP3, RTMP0, RA1;
+ vpsubq RTMP2, RTMP0, RTMP0; /* +5 ; +4 */
+ vpshufb RTMP3, RTMP0, RA2;
+ vpsubq RTMP2, RTMP0, RTMP0; /* +7 ; +6 */
+ vpshufb RTMP3, RTMP0, RA3;
+ vpsubq RTMP2, RTMP0, RTMP0; /* +9 ; +8 */
+ vpshufb RTMP3, RTMP0, RB0;
+ vpsubq RTMP2, RTMP0, RTMP0; /* +11 ; +10 */
+ vpshufb RTMP3, RTMP0, RB1;
+ vpsubq RTMP2, RTMP0, RTMP0; /* +13 ; +12 */
+ vpshufb RTMP3, RTMP0, RB2;
+ vpsubq RTMP2, RTMP0, RTMP0; /* +15 ; +14 */
+ vpshufb RTMP3, RTMP0, RB3;
+ vpsubq RTMP2, RTMP0, RTMP0; /* +16 */
+ vpshufb RTMP3x, RTMP0x, RTMP0x;
+
+ jmp .Lctr_carry_done;
+
+.Lhandle_ctr_carry:
+ /* construct IVs */
+ inc_le128(RTMP0, RNOT, RTMP1);
+ inc_le128(RTMP0, RNOT, RTMP1);
+ vpshufb RTMP3, RTMP0, RA1; /* +3 ; +2 */
+ inc_le128(RTMP0, RNOT, RTMP1);
+ inc_le128(RTMP0, RNOT, RTMP1);
+ vpshufb RTMP3, RTMP0, RA2; /* +5 ; +4 */
+ inc_le128(RTMP0, RNOT, RTMP1);
+ inc_le128(RTMP0, RNOT, RTMP1);
+ vpshufb RTMP3, RTMP0, RA3; /* +7 ; +6 */
+ inc_le128(RTMP0, RNOT, RTMP1);
+ inc_le128(RTMP0, RNOT, RTMP1);
+ vpshufb RTMP3, RTMP0, RB0; /* +9 ; +8 */
+ inc_le128(RTMP0, RNOT, RTMP1);
+ inc_le128(RTMP0, RNOT, RTMP1);
+ vpshufb RTMP3, RTMP0, RB1; /* +11 ; +10 */
+ inc_le128(RTMP0, RNOT, RTMP1);
+ inc_le128(RTMP0, RNOT, RTMP1);
+ vpshufb RTMP3, RTMP0, RB2; /* +13 ; +12 */
+ inc_le128(RTMP0, RNOT, RTMP1);
+ inc_le128(RTMP0, RNOT, RTMP1);
+ vpshufb RTMP3, RTMP0, RB3; /* +15 ; +14 */
+ inc_le128(RTMP0, RNOT, RTMP1);
+ vextracti128 $1, RTMP0, RTMP0x;
+ vpshufb RTMP3x, RTMP0x, RTMP0x; /* +16 */
+
+.align 4
+.Lctr_carry_done:
+ /* store new IV */
+ vmovdqu RTMP0x, (%rcx);
+
+ call __sm4_crypt_blk16;
+
+ vpxor (0 * 32)(%rdx), RA0, RA0;
+ vpxor (1 * 32)(%rdx), RA1, RA1;
+ vpxor (2 * 32)(%rdx), RA2, RA2;
+ vpxor (3 * 32)(%rdx), RA3, RA3;
+ vpxor (4 * 32)(%rdx), RB0, RB0;
+ vpxor (5 * 32)(%rdx), RB1, RB1;
+ vpxor (6 * 32)(%rdx), RB2, RB2;
+ vpxor (7 * 32)(%rdx), RB3, RB3;
+
+ vmovdqu RA0, (0 * 32)(%rsi);
+ vmovdqu RA1, (1 * 32)(%rsi);
+ vmovdqu RA2, (2 * 32)(%rsi);
+ vmovdqu RA3, (3 * 32)(%rsi);
+ vmovdqu RB0, (4 * 32)(%rsi);
+ vmovdqu RB1, (5 * 32)(%rsi);
+ vmovdqu RB2, (6 * 32)(%rsi);
+ vmovdqu RB3, (7 * 32)(%rsi);
+
+ vzeroall;
+ FRAME_END
+ RET;
+SYM_FUNC_END(sm4_aesni_avx2_ctr_enc_blk16)
+
+/*
+ * void sm4_aesni_avx2_cbc_dec_blk16(const u32 *rk, u8 *dst,
+ * const u8 *src, u8 *iv)
+ */
+.align 8
+SYM_FUNC_START(sm4_aesni_avx2_cbc_dec_blk16)
+ /* input:
+ * %rdi: round key array, CTX
+ * %rsi: dst (16 blocks)
+ * %rdx: src (16 blocks)
+ * %rcx: iv
+ */
+ FRAME_BEGIN
+
+ vzeroupper;
+
+ vmovdqu (0 * 32)(%rdx), RA0;
+ vmovdqu (1 * 32)(%rdx), RA1;
+ vmovdqu (2 * 32)(%rdx), RA2;
+ vmovdqu (3 * 32)(%rdx), RA3;
+ vmovdqu (4 * 32)(%rdx), RB0;
+ vmovdqu (5 * 32)(%rdx), RB1;
+ vmovdqu (6 * 32)(%rdx), RB2;
+ vmovdqu (7 * 32)(%rdx), RB3;
+
+ call __sm4_crypt_blk16;
+
+ vmovdqu (%rcx), RNOTx;
+ vinserti128 $1, (%rdx), RNOT, RNOT;
+ vpxor RNOT, RA0, RA0;
+ vpxor (0 * 32 + 16)(%rdx), RA1, RA1;
+ vpxor (1 * 32 + 16)(%rdx), RA2, RA2;
+ vpxor (2 * 32 + 16)(%rdx), RA3, RA3;
+ vpxor (3 * 32 + 16)(%rdx), RB0, RB0;
+ vpxor (4 * 32 + 16)(%rdx), RB1, RB1;
+ vpxor (5 * 32 + 16)(%rdx), RB2, RB2;
+ vpxor (6 * 32 + 16)(%rdx), RB3, RB3;
+ vmovdqu (7 * 32 + 16)(%rdx), RNOTx;
+ vmovdqu RNOTx, (%rcx); /* store new IV */
+
+ vmovdqu RA0, (0 * 32)(%rsi);
+ vmovdqu RA1, (1 * 32)(%rsi);
+ vmovdqu RA2, (2 * 32)(%rsi);
+ vmovdqu RA3, (3 * 32)(%rsi);
+ vmovdqu RB0, (4 * 32)(%rsi);
+ vmovdqu RB1, (5 * 32)(%rsi);
+ vmovdqu RB2, (6 * 32)(%rsi);
+ vmovdqu RB3, (7 * 32)(%rsi);
+
+ vzeroall;
+ FRAME_END
+ RET;
+SYM_FUNC_END(sm4_aesni_avx2_cbc_dec_blk16)
+
+/*
+ * void sm4_aesni_avx2_cfb_dec_blk16(const u32 *rk, u8 *dst,
+ * const u8 *src, u8 *iv)
+ */
+.align 8
+SYM_FUNC_START(sm4_aesni_avx2_cfb_dec_blk16)
+ /* input:
+ * %rdi: round key array, CTX
+ * %rsi: dst (16 blocks)
+ * %rdx: src (16 blocks)
+ * %rcx: iv
+ */
+ FRAME_BEGIN
+
+ vzeroupper;
+
+ /* Load input */
+ vmovdqu (%rcx), RNOTx;
+ vinserti128 $1, (%rdx), RNOT, RA0;
+ vmovdqu (0 * 32 + 16)(%rdx), RA1;
+ vmovdqu (1 * 32 + 16)(%rdx), RA2;
+ vmovdqu (2 * 32 + 16)(%rdx), RA3;
+ vmovdqu (3 * 32 + 16)(%rdx), RB0;
+ vmovdqu (4 * 32 + 16)(%rdx), RB1;
+ vmovdqu (5 * 32 + 16)(%rdx), RB2;
+ vmovdqu (6 * 32 + 16)(%rdx), RB3;
+
+ /* Update IV */
+ vmovdqu (7 * 32 + 16)(%rdx), RNOTx;
+ vmovdqu RNOTx, (%rcx);
+
+ call __sm4_crypt_blk16;
+
+ vpxor (0 * 32)(%rdx), RA0, RA0;
+ vpxor (1 * 32)(%rdx), RA1, RA1;
+ vpxor (2 * 32)(%rdx), RA2, RA2;
+ vpxor (3 * 32)(%rdx), RA3, RA3;
+ vpxor (4 * 32)(%rdx), RB0, RB0;
+ vpxor (5 * 32)(%rdx), RB1, RB1;
+ vpxor (6 * 32)(%rdx), RB2, RB2;
+ vpxor (7 * 32)(%rdx), RB3, RB3;
+
+ vmovdqu RA0, (0 * 32)(%rsi);
+ vmovdqu RA1, (1 * 32)(%rsi);
+ vmovdqu RA2, (2 * 32)(%rsi);
+ vmovdqu RA3, (3 * 32)(%rsi);
+ vmovdqu RB0, (4 * 32)(%rsi);
+ vmovdqu RB1, (5 * 32)(%rsi);
+ vmovdqu RB2, (6 * 32)(%rsi);
+ vmovdqu RB3, (7 * 32)(%rsi);
+
+ vzeroall;
+ FRAME_END
+ RET;
+SYM_FUNC_END(sm4_aesni_avx2_cfb_dec_blk16)
diff --git a/arch/x86/crypto/sm4-avx.h b/arch/x86/crypto/sm4-avx.h
new file mode 100644
index 000000000000..1bceab7516aa
--- /dev/null
+++ b/arch/x86/crypto/sm4-avx.h
@@ -0,0 +1,24 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#ifndef ASM_X86_SM4_AVX_H
+#define ASM_X86_SM4_AVX_H
+
+#include <linux/types.h>
+#include <crypto/sm4.h>
+
+typedef void (*sm4_crypt_func)(const u32 *rk, u8 *dst, const u8 *src, u8 *iv);
+
+int sm4_avx_ecb_encrypt(struct skcipher_request *req);
+int sm4_avx_ecb_decrypt(struct skcipher_request *req);
+
+int sm4_cbc_encrypt(struct skcipher_request *req);
+int sm4_avx_cbc_decrypt(struct skcipher_request *req,
+ unsigned int bsize, sm4_crypt_func func);
+
+int sm4_cfb_encrypt(struct skcipher_request *req);
+int sm4_avx_cfb_decrypt(struct skcipher_request *req,
+ unsigned int bsize, sm4_crypt_func func);
+
+int sm4_avx_ctr_crypt(struct skcipher_request *req,
+ unsigned int bsize, sm4_crypt_func func);
+
+#endif
diff --git a/arch/x86/crypto/sm4_aesni_avx2_glue.c b/arch/x86/crypto/sm4_aesni_avx2_glue.c
new file mode 100644
index 000000000000..84bc718f49a3
--- /dev/null
+++ b/arch/x86/crypto/sm4_aesni_avx2_glue.c
@@ -0,0 +1,169 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * SM4 Cipher Algorithm, AES-NI/AVX2 optimized.
+ * as specified in
+ * https://tools.ietf.org/id/draft-ribose-cfrg-sm4-10.html
+ *
+ * Copyright (c) 2021, Alibaba Group.
+ * Copyright (c) 2021 Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
+ */
+
+#include <linux/module.h>
+#include <linux/crypto.h>
+#include <linux/kernel.h>
+#include <asm/simd.h>
+#include <crypto/internal/simd.h>
+#include <crypto/internal/skcipher.h>
+#include <crypto/sm4.h>
+#include "sm4-avx.h"
+
+#define SM4_CRYPT16_BLOCK_SIZE (SM4_BLOCK_SIZE * 16)
+
+asmlinkage void sm4_aesni_avx2_ctr_enc_blk16(const u32 *rk, u8 *dst,
+ const u8 *src, u8 *iv);
+asmlinkage void sm4_aesni_avx2_cbc_dec_blk16(const u32 *rk, u8 *dst,
+ const u8 *src, u8 *iv);
+asmlinkage void sm4_aesni_avx2_cfb_dec_blk16(const u32 *rk, u8 *dst,
+ const u8 *src, u8 *iv);
+
+static int sm4_skcipher_setkey(struct crypto_skcipher *tfm, const u8 *key,
+ unsigned int key_len)
+{
+ struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm);
+
+ return sm4_expandkey(ctx, key, key_len);
+}
+
+static int cbc_decrypt(struct skcipher_request *req)
+{
+ return sm4_avx_cbc_decrypt(req, SM4_CRYPT16_BLOCK_SIZE,
+ sm4_aesni_avx2_cbc_dec_blk16);
+}
+
+
+static int cfb_decrypt(struct skcipher_request *req)
+{
+ return sm4_avx_cfb_decrypt(req, SM4_CRYPT16_BLOCK_SIZE,
+ sm4_aesni_avx2_cfb_dec_blk16);
+}
+
+static int ctr_crypt(struct skcipher_request *req)
+{
+ return sm4_avx_ctr_crypt(req, SM4_CRYPT16_BLOCK_SIZE,
+ sm4_aesni_avx2_ctr_enc_blk16);
+}
+
+static struct skcipher_alg sm4_aesni_avx2_skciphers[] = {
+ {
+ .base = {
+ .cra_name = "__ecb(sm4)",
+ .cra_driver_name = "__ecb-sm4-aesni-avx2",
+ .cra_priority = 500,
+ .cra_flags = CRYPTO_ALG_INTERNAL,
+ .cra_blocksize = SM4_BLOCK_SIZE,
+ .cra_ctxsize = sizeof(struct sm4_ctx),
+ .cra_module = THIS_MODULE,
+ },
+ .min_keysize = SM4_KEY_SIZE,
+ .max_keysize = SM4_KEY_SIZE,
+ .walksize = 16 * SM4_BLOCK_SIZE,
+ .setkey = sm4_skcipher_setkey,
+ .encrypt = sm4_avx_ecb_encrypt,
+ .decrypt = sm4_avx_ecb_decrypt,
+ }, {
+ .base = {
+ .cra_name = "__cbc(sm4)",
+ .cra_driver_name = "__cbc-sm4-aesni-avx2",
+ .cra_priority = 500,
+ .cra_flags = CRYPTO_ALG_INTERNAL,
+ .cra_blocksize = SM4_BLOCK_SIZE,
+ .cra_ctxsize = sizeof(struct sm4_ctx),
+ .cra_module = THIS_MODULE,
+ },
+ .min_keysize = SM4_KEY_SIZE,
+ .max_keysize = SM4_KEY_SIZE,
+ .ivsize = SM4_BLOCK_SIZE,
+ .walksize = 16 * SM4_BLOCK_SIZE,
+ .setkey = sm4_skcipher_setkey,
+ .encrypt = sm4_cbc_encrypt,
+ .decrypt = cbc_decrypt,
+ }, {
+ .base = {
+ .cra_name = "__cfb(sm4)",
+ .cra_driver_name = "__cfb-sm4-aesni-avx2",
+ .cra_priority = 500,
+ .cra_flags = CRYPTO_ALG_INTERNAL,
+ .cra_blocksize = 1,
+ .cra_ctxsize = sizeof(struct sm4_ctx),
+ .cra_module = THIS_MODULE,
+ },
+ .min_keysize = SM4_KEY_SIZE,
+ .max_keysize = SM4_KEY_SIZE,
+ .ivsize = SM4_BLOCK_SIZE,
+ .chunksize = SM4_BLOCK_SIZE,
+ .walksize = 16 * SM4_BLOCK_SIZE,
+ .setkey = sm4_skcipher_setkey,
+ .encrypt = sm4_cfb_encrypt,
+ .decrypt = cfb_decrypt,
+ }, {
+ .base = {
+ .cra_name = "__ctr(sm4)",
+ .cra_driver_name = "__ctr-sm4-aesni-avx2",
+ .cra_priority = 500,
+ .cra_flags = CRYPTO_ALG_INTERNAL,
+ .cra_blocksize = 1,
+ .cra_ctxsize = sizeof(struct sm4_ctx),
+ .cra_module = THIS_MODULE,
+ },
+ .min_keysize = SM4_KEY_SIZE,
+ .max_keysize = SM4_KEY_SIZE,
+ .ivsize = SM4_BLOCK_SIZE,
+ .chunksize = SM4_BLOCK_SIZE,
+ .walksize = 16 * SM4_BLOCK_SIZE,
+ .setkey = sm4_skcipher_setkey,
+ .encrypt = ctr_crypt,
+ .decrypt = ctr_crypt,
+ }
+};
+
+static struct simd_skcipher_alg *
+simd_sm4_aesni_avx2_skciphers[ARRAY_SIZE(sm4_aesni_avx2_skciphers)];
+
+static int __init sm4_init(void)
+{
+ const char *feature_name;
+
+ if (!boot_cpu_has(X86_FEATURE_AVX) ||
+ !boot_cpu_has(X86_FEATURE_AVX2) ||
+ !boot_cpu_has(X86_FEATURE_AES) ||
+ !boot_cpu_has(X86_FEATURE_OSXSAVE)) {
+ pr_info("AVX2 or AES-NI instructions are not detected.\n");
+ return -ENODEV;
+ }
+
+ if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM,
+ &feature_name)) {
+ pr_info("CPU feature '%s' is not supported.\n", feature_name);
+ return -ENODEV;
+ }
+
+ return simd_register_skciphers_compat(sm4_aesni_avx2_skciphers,
+ ARRAY_SIZE(sm4_aesni_avx2_skciphers),
+ simd_sm4_aesni_avx2_skciphers);
+}
+
+static void __exit sm4_exit(void)
+{
+ simd_unregister_skciphers(sm4_aesni_avx2_skciphers,
+ ARRAY_SIZE(sm4_aesni_avx2_skciphers),
+ simd_sm4_aesni_avx2_skciphers);
+}
+
+module_init(sm4_init);
+module_exit(sm4_exit);
+
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Tianjia Zhang <tianjia.zhang@linux.alibaba.com>");
+MODULE_DESCRIPTION("SM4 Cipher Algorithm, AES-NI/AVX2 optimized");
+MODULE_ALIAS_CRYPTO("sm4");
+MODULE_ALIAS_CRYPTO("sm4-aesni-avx2");
diff --git a/arch/x86/crypto/sm4_aesni_avx_glue.c b/arch/x86/crypto/sm4_aesni_avx_glue.c
new file mode 100644
index 000000000000..7800f77d68ad
--- /dev/null
+++ b/arch/x86/crypto/sm4_aesni_avx_glue.c
@@ -0,0 +1,487 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * SM4 Cipher Algorithm, AES-NI/AVX optimized.
+ * as specified in
+ * https://tools.ietf.org/id/draft-ribose-cfrg-sm4-10.html
+ *
+ * Copyright (c) 2021, Alibaba Group.
+ * Copyright (c) 2021 Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
+ */
+
+#include <linux/module.h>
+#include <linux/crypto.h>
+#include <linux/kernel.h>
+#include <asm/simd.h>
+#include <crypto/internal/simd.h>
+#include <crypto/internal/skcipher.h>
+#include <crypto/sm4.h>
+#include "sm4-avx.h"
+
+#define SM4_CRYPT8_BLOCK_SIZE (SM4_BLOCK_SIZE * 8)
+
+asmlinkage void sm4_aesni_avx_crypt4(const u32 *rk, u8 *dst,
+ const u8 *src, int nblocks);
+asmlinkage void sm4_aesni_avx_crypt8(const u32 *rk, u8 *dst,
+ const u8 *src, int nblocks);
+asmlinkage void sm4_aesni_avx_ctr_enc_blk8(const u32 *rk, u8 *dst,
+ const u8 *src, u8 *iv);
+asmlinkage void sm4_aesni_avx_cbc_dec_blk8(const u32 *rk, u8 *dst,
+ const u8 *src, u8 *iv);
+asmlinkage void sm4_aesni_avx_cfb_dec_blk8(const u32 *rk, u8 *dst,
+ const u8 *src, u8 *iv);
+
+static int sm4_skcipher_setkey(struct crypto_skcipher *tfm, const u8 *key,
+ unsigned int key_len)
+{
+ struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm);
+
+ return sm4_expandkey(ctx, key, key_len);
+}
+
+static int ecb_do_crypt(struct skcipher_request *req, const u32 *rkey)
+{
+ struct skcipher_walk walk;
+ unsigned int nbytes;
+ int err;
+
+ err = skcipher_walk_virt(&walk, req, false);
+
+ while ((nbytes = walk.nbytes) > 0) {
+ const u8 *src = walk.src.virt.addr;
+ u8 *dst = walk.dst.virt.addr;
+
+ kernel_fpu_begin();
+ while (nbytes >= SM4_CRYPT8_BLOCK_SIZE) {
+ sm4_aesni_avx_crypt8(rkey, dst, src, 8);
+ dst += SM4_CRYPT8_BLOCK_SIZE;
+ src += SM4_CRYPT8_BLOCK_SIZE;
+ nbytes -= SM4_CRYPT8_BLOCK_SIZE;
+ }
+ while (nbytes >= SM4_BLOCK_SIZE) {
+ unsigned int nblocks = min(nbytes >> 4, 4u);
+ sm4_aesni_avx_crypt4(rkey, dst, src, nblocks);
+ dst += nblocks * SM4_BLOCK_SIZE;
+ src += nblocks * SM4_BLOCK_SIZE;
+ nbytes -= nblocks * SM4_BLOCK_SIZE;
+ }
+ kernel_fpu_end();
+
+ err = skcipher_walk_done(&walk, nbytes);
+ }
+
+ return err;
+}
+
+int sm4_avx_ecb_encrypt(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm);
+
+ return ecb_do_crypt(req, ctx->rkey_enc);
+}
+EXPORT_SYMBOL_GPL(sm4_avx_ecb_encrypt);
+
+int sm4_avx_ecb_decrypt(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm);
+
+ return ecb_do_crypt(req, ctx->rkey_dec);
+}
+EXPORT_SYMBOL_GPL(sm4_avx_ecb_decrypt);
+
+int sm4_cbc_encrypt(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm);
+ struct skcipher_walk walk;
+ unsigned int nbytes;
+ int err;
+
+ err = skcipher_walk_virt(&walk, req, false);
+
+ while ((nbytes = walk.nbytes) > 0) {
+ const u8 *iv = walk.iv;
+ const u8 *src = walk.src.virt.addr;
+ u8 *dst = walk.dst.virt.addr;
+
+ while (nbytes >= SM4_BLOCK_SIZE) {
+ crypto_xor_cpy(dst, src, iv, SM4_BLOCK_SIZE);
+ sm4_crypt_block(ctx->rkey_enc, dst, dst);
+ iv = dst;
+ src += SM4_BLOCK_SIZE;
+ dst += SM4_BLOCK_SIZE;
+ nbytes -= SM4_BLOCK_SIZE;
+ }
+ if (iv != walk.iv)
+ memcpy(walk.iv, iv, SM4_BLOCK_SIZE);
+
+ err = skcipher_walk_done(&walk, nbytes);
+ }
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(sm4_cbc_encrypt);
+
+int sm4_avx_cbc_decrypt(struct skcipher_request *req,
+ unsigned int bsize, sm4_crypt_func func)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm);
+ struct skcipher_walk walk;
+ unsigned int nbytes;
+ int err;
+
+ err = skcipher_walk_virt(&walk, req, false);
+
+ while ((nbytes = walk.nbytes) > 0) {
+ const u8 *src = walk.src.virt.addr;
+ u8 *dst = walk.dst.virt.addr;
+
+ kernel_fpu_begin();
+
+ while (nbytes >= bsize) {
+ func(ctx->rkey_dec, dst, src, walk.iv);
+ dst += bsize;
+ src += bsize;
+ nbytes -= bsize;
+ }
+
+ while (nbytes >= SM4_BLOCK_SIZE) {
+ u8 keystream[SM4_BLOCK_SIZE * 8];
+ u8 iv[SM4_BLOCK_SIZE];
+ unsigned int nblocks = min(nbytes >> 4, 8u);
+ int i;
+
+ sm4_aesni_avx_crypt8(ctx->rkey_dec, keystream,
+ src, nblocks);
+
+ src += ((int)nblocks - 2) * SM4_BLOCK_SIZE;
+ dst += (nblocks - 1) * SM4_BLOCK_SIZE;
+ memcpy(iv, src + SM4_BLOCK_SIZE, SM4_BLOCK_SIZE);
+
+ for (i = nblocks - 1; i > 0; i--) {
+ crypto_xor_cpy(dst, src,
+ &keystream[i * SM4_BLOCK_SIZE],
+ SM4_BLOCK_SIZE);
+ src -= SM4_BLOCK_SIZE;
+ dst -= SM4_BLOCK_SIZE;
+ }
+ crypto_xor_cpy(dst, walk.iv, keystream, SM4_BLOCK_SIZE);
+ memcpy(walk.iv, iv, SM4_BLOCK_SIZE);
+ dst += nblocks * SM4_BLOCK_SIZE;
+ src += (nblocks + 1) * SM4_BLOCK_SIZE;
+ nbytes -= nblocks * SM4_BLOCK_SIZE;
+ }
+
+ kernel_fpu_end();
+ err = skcipher_walk_done(&walk, nbytes);
+ }
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(sm4_avx_cbc_decrypt);
+
+static int cbc_decrypt(struct skcipher_request *req)
+{
+ return sm4_avx_cbc_decrypt(req, SM4_CRYPT8_BLOCK_SIZE,
+ sm4_aesni_avx_cbc_dec_blk8);
+}
+
+int sm4_cfb_encrypt(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm);
+ struct skcipher_walk walk;
+ unsigned int nbytes;
+ int err;
+
+ err = skcipher_walk_virt(&walk, req, false);
+
+ while ((nbytes = walk.nbytes) > 0) {
+ u8 keystream[SM4_BLOCK_SIZE];
+ const u8 *iv = walk.iv;
+ const u8 *src = walk.src.virt.addr;
+ u8 *dst = walk.dst.virt.addr;
+
+ while (nbytes >= SM4_BLOCK_SIZE) {
+ sm4_crypt_block(ctx->rkey_enc, keystream, iv);
+ crypto_xor_cpy(dst, src, keystream, SM4_BLOCK_SIZE);
+ iv = dst;
+ src += SM4_BLOCK_SIZE;
+ dst += SM4_BLOCK_SIZE;
+ nbytes -= SM4_BLOCK_SIZE;
+ }
+ if (iv != walk.iv)
+ memcpy(walk.iv, iv, SM4_BLOCK_SIZE);
+
+ /* tail */
+ if (walk.nbytes == walk.total && nbytes > 0) {
+ sm4_crypt_block(ctx->rkey_enc, keystream, walk.iv);
+ crypto_xor_cpy(dst, src, keystream, nbytes);
+ nbytes = 0;
+ }
+
+ err = skcipher_walk_done(&walk, nbytes);
+ }
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(sm4_cfb_encrypt);
+
+int sm4_avx_cfb_decrypt(struct skcipher_request *req,
+ unsigned int bsize, sm4_crypt_func func)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm);
+ struct skcipher_walk walk;
+ unsigned int nbytes;
+ int err;
+
+ err = skcipher_walk_virt(&walk, req, false);
+
+ while ((nbytes = walk.nbytes) > 0) {
+ const u8 *src = walk.src.virt.addr;
+ u8 *dst = walk.dst.virt.addr;
+
+ kernel_fpu_begin();
+
+ while (nbytes >= bsize) {
+ func(ctx->rkey_enc, dst, src, walk.iv);
+ dst += bsize;
+ src += bsize;
+ nbytes -= bsize;
+ }
+
+ while (nbytes >= SM4_BLOCK_SIZE) {
+ u8 keystream[SM4_BLOCK_SIZE * 8];
+ unsigned int nblocks = min(nbytes >> 4, 8u);
+
+ memcpy(keystream, walk.iv, SM4_BLOCK_SIZE);
+ if (nblocks > 1)
+ memcpy(&keystream[SM4_BLOCK_SIZE], src,
+ (nblocks - 1) * SM4_BLOCK_SIZE);
+ memcpy(walk.iv, src + (nblocks - 1) * SM4_BLOCK_SIZE,
+ SM4_BLOCK_SIZE);
+
+ sm4_aesni_avx_crypt8(ctx->rkey_enc, keystream,
+ keystream, nblocks);
+
+ crypto_xor_cpy(dst, src, keystream,
+ nblocks * SM4_BLOCK_SIZE);
+ dst += nblocks * SM4_BLOCK_SIZE;
+ src += nblocks * SM4_BLOCK_SIZE;
+ nbytes -= nblocks * SM4_BLOCK_SIZE;
+ }
+
+ kernel_fpu_end();
+
+ /* tail */
+ if (walk.nbytes == walk.total && nbytes > 0) {
+ u8 keystream[SM4_BLOCK_SIZE];
+
+ sm4_crypt_block(ctx->rkey_enc, keystream, walk.iv);
+ crypto_xor_cpy(dst, src, keystream, nbytes);
+ nbytes = 0;
+ }
+
+ err = skcipher_walk_done(&walk, nbytes);
+ }
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(sm4_avx_cfb_decrypt);
+
+static int cfb_decrypt(struct skcipher_request *req)
+{
+ return sm4_avx_cfb_decrypt(req, SM4_CRYPT8_BLOCK_SIZE,
+ sm4_aesni_avx_cfb_dec_blk8);
+}
+
+int sm4_avx_ctr_crypt(struct skcipher_request *req,
+ unsigned int bsize, sm4_crypt_func func)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm);
+ struct skcipher_walk walk;
+ unsigned int nbytes;
+ int err;
+
+ err = skcipher_walk_virt(&walk, req, false);
+
+ while ((nbytes = walk.nbytes) > 0) {
+ const u8 *src = walk.src.virt.addr;
+ u8 *dst = walk.dst.virt.addr;
+
+ kernel_fpu_begin();
+
+ while (nbytes >= bsize) {
+ func(ctx->rkey_enc, dst, src, walk.iv);
+ dst += bsize;
+ src += bsize;
+ nbytes -= bsize;
+ }
+
+ while (nbytes >= SM4_BLOCK_SIZE) {
+ u8 keystream[SM4_BLOCK_SIZE * 8];
+ unsigned int nblocks = min(nbytes >> 4, 8u);
+ int i;
+
+ for (i = 0; i < nblocks; i++) {
+ memcpy(&keystream[i * SM4_BLOCK_SIZE],
+ walk.iv, SM4_BLOCK_SIZE);
+ crypto_inc(walk.iv, SM4_BLOCK_SIZE);
+ }
+ sm4_aesni_avx_crypt8(ctx->rkey_enc, keystream,
+ keystream, nblocks);
+
+ crypto_xor_cpy(dst, src, keystream,
+ nblocks * SM4_BLOCK_SIZE);
+ dst += nblocks * SM4_BLOCK_SIZE;
+ src += nblocks * SM4_BLOCK_SIZE;
+ nbytes -= nblocks * SM4_BLOCK_SIZE;
+ }
+
+ kernel_fpu_end();
+
+ /* tail */
+ if (walk.nbytes == walk.total && nbytes > 0) {
+ u8 keystream[SM4_BLOCK_SIZE];
+
+ memcpy(keystream, walk.iv, SM4_BLOCK_SIZE);
+ crypto_inc(walk.iv, SM4_BLOCK_SIZE);
+
+ sm4_crypt_block(ctx->rkey_enc, keystream, keystream);
+
+ crypto_xor_cpy(dst, src, keystream, nbytes);
+ dst += nbytes;
+ src += nbytes;
+ nbytes = 0;
+ }
+
+ err = skcipher_walk_done(&walk, nbytes);
+ }
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(sm4_avx_ctr_crypt);
+
+static int ctr_crypt(struct skcipher_request *req)
+{
+ return sm4_avx_ctr_crypt(req, SM4_CRYPT8_BLOCK_SIZE,
+ sm4_aesni_avx_ctr_enc_blk8);
+}
+
+static struct skcipher_alg sm4_aesni_avx_skciphers[] = {
+ {
+ .base = {
+ .cra_name = "__ecb(sm4)",
+ .cra_driver_name = "__ecb-sm4-aesni-avx",
+ .cra_priority = 400,
+ .cra_flags = CRYPTO_ALG_INTERNAL,
+ .cra_blocksize = SM4_BLOCK_SIZE,
+ .cra_ctxsize = sizeof(struct sm4_ctx),
+ .cra_module = THIS_MODULE,
+ },
+ .min_keysize = SM4_KEY_SIZE,
+ .max_keysize = SM4_KEY_SIZE,
+ .walksize = 8 * SM4_BLOCK_SIZE,
+ .setkey = sm4_skcipher_setkey,
+ .encrypt = sm4_avx_ecb_encrypt,
+ .decrypt = sm4_avx_ecb_decrypt,
+ }, {
+ .base = {
+ .cra_name = "__cbc(sm4)",
+ .cra_driver_name = "__cbc-sm4-aesni-avx",
+ .cra_priority = 400,
+ .cra_flags = CRYPTO_ALG_INTERNAL,
+ .cra_blocksize = SM4_BLOCK_SIZE,
+ .cra_ctxsize = sizeof(struct sm4_ctx),
+ .cra_module = THIS_MODULE,
+ },
+ .min_keysize = SM4_KEY_SIZE,
+ .max_keysize = SM4_KEY_SIZE,
+ .ivsize = SM4_BLOCK_SIZE,
+ .walksize = 8 * SM4_BLOCK_SIZE,
+ .setkey = sm4_skcipher_setkey,
+ .encrypt = sm4_cbc_encrypt,
+ .decrypt = cbc_decrypt,
+ }, {
+ .base = {
+ .cra_name = "__cfb(sm4)",
+ .cra_driver_name = "__cfb-sm4-aesni-avx",
+ .cra_priority = 400,
+ .cra_flags = CRYPTO_ALG_INTERNAL,
+ .cra_blocksize = 1,
+ .cra_ctxsize = sizeof(struct sm4_ctx),
+ .cra_module = THIS_MODULE,
+ },
+ .min_keysize = SM4_KEY_SIZE,
+ .max_keysize = SM4_KEY_SIZE,
+ .ivsize = SM4_BLOCK_SIZE,
+ .chunksize = SM4_BLOCK_SIZE,
+ .walksize = 8 * SM4_BLOCK_SIZE,
+ .setkey = sm4_skcipher_setkey,
+ .encrypt = sm4_cfb_encrypt,
+ .decrypt = cfb_decrypt,
+ }, {
+ .base = {
+ .cra_name = "__ctr(sm4)",
+ .cra_driver_name = "__ctr-sm4-aesni-avx",
+ .cra_priority = 400,
+ .cra_flags = CRYPTO_ALG_INTERNAL,
+ .cra_blocksize = 1,
+ .cra_ctxsize = sizeof(struct sm4_ctx),
+ .cra_module = THIS_MODULE,
+ },
+ .min_keysize = SM4_KEY_SIZE,
+ .max_keysize = SM4_KEY_SIZE,
+ .ivsize = SM4_BLOCK_SIZE,
+ .chunksize = SM4_BLOCK_SIZE,
+ .walksize = 8 * SM4_BLOCK_SIZE,
+ .setkey = sm4_skcipher_setkey,
+ .encrypt = ctr_crypt,
+ .decrypt = ctr_crypt,
+ }
+};
+
+static struct simd_skcipher_alg *
+simd_sm4_aesni_avx_skciphers[ARRAY_SIZE(sm4_aesni_avx_skciphers)];
+
+static int __init sm4_init(void)
+{
+ const char *feature_name;
+
+ if (!boot_cpu_has(X86_FEATURE_AVX) ||
+ !boot_cpu_has(X86_FEATURE_AES) ||
+ !boot_cpu_has(X86_FEATURE_OSXSAVE)) {
+ pr_info("AVX or AES-NI instructions are not detected.\n");
+ return -ENODEV;
+ }
+
+ if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM,
+ &feature_name)) {
+ pr_info("CPU feature '%s' is not supported.\n", feature_name);
+ return -ENODEV;
+ }
+
+ return simd_register_skciphers_compat(sm4_aesni_avx_skciphers,
+ ARRAY_SIZE(sm4_aesni_avx_skciphers),
+ simd_sm4_aesni_avx_skciphers);
+}
+
+static void __exit sm4_exit(void)
+{
+ simd_unregister_skciphers(sm4_aesni_avx_skciphers,
+ ARRAY_SIZE(sm4_aesni_avx_skciphers),
+ simd_sm4_aesni_avx_skciphers);
+}
+
+module_init(sm4_init);
+module_exit(sm4_exit);
+
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Tianjia Zhang <tianjia.zhang@linux.alibaba.com>");
+MODULE_DESCRIPTION("SM4 Cipher Algorithm, AES-NI/AVX optimized");
+MODULE_ALIAS_CRYPTO("sm4");
+MODULE_ALIAS_CRYPTO("sm4-aesni-avx");
diff --git a/arch/x86/crypto/twofish-avx-x86_64-asm_64.S b/arch/x86/crypto/twofish-avx-x86_64-asm_64.S
index a5151393bb2f..31f9b2ec3857 100644
--- a/arch/x86/crypto/twofish-avx-x86_64-asm_64.S
+++ b/arch/x86/crypto/twofish-avx-x86_64-asm_64.S
@@ -19,11 +19,6 @@
.Lbswap128_mask:
.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
-.section .rodata.cst16.xts_gf128mul_and_shl1_mask, "aM", @progbits, 16
-.align 16
-.Lxts_gf128mul_and_shl1_mask:
- .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0
-
.text
/* structure of crypto context */
@@ -272,7 +267,7 @@ SYM_FUNC_START_LOCAL(__twofish_enc_blk8)
outunpack_blocks(RC1, RD1, RA1, RB1, RK1, RX0, RY0, RK2);
outunpack_blocks(RC2, RD2, RA2, RB2, RK1, RX0, RY0, RK2);
- ret;
+ RET;
SYM_FUNC_END(__twofish_enc_blk8)
.align 8
@@ -312,7 +307,7 @@ SYM_FUNC_START_LOCAL(__twofish_dec_blk8)
outunpack_blocks(RA1, RB1, RC1, RD1, RK1, RX0, RY0, RK2);
outunpack_blocks(RA2, RB2, RC2, RD2, RK1, RX0, RY0, RK2);
- ret;
+ RET;
SYM_FUNC_END(__twofish_dec_blk8)
SYM_FUNC_START(twofish_ecb_enc_8way)
@@ -332,7 +327,7 @@ SYM_FUNC_START(twofish_ecb_enc_8way)
store_8way(%r11, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2);
FRAME_END
- ret;
+ RET;
SYM_FUNC_END(twofish_ecb_enc_8way)
SYM_FUNC_START(twofish_ecb_dec_8way)
@@ -352,7 +347,7 @@ SYM_FUNC_START(twofish_ecb_dec_8way)
store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
FRAME_END
- ret;
+ RET;
SYM_FUNC_END(twofish_ecb_dec_8way)
SYM_FUNC_START(twofish_cbc_dec_8way)
@@ -377,80 +372,5 @@ SYM_FUNC_START(twofish_cbc_dec_8way)
popq %r12;
FRAME_END
- ret;
+ RET;
SYM_FUNC_END(twofish_cbc_dec_8way)
-
-SYM_FUNC_START(twofish_ctr_8way)
- /* input:
- * %rdi: ctx, CTX
- * %rsi: dst
- * %rdx: src
- * %rcx: iv (little endian, 128bit)
- */
- FRAME_BEGIN
-
- pushq %r12;
-
- movq %rsi, %r11;
- movq %rdx, %r12;
-
- load_ctr_8way(%rcx, .Lbswap128_mask, RA1, RB1, RC1, RD1, RA2, RB2, RC2,
- RD2, RX0, RX1, RY0);
-
- call __twofish_enc_blk8;
-
- store_ctr_8way(%r12, %r11, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2);
-
- popq %r12;
-
- FRAME_END
- ret;
-SYM_FUNC_END(twofish_ctr_8way)
-
-SYM_FUNC_START(twofish_xts_enc_8way)
- /* input:
- * %rdi: ctx, CTX
- * %rsi: dst
- * %rdx: src
- * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
- */
- FRAME_BEGIN
-
- movq %rsi, %r11;
-
- /* regs <= src, dst <= IVs, regs <= regs xor IVs */
- load_xts_8way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2,
- RX0, RX1, RY0, .Lxts_gf128mul_and_shl1_mask);
-
- call __twofish_enc_blk8;
-
- /* dst <= regs xor IVs(in dst) */
- store_xts_8way(%r11, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2);
-
- FRAME_END
- ret;
-SYM_FUNC_END(twofish_xts_enc_8way)
-
-SYM_FUNC_START(twofish_xts_dec_8way)
- /* input:
- * %rdi: ctx, CTX
- * %rsi: dst
- * %rdx: src
- * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
- */
- FRAME_BEGIN
-
- movq %rsi, %r11;
-
- /* regs <= src, dst <= IVs, regs <= regs xor IVs */
- load_xts_8way(%rcx, %rdx, %rsi, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2,
- RX0, RX1, RY0, .Lxts_gf128mul_and_shl1_mask);
-
- call __twofish_dec_blk8;
-
- /* dst <= regs xor IVs(in dst) */
- store_xts_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
-
- FRAME_END
- ret;
-SYM_FUNC_END(twofish_xts_dec_8way)
diff --git a/arch/x86/crypto/twofish-i586-asm_32.S b/arch/x86/crypto/twofish-i586-asm_32.S
index a6f09e4f2e46..3abcad661884 100644
--- a/arch/x86/crypto/twofish-i586-asm_32.S
+++ b/arch/x86/crypto/twofish-i586-asm_32.S
@@ -260,7 +260,7 @@ SYM_FUNC_START(twofish_enc_blk)
pop %ebx
pop %ebp
mov $1, %eax
- ret
+ RET
SYM_FUNC_END(twofish_enc_blk)
SYM_FUNC_START(twofish_dec_blk)
@@ -317,5 +317,5 @@ SYM_FUNC_START(twofish_dec_blk)
pop %ebx
pop %ebp
mov $1, %eax
- ret
+ RET
SYM_FUNC_END(twofish_dec_blk)
diff --git a/arch/x86/crypto/twofish-x86_64-asm_64-3way.S b/arch/x86/crypto/twofish-x86_64-asm_64-3way.S
index fc23552afe37..d2288bf38a8a 100644
--- a/arch/x86/crypto/twofish-x86_64-asm_64-3way.S
+++ b/arch/x86/crypto/twofish-x86_64-asm_64-3way.S
@@ -88,7 +88,7 @@
/*
* Combined G1 & G2 function. Reordered with help of rotates to have moves
- * at begining.
+ * at beginning.
*/
#define g1g2_3(ab, cd, Tx0, Tx1, Tx2, Tx3, Ty0, Ty1, Ty2, Ty3, x, y) \
/* G1,1 && G2,1 */ \
@@ -258,7 +258,7 @@ SYM_FUNC_START(__twofish_enc_blk_3way)
popq %rbx;
popq %r12;
popq %r13;
- ret;
+ RET;
.L__enc_xor3:
outunpack_enc3(xor);
@@ -266,7 +266,7 @@ SYM_FUNC_START(__twofish_enc_blk_3way)
popq %rbx;
popq %r12;
popq %r13;
- ret;
+ RET;
SYM_FUNC_END(__twofish_enc_blk_3way)
SYM_FUNC_START(twofish_dec_blk_3way)
@@ -301,5 +301,5 @@ SYM_FUNC_START(twofish_dec_blk_3way)
popq %rbx;
popq %r12;
popq %r13;
- ret;
+ RET;
SYM_FUNC_END(twofish_dec_blk_3way)
diff --git a/arch/x86/crypto/twofish-x86_64-asm_64.S b/arch/x86/crypto/twofish-x86_64-asm_64.S
index d2e56232494a..775af290cd19 100644
--- a/arch/x86/crypto/twofish-x86_64-asm_64.S
+++ b/arch/x86/crypto/twofish-x86_64-asm_64.S
@@ -252,7 +252,7 @@ SYM_FUNC_START(twofish_enc_blk)
popq R1
movl $1,%eax
- ret
+ RET
SYM_FUNC_END(twofish_enc_blk)
SYM_FUNC_START(twofish_dec_blk)
@@ -304,5 +304,5 @@ SYM_FUNC_START(twofish_dec_blk)
popq R1
movl $1,%eax
- ret
+ RET
SYM_FUNC_END(twofish_dec_blk)
diff --git a/arch/x86/crypto/twofish.h b/arch/x86/crypto/twofish.h
new file mode 100644
index 000000000000..12df400e6d53
--- /dev/null
+++ b/arch/x86/crypto/twofish.h
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef ASM_X86_TWOFISH_H
+#define ASM_X86_TWOFISH_H
+
+#include <linux/crypto.h>
+#include <crypto/twofish.h>
+#include <crypto/b128ops.h>
+
+/* regular block cipher functions from twofish_x86_64 module */
+asmlinkage void twofish_enc_blk(const void *ctx, u8 *dst, const u8 *src);
+asmlinkage void twofish_dec_blk(const void *ctx, u8 *dst, const u8 *src);
+
+/* 3-way parallel cipher functions */
+asmlinkage void __twofish_enc_blk_3way(const void *ctx, u8 *dst, const u8 *src,
+ bool xor);
+asmlinkage void twofish_dec_blk_3way(const void *ctx, u8 *dst, const u8 *src);
+
+/* helpers from twofish_x86_64-3way module */
+extern void twofish_dec_blk_cbc_3way(const void *ctx, u8 *dst, const u8 *src);
+
+#endif /* ASM_X86_TWOFISH_H */
diff --git a/arch/x86/crypto/twofish_avx_glue.c b/arch/x86/crypto/twofish_avx_glue.c
index 2dbc8ce3730e..3eb3440b477a 100644
--- a/arch/x86/crypto/twofish_avx_glue.c
+++ b/arch/x86/crypto/twofish_avx_glue.c
@@ -15,9 +15,9 @@
#include <crypto/algapi.h>
#include <crypto/internal/simd.h>
#include <crypto/twofish.h>
-#include <crypto/xts.h>
-#include <asm/crypto/glue_helper.h>
-#include <asm/crypto/twofish.h>
+
+#include "twofish.h"
+#include "ecb_cbc_helpers.h"
#define TWOFISH_PARALLEL_BLOCKS 8
@@ -26,13 +26,6 @@ asmlinkage void twofish_ecb_enc_8way(const void *ctx, u8 *dst, const u8 *src);
asmlinkage void twofish_ecb_dec_8way(const void *ctx, u8 *dst, const u8 *src);
asmlinkage void twofish_cbc_dec_8way(const void *ctx, u8 *dst, const u8 *src);
-asmlinkage void twofish_ctr_8way(const void *ctx, u8 *dst, const u8 *src,
- le128 *iv);
-
-asmlinkage void twofish_xts_enc_8way(const void *ctx, u8 *dst, const u8 *src,
- le128 *iv);
-asmlinkage void twofish_xts_dec_8way(const void *ctx, u8 *dst, const u8 *src,
- le128 *iv);
static int twofish_setkey_skcipher(struct crypto_skcipher *tfm,
const u8 *key, unsigned int keylen)
@@ -45,171 +38,38 @@ static inline void twofish_enc_blk_3way(const void *ctx, u8 *dst, const u8 *src)
__twofish_enc_blk_3way(ctx, dst, src, false);
}
-static void twofish_xts_enc(const void *ctx, u8 *dst, const u8 *src, le128 *iv)
-{
- glue_xts_crypt_128bit_one(ctx, dst, src, iv, twofish_enc_blk);
-}
-
-static void twofish_xts_dec(const void *ctx, u8 *dst, const u8 *src, le128 *iv)
-{
- glue_xts_crypt_128bit_one(ctx, dst, src, iv, twofish_dec_blk);
-}
-
-struct twofish_xts_ctx {
- struct twofish_ctx tweak_ctx;
- struct twofish_ctx crypt_ctx;
-};
-
-static int xts_twofish_setkey(struct crypto_skcipher *tfm, const u8 *key,
- unsigned int keylen)
-{
- struct twofish_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
- int err;
-
- err = xts_verify_key(tfm, key, keylen);
- if (err)
- return err;
-
- /* first half of xts-key is for crypt */
- err = __twofish_setkey(&ctx->crypt_ctx, key, keylen / 2);
- if (err)
- return err;
-
- /* second half of xts-key is for tweak */
- return __twofish_setkey(&ctx->tweak_ctx, key + keylen / 2, keylen / 2);
-}
-
-static const struct common_glue_ctx twofish_enc = {
- .num_funcs = 3,
- .fpu_blocks_limit = TWOFISH_PARALLEL_BLOCKS,
-
- .funcs = { {
- .num_blocks = TWOFISH_PARALLEL_BLOCKS,
- .fn_u = { .ecb = twofish_ecb_enc_8way }
- }, {
- .num_blocks = 3,
- .fn_u = { .ecb = twofish_enc_blk_3way }
- }, {
- .num_blocks = 1,
- .fn_u = { .ecb = twofish_enc_blk }
- } }
-};
-
-static const struct common_glue_ctx twofish_ctr = {
- .num_funcs = 3,
- .fpu_blocks_limit = TWOFISH_PARALLEL_BLOCKS,
-
- .funcs = { {
- .num_blocks = TWOFISH_PARALLEL_BLOCKS,
- .fn_u = { .ctr = twofish_ctr_8way }
- }, {
- .num_blocks = 3,
- .fn_u = { .ctr = twofish_enc_blk_ctr_3way }
- }, {
- .num_blocks = 1,
- .fn_u = { .ctr = twofish_enc_blk_ctr }
- } }
-};
-
-static const struct common_glue_ctx twofish_enc_xts = {
- .num_funcs = 2,
- .fpu_blocks_limit = TWOFISH_PARALLEL_BLOCKS,
-
- .funcs = { {
- .num_blocks = TWOFISH_PARALLEL_BLOCKS,
- .fn_u = { .xts = twofish_xts_enc_8way }
- }, {
- .num_blocks = 1,
- .fn_u = { .xts = twofish_xts_enc }
- } }
-};
-
-static const struct common_glue_ctx twofish_dec = {
- .num_funcs = 3,
- .fpu_blocks_limit = TWOFISH_PARALLEL_BLOCKS,
-
- .funcs = { {
- .num_blocks = TWOFISH_PARALLEL_BLOCKS,
- .fn_u = { .ecb = twofish_ecb_dec_8way }
- }, {
- .num_blocks = 3,
- .fn_u = { .ecb = twofish_dec_blk_3way }
- }, {
- .num_blocks = 1,
- .fn_u = { .ecb = twofish_dec_blk }
- } }
-};
-
-static const struct common_glue_ctx twofish_dec_cbc = {
- .num_funcs = 3,
- .fpu_blocks_limit = TWOFISH_PARALLEL_BLOCKS,
-
- .funcs = { {
- .num_blocks = TWOFISH_PARALLEL_BLOCKS,
- .fn_u = { .cbc = twofish_cbc_dec_8way }
- }, {
- .num_blocks = 3,
- .fn_u = { .cbc = twofish_dec_blk_cbc_3way }
- }, {
- .num_blocks = 1,
- .fn_u = { .cbc = twofish_dec_blk }
- } }
-};
-
-static const struct common_glue_ctx twofish_dec_xts = {
- .num_funcs = 2,
- .fpu_blocks_limit = TWOFISH_PARALLEL_BLOCKS,
-
- .funcs = { {
- .num_blocks = TWOFISH_PARALLEL_BLOCKS,
- .fn_u = { .xts = twofish_xts_dec_8way }
- }, {
- .num_blocks = 1,
- .fn_u = { .xts = twofish_xts_dec }
- } }
-};
-
static int ecb_encrypt(struct skcipher_request *req)
{
- return glue_ecb_req_128bit(&twofish_enc, req);
+ ECB_WALK_START(req, TF_BLOCK_SIZE, TWOFISH_PARALLEL_BLOCKS);
+ ECB_BLOCK(TWOFISH_PARALLEL_BLOCKS, twofish_ecb_enc_8way);
+ ECB_BLOCK(3, twofish_enc_blk_3way);
+ ECB_BLOCK(1, twofish_enc_blk);
+ ECB_WALK_END();
}
static int ecb_decrypt(struct skcipher_request *req)
{
- return glue_ecb_req_128bit(&twofish_dec, req);
+ ECB_WALK_START(req, TF_BLOCK_SIZE, TWOFISH_PARALLEL_BLOCKS);
+ ECB_BLOCK(TWOFISH_PARALLEL_BLOCKS, twofish_ecb_dec_8way);
+ ECB_BLOCK(3, twofish_dec_blk_3way);
+ ECB_BLOCK(1, twofish_dec_blk);
+ ECB_WALK_END();
}
static int cbc_encrypt(struct skcipher_request *req)
{
- return glue_cbc_encrypt_req_128bit(twofish_enc_blk, req);
+ CBC_WALK_START(req, TF_BLOCK_SIZE, -1);
+ CBC_ENC_BLOCK(twofish_enc_blk);
+ CBC_WALK_END();
}
static int cbc_decrypt(struct skcipher_request *req)
{
- return glue_cbc_decrypt_req_128bit(&twofish_dec_cbc, req);
-}
-
-static int ctr_crypt(struct skcipher_request *req)
-{
- return glue_ctr_req_128bit(&twofish_ctr, req);
-}
-
-static int xts_encrypt(struct skcipher_request *req)
-{
- struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
- struct twofish_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
-
- return glue_xts_req_128bit(&twofish_enc_xts, req, twofish_enc_blk,
- &ctx->tweak_ctx, &ctx->crypt_ctx, false);
-}
-
-static int xts_decrypt(struct skcipher_request *req)
-{
- struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
- struct twofish_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
-
- return glue_xts_req_128bit(&twofish_dec_xts, req, twofish_enc_blk,
- &ctx->tweak_ctx, &ctx->crypt_ctx, true);
+ CBC_WALK_START(req, TF_BLOCK_SIZE, TWOFISH_PARALLEL_BLOCKS);
+ CBC_DEC_BLOCK(TWOFISH_PARALLEL_BLOCKS, twofish_cbc_dec_8way);
+ CBC_DEC_BLOCK(3, twofish_dec_blk_cbc_3way);
+ CBC_DEC_BLOCK(1, twofish_dec_blk);
+ CBC_WALK_END();
}
static struct skcipher_alg twofish_algs[] = {
@@ -240,35 +100,6 @@ static struct skcipher_alg twofish_algs[] = {
.setkey = twofish_setkey_skcipher,
.encrypt = cbc_encrypt,
.decrypt = cbc_decrypt,
- }, {
- .base.cra_name = "__ctr(twofish)",
- .base.cra_driver_name = "__ctr-twofish-avx",
- .base.cra_priority = 400,
- .base.cra_flags = CRYPTO_ALG_INTERNAL,
- .base.cra_blocksize = 1,
- .base.cra_ctxsize = sizeof(struct twofish_ctx),
- .base.cra_module = THIS_MODULE,
- .min_keysize = TF_MIN_KEY_SIZE,
- .max_keysize = TF_MAX_KEY_SIZE,
- .ivsize = TF_BLOCK_SIZE,
- .chunksize = TF_BLOCK_SIZE,
- .setkey = twofish_setkey_skcipher,
- .encrypt = ctr_crypt,
- .decrypt = ctr_crypt,
- }, {
- .base.cra_name = "__xts(twofish)",
- .base.cra_driver_name = "__xts-twofish-avx",
- .base.cra_priority = 400,
- .base.cra_flags = CRYPTO_ALG_INTERNAL,
- .base.cra_blocksize = TF_BLOCK_SIZE,
- .base.cra_ctxsize = sizeof(struct twofish_xts_ctx),
- .base.cra_module = THIS_MODULE,
- .min_keysize = 2 * TF_MIN_KEY_SIZE,
- .max_keysize = 2 * TF_MAX_KEY_SIZE,
- .ivsize = TF_BLOCK_SIZE,
- .setkey = xts_twofish_setkey,
- .encrypt = xts_encrypt,
- .decrypt = xts_decrypt,
},
};
diff --git a/arch/x86/crypto/twofish_glue.c b/arch/x86/crypto/twofish_glue.c
index 77e06c2da83d..f9c4adc27404 100644
--- a/arch/x86/crypto/twofish_glue.c
+++ b/arch/x86/crypto/twofish_glue.c
@@ -81,18 +81,18 @@ static struct crypto_alg alg = {
}
};
-static int __init init(void)
+static int __init twofish_glue_init(void)
{
return crypto_register_alg(&alg);
}
-static void __exit fini(void)
+static void __exit twofish_glue_fini(void)
{
crypto_unregister_alg(&alg);
}
-module_init(init);
-module_exit(fini);
+module_init(twofish_glue_init);
+module_exit(twofish_glue_fini);
MODULE_LICENSE("GPL");
MODULE_DESCRIPTION ("Twofish Cipher Algorithm, asm optimized");
diff --git a/arch/x86/crypto/twofish_glue_3way.c b/arch/x86/crypto/twofish_glue_3way.c
index 768af6075479..90454cf18e0d 100644
--- a/arch/x86/crypto/twofish_glue_3way.c
+++ b/arch/x86/crypto/twofish_glue_3way.c
@@ -5,17 +5,16 @@
* Copyright (c) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
*/
-#include <asm/crypto/glue_helper.h>
-#include <asm/crypto/twofish.h>
#include <crypto/algapi.h>
-#include <crypto/b128ops.h>
-#include <crypto/internal/skcipher.h>
#include <crypto/twofish.h>
#include <linux/crypto.h>
#include <linux/init.h>
#include <linux/module.h>
#include <linux/types.h>
+#include "twofish.h"
+#include "ecb_cbc_helpers.h"
+
EXPORT_SYMBOL_GPL(__twofish_enc_blk_3way);
EXPORT_SYMBOL_GPL(twofish_dec_blk_3way);
@@ -30,143 +29,48 @@ static inline void twofish_enc_blk_3way(const void *ctx, u8 *dst, const u8 *src)
__twofish_enc_blk_3way(ctx, dst, src, false);
}
-static inline void twofish_enc_blk_xor_3way(const void *ctx, u8 *dst,
- const u8 *src)
-{
- __twofish_enc_blk_3way(ctx, dst, src, true);
-}
-
-void twofish_dec_blk_cbc_3way(const void *ctx, u8 *d, const u8 *s)
+void twofish_dec_blk_cbc_3way(const void *ctx, u8 *dst, const u8 *src)
{
- u128 ivs[2];
- u128 *dst = (u128 *)d;
- const u128 *src = (const u128 *)s;
-
- ivs[0] = src[0];
- ivs[1] = src[1];
+ u8 buf[2][TF_BLOCK_SIZE];
+ const u8 *s = src;
- twofish_dec_blk_3way(ctx, (u8 *)dst, (u8 *)src);
+ if (dst == src)
+ s = memcpy(buf, src, sizeof(buf));
+ twofish_dec_blk_3way(ctx, dst, src);
+ crypto_xor(dst + TF_BLOCK_SIZE, s, sizeof(buf));
- u128_xor(&dst[1], &dst[1], &ivs[0]);
- u128_xor(&dst[2], &dst[2], &ivs[1]);
}
EXPORT_SYMBOL_GPL(twofish_dec_blk_cbc_3way);
-void twofish_enc_blk_ctr(const void *ctx, u8 *d, const u8 *s, le128 *iv)
-{
- be128 ctrblk;
- u128 *dst = (u128 *)d;
- const u128 *src = (const u128 *)s;
-
- if (dst != src)
- *dst = *src;
-
- le128_to_be128(&ctrblk, iv);
- le128_inc(iv);
-
- twofish_enc_blk(ctx, (u8 *)&ctrblk, (u8 *)&ctrblk);
- u128_xor(dst, dst, (u128 *)&ctrblk);
-}
-EXPORT_SYMBOL_GPL(twofish_enc_blk_ctr);
-
-void twofish_enc_blk_ctr_3way(const void *ctx, u8 *d, const u8 *s, le128 *iv)
-{
- be128 ctrblks[3];
- u128 *dst = (u128 *)d;
- const u128 *src = (const u128 *)s;
-
- if (dst != src) {
- dst[0] = src[0];
- dst[1] = src[1];
- dst[2] = src[2];
- }
-
- le128_to_be128(&ctrblks[0], iv);
- le128_inc(iv);
- le128_to_be128(&ctrblks[1], iv);
- le128_inc(iv);
- le128_to_be128(&ctrblks[2], iv);
- le128_inc(iv);
-
- twofish_enc_blk_xor_3way(ctx, (u8 *)dst, (u8 *)ctrblks);
-}
-EXPORT_SYMBOL_GPL(twofish_enc_blk_ctr_3way);
-
-static const struct common_glue_ctx twofish_enc = {
- .num_funcs = 2,
- .fpu_blocks_limit = -1,
-
- .funcs = { {
- .num_blocks = 3,
- .fn_u = { .ecb = twofish_enc_blk_3way }
- }, {
- .num_blocks = 1,
- .fn_u = { .ecb = twofish_enc_blk }
- } }
-};
-
-static const struct common_glue_ctx twofish_ctr = {
- .num_funcs = 2,
- .fpu_blocks_limit = -1,
-
- .funcs = { {
- .num_blocks = 3,
- .fn_u = { .ctr = twofish_enc_blk_ctr_3way }
- }, {
- .num_blocks = 1,
- .fn_u = { .ctr = twofish_enc_blk_ctr }
- } }
-};
-
-static const struct common_glue_ctx twofish_dec = {
- .num_funcs = 2,
- .fpu_blocks_limit = -1,
-
- .funcs = { {
- .num_blocks = 3,
- .fn_u = { .ecb = twofish_dec_blk_3way }
- }, {
- .num_blocks = 1,
- .fn_u = { .ecb = twofish_dec_blk }
- } }
-};
-
-static const struct common_glue_ctx twofish_dec_cbc = {
- .num_funcs = 2,
- .fpu_blocks_limit = -1,
-
- .funcs = { {
- .num_blocks = 3,
- .fn_u = { .cbc = twofish_dec_blk_cbc_3way }
- }, {
- .num_blocks = 1,
- .fn_u = { .cbc = twofish_dec_blk }
- } }
-};
-
static int ecb_encrypt(struct skcipher_request *req)
{
- return glue_ecb_req_128bit(&twofish_enc, req);
+ ECB_WALK_START(req, TF_BLOCK_SIZE, -1);
+ ECB_BLOCK(3, twofish_enc_blk_3way);
+ ECB_BLOCK(1, twofish_enc_blk);
+ ECB_WALK_END();
}
static int ecb_decrypt(struct skcipher_request *req)
{
- return glue_ecb_req_128bit(&twofish_dec, req);
+ ECB_WALK_START(req, TF_BLOCK_SIZE, -1);
+ ECB_BLOCK(3, twofish_dec_blk_3way);
+ ECB_BLOCK(1, twofish_dec_blk);
+ ECB_WALK_END();
}
static int cbc_encrypt(struct skcipher_request *req)
{
- return glue_cbc_encrypt_req_128bit(twofish_enc_blk, req);
+ CBC_WALK_START(req, TF_BLOCK_SIZE, -1);
+ CBC_ENC_BLOCK(twofish_enc_blk);
+ CBC_WALK_END();
}
static int cbc_decrypt(struct skcipher_request *req)
{
- return glue_cbc_decrypt_req_128bit(&twofish_dec_cbc, req);
-}
-
-static int ctr_crypt(struct skcipher_request *req)
-{
- return glue_ctr_req_128bit(&twofish_ctr, req);
+ CBC_WALK_START(req, TF_BLOCK_SIZE, -1);
+ CBC_DEC_BLOCK(3, twofish_dec_blk_cbc_3way);
+ CBC_DEC_BLOCK(1, twofish_dec_blk);
+ CBC_WALK_END();
}
static struct skcipher_alg tf_skciphers[] = {
@@ -195,20 +99,6 @@ static struct skcipher_alg tf_skciphers[] = {
.setkey = twofish_setkey_skcipher,
.encrypt = cbc_encrypt,
.decrypt = cbc_decrypt,
- }, {
- .base.cra_name = "ctr(twofish)",
- .base.cra_driver_name = "ctr-twofish-3way",
- .base.cra_priority = 300,
- .base.cra_blocksize = 1,
- .base.cra_ctxsize = sizeof(struct twofish_ctx),
- .base.cra_module = THIS_MODULE,
- .min_keysize = TF_MIN_KEY_SIZE,
- .max_keysize = TF_MAX_KEY_SIZE,
- .ivsize = TF_BLOCK_SIZE,
- .chunksize = TF_BLOCK_SIZE,
- .setkey = twofish_setkey_skcipher,
- .encrypt = ctr_crypt,
- .decrypt = ctr_crypt,
},
};
@@ -227,7 +117,7 @@ static bool is_blacklisted_cpu(void)
* storing blocks in 64bit registers to allow three blocks to
* be processed parallel. Parallel operation then allows gaining
* more performance than was trade off, on out-of-order CPUs.
- * However Atom does not benefit from this parallellism and
+ * However Atom does not benefit from this parallelism and
* should be blacklisted.
*/
return true;
@@ -250,7 +140,7 @@ static int force;
module_param(force, int, 0);
MODULE_PARM_DESC(force, "Force module load, ignore CPU blacklist");
-static int __init init(void)
+static int __init twofish_3way_init(void)
{
if (!force && is_blacklisted_cpu()) {
printk(KERN_INFO
@@ -264,13 +154,13 @@ static int __init init(void)
ARRAY_SIZE(tf_skciphers));
}
-static void __exit fini(void)
+static void __exit twofish_3way_fini(void)
{
crypto_unregister_skciphers(tf_skciphers, ARRAY_SIZE(tf_skciphers));
}
-module_init(init);
-module_exit(fini);
+module_init(twofish_3way_init);
+module_exit(twofish_3way_fini);
MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("Twofish Cipher Algorithm, 3-way parallel asm optimized");