diff options
Diffstat (limited to 'arch/x86/crypto')
91 files changed, 8939 insertions, 7611 deletions
diff --git a/arch/x86/crypto/.gitignore b/arch/x86/crypto/.gitignore index 30be0400a439..580c839bb177 100644 --- a/arch/x86/crypto/.gitignore +++ b/arch/x86/crypto/.gitignore @@ -1 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0-only poly1305-x86_64-cryptogams.S diff --git a/arch/x86/crypto/Kconfig b/arch/x86/crypto/Kconfig new file mode 100644 index 000000000000..71c4c473d34b --- /dev/null +++ b/arch/x86/crypto/Kconfig @@ -0,0 +1,484 @@ +# SPDX-License-Identifier: GPL-2.0 + +menu "Accelerated Cryptographic Algorithms for CPU (x86)" + +config CRYPTO_CURVE25519_X86 + tristate "Public key crypto: Curve25519 (ADX)" + depends on X86 && 64BIT + select CRYPTO_LIB_CURVE25519_GENERIC + select CRYPTO_ARCH_HAVE_LIB_CURVE25519 + help + Curve25519 algorithm + + Architecture: x86_64 using: + - ADX (large integer arithmetic) + +config CRYPTO_AES_NI_INTEL + tristate "Ciphers: AES, modes: ECB, CBC, CTS, CTR, XTR, XTS, GCM (AES-NI)" + depends on X86 + select CRYPTO_AEAD + select CRYPTO_LIB_AES + select CRYPTO_ALGAPI + select CRYPTO_SKCIPHER + select CRYPTO_SIMD + help + Block cipher: AES cipher algorithms + AEAD cipher: AES with GCM + Length-preserving ciphers: AES with ECB, CBC, CTS, CTR, XTR, XTS + + Architecture: x86 (32-bit and 64-bit) using: + - AES-NI (AES new instructions) + +config CRYPTO_BLOWFISH_X86_64 + tristate "Ciphers: Blowfish, modes: ECB, CBC" + depends on X86 && 64BIT + select CRYPTO_SKCIPHER + select CRYPTO_BLOWFISH_COMMON + imply CRYPTO_CTR + help + Block cipher: Blowfish cipher algorithm + Length-preserving ciphers: Blowfish with ECB and CBC modes + + Architecture: x86_64 + +config CRYPTO_CAMELLIA_X86_64 + tristate "Ciphers: Camellia with modes: ECB, CBC" + depends on X86 && 64BIT + select CRYPTO_SKCIPHER + imply CRYPTO_CTR + help + Block cipher: Camellia cipher algorithms + Length-preserving ciphers: Camellia with ECB and CBC modes + + Architecture: x86_64 + +config CRYPTO_CAMELLIA_AESNI_AVX_X86_64 + tristate "Ciphers: Camellia with modes: ECB, CBC (AES-NI/AVX)" + depends on X86 && 64BIT + select CRYPTO_SKCIPHER + select CRYPTO_CAMELLIA_X86_64 + select CRYPTO_SIMD + imply CRYPTO_XTS + help + Length-preserving ciphers: Camellia with ECB and CBC modes + + Architecture: x86_64 using: + - AES-NI (AES New Instructions) + - AVX (Advanced Vector Extensions) + +config CRYPTO_CAMELLIA_AESNI_AVX2_X86_64 + tristate "Ciphers: Camellia with modes: ECB, CBC (AES-NI/AVX2)" + depends on X86 && 64BIT + select CRYPTO_CAMELLIA_AESNI_AVX_X86_64 + help + Length-preserving ciphers: Camellia with ECB and CBC modes + + Architecture: x86_64 using: + - AES-NI (AES New Instructions) + - AVX2 (Advanced Vector Extensions 2) + +config CRYPTO_CAST5_AVX_X86_64 + tristate "Ciphers: CAST5 with modes: ECB, CBC (AVX)" + depends on X86 && 64BIT + select CRYPTO_SKCIPHER + select CRYPTO_CAST5 + select CRYPTO_CAST_COMMON + select CRYPTO_SIMD + imply CRYPTO_CTR + help + Length-preserving ciphers: CAST5 (CAST-128) cipher algorithm + (RFC2144) with ECB and CBC modes + + Architecture: x86_64 using: + - AVX (Advanced Vector Extensions) + + Processes 16 blocks in parallel. + +config CRYPTO_CAST6_AVX_X86_64 + tristate "Ciphers: CAST6 with modes: ECB, CBC (AVX)" + depends on X86 && 64BIT + select CRYPTO_SKCIPHER + select CRYPTO_CAST6 + select CRYPTO_CAST_COMMON + select CRYPTO_SIMD + imply CRYPTO_XTS + imply CRYPTO_CTR + help + Length-preserving ciphers: CAST6 (CAST-256) cipher algorithm + (RFC2612) with ECB and CBC modes + + Architecture: x86_64 using: + - AVX (Advanced Vector Extensions) + + Processes eight blocks in parallel. + +config CRYPTO_DES3_EDE_X86_64 + tristate "Ciphers: Triple DES EDE with modes: ECB, CBC" + depends on X86 && 64BIT + select CRYPTO_SKCIPHER + select CRYPTO_LIB_DES + imply CRYPTO_CTR + help + Block cipher: Triple DES EDE (FIPS 46-3) cipher algorithm + Length-preserving ciphers: Triple DES EDE with ECB and CBC modes + + Architecture: x86_64 + + Processes one or three blocks in parallel. + +config CRYPTO_SERPENT_SSE2_X86_64 + tristate "Ciphers: Serpent with modes: ECB, CBC (SSE2)" + depends on X86 && 64BIT + select CRYPTO_SKCIPHER + select CRYPTO_SERPENT + select CRYPTO_SIMD + imply CRYPTO_CTR + help + Length-preserving ciphers: Serpent cipher algorithm + with ECB and CBC modes + + Architecture: x86_64 using: + - SSE2 (Streaming SIMD Extensions 2) + + Processes eight blocks in parallel. + +config CRYPTO_SERPENT_SSE2_586 + tristate "Ciphers: Serpent with modes: ECB, CBC (32-bit with SSE2)" + depends on X86 && !64BIT + select CRYPTO_SKCIPHER + select CRYPTO_SERPENT + select CRYPTO_SIMD + imply CRYPTO_CTR + help + Length-preserving ciphers: Serpent cipher algorithm + with ECB and CBC modes + + Architecture: x86 (32-bit) using: + - SSE2 (Streaming SIMD Extensions 2) + + Processes four blocks in parallel. + +config CRYPTO_SERPENT_AVX_X86_64 + tristate "Ciphers: Serpent with modes: ECB, CBC (AVX)" + depends on X86 && 64BIT + select CRYPTO_SKCIPHER + select CRYPTO_SERPENT + select CRYPTO_SIMD + imply CRYPTO_XTS + imply CRYPTO_CTR + help + Length-preserving ciphers: Serpent cipher algorithm + with ECB and CBC modes + + Architecture: x86_64 using: + - AVX (Advanced Vector Extensions) + + Processes eight blocks in parallel. + +config CRYPTO_SERPENT_AVX2_X86_64 + tristate "Ciphers: Serpent with modes: ECB, CBC (AVX2)" + depends on X86 && 64BIT + select CRYPTO_SERPENT_AVX_X86_64 + help + Length-preserving ciphers: Serpent cipher algorithm + with ECB and CBC modes + + Architecture: x86_64 using: + - AVX2 (Advanced Vector Extensions 2) + + Processes 16 blocks in parallel. + +config CRYPTO_SM4_AESNI_AVX_X86_64 + tristate "Ciphers: SM4 with modes: ECB, CBC, CFB, CTR (AES-NI/AVX)" + depends on X86 && 64BIT + select CRYPTO_SKCIPHER + select CRYPTO_SIMD + select CRYPTO_ALGAPI + select CRYPTO_SM4 + help + Length-preserving ciphers: SM4 cipher algorithms + (OSCCA GB/T 32907-2016) with ECB, CBC, CFB, and CTR modes + + Architecture: x86_64 using: + - AES-NI (AES New Instructions) + - AVX (Advanced Vector Extensions) + + Through two affine transforms, + we can use the AES S-Box to simulate the SM4 S-Box to achieve the + effect of instruction acceleration. + + If unsure, say N. + +config CRYPTO_SM4_AESNI_AVX2_X86_64 + tristate "Ciphers: SM4 with modes: ECB, CBC, CFB, CTR (AES-NI/AVX2)" + depends on X86 && 64BIT + select CRYPTO_SKCIPHER + select CRYPTO_SIMD + select CRYPTO_ALGAPI + select CRYPTO_SM4 + select CRYPTO_SM4_AESNI_AVX_X86_64 + help + Length-preserving ciphers: SM4 cipher algorithms + (OSCCA GB/T 32907-2016) with ECB, CBC, CFB, and CTR modes + + Architecture: x86_64 using: + - AES-NI (AES New Instructions) + - AVX2 (Advanced Vector Extensions 2) + + Through two affine transforms, + we can use the AES S-Box to simulate the SM4 S-Box to achieve the + effect of instruction acceleration. + + If unsure, say N. + +config CRYPTO_TWOFISH_586 + tristate "Ciphers: Twofish (32-bit)" + depends on (X86 || UML_X86) && !64BIT + select CRYPTO_ALGAPI + select CRYPTO_TWOFISH_COMMON + imply CRYPTO_CTR + help + Block cipher: Twofish cipher algorithm + + Architecture: x86 (32-bit) + +config CRYPTO_TWOFISH_X86_64 + tristate "Ciphers: Twofish" + depends on (X86 || UML_X86) && 64BIT + select CRYPTO_ALGAPI + select CRYPTO_TWOFISH_COMMON + imply CRYPTO_CTR + help + Block cipher: Twofish cipher algorithm + + Architecture: x86_64 + +config CRYPTO_TWOFISH_X86_64_3WAY + tristate "Ciphers: Twofish with modes: ECB, CBC (3-way parallel)" + depends on X86 && 64BIT + select CRYPTO_SKCIPHER + select CRYPTO_TWOFISH_COMMON + select CRYPTO_TWOFISH_X86_64 + help + Length-preserving cipher: Twofish cipher algorithm + with ECB and CBC modes + + Architecture: x86_64 + + Processes three blocks in parallel, better utilizing resources of + out-of-order CPUs. + +config CRYPTO_TWOFISH_AVX_X86_64 + tristate "Ciphers: Twofish with modes: ECB, CBC (AVX)" + depends on X86 && 64BIT + select CRYPTO_SKCIPHER + select CRYPTO_SIMD + select CRYPTO_TWOFISH_COMMON + select CRYPTO_TWOFISH_X86_64 + select CRYPTO_TWOFISH_X86_64_3WAY + imply CRYPTO_XTS + help + Length-preserving cipher: Twofish cipher algorithm + with ECB and CBC modes + + Architecture: x86_64 using: + - AVX (Advanced Vector Extensions) + + Processes eight blocks in parallel. + +config CRYPTO_ARIA_AESNI_AVX_X86_64 + tristate "Ciphers: ARIA with modes: ECB, CTR (AES-NI/AVX/GFNI)" + depends on X86 && 64BIT + select CRYPTO_SKCIPHER + select CRYPTO_SIMD + select CRYPTO_ALGAPI + select CRYPTO_ARIA + help + Length-preserving cipher: ARIA cipher algorithms + (RFC 5794) with ECB and CTR modes + + Architecture: x86_64 using: + - AES-NI (AES New Instructions) + - AVX (Advanced Vector Extensions) + - GFNI (Galois Field New Instructions) + + Processes 16 blocks in parallel. + +config CRYPTO_CHACHA20_X86_64 + tristate "Ciphers: ChaCha20, XChaCha20, XChaCha12 (SSSE3/AVX2/AVX-512VL)" + depends on X86 && 64BIT + select CRYPTO_SKCIPHER + select CRYPTO_LIB_CHACHA_GENERIC + select CRYPTO_ARCH_HAVE_LIB_CHACHA + help + Length-preserving ciphers: ChaCha20, XChaCha20, and XChaCha12 + stream cipher algorithms + + Architecture: x86_64 using: + - SSSE3 (Supplemental SSE3) + - AVX2 (Advanced Vector Extensions 2) + - AVX-512VL (Advanced Vector Extensions-512VL) + +config CRYPTO_AEGIS128_AESNI_SSE2 + tristate "AEAD ciphers: AEGIS-128 (AES-NI/SSE2)" + depends on X86 && 64BIT + select CRYPTO_AEAD + select CRYPTO_SIMD + help + AEGIS-128 AEAD algorithm + + Architecture: x86_64 using: + - AES-NI (AES New Instructions) + - SSE2 (Streaming SIMD Extensions 2) + +config CRYPTO_NHPOLY1305_SSE2 + tristate "Hash functions: NHPoly1305 (SSE2)" + depends on X86 && 64BIT + select CRYPTO_NHPOLY1305 + help + NHPoly1305 hash function for Adiantum + + Architecture: x86_64 using: + - SSE2 (Streaming SIMD Extensions 2) + +config CRYPTO_NHPOLY1305_AVX2 + tristate "Hash functions: NHPoly1305 (AVX2)" + depends on X86 && 64BIT + select CRYPTO_NHPOLY1305 + help + NHPoly1305 hash function for Adiantum + + Architecture: x86_64 using: + - AVX2 (Advanced Vector Extensions 2) + +config CRYPTO_BLAKE2S_X86 + bool "Hash functions: BLAKE2s (SSSE3/AVX-512)" + depends on X86 && 64BIT + select CRYPTO_LIB_BLAKE2S_GENERIC + select CRYPTO_ARCH_HAVE_LIB_BLAKE2S + help + BLAKE2s cryptographic hash function (RFC 7693) + + Architecture: x86_64 using: + - SSSE3 (Supplemental SSE3) + - AVX-512 (Advanced Vector Extensions-512) + +config CRYPTO_POLYVAL_CLMUL_NI + tristate "Hash functions: POLYVAL (CLMUL-NI)" + depends on X86 && 64BIT + select CRYPTO_POLYVAL + help + POLYVAL hash function for HCTR2 + + Architecture: x86_64 using: + - CLMUL-NI (carry-less multiplication new instructions) + +config CRYPTO_POLY1305_X86_64 + tristate "Hash functions: Poly1305 (SSE2/AVX2)" + depends on X86 && 64BIT + select CRYPTO_LIB_POLY1305_GENERIC + select CRYPTO_ARCH_HAVE_LIB_POLY1305 + help + Poly1305 authenticator algorithm (RFC7539) + + Architecture: x86_64 using: + - SSE2 (Streaming SIMD Extensions 2) + - AVX2 (Advanced Vector Extensions 2) + +config CRYPTO_SHA1_SSSE3 + tristate "Hash functions: SHA-1 (SSSE3/AVX/AVX2/SHA-NI)" + depends on X86 && 64BIT + select CRYPTO_SHA1 + select CRYPTO_HASH + help + SHA-1 secure hash algorithm (FIPS 180) + + Architecture: x86_64 using: + - SSSE3 (Supplemental SSE3) + - AVX (Advanced Vector Extensions) + - AVX2 (Advanced Vector Extensions 2) + - SHA-NI (SHA Extensions New Instructions) + +config CRYPTO_SHA256_SSSE3 + tristate "Hash functions: SHA-224 and SHA-256 (SSSE3/AVX/AVX2/SHA-NI)" + depends on X86 && 64BIT + select CRYPTO_SHA256 + select CRYPTO_HASH + help + SHA-224 and SHA-256 secure hash algorithms (FIPS 180) + + Architecture: x86_64 using: + - SSSE3 (Supplemental SSE3) + - AVX (Advanced Vector Extensions) + - AVX2 (Advanced Vector Extensions 2) + - SHA-NI (SHA Extensions New Instructions) + +config CRYPTO_SHA512_SSSE3 + tristate "Hash functions: SHA-384 and SHA-512 (SSSE3/AVX/AVX2)" + depends on X86 && 64BIT + select CRYPTO_SHA512 + select CRYPTO_HASH + help + SHA-384 and SHA-512 secure hash algorithms (FIPS 180) + + Architecture: x86_64 using: + - SSSE3 (Supplemental SSE3) + - AVX (Advanced Vector Extensions) + - AVX2 (Advanced Vector Extensions 2) + +config CRYPTO_SM3_AVX_X86_64 + tristate "Hash functions: SM3 (AVX)" + depends on X86 && 64BIT + select CRYPTO_HASH + select CRYPTO_SM3 + help + SM3 secure hash function as defined by OSCCA GM/T 0004-2012 SM3 + + Architecture: x86_64 using: + - AVX (Advanced Vector Extensions) + + If unsure, say N. + +config CRYPTO_GHASH_CLMUL_NI_INTEL + tristate "Hash functions: GHASH (CLMUL-NI)" + depends on X86 && 64BIT + select CRYPTO_CRYPTD + help + GCM GHASH hash function (NIST SP800-38D) + + Architecture: x86_64 using: + - CLMUL-NI (carry-less multiplication new instructions) + +config CRYPTO_CRC32C_INTEL + tristate "CRC32c (SSE4.2/PCLMULQDQ)" + depends on X86 + select CRYPTO_HASH + help + CRC32c CRC algorithm with the iSCSI polynomial (RFC 3385 and RFC 3720) + + Architecture: x86 (32-bit and 64-bit) using: + - SSE4.2 (Streaming SIMD Extensions 4.2) CRC32 instruction + - PCLMULQDQ (carry-less multiplication) + +config CRYPTO_CRC32_PCLMUL + tristate "CRC32 (PCLMULQDQ)" + depends on X86 + select CRYPTO_HASH + select CRC32 + help + CRC32 CRC algorithm (IEEE 802.3) + + Architecture: x86 (32-bit and 64-bit) using: + - PCLMULQDQ (carry-less multiplication) + +config CRYPTO_CRCT10DIF_PCLMUL + tristate "CRCT10DIF (PCLMULQDQ)" + depends on X86 && 64BIT && CRC_T10DIF + select CRYPTO_HASH + help + CRC16 CRC algorithm used for the T10 (SCSI) Data Integrity Field (DIF) + + Architecture: x86_64 using: + - PCLMULQDQ (carry-less multiplication) + +endmenu diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile index 8c2e9eadee8a..3b1d701a4f6c 100644 --- a/arch/x86/crypto/Makefile +++ b/arch/x86/crypto/Makefile @@ -1,130 +1,107 @@ # SPDX-License-Identifier: GPL-2.0 # -# Arch-specific CryptoAPI modules. -# - -OBJECT_FILES_NON_STANDARD := y - -avx_supported := $(call as-instr,vpxor %xmm0$(comma)%xmm0$(comma)%xmm0,yes,no) -avx2_supported := $(call as-instr,vpgatherdd %ymm0$(comma)(%eax$(comma)%ymm1\ - $(comma)4)$(comma)%ymm2,yes,no) -avx512_supported :=$(call as-instr,vpmovm2b %k1$(comma)%zmm5,yes,no) -sha1_ni_supported :=$(call as-instr,sha1msg1 %xmm0$(comma)%xmm1,yes,no) -sha256_ni_supported :=$(call as-instr,sha256msg1 %xmm0$(comma)%xmm1,yes,no) -adx_supported := $(call as-instr,adox %r10$(comma)%r10,yes,no) - -obj-$(CONFIG_CRYPTO_GLUE_HELPER_X86) += glue_helper.o +# x86 crypto algorithms obj-$(CONFIG_CRYPTO_TWOFISH_586) += twofish-i586.o +twofish-i586-y := twofish-i586-asm_32.o twofish_glue.o +obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o +twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_glue.o +obj-$(CONFIG_CRYPTO_TWOFISH_X86_64_3WAY) += twofish-x86_64-3way.o +twofish-x86_64-3way-y := twofish-x86_64-asm_64-3way.o twofish_glue_3way.o +obj-$(CONFIG_CRYPTO_TWOFISH_AVX_X86_64) += twofish-avx-x86_64.o +twofish-avx-x86_64-y := twofish-avx-x86_64-asm_64.o twofish_avx_glue.o + obj-$(CONFIG_CRYPTO_SERPENT_SSE2_586) += serpent-sse2-i586.o +serpent-sse2-i586-y := serpent-sse2-i586-asm_32.o serpent_sse2_glue.o +obj-$(CONFIG_CRYPTO_SERPENT_SSE2_X86_64) += serpent-sse2-x86_64.o +serpent-sse2-x86_64-y := serpent-sse2-x86_64-asm_64.o serpent_sse2_glue.o +obj-$(CONFIG_CRYPTO_SERPENT_AVX_X86_64) += serpent-avx-x86_64.o +serpent-avx-x86_64-y := serpent-avx-x86_64-asm_64.o serpent_avx_glue.o +obj-$(CONFIG_CRYPTO_SERPENT_AVX2_X86_64) += serpent-avx2.o +serpent-avx2-y := serpent-avx2-asm_64.o serpent_avx2_glue.o obj-$(CONFIG_CRYPTO_DES3_EDE_X86_64) += des3_ede-x86_64.o +des3_ede-x86_64-y := des3_ede-asm_64.o des3_ede_glue.o + obj-$(CONFIG_CRYPTO_CAMELLIA_X86_64) += camellia-x86_64.o +camellia-x86_64-y := camellia-x86_64-asm_64.o camellia_glue.o +obj-$(CONFIG_CRYPTO_CAMELLIA_AESNI_AVX_X86_64) += camellia-aesni-avx-x86_64.o +camellia-aesni-avx-x86_64-y := camellia-aesni-avx-asm_64.o camellia_aesni_avx_glue.o +obj-$(CONFIG_CRYPTO_CAMELLIA_AESNI_AVX2_X86_64) += camellia-aesni-avx2.o +camellia-aesni-avx2-y := camellia-aesni-avx2-asm_64.o camellia_aesni_avx2_glue.o + obj-$(CONFIG_CRYPTO_BLOWFISH_X86_64) += blowfish-x86_64.o -obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o -obj-$(CONFIG_CRYPTO_TWOFISH_X86_64_3WAY) += twofish-x86_64-3way.o +blowfish-x86_64-y := blowfish-x86_64-asm_64.o blowfish_glue.o + +obj-$(CONFIG_CRYPTO_CAST5_AVX_X86_64) += cast5-avx-x86_64.o +cast5-avx-x86_64-y := cast5-avx-x86_64-asm_64.o cast5_avx_glue.o + +obj-$(CONFIG_CRYPTO_CAST6_AVX_X86_64) += cast6-avx-x86_64.o +cast6-avx-x86_64-y := cast6-avx-x86_64-asm_64.o cast6_avx_glue.o + +obj-$(CONFIG_CRYPTO_AEGIS128_AESNI_SSE2) += aegis128-aesni.o +aegis128-aesni-y := aegis128-aesni-asm.o aegis128-aesni-glue.o + obj-$(CONFIG_CRYPTO_CHACHA20_X86_64) += chacha-x86_64.o -obj-$(CONFIG_CRYPTO_SERPENT_SSE2_X86_64) += serpent-sse2-x86_64.o +chacha-x86_64-y := chacha-avx2-x86_64.o chacha-ssse3-x86_64.o chacha_glue.o +chacha-x86_64-$(CONFIG_AS_AVX512) += chacha-avx512vl-x86_64.o + obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o -obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o +aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o +aesni-intel-$(CONFIG_64BIT) += aesni-intel_avx-x86_64.o aes_ctrby8_avx-x86_64.o -obj-$(CONFIG_CRYPTO_CRC32C_INTEL) += crc32c-intel.o obj-$(CONFIG_CRYPTO_SHA1_SSSE3) += sha1-ssse3.o -obj-$(CONFIG_CRYPTO_CRC32_PCLMUL) += crc32-pclmul.o +sha1-ssse3-y := sha1_avx2_x86_64_asm.o sha1_ssse3_asm.o sha1_ssse3_glue.o +sha1-ssse3-$(CONFIG_AS_SHA1_NI) += sha1_ni_asm.o + obj-$(CONFIG_CRYPTO_SHA256_SSSE3) += sha256-ssse3.o +sha256-ssse3-y := sha256-ssse3-asm.o sha256-avx-asm.o sha256-avx2-asm.o sha256_ssse3_glue.o +sha256-ssse3-$(CONFIG_AS_SHA256_NI) += sha256_ni_asm.o + obj-$(CONFIG_CRYPTO_SHA512_SSSE3) += sha512-ssse3.o -obj-$(CONFIG_CRYPTO_CRCT10DIF_PCLMUL) += crct10dif-pclmul.o -obj-$(CONFIG_CRYPTO_POLY1305_X86_64) += poly1305-x86_64.o +sha512-ssse3-y := sha512-ssse3-asm.o sha512-avx-asm.o sha512-avx2-asm.o sha512_ssse3_glue.o -obj-$(CONFIG_CRYPTO_AEGIS128_AESNI_SSE2) += aegis128-aesni.o +obj-$(CONFIG_CRYPTO_BLAKE2S_X86) += libblake2s-x86_64.o +libblake2s-x86_64-y := blake2s-core.o blake2s-glue.o -obj-$(CONFIG_CRYPTO_NHPOLY1305_SSE2) += nhpoly1305-sse2.o -obj-$(CONFIG_CRYPTO_NHPOLY1305_AVX2) += nhpoly1305-avx2.o +obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o +ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o -# These modules require the assembler to support ADX. -ifeq ($(adx_supported),yes) - obj-$(CONFIG_CRYPTO_CURVE25519_X86) += curve25519-x86_64.o -endif - -# These modules require assembler to support AVX. -ifeq ($(avx_supported),yes) - obj-$(CONFIG_CRYPTO_CAMELLIA_AESNI_AVX_X86_64) += \ - camellia-aesni-avx-x86_64.o - obj-$(CONFIG_CRYPTO_CAST5_AVX_X86_64) += cast5-avx-x86_64.o - obj-$(CONFIG_CRYPTO_CAST6_AVX_X86_64) += cast6-avx-x86_64.o - obj-$(CONFIG_CRYPTO_TWOFISH_AVX_X86_64) += twofish-avx-x86_64.o - obj-$(CONFIG_CRYPTO_SERPENT_AVX_X86_64) += serpent-avx-x86_64.o - obj-$(CONFIG_CRYPTO_BLAKE2S_X86) += blake2s-x86_64.o -endif - -# These modules require assembler to support AVX2. -ifeq ($(avx2_supported),yes) - obj-$(CONFIG_CRYPTO_CAMELLIA_AESNI_AVX2_X86_64) += camellia-aesni-avx2.o - obj-$(CONFIG_CRYPTO_SERPENT_AVX2_X86_64) += serpent-avx2.o -endif +obj-$(CONFIG_CRYPTO_POLYVAL_CLMUL_NI) += polyval-clmulni.o +polyval-clmulni-y := polyval-clmulni_asm.o polyval-clmulni_glue.o -twofish-i586-y := twofish-i586-asm_32.o twofish_glue.o -serpent-sse2-i586-y := serpent-sse2-i586-asm_32.o serpent_sse2_glue.o +obj-$(CONFIG_CRYPTO_CRC32C_INTEL) += crc32c-intel.o +crc32c-intel-y := crc32c-intel_glue.o +crc32c-intel-$(CONFIG_64BIT) += crc32c-pcl-intel-asm_64.o -des3_ede-x86_64-y := des3_ede-asm_64.o des3_ede_glue.o -camellia-x86_64-y := camellia-x86_64-asm_64.o camellia_glue.o -blowfish-x86_64-y := blowfish-x86_64-asm_64.o blowfish_glue.o -twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_glue.o -twofish-x86_64-3way-y := twofish-x86_64-asm_64-3way.o twofish_glue_3way.o -chacha-x86_64-y := chacha-ssse3-x86_64.o chacha_glue.o -serpent-sse2-x86_64-y := serpent-sse2-x86_64-asm_64.o serpent_sse2_glue.o +obj-$(CONFIG_CRYPTO_CRC32_PCLMUL) += crc32-pclmul.o +crc32-pclmul-y := crc32-pclmul_asm.o crc32-pclmul_glue.o -aegis128-aesni-y := aegis128-aesni-asm.o aegis128-aesni-glue.o +obj-$(CONFIG_CRYPTO_CRCT10DIF_PCLMUL) += crct10dif-pclmul.o +crct10dif-pclmul-y := crct10dif-pcl-asm_64.o crct10dif-pclmul_glue.o -nhpoly1305-sse2-y := nh-sse2-x86_64.o nhpoly1305-sse2-glue.o -blake2s-x86_64-y := blake2s-core.o blake2s-glue.o +obj-$(CONFIG_CRYPTO_POLY1305_X86_64) += poly1305-x86_64.o poly1305-x86_64-y := poly1305-x86_64-cryptogams.o poly1305_glue.o -ifneq ($(CONFIG_CRYPTO_POLY1305_X86_64),) targets += poly1305-x86_64-cryptogams.S -endif - -ifeq ($(avx_supported),yes) - camellia-aesni-avx-x86_64-y := camellia-aesni-avx-asm_64.o \ - camellia_aesni_avx_glue.o - cast5-avx-x86_64-y := cast5-avx-x86_64-asm_64.o cast5_avx_glue.o - cast6-avx-x86_64-y := cast6-avx-x86_64-asm_64.o cast6_avx_glue.o - twofish-avx-x86_64-y := twofish-avx-x86_64-asm_64.o \ - twofish_avx_glue.o - serpent-avx-x86_64-y := serpent-avx-x86_64-asm_64.o \ - serpent_avx_glue.o -endif - -ifeq ($(avx2_supported),yes) - camellia-aesni-avx2-y := camellia-aesni-avx2-asm_64.o camellia_aesni_avx2_glue.o - chacha-x86_64-y += chacha-avx2-x86_64.o - serpent-avx2-y := serpent-avx2-asm_64.o serpent_avx2_glue.o - - nhpoly1305-avx2-y := nh-avx2-x86_64.o nhpoly1305-avx2-glue.o -endif - -ifeq ($(avx512_supported),yes) - chacha-x86_64-y += chacha-avx512vl-x86_64.o -endif -aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o -aesni-intel-$(CONFIG_64BIT) += aesni-intel_avx-x86_64.o aes_ctrby8_avx-x86_64.o -ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o -sha1-ssse3-y := sha1_ssse3_asm.o sha1_ssse3_glue.o -ifeq ($(avx2_supported),yes) -sha1-ssse3-y += sha1_avx2_x86_64_asm.o -endif -ifeq ($(sha1_ni_supported),yes) -sha1-ssse3-y += sha1_ni_asm.o -endif -crc32c-intel-y := crc32c-intel_glue.o -crc32c-intel-$(CONFIG_64BIT) += crc32c-pcl-intel-asm_64.o -crc32-pclmul-y := crc32-pclmul_asm.o crc32-pclmul_glue.o -sha256-ssse3-y := sha256-ssse3-asm.o sha256-avx-asm.o sha256-avx2-asm.o sha256_ssse3_glue.o -ifeq ($(sha256_ni_supported),yes) -sha256-ssse3-y += sha256_ni_asm.o -endif -sha512-ssse3-y := sha512-ssse3-asm.o sha512-avx-asm.o sha512-avx2-asm.o sha512_ssse3_glue.o -crct10dif-pclmul-y := crct10dif-pcl-asm_64.o crct10dif-pclmul_glue.o +obj-$(CONFIG_CRYPTO_NHPOLY1305_SSE2) += nhpoly1305-sse2.o +nhpoly1305-sse2-y := nh-sse2-x86_64.o nhpoly1305-sse2-glue.o +obj-$(CONFIG_CRYPTO_NHPOLY1305_AVX2) += nhpoly1305-avx2.o +nhpoly1305-avx2-y := nh-avx2-x86_64.o nhpoly1305-avx2-glue.o + +obj-$(CONFIG_CRYPTO_CURVE25519_X86) += curve25519-x86_64.o + +obj-$(CONFIG_CRYPTO_SM3_AVX_X86_64) += sm3-avx-x86_64.o +sm3-avx-x86_64-y := sm3-avx-asm_64.o sm3_avx_glue.o + +obj-$(CONFIG_CRYPTO_SM4_AESNI_AVX_X86_64) += sm4-aesni-avx-x86_64.o +sm4-aesni-avx-x86_64-y := sm4-aesni-avx-asm_64.o sm4_aesni_avx_glue.o + +obj-$(CONFIG_CRYPTO_SM4_AESNI_AVX2_X86_64) += sm4-aesni-avx2-x86_64.o +sm4-aesni-avx2-x86_64-y := sm4-aesni-avx2-asm_64.o sm4_aesni_avx2_glue.o + +obj-$(CONFIG_CRYPTO_ARIA_AESNI_AVX_X86_64) += aria-aesni-avx-x86_64.o +aria-aesni-avx-x86_64-y := aria-aesni-avx-asm_64.o aria_aesni_avx_glue.o quiet_cmd_perlasm = PERLASM $@ cmd_perlasm = $(PERL) $< > $@ diff --git a/arch/x86/crypto/aegis128-aesni-asm.S b/arch/x86/crypto/aegis128-aesni-asm.S index 51d46d93efbc..b48ddebb4748 100644 --- a/arch/x86/crypto/aegis128-aesni-asm.S +++ b/arch/x86/crypto/aegis128-aesni-asm.S @@ -122,7 +122,7 @@ SYM_FUNC_START_LOCAL(__load_partial) pxor T0, MSG .Lld_partial_8: - ret + RET SYM_FUNC_END(__load_partial) /* @@ -180,7 +180,7 @@ SYM_FUNC_START_LOCAL(__store_partial) mov %r10b, (%r9) .Lst_partial_1: - ret + RET SYM_FUNC_END(__store_partial) /* @@ -225,7 +225,7 @@ SYM_FUNC_START(crypto_aegis128_aesni_init) movdqu STATE4, 0x40(STATEP) FRAME_END - ret + RET SYM_FUNC_END(crypto_aegis128_aesni_init) /* @@ -337,7 +337,7 @@ SYM_FUNC_START(crypto_aegis128_aesni_ad) movdqu STATE3, 0x30(STATEP) movdqu STATE4, 0x40(STATEP) FRAME_END - ret + RET .Lad_out_1: movdqu STATE4, 0x00(STATEP) @@ -346,7 +346,7 @@ SYM_FUNC_START(crypto_aegis128_aesni_ad) movdqu STATE2, 0x30(STATEP) movdqu STATE3, 0x40(STATEP) FRAME_END - ret + RET .Lad_out_2: movdqu STATE3, 0x00(STATEP) @@ -355,7 +355,7 @@ SYM_FUNC_START(crypto_aegis128_aesni_ad) movdqu STATE1, 0x30(STATEP) movdqu STATE2, 0x40(STATEP) FRAME_END - ret + RET .Lad_out_3: movdqu STATE2, 0x00(STATEP) @@ -364,7 +364,7 @@ SYM_FUNC_START(crypto_aegis128_aesni_ad) movdqu STATE0, 0x30(STATEP) movdqu STATE1, 0x40(STATEP) FRAME_END - ret + RET .Lad_out_4: movdqu STATE1, 0x00(STATEP) @@ -373,11 +373,11 @@ SYM_FUNC_START(crypto_aegis128_aesni_ad) movdqu STATE4, 0x30(STATEP) movdqu STATE0, 0x40(STATEP) FRAME_END - ret + RET .Lad_out: FRAME_END - ret + RET SYM_FUNC_END(crypto_aegis128_aesni_ad) .macro encrypt_block a s0 s1 s2 s3 s4 i @@ -452,7 +452,7 @@ SYM_FUNC_START(crypto_aegis128_aesni_enc) movdqu STATE2, 0x30(STATEP) movdqu STATE3, 0x40(STATEP) FRAME_END - ret + RET .Lenc_out_1: movdqu STATE3, 0x00(STATEP) @@ -461,7 +461,7 @@ SYM_FUNC_START(crypto_aegis128_aesni_enc) movdqu STATE1, 0x30(STATEP) movdqu STATE2, 0x40(STATEP) FRAME_END - ret + RET .Lenc_out_2: movdqu STATE2, 0x00(STATEP) @@ -470,7 +470,7 @@ SYM_FUNC_START(crypto_aegis128_aesni_enc) movdqu STATE0, 0x30(STATEP) movdqu STATE1, 0x40(STATEP) FRAME_END - ret + RET .Lenc_out_3: movdqu STATE1, 0x00(STATEP) @@ -479,7 +479,7 @@ SYM_FUNC_START(crypto_aegis128_aesni_enc) movdqu STATE4, 0x30(STATEP) movdqu STATE0, 0x40(STATEP) FRAME_END - ret + RET .Lenc_out_4: movdqu STATE0, 0x00(STATEP) @@ -488,11 +488,11 @@ SYM_FUNC_START(crypto_aegis128_aesni_enc) movdqu STATE3, 0x30(STATEP) movdqu STATE4, 0x40(STATEP) FRAME_END - ret + RET .Lenc_out: FRAME_END - ret + RET SYM_FUNC_END(crypto_aegis128_aesni_enc) /* @@ -532,7 +532,7 @@ SYM_FUNC_START(crypto_aegis128_aesni_enc_tail) movdqu STATE3, 0x40(STATEP) FRAME_END - ret + RET SYM_FUNC_END(crypto_aegis128_aesni_enc_tail) .macro decrypt_block a s0 s1 s2 s3 s4 i @@ -606,7 +606,7 @@ SYM_FUNC_START(crypto_aegis128_aesni_dec) movdqu STATE2, 0x30(STATEP) movdqu STATE3, 0x40(STATEP) FRAME_END - ret + RET .Ldec_out_1: movdqu STATE3, 0x00(STATEP) @@ -615,7 +615,7 @@ SYM_FUNC_START(crypto_aegis128_aesni_dec) movdqu STATE1, 0x30(STATEP) movdqu STATE2, 0x40(STATEP) FRAME_END - ret + RET .Ldec_out_2: movdqu STATE2, 0x00(STATEP) @@ -624,7 +624,7 @@ SYM_FUNC_START(crypto_aegis128_aesni_dec) movdqu STATE0, 0x30(STATEP) movdqu STATE1, 0x40(STATEP) FRAME_END - ret + RET .Ldec_out_3: movdqu STATE1, 0x00(STATEP) @@ -633,7 +633,7 @@ SYM_FUNC_START(crypto_aegis128_aesni_dec) movdqu STATE4, 0x30(STATEP) movdqu STATE0, 0x40(STATEP) FRAME_END - ret + RET .Ldec_out_4: movdqu STATE0, 0x00(STATEP) @@ -642,11 +642,11 @@ SYM_FUNC_START(crypto_aegis128_aesni_dec) movdqu STATE3, 0x30(STATEP) movdqu STATE4, 0x40(STATEP) FRAME_END - ret + RET .Ldec_out: FRAME_END - ret + RET SYM_FUNC_END(crypto_aegis128_aesni_dec) /* @@ -696,7 +696,7 @@ SYM_FUNC_START(crypto_aegis128_aesni_dec_tail) movdqu STATE3, 0x40(STATEP) FRAME_END - ret + RET SYM_FUNC_END(crypto_aegis128_aesni_dec_tail) /* @@ -743,5 +743,5 @@ SYM_FUNC_START(crypto_aegis128_aesni_final) movdqu MSG, (%rsi) FRAME_END - ret + RET SYM_FUNC_END(crypto_aegis128_aesni_final) diff --git a/arch/x86/crypto/aes_ctrby8_avx-x86_64.S b/arch/x86/crypto/aes_ctrby8_avx-x86_64.S index ec437db1fa54..2402b9418cd7 100644 --- a/arch/x86/crypto/aes_ctrby8_avx-x86_64.S +++ b/arch/x86/crypto/aes_ctrby8_avx-x86_64.S @@ -1,72 +1,33 @@ +/* SPDX-License-Identifier: GPL-2.0-only OR BSD-3-Clause */ /* - * Implement AES CTR mode by8 optimization with AVX instructions. (x86_64) - * - * This is AES128/192/256 CTR mode optimization implementation. It requires - * the support of Intel(R) AESNI and AVX instructions. - * - * This work was inspired by the AES CTR mode optimization published - * in Intel Optimized IPSEC Cryptograhpic library. - * Additional information on it can be found at: - * http://downloadcenter.intel.com/Detail_Desc.aspx?agr=Y&DwnldID=22972 - * - * This file is provided under a dual BSD/GPLv2 license. When using or - * redistributing this file, you may do so under either license. - * - * GPL LICENSE SUMMARY + * AES CTR mode by8 optimization with AVX instructions. (x86_64) * * Copyright(c) 2014 Intel Corporation. * - * This program is free software; you can redistribute it and/or modify - * it under the terms of version 2 of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * * Contact Information: * James Guilford <james.guilford@intel.com> * Sean Gulley <sean.m.gulley@intel.com> * Chandramouli Narayanan <mouli@linux.intel.com> + */ +/* + * This is AES128/192/256 CTR mode optimization implementation. It requires + * the support of Intel(R) AESNI and AVX instructions. * - * BSD LICENSE - * - * Copyright(c) 2014 Intel Corporation. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * Neither the name of Intel Corporation nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * + * This work was inspired by the AES CTR mode optimization published + * in Intel Optimized IPSEC Cryptographic library. + * Additional information on it can be found at: + * https://github.com/intel/intel-ipsec-mb */ #include <linux/linkage.h> -#include <asm/inst.h> #define VMOVDQ vmovdqu +/* + * Note: the "x" prefix in these aliases means "this is an xmm register". The + * alias prefixes have no relation to XCTR where the "X" prefix means "XOR + * counter". + */ #define xdata0 %xmm0 #define xdata1 %xmm1 #define xdata2 %xmm2 @@ -75,8 +36,10 @@ #define xdata5 %xmm5 #define xdata6 %xmm6 #define xdata7 %xmm7 -#define xcounter %xmm8 -#define xbyteswap %xmm9 +#define xcounter %xmm8 // CTR mode only +#define xiv %xmm8 // XCTR mode only +#define xbyteswap %xmm9 // CTR mode only +#define xtmp %xmm9 // XCTR mode only #define xkey0 %xmm10 #define xkey4 %xmm11 #define xkey8 %xmm12 @@ -89,7 +52,7 @@ #define p_keys %rdx #define p_out %rcx #define num_bytes %r8 - +#define counter %r9 // XCTR mode only #define tmp %r10 #define DDQ_DATA 0 #define XDATA 1 @@ -127,10 +90,6 @@ ddq_add_8: /* generate a unique variable for ddq_add_x */ -.macro setddq n - var_ddq_add = ddq_add_\n -.endm - /* generate a unique variable for xmm register */ .macro setxdata n var_xdata = %xmm\n @@ -140,9 +99,7 @@ ddq_add_8: .macro club name, id .altmacro - .if \name == DDQ_DATA - setddq %\id - .elseif \name == XDATA + .if \name == XDATA setxdata %\id .endif .noaltmacro @@ -152,7 +109,7 @@ ddq_add_8: * do_aes num_in_par load_keys key_len * This increments p_in, but not p_out */ -.macro do_aes b, k, key_len +.macro do_aes b, k, key_len, xctr .set by, \b .set load_keys, \k .set klen, \key_len @@ -161,31 +118,48 @@ ddq_add_8: vmovdqa 0*16(p_keys), xkey0 .endif - vpshufb xbyteswap, xcounter, xdata0 - - .set i, 1 - .rept (by - 1) - club DDQ_DATA, i - club XDATA, i - vpaddq var_ddq_add(%rip), xcounter, var_xdata - vptest ddq_low_msk(%rip), var_xdata - jnz 1f - vpaddq ddq_high_add_1(%rip), var_xdata, var_xdata - vpaddq ddq_high_add_1(%rip), xcounter, xcounter - 1: - vpshufb xbyteswap, var_xdata, var_xdata - .set i, (i +1) - .endr + .if \xctr + movq counter, xtmp + .set i, 0 + .rept (by) + club XDATA, i + vpaddq (ddq_add_1 + 16 * i)(%rip), xtmp, var_xdata + .set i, (i +1) + .endr + .set i, 0 + .rept (by) + club XDATA, i + vpxor xiv, var_xdata, var_xdata + .set i, (i +1) + .endr + .else + vpshufb xbyteswap, xcounter, xdata0 + .set i, 1 + .rept (by - 1) + club XDATA, i + vpaddq (ddq_add_1 + 16 * (i - 1))(%rip), xcounter, var_xdata + vptest ddq_low_msk(%rip), var_xdata + jnz 1f + vpaddq ddq_high_add_1(%rip), var_xdata, var_xdata + vpaddq ddq_high_add_1(%rip), xcounter, xcounter + 1: + vpshufb xbyteswap, var_xdata, var_xdata + .set i, (i +1) + .endr + .endif vmovdqa 1*16(p_keys), xkeyA vpxor xkey0, xdata0, xdata0 - club DDQ_DATA, by - vpaddq var_ddq_add(%rip), xcounter, xcounter - vptest ddq_low_msk(%rip), xcounter - jnz 1f - vpaddq ddq_high_add_1(%rip), xcounter, xcounter - 1: + .if \xctr + add $by, counter + .else + vpaddq (ddq_add_1 + 16 * (by - 1))(%rip), xcounter, xcounter + vptest ddq_low_msk(%rip), xcounter + jnz 1f + vpaddq ddq_high_add_1(%rip), xcounter, xcounter + 1: + .endif .set i, 1 .rept (by - 1) @@ -423,94 +397,99 @@ ddq_add_8: .endr .endm -.macro do_aes_load val, key_len - do_aes \val, 1, \key_len +.macro do_aes_load val, key_len, xctr + do_aes \val, 1, \key_len, \xctr .endm -.macro do_aes_noload val, key_len - do_aes \val, 0, \key_len +.macro do_aes_noload val, key_len, xctr + do_aes \val, 0, \key_len, \xctr .endm /* main body of aes ctr load */ -.macro do_aes_ctrmain key_len +.macro do_aes_ctrmain key_len, xctr cmp $16, num_bytes - jb .Ldo_return2\key_len + jb .Ldo_return2\xctr\key_len - vmovdqa byteswap_const(%rip), xbyteswap - vmovdqu (p_iv), xcounter - vpshufb xbyteswap, xcounter, xcounter + .if \xctr + shr $4, counter + vmovdqu (p_iv), xiv + .else + vmovdqa byteswap_const(%rip), xbyteswap + vmovdqu (p_iv), xcounter + vpshufb xbyteswap, xcounter, xcounter + .endif mov num_bytes, tmp and $(7*16), tmp - jz .Lmult_of_8_blks\key_len + jz .Lmult_of_8_blks\xctr\key_len /* 1 <= tmp <= 7 */ cmp $(4*16), tmp - jg .Lgt4\key_len - je .Leq4\key_len + jg .Lgt4\xctr\key_len + je .Leq4\xctr\key_len -.Llt4\key_len: +.Llt4\xctr\key_len: cmp $(2*16), tmp - jg .Leq3\key_len - je .Leq2\key_len + jg .Leq3\xctr\key_len + je .Leq2\xctr\key_len -.Leq1\key_len: - do_aes_load 1, \key_len +.Leq1\xctr\key_len: + do_aes_load 1, \key_len, \xctr add $(1*16), p_out and $(~7*16), num_bytes - jz .Ldo_return2\key_len - jmp .Lmain_loop2\key_len + jz .Ldo_return2\xctr\key_len + jmp .Lmain_loop2\xctr\key_len -.Leq2\key_len: - do_aes_load 2, \key_len +.Leq2\xctr\key_len: + do_aes_load 2, \key_len, \xctr add $(2*16), p_out and $(~7*16), num_bytes - jz .Ldo_return2\key_len - jmp .Lmain_loop2\key_len + jz .Ldo_return2\xctr\key_len + jmp .Lmain_loop2\xctr\key_len -.Leq3\key_len: - do_aes_load 3, \key_len +.Leq3\xctr\key_len: + do_aes_load 3, \key_len, \xctr add $(3*16), p_out and $(~7*16), num_bytes - jz .Ldo_return2\key_len - jmp .Lmain_loop2\key_len + jz .Ldo_return2\xctr\key_len + jmp .Lmain_loop2\xctr\key_len -.Leq4\key_len: - do_aes_load 4, \key_len +.Leq4\xctr\key_len: + do_aes_load 4, \key_len, \xctr add $(4*16), p_out and $(~7*16), num_bytes - jz .Ldo_return2\key_len - jmp .Lmain_loop2\key_len + jz .Ldo_return2\xctr\key_len + jmp .Lmain_loop2\xctr\key_len -.Lgt4\key_len: +.Lgt4\xctr\key_len: cmp $(6*16), tmp - jg .Leq7\key_len - je .Leq6\key_len + jg .Leq7\xctr\key_len + je .Leq6\xctr\key_len -.Leq5\key_len: - do_aes_load 5, \key_len +.Leq5\xctr\key_len: + do_aes_load 5, \key_len, \xctr add $(5*16), p_out and $(~7*16), num_bytes - jz .Ldo_return2\key_len - jmp .Lmain_loop2\key_len + jz .Ldo_return2\xctr\key_len + jmp .Lmain_loop2\xctr\key_len -.Leq6\key_len: - do_aes_load 6, \key_len +.Leq6\xctr\key_len: + do_aes_load 6, \key_len, \xctr add $(6*16), p_out and $(~7*16), num_bytes - jz .Ldo_return2\key_len - jmp .Lmain_loop2\key_len + jz .Ldo_return2\xctr\key_len + jmp .Lmain_loop2\xctr\key_len -.Leq7\key_len: - do_aes_load 7, \key_len +.Leq7\xctr\key_len: + do_aes_load 7, \key_len, \xctr add $(7*16), p_out and $(~7*16), num_bytes - jz .Ldo_return2\key_len - jmp .Lmain_loop2\key_len + jz .Ldo_return2\xctr\key_len + jmp .Lmain_loop2\xctr\key_len -.Lmult_of_8_blks\key_len: +.Lmult_of_8_blks\xctr\key_len: .if (\key_len != KEY_128) vmovdqa 0*16(p_keys), xkey0 vmovdqa 4*16(p_keys), xkey4 @@ -523,18 +502,20 @@ ddq_add_8: vmovdqa 9*16(p_keys), xkey12 .endif .align 16 -.Lmain_loop2\key_len: +.Lmain_loop2\xctr\key_len: /* num_bytes is a multiple of 8 and >0 */ - do_aes_noload 8, \key_len + do_aes_noload 8, \key_len, \xctr add $(8*16), p_out sub $(8*16), num_bytes - jne .Lmain_loop2\key_len + jne .Lmain_loop2\xctr\key_len -.Ldo_return2\key_len: - /* return updated IV */ - vpshufb xbyteswap, xcounter, xcounter - vmovdqu xcounter, (p_iv) - ret +.Ldo_return2\xctr\key_len: + .if !\xctr + /* return updated IV */ + vpshufb xbyteswap, xcounter, xcounter + vmovdqu xcounter, (p_iv) + .endif + RET .endm /* @@ -546,7 +527,7 @@ ddq_add_8: */ SYM_FUNC_START(aes_ctr_enc_128_avx_by8) /* call the aes main loop */ - do_aes_ctrmain KEY_128 + do_aes_ctrmain KEY_128 0 SYM_FUNC_END(aes_ctr_enc_128_avx_by8) @@ -559,7 +540,7 @@ SYM_FUNC_END(aes_ctr_enc_128_avx_by8) */ SYM_FUNC_START(aes_ctr_enc_192_avx_by8) /* call the aes main loop */ - do_aes_ctrmain KEY_192 + do_aes_ctrmain KEY_192 0 SYM_FUNC_END(aes_ctr_enc_192_avx_by8) @@ -572,6 +553,45 @@ SYM_FUNC_END(aes_ctr_enc_192_avx_by8) */ SYM_FUNC_START(aes_ctr_enc_256_avx_by8) /* call the aes main loop */ - do_aes_ctrmain KEY_256 + do_aes_ctrmain KEY_256 0 SYM_FUNC_END(aes_ctr_enc_256_avx_by8) + +/* + * routine to do AES128 XCTR enc/decrypt "by8" + * XMM registers are clobbered. + * Saving/restoring must be done at a higher level + * aes_xctr_enc_128_avx_by8(const u8 *in, const u8 *iv, const void *keys, + * u8* out, unsigned int num_bytes, unsigned int byte_ctr) + */ +SYM_FUNC_START(aes_xctr_enc_128_avx_by8) + /* call the aes main loop */ + do_aes_ctrmain KEY_128 1 + +SYM_FUNC_END(aes_xctr_enc_128_avx_by8) + +/* + * routine to do AES192 XCTR enc/decrypt "by8" + * XMM registers are clobbered. + * Saving/restoring must be done at a higher level + * aes_xctr_enc_192_avx_by8(const u8 *in, const u8 *iv, const void *keys, + * u8* out, unsigned int num_bytes, unsigned int byte_ctr) + */ +SYM_FUNC_START(aes_xctr_enc_192_avx_by8) + /* call the aes main loop */ + do_aes_ctrmain KEY_192 1 + +SYM_FUNC_END(aes_xctr_enc_192_avx_by8) + +/* + * routine to do AES256 XCTR enc/decrypt "by8" + * XMM registers are clobbered. + * Saving/restoring must be done at a higher level + * aes_xctr_enc_256_avx_by8(const u8 *in, const u8 *iv, const void *keys, + * u8* out, unsigned int num_bytes, unsigned int byte_ctr) + */ +SYM_FUNC_START(aes_xctr_enc_256_avx_by8) + /* call the aes main loop */ + do_aes_ctrmain KEY_256 1 + +SYM_FUNC_END(aes_xctr_enc_256_avx_by8) diff --git a/arch/x86/crypto/aes_glue.c b/arch/x86/crypto/aes_glue.c deleted file mode 100644 index 7b7dc05fa1a4..000000000000 --- a/arch/x86/crypto/aes_glue.c +++ /dev/null @@ -1 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S index cad6e1bfa7d5..837c1e0aa021 100644 --- a/arch/x86/crypto/aesni-intel_asm.S +++ b/arch/x86/crypto/aesni-intel_asm.S @@ -26,7 +26,6 @@ */ #include <linux/linkage.h> -#include <asm/inst.h> #include <asm/frame.h> #include <asm/nospec-branch.h> @@ -44,10 +43,6 @@ #ifdef __x86_64__ # constants in mergeable sections, linker can reorder and merge -.section .rodata.cst16.gf128mul_x_ble_mask, "aM", @progbits, 16 -.align 16 -.Lgf128mul_x_ble_mask: - .octa 0x00000000000000010000000000000087 .section .rodata.cst16.POLY, "aM", @progbits, 16 .align 16 POLY: .octa 0xC2000000000000000000000000000001 @@ -147,7 +142,7 @@ ALL_F: .octa 0xffffffffffffffffffffffffffffffff #define CTR %xmm11 #define INC %xmm12 -#define GF128MUL_MASK %xmm10 +#define GF128MUL_MASK %xmm7 #ifdef __x86_64__ #define AREG %rax @@ -201,7 +196,7 @@ ALL_F: .octa 0xffffffffffffffffffffffffffffffff mov \SUBKEY, %r12 movdqu (%r12), \TMP3 movdqa SHUF_MASK(%rip), \TMP2 - PSHUFB_XMM \TMP2, \TMP3 + pshufb \TMP2, \TMP3 # precompute HashKey<<1 mod poly from the HashKey (required for GHASH) @@ -263,10 +258,10 @@ ALL_F: .octa 0xffffffffffffffffffffffffffffffff movdqu %xmm0, OrigIV(%arg2) # ctx_data.orig_IV = iv movdqa SHUF_MASK(%rip), %xmm2 - PSHUFB_XMM %xmm2, %xmm0 + pshufb %xmm2, %xmm0 movdqu %xmm0, CurCount(%arg2) # ctx_data.current_counter = iv - PRECOMPUTE \SUBKEY, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, + PRECOMPUTE \SUBKEY, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7 movdqu HashKey(%arg2), %xmm13 CALC_AAD_HASH %xmm13, \AAD, \AADLEN, %xmm0, %xmm1, %xmm2, %xmm3, \ @@ -319,7 +314,7 @@ _initial_blocks_\@: # Main loop - Encrypt/Decrypt remaining blocks - cmp $0, %r13 + test %r13, %r13 je _zero_cipher_left_\@ sub $64, %r13 je _four_cipher_left_\@ @@ -347,7 +342,7 @@ _zero_cipher_left_\@: paddd ONE(%rip), %xmm0 # INCR CNT to get Yn movdqu %xmm0, CurCount(%arg2) movdqa SHUF_MASK(%rip), %xmm10 - PSHUFB_XMM %xmm10, %xmm0 + pshufb %xmm10, %xmm0 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # Encrypt(K, Yn) movdqu %xmm0, PBlockEncKey(%arg2) @@ -377,7 +372,7 @@ _large_enough_update_\@: # get the appropriate shuffle mask movdqu (%r12), %xmm2 # shift right 16-r13 bytes - PSHUFB_XMM %xmm2, %xmm1 + pshufb %xmm2, %xmm1 _data_read_\@: lea ALL_F+16(%rip), %r12 @@ -393,12 +388,12 @@ _data_read_\@: .ifc \operation, dec pand %xmm1, %xmm2 movdqa SHUF_MASK(%rip), %xmm10 - PSHUFB_XMM %xmm10 ,%xmm2 + pshufb %xmm10 ,%xmm2 pxor %xmm2, %xmm8 .else movdqa SHUF_MASK(%rip), %xmm10 - PSHUFB_XMM %xmm10,%xmm0 + pshufb %xmm10,%xmm0 pxor %xmm0, %xmm8 .endif @@ -408,17 +403,17 @@ _data_read_\@: # GHASH computation for the last <16 byte block movdqa SHUF_MASK(%rip), %xmm10 # shuffle xmm0 back to output as ciphertext - PSHUFB_XMM %xmm10, %xmm0 + pshufb %xmm10, %xmm0 .endif # Output %r13 bytes - MOVQ_R64_XMM %xmm0, %rax + movq %xmm0, %rax cmp $8, %r13 jle _less_than_8_bytes_left_\@ mov %rax, (%arg3 , %r11, 1) add $8, %r11 psrldq $8, %xmm0 - MOVQ_R64_XMM %xmm0, %rax + movq %xmm0, %rax sub $8, %r13 _less_than_8_bytes_left_\@: mov %al, (%arg3, %r11, 1) @@ -438,7 +433,7 @@ _multiple_of_16_bytes_\@: mov PBlockLen(%arg2), %r12 - cmp $0, %r12 + test %r12, %r12 je _partial_done\@ GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6 @@ -449,7 +444,7 @@ _partial_done\@: movd %r12d, %xmm15 # len(A) in %xmm15 mov InLen(%arg2), %r12 shl $3, %r12 # len(C) in bits (*128) - MOVQ_R64_XMM %r12, %xmm1 + movq %r12, %xmm1 pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000 pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C) @@ -457,7 +452,7 @@ _partial_done\@: GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6 # final GHASH computation movdqa SHUF_MASK(%rip), %xmm10 - PSHUFB_XMM %xmm10, %xmm8 + pshufb %xmm10, %xmm8 movdqu OrigIV(%arg2), %xmm0 # %xmm0 = Y0 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Y0) @@ -470,12 +465,12 @@ _return_T_\@: cmp $8, %r11 jl _T_4_\@ _T_8_\@: - MOVQ_R64_XMM %xmm0, %rax + movq %xmm0, %rax mov %rax, (%r10) add $8, %r10 sub $8, %r11 psrldq $8, %xmm0 - cmp $0, %r11 + test %r11, %r11 je _return_T_done_\@ _T_4_\@: movd %xmm0, %eax @@ -483,7 +478,7 @@ _T_4_\@: add $4, %r10 sub $4, %r11 psrldq $4, %xmm0 - cmp $0, %r11 + test %r11, %r11 je _return_T_done_\@ _T_123_\@: movd %xmm0, %eax @@ -518,9 +513,9 @@ _return_T_done_\@: pshufd $78, \HK, \TMP3 pxor \GH, \TMP2 # TMP2 = a1+a0 pxor \HK, \TMP3 # TMP3 = b1+b0 - PCLMULQDQ 0x11, \HK, \TMP1 # TMP1 = a1*b1 - PCLMULQDQ 0x00, \HK, \GH # GH = a0*b0 - PCLMULQDQ 0x00, \TMP3, \TMP2 # TMP2 = (a0+a1)*(b1+b0) + pclmulqdq $0x11, \HK, \TMP1 # TMP1 = a1*b1 + pclmulqdq $0x00, \HK, \GH # GH = a0*b0 + pclmulqdq $0x00, \TMP3, \TMP2 # TMP2 = (a0+a1)*(b1+b0) pxor \GH, \TMP2 pxor \TMP1, \TMP2 # TMP2 = (a0*b0)+(a1*b0) movdqa \TMP2, \TMP3 @@ -570,7 +565,7 @@ _return_T_done_\@: cmp $8, \DLEN jl _read_lt8_\@ mov (\DPTR), %rax - MOVQ_R64_XMM %rax, \XMMDst + movq %rax, \XMMDst sub $8, \DLEN jz _done_read_partial_block_\@ xor %eax, %eax @@ -579,7 +574,7 @@ _read_next_byte_\@: mov 7(\DPTR, \DLEN, 1), %al dec \DLEN jnz _read_next_byte_\@ - MOVQ_R64_XMM %rax, \XMM1 + movq %rax, \XMM1 pslldq $8, \XMM1 por \XMM1, \XMMDst jmp _done_read_partial_block_\@ @@ -590,7 +585,7 @@ _read_next_byte_lt8_\@: mov -1(\DPTR, \DLEN, 1), %al dec \DLEN jnz _read_next_byte_lt8_\@ - MOVQ_R64_XMM %rax, \XMMDst + movq %rax, \XMMDst _done_read_partial_block_\@: .endm @@ -608,7 +603,7 @@ _done_read_partial_block_\@: jl _get_AAD_rest\@ _get_AAD_blocks\@: movdqu (%r10), \TMP7 - PSHUFB_XMM %xmm14, \TMP7 # byte-reflect the AAD data + pshufb %xmm14, \TMP7 # byte-reflect the AAD data pxor \TMP7, \TMP6 GHASH_MUL \TMP6, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5 add $16, %r10 @@ -620,11 +615,11 @@ _get_AAD_blocks\@: /* read the last <16B of AAD */ _get_AAD_rest\@: - cmp $0, %r11 + test %r11, %r11 je _get_AAD_done\@ READ_PARTIAL_BLOCK %r10, %r11, \TMP1, \TMP7 - PSHUFB_XMM %xmm14, \TMP7 # byte-reflect the AAD data + pshufb %xmm14, \TMP7 # byte-reflect the AAD data pxor \TMP6, \TMP7 GHASH_MUL \TMP7, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5 movdqu \TMP7, \TMP6 @@ -641,7 +636,7 @@ _get_AAD_done\@: .macro PARTIAL_BLOCK CYPH_PLAIN_OUT PLAIN_CYPH_IN PLAIN_CYPH_LEN DATA_OFFSET \ AAD_HASH operation mov PBlockLen(%arg2), %r13 - cmp $0, %r13 + test %r13, %r13 je _partial_block_done_\@ # Leave Macro if no partial blocks # Read in input data without over reading cmp $16, \PLAIN_CYPH_LEN @@ -667,7 +662,7 @@ _data_read_\@: # Finished reading in data # r16-r13 is the number of bytes in plaintext mod 16) add %r13, %r12 movdqu (%r12), %xmm2 # get the appropriate shuffle mask - PSHUFB_XMM %xmm2, %xmm9 # shift right r13 bytes + pshufb %xmm2, %xmm9 # shift right r13 bytes .ifc \operation, dec movdqa %xmm1, %xmm3 @@ -689,11 +684,11 @@ _no_extra_mask_1_\@: pand %xmm1, %xmm3 movdqa SHUF_MASK(%rip), %xmm10 - PSHUFB_XMM %xmm10, %xmm3 - PSHUFB_XMM %xmm2, %xmm3 + pshufb %xmm10, %xmm3 + pshufb %xmm2, %xmm3 pxor %xmm3, \AAD_HASH - cmp $0, %r10 + test %r10, %r10 jl _partial_incomplete_1_\@ # GHASH computation for the last <16 Byte block @@ -724,11 +719,11 @@ _no_extra_mask_2_\@: pand %xmm1, %xmm9 movdqa SHUF_MASK(%rip), %xmm1 - PSHUFB_XMM %xmm1, %xmm9 - PSHUFB_XMM %xmm2, %xmm9 + pshufb %xmm1, %xmm9 + pshufb %xmm2, %xmm9 pxor %xmm9, \AAD_HASH - cmp $0, %r10 + test %r10, %r10 jl _partial_incomplete_2_\@ # GHASH computation for the last <16 Byte block @@ -744,11 +739,11 @@ _encode_done_\@: movdqa SHUF_MASK(%rip), %xmm10 # shuffle xmm9 back to output as ciphertext - PSHUFB_XMM %xmm10, %xmm9 - PSHUFB_XMM %xmm2, %xmm9 + pshufb %xmm10, %xmm9 + pshufb %xmm2, %xmm9 .endif # output encrypted Bytes - cmp $0, %r10 + test %r10, %r10 jl _partial_fill_\@ mov %r13, %r12 mov $16, %r13 @@ -759,14 +754,14 @@ _partial_fill_\@: mov \PLAIN_CYPH_LEN, %r13 _count_set_\@: movdqa %xmm9, %xmm0 - MOVQ_R64_XMM %xmm0, %rax + movq %xmm0, %rax cmp $8, %r13 jle _less_than_8_bytes_left_\@ mov %rax, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1) add $8, \DATA_OFFSET psrldq $8, %xmm0 - MOVQ_R64_XMM %xmm0, %rax + movq %xmm0, %rax sub $8, %r13 _less_than_8_bytes_left_\@: movb %al, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1) @@ -810,7 +805,7 @@ _partial_block_done_\@: .else MOVADQ \XMM0, %xmm\index .endif - PSHUFB_XMM %xmm14, %xmm\index # perform a 16 byte swap + pshufb %xmm14, %xmm\index # perform a 16 byte swap pxor \TMP2, %xmm\index .endr lea 0x10(%arg1),%r10 @@ -821,7 +816,7 @@ _partial_block_done_\@: aes_loop_initial_\@: MOVADQ (%r10),\TMP1 .irpc index, \i_seq - AESENC \TMP1, %xmm\index + aesenc \TMP1, %xmm\index .endr add $16,%r10 sub $1,%eax @@ -829,7 +824,7 @@ aes_loop_initial_\@: MOVADQ (%r10), \TMP1 .irpc index, \i_seq - AESENCLAST \TMP1, %xmm\index # Last Round + aesenclast \TMP1, %xmm\index # Last Round .endr .irpc index, \i_seq movdqu (%arg4 , %r11, 1), \TMP1 @@ -841,7 +836,7 @@ aes_loop_initial_\@: .ifc \operation, dec movdqa \TMP1, %xmm\index .endif - PSHUFB_XMM %xmm14, %xmm\index + pshufb %xmm14, %xmm\index # prepare plaintext/ciphertext for GHASH computation .endr @@ -876,19 +871,19 @@ aes_loop_initial_\@: MOVADQ ONE(%RIP),\TMP1 paddd \TMP1, \XMM0 # INCR Y0 MOVADQ \XMM0, \XMM1 - PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap + pshufb %xmm14, \XMM1 # perform a 16 byte swap paddd \TMP1, \XMM0 # INCR Y0 MOVADQ \XMM0, \XMM2 - PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap + pshufb %xmm14, \XMM2 # perform a 16 byte swap paddd \TMP1, \XMM0 # INCR Y0 MOVADQ \XMM0, \XMM3 - PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap + pshufb %xmm14, \XMM3 # perform a 16 byte swap paddd \TMP1, \XMM0 # INCR Y0 MOVADQ \XMM0, \XMM4 - PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap + pshufb %xmm14, \XMM4 # perform a 16 byte swap MOVADQ 0(%arg1),\TMP1 pxor \TMP1, \XMM1 @@ -897,17 +892,17 @@ aes_loop_initial_\@: pxor \TMP1, \XMM4 .irpc index, 1234 # do 4 rounds movaps 0x10*\index(%arg1), \TMP1 - AESENC \TMP1, \XMM1 - AESENC \TMP1, \XMM2 - AESENC \TMP1, \XMM3 - AESENC \TMP1, \XMM4 + aesenc \TMP1, \XMM1 + aesenc \TMP1, \XMM2 + aesenc \TMP1, \XMM3 + aesenc \TMP1, \XMM4 .endr .irpc index, 56789 # do next 5 rounds movaps 0x10*\index(%arg1), \TMP1 - AESENC \TMP1, \XMM1 - AESENC \TMP1, \XMM2 - AESENC \TMP1, \XMM3 - AESENC \TMP1, \XMM4 + aesenc \TMP1, \XMM1 + aesenc \TMP1, \XMM2 + aesenc \TMP1, \XMM3 + aesenc \TMP1, \XMM4 .endr lea 0xa0(%arg1),%r10 mov keysize,%eax @@ -918,7 +913,7 @@ aes_loop_initial_\@: aes_loop_pre_\@: MOVADQ (%r10),\TMP2 .irpc index, 1234 - AESENC \TMP2, %xmm\index + aesenc \TMP2, %xmm\index .endr add $16,%r10 sub $1,%eax @@ -926,10 +921,10 @@ aes_loop_pre_\@: aes_loop_pre_done\@: MOVADQ (%r10), \TMP2 - AESENCLAST \TMP2, \XMM1 - AESENCLAST \TMP2, \XMM2 - AESENCLAST \TMP2, \XMM3 - AESENCLAST \TMP2, \XMM4 + aesenclast \TMP2, \XMM1 + aesenclast \TMP2, \XMM2 + aesenclast \TMP2, \XMM3 + aesenclast \TMP2, \XMM4 movdqu 16*0(%arg4 , %r11 , 1), \TMP1 pxor \TMP1, \XMM1 .ifc \operation, dec @@ -961,12 +956,12 @@ aes_loop_pre_done\@: .endif add $64, %r11 - PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap + pshufb %xmm14, \XMM1 # perform a 16 byte swap pxor \XMMDst, \XMM1 # combine GHASHed value with the corresponding ciphertext - PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap - PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap - PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap + pshufb %xmm14, \XMM2 # perform a 16 byte swap + pshufb %xmm14, \XMM3 # perform a 16 byte swap + pshufb %xmm14, \XMM4 # perform a 16 byte swap _initial_blocks_done\@: @@ -978,7 +973,7 @@ _initial_blocks_done\@: * arg1, %arg3, %arg4 are used as pointers only, not modified * %r11 is the data offset value */ -.macro GHASH_4_ENCRYPT_4_PARALLEL_ENC TMP1 TMP2 TMP3 TMP4 TMP5 \ +.macro GHASH_4_ENCRYPT_4_PARALLEL_enc TMP1 TMP2 TMP3 TMP4 TMP5 \ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation movdqa \XMM1, \XMM5 @@ -994,7 +989,7 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation pxor \XMM5, \TMP6 paddd ONE(%rip), \XMM0 # INCR CNT movdqu HashKey_4(%arg2), \TMP5 - PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1 + pclmulqdq $0x11, \TMP5, \TMP4 # TMP4 = a1*b1 movdqa \XMM0, \XMM1 paddd ONE(%rip), \XMM0 # INCR CNT movdqa \XMM0, \XMM2 @@ -1002,51 +997,51 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation movdqa \XMM0, \XMM3 paddd ONE(%rip), \XMM0 # INCR CNT movdqa \XMM0, \XMM4 - PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap - PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0 - PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap - PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap - PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap + pshufb %xmm15, \XMM1 # perform a 16 byte swap + pclmulqdq $0x00, \TMP5, \XMM5 # XMM5 = a0*b0 + pshufb %xmm15, \XMM2 # perform a 16 byte swap + pshufb %xmm15, \XMM3 # perform a 16 byte swap + pshufb %xmm15, \XMM4 # perform a 16 byte swap pxor (%arg1), \XMM1 pxor (%arg1), \XMM2 pxor (%arg1), \XMM3 pxor (%arg1), \XMM4 movdqu HashKey_4_k(%arg2), \TMP5 - PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0) + pclmulqdq $0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0) movaps 0x10(%arg1), \TMP1 - AESENC \TMP1, \XMM1 # Round 1 - AESENC \TMP1, \XMM2 - AESENC \TMP1, \XMM3 - AESENC \TMP1, \XMM4 + aesenc \TMP1, \XMM1 # Round 1 + aesenc \TMP1, \XMM2 + aesenc \TMP1, \XMM3 + aesenc \TMP1, \XMM4 movaps 0x20(%arg1), \TMP1 - AESENC \TMP1, \XMM1 # Round 2 - AESENC \TMP1, \XMM2 - AESENC \TMP1, \XMM3 - AESENC \TMP1, \XMM4 + aesenc \TMP1, \XMM1 # Round 2 + aesenc \TMP1, \XMM2 + aesenc \TMP1, \XMM3 + aesenc \TMP1, \XMM4 movdqa \XMM6, \TMP1 pshufd $78, \XMM6, \TMP2 pxor \XMM6, \TMP2 movdqu HashKey_3(%arg2), \TMP5 - PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1 + pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1 * b1 movaps 0x30(%arg1), \TMP3 - AESENC \TMP3, \XMM1 # Round 3 - AESENC \TMP3, \XMM2 - AESENC \TMP3, \XMM3 - AESENC \TMP3, \XMM4 - PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0 + aesenc \TMP3, \XMM1 # Round 3 + aesenc \TMP3, \XMM2 + aesenc \TMP3, \XMM3 + aesenc \TMP3, \XMM4 + pclmulqdq $0x00, \TMP5, \XMM6 # XMM6 = a0*b0 movaps 0x40(%arg1), \TMP3 - AESENC \TMP3, \XMM1 # Round 4 - AESENC \TMP3, \XMM2 - AESENC \TMP3, \XMM3 - AESENC \TMP3, \XMM4 + aesenc \TMP3, \XMM1 # Round 4 + aesenc \TMP3, \XMM2 + aesenc \TMP3, \XMM3 + aesenc \TMP3, \XMM4 movdqu HashKey_3_k(%arg2), \TMP5 - PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) + pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) movaps 0x50(%arg1), \TMP3 - AESENC \TMP3, \XMM1 # Round 5 - AESENC \TMP3, \XMM2 - AESENC \TMP3, \XMM3 - AESENC \TMP3, \XMM4 + aesenc \TMP3, \XMM1 # Round 5 + aesenc \TMP3, \XMM2 + aesenc \TMP3, \XMM3 + aesenc \TMP3, \XMM4 pxor \TMP1, \TMP4 # accumulate the results in TMP4:XMM5, TMP6 holds the middle part pxor \XMM6, \XMM5 @@ -1058,25 +1053,25 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation # Multiply TMP5 * HashKey using karatsuba - PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 + pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1 movaps 0x60(%arg1), \TMP3 - AESENC \TMP3, \XMM1 # Round 6 - AESENC \TMP3, \XMM2 - AESENC \TMP3, \XMM3 - AESENC \TMP3, \XMM4 - PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0 + aesenc \TMP3, \XMM1 # Round 6 + aesenc \TMP3, \XMM2 + aesenc \TMP3, \XMM3 + aesenc \TMP3, \XMM4 + pclmulqdq $0x00, \TMP5, \XMM7 # XMM7 = a0*b0 movaps 0x70(%arg1), \TMP3 - AESENC \TMP3, \XMM1 # Round 7 - AESENC \TMP3, \XMM2 - AESENC \TMP3, \XMM3 - AESENC \TMP3, \XMM4 + aesenc \TMP3, \XMM1 # Round 7 + aesenc \TMP3, \XMM2 + aesenc \TMP3, \XMM3 + aesenc \TMP3, \XMM4 movdqu HashKey_2_k(%arg2), \TMP5 - PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) + pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) movaps 0x80(%arg1), \TMP3 - AESENC \TMP3, \XMM1 # Round 8 - AESENC \TMP3, \XMM2 - AESENC \TMP3, \XMM3 - AESENC \TMP3, \XMM4 + aesenc \TMP3, \XMM1 # Round 8 + aesenc \TMP3, \XMM2 + aesenc \TMP3, \XMM3 + aesenc \TMP3, \XMM4 pxor \TMP1, \TMP4 # accumulate the results in TMP4:XMM5, TMP6 holds the middle part pxor \XMM7, \XMM5 @@ -1089,13 +1084,13 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation pshufd $78, \XMM8, \TMP2 pxor \XMM8, \TMP2 movdqu HashKey(%arg2), \TMP5 - PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 + pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1 movaps 0x90(%arg1), \TMP3 - AESENC \TMP3, \XMM1 # Round 9 - AESENC \TMP3, \XMM2 - AESENC \TMP3, \XMM3 - AESENC \TMP3, \XMM4 - PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0 + aesenc \TMP3, \XMM1 # Round 9 + aesenc \TMP3, \XMM2 + aesenc \TMP3, \XMM3 + aesenc \TMP3, \XMM4 + pclmulqdq $0x00, \TMP5, \XMM8 # XMM8 = a0*b0 lea 0xa0(%arg1),%r10 mov keysize,%eax shr $2,%eax # 128->4, 192->6, 256->8 @@ -1105,7 +1100,7 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation aes_loop_par_enc\@: MOVADQ (%r10),\TMP3 .irpc index, 1234 - AESENC \TMP3, %xmm\index + aesenc \TMP3, %xmm\index .endr add $16,%r10 sub $1,%eax @@ -1113,12 +1108,12 @@ aes_loop_par_enc\@: aes_loop_par_enc_done\@: MOVADQ (%r10), \TMP3 - AESENCLAST \TMP3, \XMM1 # Round 10 - AESENCLAST \TMP3, \XMM2 - AESENCLAST \TMP3, \XMM3 - AESENCLAST \TMP3, \XMM4 + aesenclast \TMP3, \XMM1 # Round 10 + aesenclast \TMP3, \XMM2 + aesenclast \TMP3, \XMM3 + aesenclast \TMP3, \XMM4 movdqu HashKey_k(%arg2), \TMP5 - PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) + pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) movdqu (%arg4,%r11,1), \TMP3 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK movdqu 16(%arg4,%r11,1), \TMP3 @@ -1131,10 +1126,10 @@ aes_loop_par_enc_done\@: movdqu \XMM2, 16(%arg3,%r11,1) # Write to the ciphertext buffer movdqu \XMM3, 32(%arg3,%r11,1) # Write to the ciphertext buffer movdqu \XMM4, 48(%arg3,%r11,1) # Write to the ciphertext buffer - PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap - PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap - PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap - PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap + pshufb %xmm15, \XMM1 # perform a 16 byte swap + pshufb %xmm15, \XMM2 # perform a 16 byte swap + pshufb %xmm15, \XMM3 # perform a 16 byte swap + pshufb %xmm15, \XMM4 # perform a 16 byte swap pxor \TMP4, \TMP1 pxor \XMM8, \XMM5 @@ -1186,7 +1181,7 @@ aes_loop_par_enc_done\@: * arg1, %arg3, %arg4 are used as pointers only, not modified * %r11 is the data offset value */ -.macro GHASH_4_ENCRYPT_4_PARALLEL_DEC TMP1 TMP2 TMP3 TMP4 TMP5 \ +.macro GHASH_4_ENCRYPT_4_PARALLEL_dec TMP1 TMP2 TMP3 TMP4 TMP5 \ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation movdqa \XMM1, \XMM5 @@ -1202,7 +1197,7 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation pxor \XMM5, \TMP6 paddd ONE(%rip), \XMM0 # INCR CNT movdqu HashKey_4(%arg2), \TMP5 - PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1 + pclmulqdq $0x11, \TMP5, \TMP4 # TMP4 = a1*b1 movdqa \XMM0, \XMM1 paddd ONE(%rip), \XMM0 # INCR CNT movdqa \XMM0, \XMM2 @@ -1210,51 +1205,51 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation movdqa \XMM0, \XMM3 paddd ONE(%rip), \XMM0 # INCR CNT movdqa \XMM0, \XMM4 - PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap - PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0 - PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap - PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap - PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap + pshufb %xmm15, \XMM1 # perform a 16 byte swap + pclmulqdq $0x00, \TMP5, \XMM5 # XMM5 = a0*b0 + pshufb %xmm15, \XMM2 # perform a 16 byte swap + pshufb %xmm15, \XMM3 # perform a 16 byte swap + pshufb %xmm15, \XMM4 # perform a 16 byte swap pxor (%arg1), \XMM1 pxor (%arg1), \XMM2 pxor (%arg1), \XMM3 pxor (%arg1), \XMM4 movdqu HashKey_4_k(%arg2), \TMP5 - PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0) + pclmulqdq $0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0) movaps 0x10(%arg1), \TMP1 - AESENC \TMP1, \XMM1 # Round 1 - AESENC \TMP1, \XMM2 - AESENC \TMP1, \XMM3 - AESENC \TMP1, \XMM4 + aesenc \TMP1, \XMM1 # Round 1 + aesenc \TMP1, \XMM2 + aesenc \TMP1, \XMM3 + aesenc \TMP1, \XMM4 movaps 0x20(%arg1), \TMP1 - AESENC \TMP1, \XMM1 # Round 2 - AESENC \TMP1, \XMM2 - AESENC \TMP1, \XMM3 - AESENC \TMP1, \XMM4 + aesenc \TMP1, \XMM1 # Round 2 + aesenc \TMP1, \XMM2 + aesenc \TMP1, \XMM3 + aesenc \TMP1, \XMM4 movdqa \XMM6, \TMP1 pshufd $78, \XMM6, \TMP2 pxor \XMM6, \TMP2 movdqu HashKey_3(%arg2), \TMP5 - PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1 + pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1 * b1 movaps 0x30(%arg1), \TMP3 - AESENC \TMP3, \XMM1 # Round 3 - AESENC \TMP3, \XMM2 - AESENC \TMP3, \XMM3 - AESENC \TMP3, \XMM4 - PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0 + aesenc \TMP3, \XMM1 # Round 3 + aesenc \TMP3, \XMM2 + aesenc \TMP3, \XMM3 + aesenc \TMP3, \XMM4 + pclmulqdq $0x00, \TMP5, \XMM6 # XMM6 = a0*b0 movaps 0x40(%arg1), \TMP3 - AESENC \TMP3, \XMM1 # Round 4 - AESENC \TMP3, \XMM2 - AESENC \TMP3, \XMM3 - AESENC \TMP3, \XMM4 + aesenc \TMP3, \XMM1 # Round 4 + aesenc \TMP3, \XMM2 + aesenc \TMP3, \XMM3 + aesenc \TMP3, \XMM4 movdqu HashKey_3_k(%arg2), \TMP5 - PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) + pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) movaps 0x50(%arg1), \TMP3 - AESENC \TMP3, \XMM1 # Round 5 - AESENC \TMP3, \XMM2 - AESENC \TMP3, \XMM3 - AESENC \TMP3, \XMM4 + aesenc \TMP3, \XMM1 # Round 5 + aesenc \TMP3, \XMM2 + aesenc \TMP3, \XMM3 + aesenc \TMP3, \XMM4 pxor \TMP1, \TMP4 # accumulate the results in TMP4:XMM5, TMP6 holds the middle part pxor \XMM6, \XMM5 @@ -1266,25 +1261,25 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation # Multiply TMP5 * HashKey using karatsuba - PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 + pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1 movaps 0x60(%arg1), \TMP3 - AESENC \TMP3, \XMM1 # Round 6 - AESENC \TMP3, \XMM2 - AESENC \TMP3, \XMM3 - AESENC \TMP3, \XMM4 - PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0 + aesenc \TMP3, \XMM1 # Round 6 + aesenc \TMP3, \XMM2 + aesenc \TMP3, \XMM3 + aesenc \TMP3, \XMM4 + pclmulqdq $0x00, \TMP5, \XMM7 # XMM7 = a0*b0 movaps 0x70(%arg1), \TMP3 - AESENC \TMP3, \XMM1 # Round 7 - AESENC \TMP3, \XMM2 - AESENC \TMP3, \XMM3 - AESENC \TMP3, \XMM4 + aesenc \TMP3, \XMM1 # Round 7 + aesenc \TMP3, \XMM2 + aesenc \TMP3, \XMM3 + aesenc \TMP3, \XMM4 movdqu HashKey_2_k(%arg2), \TMP5 - PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) + pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) movaps 0x80(%arg1), \TMP3 - AESENC \TMP3, \XMM1 # Round 8 - AESENC \TMP3, \XMM2 - AESENC \TMP3, \XMM3 - AESENC \TMP3, \XMM4 + aesenc \TMP3, \XMM1 # Round 8 + aesenc \TMP3, \XMM2 + aesenc \TMP3, \XMM3 + aesenc \TMP3, \XMM4 pxor \TMP1, \TMP4 # accumulate the results in TMP4:XMM5, TMP6 holds the middle part pxor \XMM7, \XMM5 @@ -1297,13 +1292,13 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation pshufd $78, \XMM8, \TMP2 pxor \XMM8, \TMP2 movdqu HashKey(%arg2), \TMP5 - PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 + pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1 movaps 0x90(%arg1), \TMP3 - AESENC \TMP3, \XMM1 # Round 9 - AESENC \TMP3, \XMM2 - AESENC \TMP3, \XMM3 - AESENC \TMP3, \XMM4 - PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0 + aesenc \TMP3, \XMM1 # Round 9 + aesenc \TMP3, \XMM2 + aesenc \TMP3, \XMM3 + aesenc \TMP3, \XMM4 + pclmulqdq $0x00, \TMP5, \XMM8 # XMM8 = a0*b0 lea 0xa0(%arg1),%r10 mov keysize,%eax shr $2,%eax # 128->4, 192->6, 256->8 @@ -1313,7 +1308,7 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation aes_loop_par_dec\@: MOVADQ (%r10),\TMP3 .irpc index, 1234 - AESENC \TMP3, %xmm\index + aesenc \TMP3, %xmm\index .endr add $16,%r10 sub $1,%eax @@ -1321,12 +1316,12 @@ aes_loop_par_dec\@: aes_loop_par_dec_done\@: MOVADQ (%r10), \TMP3 - AESENCLAST \TMP3, \XMM1 # last round - AESENCLAST \TMP3, \XMM2 - AESENCLAST \TMP3, \XMM3 - AESENCLAST \TMP3, \XMM4 + aesenclast \TMP3, \XMM1 # last round + aesenclast \TMP3, \XMM2 + aesenclast \TMP3, \XMM3 + aesenclast \TMP3, \XMM4 movdqu HashKey_k(%arg2), \TMP5 - PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) + pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) movdqu (%arg4,%r11,1), \TMP3 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK movdqu \XMM1, (%arg3,%r11,1) # Write to plaintext buffer @@ -1343,10 +1338,10 @@ aes_loop_par_dec_done\@: pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK movdqu \XMM4, 48(%arg3,%r11,1) # Write to plaintext buffer movdqa \TMP3, \XMM4 - PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap - PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap - PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap - PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap + pshufb %xmm15, \XMM1 # perform a 16 byte swap + pshufb %xmm15, \XMM2 # perform a 16 byte swap + pshufb %xmm15, \XMM3 # perform a 16 byte swap + pshufb %xmm15, \XMM4 # perform a 16 byte swap pxor \TMP4, \TMP1 pxor \XMM8, \XMM5 @@ -1402,10 +1397,10 @@ TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst pshufd $78, \XMM1, \TMP2 pxor \XMM1, \TMP2 movdqu HashKey_4(%arg2), \TMP5 - PCLMULQDQ 0x11, \TMP5, \TMP6 # TMP6 = a1*b1 - PCLMULQDQ 0x00, \TMP5, \XMM1 # XMM1 = a0*b0 + pclmulqdq $0x11, \TMP5, \TMP6 # TMP6 = a1*b1 + pclmulqdq $0x00, \TMP5, \XMM1 # XMM1 = a0*b0 movdqu HashKey_4_k(%arg2), \TMP4 - PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) + pclmulqdq $0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) movdqa \XMM1, \XMMDst movdqa \TMP2, \XMM1 # result in TMP6, XMMDst, XMM1 @@ -1415,10 +1410,10 @@ TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst pshufd $78, \XMM2, \TMP2 pxor \XMM2, \TMP2 movdqu HashKey_3(%arg2), \TMP5 - PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 - PCLMULQDQ 0x00, \TMP5, \XMM2 # XMM2 = a0*b0 + pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1 + pclmulqdq $0x00, \TMP5, \XMM2 # XMM2 = a0*b0 movdqu HashKey_3_k(%arg2), \TMP4 - PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) + pclmulqdq $0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) pxor \TMP1, \TMP6 pxor \XMM2, \XMMDst pxor \TMP2, \XMM1 @@ -1430,10 +1425,10 @@ TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst pshufd $78, \XMM3, \TMP2 pxor \XMM3, \TMP2 movdqu HashKey_2(%arg2), \TMP5 - PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 - PCLMULQDQ 0x00, \TMP5, \XMM3 # XMM3 = a0*b0 + pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1 + pclmulqdq $0x00, \TMP5, \XMM3 # XMM3 = a0*b0 movdqu HashKey_2_k(%arg2), \TMP4 - PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) + pclmulqdq $0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) pxor \TMP1, \TMP6 pxor \XMM3, \XMMDst pxor \TMP2, \XMM1 # results accumulated in TMP6, XMMDst, XMM1 @@ -1443,10 +1438,10 @@ TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst pshufd $78, \XMM4, \TMP2 pxor \XMM4, \TMP2 movdqu HashKey(%arg2), \TMP5 - PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 - PCLMULQDQ 0x00, \TMP5, \XMM4 # XMM4 = a0*b0 + pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1 + pclmulqdq $0x00, \TMP5, \XMM4 # XMM4 = a0*b0 movdqu HashKey_k(%arg2), \TMP4 - PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) + pclmulqdq $0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) pxor \TMP1, \TMP6 pxor \XMM4, \XMMDst pxor \XMM1, \TMP2 @@ -1504,13 +1499,13 @@ TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst _esb_loop_\@: MOVADQ (%r10),\TMP1 - AESENC \TMP1,\XMM0 + aesenc \TMP1,\XMM0 add $16,%r10 sub $1,%eax jnz _esb_loop_\@ MOVADQ (%r10),\TMP1 - AESENCLAST \TMP1,\XMM0 + aesenclast \TMP1,\XMM0 .endm /***************************************************************************** * void aesni_gcm_dec(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary. @@ -1599,7 +1594,7 @@ SYM_FUNC_START(aesni_gcm_dec) GCM_ENC_DEC dec GCM_COMPLETE arg10, arg11 FUNC_RESTORE - ret + RET SYM_FUNC_END(aesni_gcm_dec) @@ -1688,7 +1683,7 @@ SYM_FUNC_START(aesni_gcm_enc) GCM_COMPLETE arg10, arg11 FUNC_RESTORE - ret + RET SYM_FUNC_END(aesni_gcm_enc) /***************************************************************************** @@ -1706,7 +1701,7 @@ SYM_FUNC_START(aesni_gcm_init) FUNC_SAVE GCM_INIT %arg3, %arg4,%arg5, %arg6 FUNC_RESTORE - ret + RET SYM_FUNC_END(aesni_gcm_init) /***************************************************************************** @@ -1721,7 +1716,7 @@ SYM_FUNC_START(aesni_gcm_enc_update) FUNC_SAVE GCM_ENC_DEC enc FUNC_RESTORE - ret + RET SYM_FUNC_END(aesni_gcm_enc_update) /***************************************************************************** @@ -1736,7 +1731,7 @@ SYM_FUNC_START(aesni_gcm_dec_update) FUNC_SAVE GCM_ENC_DEC dec FUNC_RESTORE - ret + RET SYM_FUNC_END(aesni_gcm_dec_update) /***************************************************************************** @@ -1751,13 +1746,11 @@ SYM_FUNC_START(aesni_gcm_finalize) FUNC_SAVE GCM_COMPLETE %arg3 %arg4 FUNC_RESTORE - ret + RET SYM_FUNC_END(aesni_gcm_finalize) #endif - -SYM_FUNC_START_LOCAL_ALIAS(_key_expansion_128) SYM_FUNC_START_LOCAL(_key_expansion_256a) pshufd $0b11111111, %xmm1, %xmm1 shufps $0b00010000, %xmm0, %xmm4 @@ -1767,9 +1760,9 @@ SYM_FUNC_START_LOCAL(_key_expansion_256a) pxor %xmm1, %xmm0 movaps %xmm0, (TKEYP) add $0x10, TKEYP - ret + RET SYM_FUNC_END(_key_expansion_256a) -SYM_FUNC_END_ALIAS(_key_expansion_128) +SYM_FUNC_ALIAS_LOCAL(_key_expansion_128, _key_expansion_256a) SYM_FUNC_START_LOCAL(_key_expansion_192a) pshufd $0b01010101, %xmm1, %xmm1 @@ -1792,7 +1785,7 @@ SYM_FUNC_START_LOCAL(_key_expansion_192a) shufps $0b01001110, %xmm2, %xmm1 movaps %xmm1, 0x10(TKEYP) add $0x20, TKEYP - ret + RET SYM_FUNC_END(_key_expansion_192a) SYM_FUNC_START_LOCAL(_key_expansion_192b) @@ -1811,7 +1804,7 @@ SYM_FUNC_START_LOCAL(_key_expansion_192b) movaps %xmm0, (TKEYP) add $0x10, TKEYP - ret + RET SYM_FUNC_END(_key_expansion_192b) SYM_FUNC_START_LOCAL(_key_expansion_256b) @@ -1823,7 +1816,7 @@ SYM_FUNC_START_LOCAL(_key_expansion_256b) pxor %xmm1, %xmm2 movaps %xmm2, (TKEYP) add $0x10, TKEYP - ret + RET SYM_FUNC_END(_key_expansion_256b) /* @@ -1849,72 +1842,72 @@ SYM_FUNC_START(aesni_set_key) movups 0x10(UKEYP), %xmm2 # other user key movaps %xmm2, (TKEYP) add $0x10, TKEYP - AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1 + aeskeygenassist $0x1, %xmm2, %xmm1 # round 1 call _key_expansion_256a - AESKEYGENASSIST 0x1 %xmm0 %xmm1 + aeskeygenassist $0x1, %xmm0, %xmm1 call _key_expansion_256b - AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2 + aeskeygenassist $0x2, %xmm2, %xmm1 # round 2 call _key_expansion_256a - AESKEYGENASSIST 0x2 %xmm0 %xmm1 + aeskeygenassist $0x2, %xmm0, %xmm1 call _key_expansion_256b - AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3 + aeskeygenassist $0x4, %xmm2, %xmm1 # round 3 call _key_expansion_256a - AESKEYGENASSIST 0x4 %xmm0 %xmm1 + aeskeygenassist $0x4, %xmm0, %xmm1 call _key_expansion_256b - AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4 + aeskeygenassist $0x8, %xmm2, %xmm1 # round 4 call _key_expansion_256a - AESKEYGENASSIST 0x8 %xmm0 %xmm1 + aeskeygenassist $0x8, %xmm0, %xmm1 call _key_expansion_256b - AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5 + aeskeygenassist $0x10, %xmm2, %xmm1 # round 5 call _key_expansion_256a - AESKEYGENASSIST 0x10 %xmm0 %xmm1 + aeskeygenassist $0x10, %xmm0, %xmm1 call _key_expansion_256b - AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6 + aeskeygenassist $0x20, %xmm2, %xmm1 # round 6 call _key_expansion_256a - AESKEYGENASSIST 0x20 %xmm0 %xmm1 + aeskeygenassist $0x20, %xmm0, %xmm1 call _key_expansion_256b - AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7 + aeskeygenassist $0x40, %xmm2, %xmm1 # round 7 call _key_expansion_256a jmp .Ldec_key .Lenc_key192: movq 0x10(UKEYP), %xmm2 # other user key - AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1 + aeskeygenassist $0x1, %xmm2, %xmm1 # round 1 call _key_expansion_192a - AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2 + aeskeygenassist $0x2, %xmm2, %xmm1 # round 2 call _key_expansion_192b - AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3 + aeskeygenassist $0x4, %xmm2, %xmm1 # round 3 call _key_expansion_192a - AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4 + aeskeygenassist $0x8, %xmm2, %xmm1 # round 4 call _key_expansion_192b - AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5 + aeskeygenassist $0x10, %xmm2, %xmm1 # round 5 call _key_expansion_192a - AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6 + aeskeygenassist $0x20, %xmm2, %xmm1 # round 6 call _key_expansion_192b - AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7 + aeskeygenassist $0x40, %xmm2, %xmm1 # round 7 call _key_expansion_192a - AESKEYGENASSIST 0x80 %xmm2 %xmm1 # round 8 + aeskeygenassist $0x80, %xmm2, %xmm1 # round 8 call _key_expansion_192b jmp .Ldec_key .Lenc_key128: - AESKEYGENASSIST 0x1 %xmm0 %xmm1 # round 1 + aeskeygenassist $0x1, %xmm0, %xmm1 # round 1 call _key_expansion_128 - AESKEYGENASSIST 0x2 %xmm0 %xmm1 # round 2 + aeskeygenassist $0x2, %xmm0, %xmm1 # round 2 call _key_expansion_128 - AESKEYGENASSIST 0x4 %xmm0 %xmm1 # round 3 + aeskeygenassist $0x4, %xmm0, %xmm1 # round 3 call _key_expansion_128 - AESKEYGENASSIST 0x8 %xmm0 %xmm1 # round 4 + aeskeygenassist $0x8, %xmm0, %xmm1 # round 4 call _key_expansion_128 - AESKEYGENASSIST 0x10 %xmm0 %xmm1 # round 5 + aeskeygenassist $0x10, %xmm0, %xmm1 # round 5 call _key_expansion_128 - AESKEYGENASSIST 0x20 %xmm0 %xmm1 # round 6 + aeskeygenassist $0x20, %xmm0, %xmm1 # round 6 call _key_expansion_128 - AESKEYGENASSIST 0x40 %xmm0 %xmm1 # round 7 + aeskeygenassist $0x40, %xmm0, %xmm1 # round 7 call _key_expansion_128 - AESKEYGENASSIST 0x80 %xmm0 %xmm1 # round 8 + aeskeygenassist $0x80, %xmm0, %xmm1 # round 8 call _key_expansion_128 - AESKEYGENASSIST 0x1b %xmm0 %xmm1 # round 9 + aeskeygenassist $0x1b, %xmm0, %xmm1 # round 9 call _key_expansion_128 - AESKEYGENASSIST 0x36 %xmm0 %xmm1 # round 10 + aeskeygenassist $0x36, %xmm0, %xmm1 # round 10 call _key_expansion_128 .Ldec_key: sub $0x10, TKEYP @@ -1927,7 +1920,7 @@ SYM_FUNC_START(aesni_set_key) .align 4 .Ldec_key_loop: movaps (KEYP), %xmm0 - AESIMC %xmm0 %xmm1 + aesimc %xmm0, %xmm1 movaps %xmm1, (UKEYP) add $0x10, KEYP sub $0x10, UKEYP @@ -1938,7 +1931,7 @@ SYM_FUNC_START(aesni_set_key) popl KEYP #endif FRAME_END - ret + RET SYM_FUNC_END(aesni_set_key) /* @@ -1962,7 +1955,7 @@ SYM_FUNC_START(aesni_enc) popl KEYP #endif FRAME_END - ret + RET SYM_FUNC_END(aesni_enc) /* @@ -1988,38 +1981,38 @@ SYM_FUNC_START_LOCAL(_aesni_enc1) je .Lenc192 add $0x20, TKEYP movaps -0x60(TKEYP), KEY - AESENC KEY STATE + aesenc KEY, STATE movaps -0x50(TKEYP), KEY - AESENC KEY STATE + aesenc KEY, STATE .align 4 .Lenc192: movaps -0x40(TKEYP), KEY - AESENC KEY STATE + aesenc KEY, STATE movaps -0x30(TKEYP), KEY - AESENC KEY STATE + aesenc KEY, STATE .align 4 .Lenc128: movaps -0x20(TKEYP), KEY - AESENC KEY STATE + aesenc KEY, STATE movaps -0x10(TKEYP), KEY - AESENC KEY STATE + aesenc KEY, STATE movaps (TKEYP), KEY - AESENC KEY STATE + aesenc KEY, STATE movaps 0x10(TKEYP), KEY - AESENC KEY STATE + aesenc KEY, STATE movaps 0x20(TKEYP), KEY - AESENC KEY STATE + aesenc KEY, STATE movaps 0x30(TKEYP), KEY - AESENC KEY STATE + aesenc KEY, STATE movaps 0x40(TKEYP), KEY - AESENC KEY STATE + aesenc KEY, STATE movaps 0x50(TKEYP), KEY - AESENC KEY STATE + aesenc KEY, STATE movaps 0x60(TKEYP), KEY - AESENC KEY STATE + aesenc KEY, STATE movaps 0x70(TKEYP), KEY - AESENCLAST KEY STATE - ret + aesenclast KEY, STATE + RET SYM_FUNC_END(_aesni_enc1) /* @@ -2054,80 +2047,80 @@ SYM_FUNC_START_LOCAL(_aesni_enc4) je .L4enc192 add $0x20, TKEYP movaps -0x60(TKEYP), KEY - AESENC KEY STATE1 - AESENC KEY STATE2 - AESENC KEY STATE3 - AESENC KEY STATE4 + aesenc KEY, STATE1 + aesenc KEY, STATE2 + aesenc KEY, STATE3 + aesenc KEY, STATE4 movaps -0x50(TKEYP), KEY - AESENC KEY STATE1 - AESENC KEY STATE2 - AESENC KEY STATE3 - AESENC KEY STATE4 + aesenc KEY, STATE1 + aesenc KEY, STATE2 + aesenc KEY, STATE3 + aesenc KEY, STATE4 #.align 4 .L4enc192: movaps -0x40(TKEYP), KEY - AESENC KEY STATE1 - AESENC KEY STATE2 - AESENC KEY STATE3 - AESENC KEY STATE4 + aesenc KEY, STATE1 + aesenc KEY, STATE2 + aesenc KEY, STATE3 + aesenc KEY, STATE4 movaps -0x30(TKEYP), KEY - AESENC KEY STATE1 - AESENC KEY STATE2 - AESENC KEY STATE3 - AESENC KEY STATE4 + aesenc KEY, STATE1 + aesenc KEY, STATE2 + aesenc KEY, STATE3 + aesenc KEY, STATE4 #.align 4 .L4enc128: movaps -0x20(TKEYP), KEY - AESENC KEY STATE1 - AESENC KEY STATE2 - AESENC KEY STATE3 - AESENC KEY STATE4 + aesenc KEY, STATE1 + aesenc KEY, STATE2 + aesenc KEY, STATE3 + aesenc KEY, STATE4 movaps -0x10(TKEYP), KEY - AESENC KEY STATE1 - AESENC KEY STATE2 - AESENC KEY STATE3 - AESENC KEY STATE4 + aesenc KEY, STATE1 + aesenc KEY, STATE2 + aesenc KEY, STATE3 + aesenc KEY, STATE4 movaps (TKEYP), KEY - AESENC KEY STATE1 - AESENC KEY STATE2 - AESENC KEY STATE3 - AESENC KEY STATE4 + aesenc KEY, STATE1 + aesenc KEY, STATE2 + aesenc KEY, STATE3 + aesenc KEY, STATE4 movaps 0x10(TKEYP), KEY - AESENC KEY STATE1 - AESENC KEY STATE2 - AESENC KEY STATE3 - AESENC KEY STATE4 + aesenc KEY, STATE1 + aesenc KEY, STATE2 + aesenc KEY, STATE3 + aesenc KEY, STATE4 movaps 0x20(TKEYP), KEY - AESENC KEY STATE1 - AESENC KEY STATE2 - AESENC KEY STATE3 - AESENC KEY STATE4 + aesenc KEY, STATE1 + aesenc KEY, STATE2 + aesenc KEY, STATE3 + aesenc KEY, STATE4 movaps 0x30(TKEYP), KEY - AESENC KEY STATE1 - AESENC KEY STATE2 - AESENC KEY STATE3 - AESENC KEY STATE4 + aesenc KEY, STATE1 + aesenc KEY, STATE2 + aesenc KEY, STATE3 + aesenc KEY, STATE4 movaps 0x40(TKEYP), KEY - AESENC KEY STATE1 - AESENC KEY STATE2 - AESENC KEY STATE3 - AESENC KEY STATE4 + aesenc KEY, STATE1 + aesenc KEY, STATE2 + aesenc KEY, STATE3 + aesenc KEY, STATE4 movaps 0x50(TKEYP), KEY - AESENC KEY STATE1 - AESENC KEY STATE2 - AESENC KEY STATE3 - AESENC KEY STATE4 + aesenc KEY, STATE1 + aesenc KEY, STATE2 + aesenc KEY, STATE3 + aesenc KEY, STATE4 movaps 0x60(TKEYP), KEY - AESENC KEY STATE1 - AESENC KEY STATE2 - AESENC KEY STATE3 - AESENC KEY STATE4 + aesenc KEY, STATE1 + aesenc KEY, STATE2 + aesenc KEY, STATE3 + aesenc KEY, STATE4 movaps 0x70(TKEYP), KEY - AESENCLAST KEY STATE1 # last round - AESENCLAST KEY STATE2 - AESENCLAST KEY STATE3 - AESENCLAST KEY STATE4 - ret + aesenclast KEY, STATE1 # last round + aesenclast KEY, STATE2 + aesenclast KEY, STATE3 + aesenclast KEY, STATE4 + RET SYM_FUNC_END(_aesni_enc4) /* @@ -2152,7 +2145,7 @@ SYM_FUNC_START(aesni_dec) popl KEYP #endif FRAME_END - ret + RET SYM_FUNC_END(aesni_dec) /* @@ -2178,38 +2171,38 @@ SYM_FUNC_START_LOCAL(_aesni_dec1) je .Ldec192 add $0x20, TKEYP movaps -0x60(TKEYP), KEY - AESDEC KEY STATE + aesdec KEY, STATE movaps -0x50(TKEYP), KEY - AESDEC KEY STATE + aesdec KEY, STATE .align 4 .Ldec192: movaps -0x40(TKEYP), KEY - AESDEC KEY STATE + aesdec KEY, STATE movaps -0x30(TKEYP), KEY - AESDEC KEY STATE + aesdec KEY, STATE .align 4 .Ldec128: movaps -0x20(TKEYP), KEY - AESDEC KEY STATE + aesdec KEY, STATE movaps -0x10(TKEYP), KEY - AESDEC KEY STATE + aesdec KEY, STATE movaps (TKEYP), KEY - AESDEC KEY STATE + aesdec KEY, STATE movaps 0x10(TKEYP), KEY - AESDEC KEY STATE + aesdec KEY, STATE movaps 0x20(TKEYP), KEY - AESDEC KEY STATE + aesdec KEY, STATE movaps 0x30(TKEYP), KEY - AESDEC KEY STATE + aesdec KEY, STATE movaps 0x40(TKEYP), KEY - AESDEC KEY STATE + aesdec KEY, STATE movaps 0x50(TKEYP), KEY - AESDEC KEY STATE + aesdec KEY, STATE movaps 0x60(TKEYP), KEY - AESDEC KEY STATE + aesdec KEY, STATE movaps 0x70(TKEYP), KEY - AESDECLAST KEY STATE - ret + aesdeclast KEY, STATE + RET SYM_FUNC_END(_aesni_dec1) /* @@ -2244,80 +2237,80 @@ SYM_FUNC_START_LOCAL(_aesni_dec4) je .L4dec192 add $0x20, TKEYP movaps -0x60(TKEYP), KEY - AESDEC KEY STATE1 - AESDEC KEY STATE2 - AESDEC KEY STATE3 - AESDEC KEY STATE4 + aesdec KEY, STATE1 + aesdec KEY, STATE2 + aesdec KEY, STATE3 + aesdec KEY, STATE4 movaps -0x50(TKEYP), KEY - AESDEC KEY STATE1 - AESDEC KEY STATE2 - AESDEC KEY STATE3 - AESDEC KEY STATE4 + aesdec KEY, STATE1 + aesdec KEY, STATE2 + aesdec KEY, STATE3 + aesdec KEY, STATE4 .align 4 .L4dec192: movaps -0x40(TKEYP), KEY - AESDEC KEY STATE1 - AESDEC KEY STATE2 - AESDEC KEY STATE3 - AESDEC KEY STATE4 + aesdec KEY, STATE1 + aesdec KEY, STATE2 + aesdec KEY, STATE3 + aesdec KEY, STATE4 movaps -0x30(TKEYP), KEY - AESDEC KEY STATE1 - AESDEC KEY STATE2 - AESDEC KEY STATE3 - AESDEC KEY STATE4 + aesdec KEY, STATE1 + aesdec KEY, STATE2 + aesdec KEY, STATE3 + aesdec KEY, STATE4 .align 4 .L4dec128: movaps -0x20(TKEYP), KEY - AESDEC KEY STATE1 - AESDEC KEY STATE2 - AESDEC KEY STATE3 - AESDEC KEY STATE4 + aesdec KEY, STATE1 + aesdec KEY, STATE2 + aesdec KEY, STATE3 + aesdec KEY, STATE4 movaps -0x10(TKEYP), KEY - AESDEC KEY STATE1 - AESDEC KEY STATE2 - AESDEC KEY STATE3 - AESDEC KEY STATE4 + aesdec KEY, STATE1 + aesdec KEY, STATE2 + aesdec KEY, STATE3 + aesdec KEY, STATE4 movaps (TKEYP), KEY - AESDEC KEY STATE1 - AESDEC KEY STATE2 - AESDEC KEY STATE3 - AESDEC KEY STATE4 + aesdec KEY, STATE1 + aesdec KEY, STATE2 + aesdec KEY, STATE3 + aesdec KEY, STATE4 movaps 0x10(TKEYP), KEY - AESDEC KEY STATE1 - AESDEC KEY STATE2 - AESDEC KEY STATE3 - AESDEC KEY STATE4 + aesdec KEY, STATE1 + aesdec KEY, STATE2 + aesdec KEY, STATE3 + aesdec KEY, STATE4 movaps 0x20(TKEYP), KEY - AESDEC KEY STATE1 - AESDEC KEY STATE2 - AESDEC KEY STATE3 - AESDEC KEY STATE4 + aesdec KEY, STATE1 + aesdec KEY, STATE2 + aesdec KEY, STATE3 + aesdec KEY, STATE4 movaps 0x30(TKEYP), KEY - AESDEC KEY STATE1 - AESDEC KEY STATE2 - AESDEC KEY STATE3 - AESDEC KEY STATE4 + aesdec KEY, STATE1 + aesdec KEY, STATE2 + aesdec KEY, STATE3 + aesdec KEY, STATE4 movaps 0x40(TKEYP), KEY - AESDEC KEY STATE1 - AESDEC KEY STATE2 - AESDEC KEY STATE3 - AESDEC KEY STATE4 + aesdec KEY, STATE1 + aesdec KEY, STATE2 + aesdec KEY, STATE3 + aesdec KEY, STATE4 movaps 0x50(TKEYP), KEY - AESDEC KEY STATE1 - AESDEC KEY STATE2 - AESDEC KEY STATE3 - AESDEC KEY STATE4 + aesdec KEY, STATE1 + aesdec KEY, STATE2 + aesdec KEY, STATE3 + aesdec KEY, STATE4 movaps 0x60(TKEYP), KEY - AESDEC KEY STATE1 - AESDEC KEY STATE2 - AESDEC KEY STATE3 - AESDEC KEY STATE4 + aesdec KEY, STATE1 + aesdec KEY, STATE2 + aesdec KEY, STATE3 + aesdec KEY, STATE4 movaps 0x70(TKEYP), KEY - AESDECLAST KEY STATE1 # last round - AESDECLAST KEY STATE2 - AESDECLAST KEY STATE3 - AESDECLAST KEY STATE4 - ret + aesdeclast KEY, STATE1 # last round + aesdeclast KEY, STATE2 + aesdeclast KEY, STATE3 + aesdeclast KEY, STATE4 + RET SYM_FUNC_END(_aesni_dec4) /* @@ -2377,7 +2370,7 @@ SYM_FUNC_START(aesni_ecb_enc) popl LEN #endif FRAME_END - ret + RET SYM_FUNC_END(aesni_ecb_enc) /* @@ -2438,7 +2431,7 @@ SYM_FUNC_START(aesni_ecb_dec) popl LEN #endif FRAME_END - ret + RET SYM_FUNC_END(aesni_ecb_dec) /* @@ -2482,7 +2475,7 @@ SYM_FUNC_START(aesni_cbc_enc) popl IVP #endif FRAME_END - ret + RET SYM_FUNC_END(aesni_cbc_enc) /* @@ -2575,16 +2568,143 @@ SYM_FUNC_START(aesni_cbc_dec) popl IVP #endif FRAME_END - ret + RET SYM_FUNC_END(aesni_cbc_dec) -#ifdef __x86_64__ +/* + * void aesni_cts_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, + * size_t len, u8 *iv) + */ +SYM_FUNC_START(aesni_cts_cbc_enc) + FRAME_BEGIN +#ifndef __x86_64__ + pushl IVP + pushl LEN + pushl KEYP + pushl KLEN + movl (FRAME_OFFSET+20)(%esp), KEYP # ctx + movl (FRAME_OFFSET+24)(%esp), OUTP # dst + movl (FRAME_OFFSET+28)(%esp), INP # src + movl (FRAME_OFFSET+32)(%esp), LEN # len + movl (FRAME_OFFSET+36)(%esp), IVP # iv + lea .Lcts_permute_table, T1 +#else + lea .Lcts_permute_table(%rip), T1 +#endif + mov 480(KEYP), KLEN + movups (IVP), STATE + sub $16, LEN + mov T1, IVP + add $32, IVP + add LEN, T1 + sub LEN, IVP + movups (T1), %xmm4 + movups (IVP), %xmm5 + + movups (INP), IN1 + add LEN, INP + movups (INP), IN2 + + pxor IN1, STATE + call _aesni_enc1 + + pshufb %xmm5, IN2 + pxor STATE, IN2 + pshufb %xmm4, STATE + add OUTP, LEN + movups STATE, (LEN) + + movaps IN2, STATE + call _aesni_enc1 + movups STATE, (OUTP) + +#ifndef __x86_64__ + popl KLEN + popl KEYP + popl LEN + popl IVP +#endif + FRAME_END + RET +SYM_FUNC_END(aesni_cts_cbc_enc) + +/* + * void aesni_cts_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, + * size_t len, u8 *iv) + */ +SYM_FUNC_START(aesni_cts_cbc_dec) + FRAME_BEGIN +#ifndef __x86_64__ + pushl IVP + pushl LEN + pushl KEYP + pushl KLEN + movl (FRAME_OFFSET+20)(%esp), KEYP # ctx + movl (FRAME_OFFSET+24)(%esp), OUTP # dst + movl (FRAME_OFFSET+28)(%esp), INP # src + movl (FRAME_OFFSET+32)(%esp), LEN # len + movl (FRAME_OFFSET+36)(%esp), IVP # iv + lea .Lcts_permute_table, T1 +#else + lea .Lcts_permute_table(%rip), T1 +#endif + mov 480(KEYP), KLEN + add $240, KEYP + movups (IVP), IV + sub $16, LEN + mov T1, IVP + add $32, IVP + add LEN, T1 + sub LEN, IVP + movups (T1), %xmm4 + + movups (INP), STATE + add LEN, INP + movups (INP), IN1 + + call _aesni_dec1 + movaps STATE, IN2 + pshufb %xmm4, STATE + pxor IN1, STATE + + add OUTP, LEN + movups STATE, (LEN) + + movups (IVP), %xmm0 + pshufb %xmm0, IN1 + pblendvb IN2, IN1 + movaps IN1, STATE + call _aesni_dec1 + + pxor IV, STATE + movups STATE, (OUTP) + +#ifndef __x86_64__ + popl KLEN + popl KEYP + popl LEN + popl IVP +#endif + FRAME_END + RET +SYM_FUNC_END(aesni_cts_cbc_dec) + .pushsection .rodata .align 16 +.Lcts_permute_table: + .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 + .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 + .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 + .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f + .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 + .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 +#ifdef __x86_64__ .Lbswap_mask: .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 +#endif .popsection +#ifdef __x86_64__ /* * _aesni_inc_init: internal ABI * setup registers used by _aesni_inc @@ -2599,11 +2719,11 @@ SYM_FUNC_END(aesni_cbc_dec) SYM_FUNC_START_LOCAL(_aesni_inc_init) movaps .Lbswap_mask, BSWAP_MASK movaps IV, CTR - PSHUFB_XMM BSWAP_MASK CTR + pshufb BSWAP_MASK, CTR mov $1, TCTR_LOW - MOVQ_R64_XMM TCTR_LOW INC - MOVQ_R64_XMM CTR TCTR_LOW - ret + movq TCTR_LOW, INC + movq CTR, TCTR_LOW + RET SYM_FUNC_END(_aesni_inc_init) /* @@ -2630,8 +2750,8 @@ SYM_FUNC_START_LOCAL(_aesni_inc) psrldq $8, INC .Linc_low: movaps CTR, IV - PSHUFB_XMM BSWAP_MASK IV - ret + pshufb BSWAP_MASK, IV + RET SYM_FUNC_END(_aesni_inc) /* @@ -2694,9 +2814,17 @@ SYM_FUNC_START(aesni_ctr_enc) movups IV, (IVP) .Lctr_enc_just_ret: FRAME_END - ret + RET SYM_FUNC_END(aesni_ctr_enc) +#endif + +.section .rodata.cst16.gf128mul_x_ble_mask, "aM", @progbits, 16 +.align 16 +.Lgf128mul_x_ble_mask: + .octa 0x00000000000000010000000000000087 +.previous + /* * _aesni_gf128mul_x_ble: internal ABI * Multiply in GF(2^128) for XTS IVs @@ -2709,120 +2837,325 @@ SYM_FUNC_END(aesni_ctr_enc) * CTR: == temporary value */ #define _aesni_gf128mul_x_ble() \ - pshufd $0x13, IV, CTR; \ + pshufd $0x13, IV, KEY; \ paddq IV, IV; \ - psrad $31, CTR; \ - pand GF128MUL_MASK, CTR; \ - pxor CTR, IV; + psrad $31, KEY; \ + pand GF128MUL_MASK, KEY; \ + pxor KEY, IV; /* - * void aesni_xts_crypt8(const struct crypto_aes_ctx *ctx, u8 *dst, - * const u8 *src, bool enc, le128 *iv) + * void aesni_xts_encrypt(const struct crypto_aes_ctx *ctx, u8 *dst, + * const u8 *src, unsigned int len, le128 *iv) */ -SYM_FUNC_START(aesni_xts_crypt8) +SYM_FUNC_START(aesni_xts_encrypt) FRAME_BEGIN - cmpb $0, %cl - movl $0, %ecx - movl $240, %r10d - leaq _aesni_enc4, %r11 - leaq _aesni_dec4, %rax - cmovel %r10d, %ecx - cmoveq %rax, %r11 - +#ifndef __x86_64__ + pushl IVP + pushl LEN + pushl KEYP + pushl KLEN + movl (FRAME_OFFSET+20)(%esp), KEYP # ctx + movl (FRAME_OFFSET+24)(%esp), OUTP # dst + movl (FRAME_OFFSET+28)(%esp), INP # src + movl (FRAME_OFFSET+32)(%esp), LEN # len + movl (FRAME_OFFSET+36)(%esp), IVP # iv movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK +#else + movdqa .Lgf128mul_x_ble_mask(%rip), GF128MUL_MASK +#endif movups (IVP), IV mov 480(KEYP), KLEN - addq %rcx, KEYP + +.Lxts_enc_loop4: + sub $64, LEN + jl .Lxts_enc_1x movdqa IV, STATE1 - movdqu 0x00(INP), INC - pxor INC, STATE1 + movdqu 0x00(INP), IN + pxor IN, STATE1 movdqu IV, 0x00(OUTP) _aesni_gf128mul_x_ble() movdqa IV, STATE2 - movdqu 0x10(INP), INC - pxor INC, STATE2 + movdqu 0x10(INP), IN + pxor IN, STATE2 movdqu IV, 0x10(OUTP) _aesni_gf128mul_x_ble() movdqa IV, STATE3 - movdqu 0x20(INP), INC - pxor INC, STATE3 + movdqu 0x20(INP), IN + pxor IN, STATE3 movdqu IV, 0x20(OUTP) _aesni_gf128mul_x_ble() movdqa IV, STATE4 - movdqu 0x30(INP), INC - pxor INC, STATE4 + movdqu 0x30(INP), IN + pxor IN, STATE4 movdqu IV, 0x30(OUTP) - CALL_NOSPEC %r11 + call _aesni_enc4 - movdqu 0x00(OUTP), INC - pxor INC, STATE1 + movdqu 0x00(OUTP), IN + pxor IN, STATE1 movdqu STATE1, 0x00(OUTP) + movdqu 0x10(OUTP), IN + pxor IN, STATE2 + movdqu STATE2, 0x10(OUTP) + + movdqu 0x20(OUTP), IN + pxor IN, STATE3 + movdqu STATE3, 0x20(OUTP) + + movdqu 0x30(OUTP), IN + pxor IN, STATE4 + movdqu STATE4, 0x30(OUTP) + _aesni_gf128mul_x_ble() - movdqa IV, STATE1 - movdqu 0x40(INP), INC - pxor INC, STATE1 - movdqu IV, 0x40(OUTP) - movdqu 0x10(OUTP), INC - pxor INC, STATE2 - movdqu STATE2, 0x10(OUTP) + add $64, INP + add $64, OUTP + test LEN, LEN + jnz .Lxts_enc_loop4 + +.Lxts_enc_ret_iv: + movups IV, (IVP) +.Lxts_enc_ret: +#ifndef __x86_64__ + popl KLEN + popl KEYP + popl LEN + popl IVP +#endif + FRAME_END + RET + +.Lxts_enc_1x: + add $64, LEN + jz .Lxts_enc_ret_iv + sub $16, LEN + jl .Lxts_enc_cts4 + +.Lxts_enc_loop1: + movdqu (INP), STATE + pxor IV, STATE + call _aesni_enc1 + pxor IV, STATE _aesni_gf128mul_x_ble() - movdqa IV, STATE2 - movdqu 0x50(INP), INC - pxor INC, STATE2 - movdqu IV, 0x50(OUTP) - movdqu 0x20(OUTP), INC - pxor INC, STATE3 - movdqu STATE3, 0x20(OUTP) + test LEN, LEN + jz .Lxts_enc_out + + add $16, INP + sub $16, LEN + jl .Lxts_enc_cts1 + + movdqu STATE, (OUTP) + add $16, OUTP + jmp .Lxts_enc_loop1 + +.Lxts_enc_out: + movdqu STATE, (OUTP) + jmp .Lxts_enc_ret_iv + +.Lxts_enc_cts4: + movdqa STATE4, STATE + sub $16, OUTP + +.Lxts_enc_cts1: +#ifndef __x86_64__ + lea .Lcts_permute_table, T1 +#else + lea .Lcts_permute_table(%rip), T1 +#endif + add LEN, INP /* rewind input pointer */ + add $16, LEN /* # bytes in final block */ + movups (INP), IN1 + + mov T1, IVP + add $32, IVP + add LEN, T1 + sub LEN, IVP + add OUTP, LEN + + movups (T1), %xmm4 + movaps STATE, IN2 + pshufb %xmm4, STATE + movups STATE, (LEN) + + movups (IVP), %xmm0 + pshufb %xmm0, IN1 + pblendvb IN2, IN1 + movaps IN1, STATE + + pxor IV, STATE + call _aesni_enc1 + pxor IV, STATE + + movups STATE, (OUTP) + jmp .Lxts_enc_ret +SYM_FUNC_END(aesni_xts_encrypt) + +/* + * void aesni_xts_decrypt(const struct crypto_aes_ctx *ctx, u8 *dst, + * const u8 *src, unsigned int len, le128 *iv) + */ +SYM_FUNC_START(aesni_xts_decrypt) + FRAME_BEGIN +#ifndef __x86_64__ + pushl IVP + pushl LEN + pushl KEYP + pushl KLEN + movl (FRAME_OFFSET+20)(%esp), KEYP # ctx + movl (FRAME_OFFSET+24)(%esp), OUTP # dst + movl (FRAME_OFFSET+28)(%esp), INP # src + movl (FRAME_OFFSET+32)(%esp), LEN # len + movl (FRAME_OFFSET+36)(%esp), IVP # iv + movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK +#else + movdqa .Lgf128mul_x_ble_mask(%rip), GF128MUL_MASK +#endif + movups (IVP), IV + + mov 480(KEYP), KLEN + add $240, KEYP + + test $15, LEN + jz .Lxts_dec_loop4 + sub $16, LEN + +.Lxts_dec_loop4: + sub $64, LEN + jl .Lxts_dec_1x + + movdqa IV, STATE1 + movdqu 0x00(INP), IN + pxor IN, STATE1 + movdqu IV, 0x00(OUTP) _aesni_gf128mul_x_ble() - movdqa IV, STATE3 - movdqu 0x60(INP), INC - pxor INC, STATE3 - movdqu IV, 0x60(OUTP) + movdqa IV, STATE2 + movdqu 0x10(INP), IN + pxor IN, STATE2 + movdqu IV, 0x10(OUTP) - movdqu 0x30(OUTP), INC - pxor INC, STATE4 - movdqu STATE4, 0x30(OUTP) + _aesni_gf128mul_x_ble() + movdqa IV, STATE3 + movdqu 0x20(INP), IN + pxor IN, STATE3 + movdqu IV, 0x20(OUTP) _aesni_gf128mul_x_ble() movdqa IV, STATE4 - movdqu 0x70(INP), INC - pxor INC, STATE4 - movdqu IV, 0x70(OUTP) + movdqu 0x30(INP), IN + pxor IN, STATE4 + movdqu IV, 0x30(OUTP) + + call _aesni_dec4 + + movdqu 0x00(OUTP), IN + pxor IN, STATE1 + movdqu STATE1, 0x00(OUTP) + + movdqu 0x10(OUTP), IN + pxor IN, STATE2 + movdqu STATE2, 0x10(OUTP) + + movdqu 0x20(OUTP), IN + pxor IN, STATE3 + movdqu STATE3, 0x20(OUTP) + + movdqu 0x30(OUTP), IN + pxor IN, STATE4 + movdqu STATE4, 0x30(OUTP) _aesni_gf128mul_x_ble() + + add $64, INP + add $64, OUTP + test LEN, LEN + jnz .Lxts_dec_loop4 + +.Lxts_dec_ret_iv: movups IV, (IVP) - CALL_NOSPEC %r11 +.Lxts_dec_ret: +#ifndef __x86_64__ + popl KLEN + popl KEYP + popl LEN + popl IVP +#endif + FRAME_END + RET - movdqu 0x40(OUTP), INC - pxor INC, STATE1 - movdqu STATE1, 0x40(OUTP) +.Lxts_dec_1x: + add $64, LEN + jz .Lxts_dec_ret_iv - movdqu 0x50(OUTP), INC - pxor INC, STATE2 - movdqu STATE2, 0x50(OUTP) +.Lxts_dec_loop1: + movdqu (INP), STATE - movdqu 0x60(OUTP), INC - pxor INC, STATE3 - movdqu STATE3, 0x60(OUTP) + add $16, INP + sub $16, LEN + jl .Lxts_dec_cts1 - movdqu 0x70(OUTP), INC - pxor INC, STATE4 - movdqu STATE4, 0x70(OUTP) + pxor IV, STATE + call _aesni_dec1 + pxor IV, STATE + _aesni_gf128mul_x_ble() - FRAME_END - ret -SYM_FUNC_END(aesni_xts_crypt8) + test LEN, LEN + jz .Lxts_dec_out + + movdqu STATE, (OUTP) + add $16, OUTP + jmp .Lxts_dec_loop1 + +.Lxts_dec_out: + movdqu STATE, (OUTP) + jmp .Lxts_dec_ret_iv + +.Lxts_dec_cts1: + movdqa IV, STATE4 + _aesni_gf128mul_x_ble() + pxor IV, STATE + call _aesni_dec1 + pxor IV, STATE + +#ifndef __x86_64__ + lea .Lcts_permute_table, T1 +#else + lea .Lcts_permute_table(%rip), T1 #endif + add LEN, INP /* rewind input pointer */ + add $16, LEN /* # bytes in final block */ + movups (INP), IN1 + + mov T1, IVP + add $32, IVP + add LEN, T1 + sub LEN, IVP + add OUTP, LEN + + movups (T1), %xmm4 + movaps STATE, IN2 + pshufb %xmm4, STATE + movups STATE, (LEN) + + movups (IVP), %xmm0 + pshufb %xmm0, IN1 + pblendvb IN2, IN1 + movaps IN1, STATE + + pxor STATE4, STATE + call _aesni_dec1 + pxor STATE4, STATE + + movups STATE, (OUTP) + jmp .Lxts_dec_ret +SYM_FUNC_END(aesni_xts_decrypt) diff --git a/arch/x86/crypto/aesni-intel_avx-x86_64.S b/arch/x86/crypto/aesni-intel_avx-x86_64.S index bfa1c0b3e5b4..0852ab573fd3 100644 --- a/arch/x86/crypto/aesni-intel_avx-x86_64.S +++ b/arch/x86/crypto/aesni-intel_avx-x86_64.S @@ -120,7 +120,6 @@ ## #include <linux/linkage.h> -#include <asm/inst.h> # constants in mergeable sections, linker can reorder and merge .section .rodata.cst16.POLY, "aM", @progbits, 16 @@ -213,10 +212,6 @@ HashKey_8_k = 16*21 # store XOR of HashKey^8 <<1 mod poly here (for Karatsu #define arg4 %rcx #define arg5 %r8 #define arg6 %r9 -#define arg7 STACK_OFFSET+8*1(%r14) -#define arg8 STACK_OFFSET+8*2(%r14) -#define arg9 STACK_OFFSET+8*3(%r14) -#define arg10 STACK_OFFSET+8*4(%r14) #define keysize 2*15*16(arg1) i = 0 @@ -238,9 +233,6 @@ define_reg j %j .noaltmacro .endm -# need to push 4 registers into stack to maintain -STACK_OFFSET = 8*4 - TMP1 = 16*0 # Temporary storage for AAD TMP2 = 16*1 # Temporary storage for AES State 2 (State 1 is stored in an XMM register) TMP3 = 16*2 # Temporary storage for AES State 3 @@ -257,25 +249,22 @@ VARIABLE_OFFSET = 16*8 ################################ .macro FUNC_SAVE - #the number of pushes must equal STACK_OFFSET push %r12 push %r13 - push %r14 push %r15 - mov %rsp, %r14 - - + push %rbp + mov %rsp, %rbp sub $VARIABLE_OFFSET, %rsp and $~63, %rsp # align rsp to 64 bytes .endm .macro FUNC_RESTORE - mov %r14, %rsp + mov %rbp, %rsp + pop %rbp pop %r15 - pop %r14 pop %r13 pop %r12 .endm @@ -295,7 +284,7 @@ VARIABLE_OFFSET = 16*8 # combined for GCM encrypt and decrypt functions # clobbering all xmm registers -# clobbering r10, r11, r12, r13, r14, r15 +# clobbering r10, r11, r12, r13, r15, rax .macro GCM_ENC_DEC INITIAL_BLOCKS GHASH_8_ENCRYPT_8_PARALLEL GHASH_LAST_8 GHASH_MUL ENC_DEC REP vmovdqu AadHash(arg2), %xmm8 vmovdqu HashKey(arg2), %xmm13 # xmm13 = HashKey @@ -370,7 +359,7 @@ _initial_num_blocks_is_0\@: _initial_blocks_encrypted\@: - cmp $0, %r13 + test %r13, %r13 je _zero_cipher_left\@ sub $128, %r13 @@ -529,7 +518,7 @@ _multiple_of_16_bytes\@: vmovdqu HashKey(arg2), %xmm13 mov PBlockLen(arg2), %r12 - cmp $0, %r12 + test %r12, %r12 je _partial_done\@ #GHASH computation for the last <16 Byte block @@ -574,7 +563,7 @@ _T_8\@: add $8, %r10 sub $8, %r11 vpsrldq $8, %xmm9, %xmm9 - cmp $0, %r11 + test %r11, %r11 je _return_T_done\@ _T_4\@: vmovd %xmm9, %eax @@ -582,7 +571,7 @@ _T_4\@: add $4, %r10 sub $4, %r11 vpsrldq $4, %xmm9, %xmm9 - cmp $0, %r11 + test %r11, %r11 je _return_T_done\@ _T_123\@: vmovd %xmm9, %eax @@ -626,7 +615,7 @@ _get_AAD_blocks\@: cmp $16, %r11 jge _get_AAD_blocks\@ vmovdqu \T8, \T7 - cmp $0, %r11 + test %r11, %r11 je _get_AAD_done\@ vpxor \T7, \T7, \T7 @@ -645,7 +634,7 @@ _get_AAD_rest8\@: vpxor \T1, \T7, \T7 jmp _get_AAD_rest8\@ _get_AAD_rest4\@: - cmp $0, %r11 + test %r11, %r11 jle _get_AAD_rest0\@ mov (%r10), %eax movq %rax, \T1 @@ -750,7 +739,7 @@ _done_read_partial_block_\@: .macro PARTIAL_BLOCK GHASH_MUL CYPH_PLAIN_OUT PLAIN_CYPH_IN PLAIN_CYPH_LEN DATA_OFFSET \ AAD_HASH ENC_DEC mov PBlockLen(arg2), %r13 - cmp $0, %r13 + test %r13, %r13 je _partial_block_done_\@ # Leave Macro if no partial blocks # Read in input data without over reading cmp $16, \PLAIN_CYPH_LEN @@ -802,7 +791,7 @@ _no_extra_mask_1_\@: vpshufb %xmm2, %xmm3, %xmm3 vpxor %xmm3, \AAD_HASH, \AAD_HASH - cmp $0, %r10 + test %r10, %r10 jl _partial_incomplete_1_\@ # GHASH computation for the last <16 Byte block @@ -837,7 +826,7 @@ _no_extra_mask_2_\@: vpshufb %xmm2, %xmm9, %xmm9 vpxor %xmm9, \AAD_HASH, \AAD_HASH - cmp $0, %r10 + test %r10, %r10 jl _partial_incomplete_2_\@ # GHASH computation for the last <16 Byte block @@ -857,7 +846,7 @@ _encode_done_\@: vpshufb %xmm2, %xmm9, %xmm9 .endif # output encrypted Bytes - cmp $0, %r10 + test %r10, %r10 jl _partial_fill_\@ mov %r13, %r12 mov $16, %r13 @@ -886,7 +875,6 @@ _less_than_8_bytes_left_\@: _partial_block_done_\@: .endm # PARTIAL_BLOCK -#ifdef CONFIG_AS_AVX ############################################################################### # GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0) # Input: A and B (128-bits each, bit-reflected) @@ -998,7 +986,7 @@ _partial_block_done_\@: ## num_initial_blocks = b mod 4# ## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext ## r10, r11, r12, rax are clobbered -## arg1, arg3, arg4, r14 are used as a pointer only, not modified +## arg1, arg2, arg3, arg4 are used as pointers only, not modified .macro INITIAL_BLOCKS_AVX REP num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC i = (8-\num_initial_blocks) @@ -1233,7 +1221,7 @@ _initial_blocks_done\@: # encrypt 8 blocks at a time # ghash the 8 previously encrypted ciphertext blocks -# arg1, arg3, arg4 are used as pointers only, not modified +# arg1, arg2, arg3, arg4 are used as pointers only, not modified # r11 is the data offset value .macro GHASH_8_ENCRYPT_8_PARALLEL_AVX REP T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC @@ -1779,7 +1767,7 @@ SYM_FUNC_START(aesni_gcm_init_avx_gen2) FUNC_SAVE INIT GHASH_MUL_AVX, PRECOMPUTE_AVX FUNC_RESTORE - ret + RET SYM_FUNC_END(aesni_gcm_init_avx_gen2) ############################################################################### @@ -1800,15 +1788,15 @@ SYM_FUNC_START(aesni_gcm_enc_update_avx_gen2) # must be 192 GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 11 FUNC_RESTORE - ret + RET key_128_enc_update: GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 9 FUNC_RESTORE - ret + RET key_256_enc_update: GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 13 FUNC_RESTORE - ret + RET SYM_FUNC_END(aesni_gcm_enc_update_avx_gen2) ############################################################################### @@ -1829,15 +1817,15 @@ SYM_FUNC_START(aesni_gcm_dec_update_avx_gen2) # must be 192 GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 11 FUNC_RESTORE - ret + RET key_128_dec_update: GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 9 FUNC_RESTORE - ret + RET key_256_dec_update: GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 13 FUNC_RESTORE - ret + RET SYM_FUNC_END(aesni_gcm_dec_update_avx_gen2) ############################################################################### @@ -1858,20 +1846,17 @@ SYM_FUNC_START(aesni_gcm_finalize_avx_gen2) # must be 192 GCM_COMPLETE GHASH_MUL_AVX, 11, arg3, arg4 FUNC_RESTORE - ret + RET key_128_finalize: GCM_COMPLETE GHASH_MUL_AVX, 9, arg3, arg4 FUNC_RESTORE - ret + RET key_256_finalize: GCM_COMPLETE GHASH_MUL_AVX, 13, arg3, arg4 FUNC_RESTORE - ret + RET SYM_FUNC_END(aesni_gcm_finalize_avx_gen2) -#endif /* CONFIG_AS_AVX */ - -#ifdef CONFIG_AS_AVX2 ############################################################################### # GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0) # Input: A and B (128-bits each, bit-reflected) @@ -1949,7 +1934,7 @@ SYM_FUNC_END(aesni_gcm_finalize_avx_gen2) ## num_initial_blocks = b mod 4# ## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext ## r10, r11, r12, rax are clobbered -## arg1, arg3, arg4, r14 are used as a pointer only, not modified +## arg1, arg2, arg3, arg4 are used as pointers only, not modified .macro INITIAL_BLOCKS_AVX2 REP num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC VER i = (8-\num_initial_blocks) @@ -2191,7 +2176,7 @@ _initial_blocks_done\@: # encrypt 8 blocks at a time # ghash the 8 previously encrypted ciphertext blocks -# arg1, arg3, arg4 are used as pointers only, not modified +# arg1, arg2, arg3, arg4 are used as pointers only, not modified # r11 is the data offset value .macro GHASH_8_ENCRYPT_8_PARALLEL_AVX2 REP T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC @@ -2750,7 +2735,7 @@ SYM_FUNC_START(aesni_gcm_init_avx_gen4) FUNC_SAVE INIT GHASH_MUL_AVX2, PRECOMPUTE_AVX2 FUNC_RESTORE - ret + RET SYM_FUNC_END(aesni_gcm_init_avx_gen4) ############################################################################### @@ -2771,15 +2756,15 @@ SYM_FUNC_START(aesni_gcm_enc_update_avx_gen4) # must be 192 GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 11 FUNC_RESTORE - ret + RET key_128_enc_update4: GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 9 FUNC_RESTORE - ret + RET key_256_enc_update4: GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 13 FUNC_RESTORE - ret + RET SYM_FUNC_END(aesni_gcm_enc_update_avx_gen4) ############################################################################### @@ -2800,15 +2785,15 @@ SYM_FUNC_START(aesni_gcm_dec_update_avx_gen4) # must be 192 GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 11 FUNC_RESTORE - ret + RET key_128_dec_update4: GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 9 FUNC_RESTORE - ret + RET key_256_dec_update4: GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 13 FUNC_RESTORE - ret + RET SYM_FUNC_END(aesni_gcm_dec_update_avx_gen4) ############################################################################### @@ -2829,15 +2814,13 @@ SYM_FUNC_START(aesni_gcm_finalize_avx_gen4) # must be 192 GCM_COMPLETE GHASH_MUL_AVX2, 11, arg3, arg4 FUNC_RESTORE - ret + RET key_128_finalize4: GCM_COMPLETE GHASH_MUL_AVX2, 9, arg3, arg4 FUNC_RESTORE - ret + RET key_256_finalize4: GCM_COMPLETE GHASH_MUL_AVX2, 13, arg3, arg4 FUNC_RESTORE - ret + RET SYM_FUNC_END(aesni_gcm_finalize_avx_gen4) - -#endif /* CONFIG_AS_AVX2 */ diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c index bbbebbd35b5d..a5b0cb3efeba 100644 --- a/arch/x86/crypto/aesni-intel_glue.c +++ b/arch/x86/crypto/aesni-intel_glue.c @@ -31,11 +31,10 @@ #include <crypto/internal/aead.h> #include <crypto/internal/simd.h> #include <crypto/internal/skcipher.h> +#include <linux/jump_label.h> #include <linux/workqueue.h> #include <linux/spinlock.h> -#ifdef CONFIG_X86_64 -#include <asm/crypto/glue_helper.h> -#endif +#include <linux/static_call.h> #define AESNI_ALIGN 16 @@ -93,62 +92,25 @@ asmlinkage void aesni_cbc_enc(struct crypto_aes_ctx *ctx, u8 *out, const u8 *in, unsigned int len, u8 *iv); asmlinkage void aesni_cbc_dec(struct crypto_aes_ctx *ctx, u8 *out, const u8 *in, unsigned int len, u8 *iv); +asmlinkage void aesni_cts_cbc_enc(struct crypto_aes_ctx *ctx, u8 *out, + const u8 *in, unsigned int len, u8 *iv); +asmlinkage void aesni_cts_cbc_dec(struct crypto_aes_ctx *ctx, u8 *out, + const u8 *in, unsigned int len, u8 *iv); #define AVX_GEN2_OPTSIZE 640 #define AVX_GEN4_OPTSIZE 4096 +asmlinkage void aesni_xts_encrypt(const struct crypto_aes_ctx *ctx, u8 *out, + const u8 *in, unsigned int len, u8 *iv); + +asmlinkage void aesni_xts_decrypt(const struct crypto_aes_ctx *ctx, u8 *out, + const u8 *in, unsigned int len, u8 *iv); + #ifdef CONFIG_X86_64 -static void (*aesni_ctr_enc_tfm)(struct crypto_aes_ctx *ctx, u8 *out, - const u8 *in, unsigned int len, u8 *iv); asmlinkage void aesni_ctr_enc(struct crypto_aes_ctx *ctx, u8 *out, const u8 *in, unsigned int len, u8 *iv); - -asmlinkage void aesni_xts_crypt8(const struct crypto_aes_ctx *ctx, u8 *out, - const u8 *in, bool enc, le128 *iv); - -/* asmlinkage void aesni_gcm_enc() - * void *ctx, AES Key schedule. Starts on a 16 byte boundary. - * struct gcm_context_data. May be uninitialized. - * u8 *out, Ciphertext output. Encrypt in-place is allowed. - * const u8 *in, Plaintext input - * unsigned long plaintext_len, Length of data in bytes for encryption. - * u8 *iv, Pre-counter block j0: 12 byte IV concatenated with 0x00000001. - * 16-byte aligned pointer. - * u8 *hash_subkey, the Hash sub key input. Data starts on a 16-byte boundary. - * const u8 *aad, Additional Authentication Data (AAD) - * unsigned long aad_len, Length of AAD in bytes. - * u8 *auth_tag, Authenticated Tag output. - * unsigned long auth_tag_len), Authenticated Tag Length in bytes. - * Valid values are 16 (most likely), 12 or 8. - */ -asmlinkage void aesni_gcm_enc(void *ctx, - struct gcm_context_data *gdata, u8 *out, - const u8 *in, unsigned long plaintext_len, u8 *iv, - u8 *hash_subkey, const u8 *aad, unsigned long aad_len, - u8 *auth_tag, unsigned long auth_tag_len); - -/* asmlinkage void aesni_gcm_dec() - * void *ctx, AES Key schedule. Starts on a 16 byte boundary. - * struct gcm_context_data. May be uninitialized. - * u8 *out, Plaintext output. Decrypt in-place is allowed. - * const u8 *in, Ciphertext input - * unsigned long ciphertext_len, Length of data in bytes for decryption. - * u8 *iv, Pre-counter block j0: 12 byte IV concatenated with 0x00000001. - * 16-byte aligned pointer. - * u8 *hash_subkey, the Hash sub key input. Data starts on a 16-byte boundary. - * const u8 *aad, Additional Authentication Data (AAD) - * unsigned long aad_len, Length of AAD in bytes. With RFC4106 this is going - * to be 8 or 12 bytes - * u8 *auth_tag, Authenticated Tag output. - * unsigned long auth_tag_len) Authenticated Tag Length in bytes. - * Valid values are 16 (most likely), 12 or 8. - */ -asmlinkage void aesni_gcm_dec(void *ctx, - struct gcm_context_data *gdata, u8 *out, - const u8 *in, unsigned long ciphertext_len, u8 *iv, - u8 *hash_subkey, const u8 *aad, unsigned long aad_len, - u8 *auth_tag, unsigned long auth_tag_len); +DEFINE_STATIC_CALL(aesni_ctr_enc_tfm, aesni_ctr_enc); /* Scatter / Gather routines, with args similar to above */ asmlinkage void aesni_gcm_init(void *ctx, @@ -167,31 +129,26 @@ asmlinkage void aesni_gcm_finalize(void *ctx, struct gcm_context_data *gdata, u8 *auth_tag, unsigned long auth_tag_len); -static const struct aesni_gcm_tfm_s { - void (*init)(void *ctx, struct gcm_context_data *gdata, u8 *iv, - u8 *hash_subkey, const u8 *aad, unsigned long aad_len); - void (*enc_update)(void *ctx, struct gcm_context_data *gdata, u8 *out, - const u8 *in, unsigned long plaintext_len); - void (*dec_update)(void *ctx, struct gcm_context_data *gdata, u8 *out, - const u8 *in, unsigned long ciphertext_len); - void (*finalize)(void *ctx, struct gcm_context_data *gdata, - u8 *auth_tag, unsigned long auth_tag_len); -} *aesni_gcm_tfm; - -static const struct aesni_gcm_tfm_s aesni_gcm_tfm_sse = { - .init = &aesni_gcm_init, - .enc_update = &aesni_gcm_enc_update, - .dec_update = &aesni_gcm_dec_update, - .finalize = &aesni_gcm_finalize, -}; - -#ifdef CONFIG_AS_AVX asmlinkage void aes_ctr_enc_128_avx_by8(const u8 *in, u8 *iv, void *keys, u8 *out, unsigned int num_bytes); asmlinkage void aes_ctr_enc_192_avx_by8(const u8 *in, u8 *iv, void *keys, u8 *out, unsigned int num_bytes); asmlinkage void aes_ctr_enc_256_avx_by8(const u8 *in, u8 *iv, void *keys, u8 *out, unsigned int num_bytes); + + +asmlinkage void aes_xctr_enc_128_avx_by8(const u8 *in, const u8 *iv, + const void *keys, u8 *out, unsigned int num_bytes, + unsigned int byte_ctr); + +asmlinkage void aes_xctr_enc_192_avx_by8(const u8 *in, const u8 *iv, + const void *keys, u8 *out, unsigned int num_bytes, + unsigned int byte_ctr); + +asmlinkage void aes_xctr_enc_256_avx_by8(const u8 *in, const u8 *iv, + const void *keys, u8 *out, unsigned int num_bytes, + unsigned int byte_ctr); + /* * asmlinkage void aesni_gcm_init_avx_gen2() * gcm_data *my_ctx_data, context data @@ -215,28 +172,6 @@ asmlinkage void aesni_gcm_finalize_avx_gen2(void *ctx, struct gcm_context_data *gdata, u8 *auth_tag, unsigned long auth_tag_len); -asmlinkage void aesni_gcm_enc_avx_gen2(void *ctx, - struct gcm_context_data *gdata, u8 *out, - const u8 *in, unsigned long plaintext_len, u8 *iv, - const u8 *aad, unsigned long aad_len, - u8 *auth_tag, unsigned long auth_tag_len); - -asmlinkage void aesni_gcm_dec_avx_gen2(void *ctx, - struct gcm_context_data *gdata, u8 *out, - const u8 *in, unsigned long ciphertext_len, u8 *iv, - const u8 *aad, unsigned long aad_len, - u8 *auth_tag, unsigned long auth_tag_len); - -static const struct aesni_gcm_tfm_s aesni_gcm_tfm_avx_gen2 = { - .init = &aesni_gcm_init_avx_gen2, - .enc_update = &aesni_gcm_enc_update_avx_gen2, - .dec_update = &aesni_gcm_dec_update_avx_gen2, - .finalize = &aesni_gcm_finalize_avx_gen2, -}; - -#endif - -#ifdef CONFIG_AS_AVX2 /* * asmlinkage void aesni_gcm_init_avx_gen4() * gcm_data *my_ctx_data, context data @@ -260,26 +195,8 @@ asmlinkage void aesni_gcm_finalize_avx_gen4(void *ctx, struct gcm_context_data *gdata, u8 *auth_tag, unsigned long auth_tag_len); -asmlinkage void aesni_gcm_enc_avx_gen4(void *ctx, - struct gcm_context_data *gdata, u8 *out, - const u8 *in, unsigned long plaintext_len, u8 *iv, - const u8 *aad, unsigned long aad_len, - u8 *auth_tag, unsigned long auth_tag_len); - -asmlinkage void aesni_gcm_dec_avx_gen4(void *ctx, - struct gcm_context_data *gdata, u8 *out, - const u8 *in, unsigned long ciphertext_len, u8 *iv, - const u8 *aad, unsigned long aad_len, - u8 *auth_tag, unsigned long auth_tag_len); - -static const struct aesni_gcm_tfm_s aesni_gcm_tfm_avx_gen4 = { - .init = &aesni_gcm_init_avx_gen4, - .enc_update = &aesni_gcm_enc_update_avx_gen4, - .dec_update = &aesni_gcm_dec_update_avx_gen4, - .finalize = &aesni_gcm_finalize_avx_gen4, -}; - -#endif +static __ro_after_init DEFINE_STATIC_KEY_FALSE(gcm_use_avx); +static __ro_after_init DEFINE_STATIC_KEY_FALSE(gcm_use_avx2); static inline struct aesni_rfc4106_gcm_ctx *aesni_rfc4106_gcm_ctx_get(struct crypto_aead *tfm) @@ -380,16 +297,16 @@ static int ecb_encrypt(struct skcipher_request *req) unsigned int nbytes; int err; - err = skcipher_walk_virt(&walk, req, true); + err = skcipher_walk_virt(&walk, req, false); - kernel_fpu_begin(); while ((nbytes = walk.nbytes)) { + kernel_fpu_begin(); aesni_ecb_enc(ctx, walk.dst.virt.addr, walk.src.virt.addr, nbytes & AES_BLOCK_MASK); + kernel_fpu_end(); nbytes &= AES_BLOCK_SIZE - 1; err = skcipher_walk_done(&walk, nbytes); } - kernel_fpu_end(); return err; } @@ -402,16 +319,16 @@ static int ecb_decrypt(struct skcipher_request *req) unsigned int nbytes; int err; - err = skcipher_walk_virt(&walk, req, true); + err = skcipher_walk_virt(&walk, req, false); - kernel_fpu_begin(); while ((nbytes = walk.nbytes)) { + kernel_fpu_begin(); aesni_ecb_dec(ctx, walk.dst.virt.addr, walk.src.virt.addr, nbytes & AES_BLOCK_MASK); + kernel_fpu_end(); nbytes &= AES_BLOCK_SIZE - 1; err = skcipher_walk_done(&walk, nbytes); } - kernel_fpu_end(); return err; } @@ -424,16 +341,16 @@ static int cbc_encrypt(struct skcipher_request *req) unsigned int nbytes; int err; - err = skcipher_walk_virt(&walk, req, true); + err = skcipher_walk_virt(&walk, req, false); - kernel_fpu_begin(); while ((nbytes = walk.nbytes)) { + kernel_fpu_begin(); aesni_cbc_enc(ctx, walk.dst.virt.addr, walk.src.virt.addr, nbytes & AES_BLOCK_MASK, walk.iv); + kernel_fpu_end(); nbytes &= AES_BLOCK_SIZE - 1; err = skcipher_walk_done(&walk, nbytes); } - kernel_fpu_end(); return err; } @@ -446,37 +363,133 @@ static int cbc_decrypt(struct skcipher_request *req) unsigned int nbytes; int err; - err = skcipher_walk_virt(&walk, req, true); + err = skcipher_walk_virt(&walk, req, false); - kernel_fpu_begin(); while ((nbytes = walk.nbytes)) { + kernel_fpu_begin(); aesni_cbc_dec(ctx, walk.dst.virt.addr, walk.src.virt.addr, nbytes & AES_BLOCK_MASK, walk.iv); + kernel_fpu_end(); nbytes &= AES_BLOCK_SIZE - 1; err = skcipher_walk_done(&walk, nbytes); } - kernel_fpu_end(); return err; } -#ifdef CONFIG_X86_64 -static void ctr_crypt_final(struct crypto_aes_ctx *ctx, - struct skcipher_walk *walk) +static int cts_cbc_encrypt(struct skcipher_request *req) { - u8 *ctrblk = walk->iv; - u8 keystream[AES_BLOCK_SIZE]; - u8 *src = walk->src.virt.addr; - u8 *dst = walk->dst.virt.addr; - unsigned int nbytes = walk->nbytes; + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct crypto_aes_ctx *ctx = aes_ctx(crypto_skcipher_ctx(tfm)); + int cbc_blocks = DIV_ROUND_UP(req->cryptlen, AES_BLOCK_SIZE) - 2; + struct scatterlist *src = req->src, *dst = req->dst; + struct scatterlist sg_src[2], sg_dst[2]; + struct skcipher_request subreq; + struct skcipher_walk walk; + int err; + + skcipher_request_set_tfm(&subreq, tfm); + skcipher_request_set_callback(&subreq, skcipher_request_flags(req), + NULL, NULL); + + if (req->cryptlen <= AES_BLOCK_SIZE) { + if (req->cryptlen < AES_BLOCK_SIZE) + return -EINVAL; + cbc_blocks = 1; + } + + if (cbc_blocks > 0) { + skcipher_request_set_crypt(&subreq, req->src, req->dst, + cbc_blocks * AES_BLOCK_SIZE, + req->iv); + + err = cbc_encrypt(&subreq); + if (err) + return err; + + if (req->cryptlen == AES_BLOCK_SIZE) + return 0; + + dst = src = scatterwalk_ffwd(sg_src, req->src, subreq.cryptlen); + if (req->dst != req->src) + dst = scatterwalk_ffwd(sg_dst, req->dst, + subreq.cryptlen); + } + + /* handle ciphertext stealing */ + skcipher_request_set_crypt(&subreq, src, dst, + req->cryptlen - cbc_blocks * AES_BLOCK_SIZE, + req->iv); + + err = skcipher_walk_virt(&walk, &subreq, false); + if (err) + return err; + + kernel_fpu_begin(); + aesni_cts_cbc_enc(ctx, walk.dst.virt.addr, walk.src.virt.addr, + walk.nbytes, walk.iv); + kernel_fpu_end(); + + return skcipher_walk_done(&walk, 0); +} + +static int cts_cbc_decrypt(struct skcipher_request *req) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct crypto_aes_ctx *ctx = aes_ctx(crypto_skcipher_ctx(tfm)); + int cbc_blocks = DIV_ROUND_UP(req->cryptlen, AES_BLOCK_SIZE) - 2; + struct scatterlist *src = req->src, *dst = req->dst; + struct scatterlist sg_src[2], sg_dst[2]; + struct skcipher_request subreq; + struct skcipher_walk walk; + int err; + + skcipher_request_set_tfm(&subreq, tfm); + skcipher_request_set_callback(&subreq, skcipher_request_flags(req), + NULL, NULL); + + if (req->cryptlen <= AES_BLOCK_SIZE) { + if (req->cryptlen < AES_BLOCK_SIZE) + return -EINVAL; + cbc_blocks = 1; + } + + if (cbc_blocks > 0) { + skcipher_request_set_crypt(&subreq, req->src, req->dst, + cbc_blocks * AES_BLOCK_SIZE, + req->iv); + + err = cbc_decrypt(&subreq); + if (err) + return err; + + if (req->cryptlen == AES_BLOCK_SIZE) + return 0; + + dst = src = scatterwalk_ffwd(sg_src, req->src, subreq.cryptlen); + if (req->dst != req->src) + dst = scatterwalk_ffwd(sg_dst, req->dst, + subreq.cryptlen); + } + + /* handle ciphertext stealing */ + skcipher_request_set_crypt(&subreq, src, dst, + req->cryptlen - cbc_blocks * AES_BLOCK_SIZE, + req->iv); + + err = skcipher_walk_virt(&walk, &subreq, false); + if (err) + return err; - aesni_enc(ctx, keystream, ctrblk); - crypto_xor_cpy(dst, keystream, src, nbytes); + kernel_fpu_begin(); + aesni_cts_cbc_dec(ctx, walk.dst.virt.addr, walk.src.virt.addr, + walk.nbytes, walk.iv); + kernel_fpu_end(); - crypto_inc(ctrblk, AES_BLOCK_SIZE); + return skcipher_walk_done(&walk, 0); } -#ifdef CONFIG_AS_AVX +#ifdef CONFIG_X86_64 static void aesni_ctr_enc_avx_tfm(struct crypto_aes_ctx *ctx, u8 *out, const u8 *in, unsigned int len, u8 *iv) { @@ -493,124 +506,92 @@ static void aesni_ctr_enc_avx_tfm(struct crypto_aes_ctx *ctx, u8 *out, else aes_ctr_enc_256_avx_by8(in, iv, (void *)ctx, out, len); } -#endif static int ctr_crypt(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct crypto_aes_ctx *ctx = aes_ctx(crypto_skcipher_ctx(tfm)); + u8 keystream[AES_BLOCK_SIZE]; struct skcipher_walk walk; unsigned int nbytes; int err; - err = skcipher_walk_virt(&walk, req, true); + err = skcipher_walk_virt(&walk, req, false); - kernel_fpu_begin(); - while ((nbytes = walk.nbytes) >= AES_BLOCK_SIZE) { - aesni_ctr_enc_tfm(ctx, walk.dst.virt.addr, walk.src.virt.addr, - nbytes & AES_BLOCK_MASK, walk.iv); - nbytes &= AES_BLOCK_SIZE - 1; + while ((nbytes = walk.nbytes) > 0) { + kernel_fpu_begin(); + if (nbytes & AES_BLOCK_MASK) + static_call(aesni_ctr_enc_tfm)(ctx, walk.dst.virt.addr, + walk.src.virt.addr, + nbytes & AES_BLOCK_MASK, + walk.iv); + nbytes &= ~AES_BLOCK_MASK; + + if (walk.nbytes == walk.total && nbytes > 0) { + aesni_enc(ctx, keystream, walk.iv); + crypto_xor_cpy(walk.dst.virt.addr + walk.nbytes - nbytes, + walk.src.virt.addr + walk.nbytes - nbytes, + keystream, nbytes); + crypto_inc(walk.iv, AES_BLOCK_SIZE); + nbytes = 0; + } + kernel_fpu_end(); err = skcipher_walk_done(&walk, nbytes); } - if (walk.nbytes) { - ctr_crypt_final(ctx, &walk); - err = skcipher_walk_done(&walk, 0); - } - kernel_fpu_end(); - return err; } -static int xts_aesni_setkey(struct crypto_skcipher *tfm, const u8 *key, - unsigned int keylen) -{ - struct aesni_xts_ctx *ctx = crypto_skcipher_ctx(tfm); - int err; - - err = xts_verify_key(tfm, key, keylen); - if (err) - return err; - - keylen /= 2; - - /* first half of xts-key is for crypt */ - err = aes_set_key_common(crypto_skcipher_tfm(tfm), ctx->raw_crypt_ctx, - key, keylen); - if (err) - return err; - - /* second half of xts-key is for tweak */ - return aes_set_key_common(crypto_skcipher_tfm(tfm), ctx->raw_tweak_ctx, - key + keylen, keylen); -} - - -static void aesni_xts_enc(const void *ctx, u8 *dst, const u8 *src, le128 *iv) -{ - glue_xts_crypt_128bit_one(ctx, dst, src, iv, aesni_enc); -} - -static void aesni_xts_dec(const void *ctx, u8 *dst, const u8 *src, le128 *iv) -{ - glue_xts_crypt_128bit_one(ctx, dst, src, iv, aesni_dec); -} - -static void aesni_xts_enc8(const void *ctx, u8 *dst, const u8 *src, le128 *iv) +static void aesni_xctr_enc_avx_tfm(struct crypto_aes_ctx *ctx, u8 *out, + const u8 *in, unsigned int len, u8 *iv, + unsigned int byte_ctr) { - aesni_xts_crypt8(ctx, dst, src, true, iv); + if (ctx->key_length == AES_KEYSIZE_128) + aes_xctr_enc_128_avx_by8(in, iv, (void *)ctx, out, len, + byte_ctr); + else if (ctx->key_length == AES_KEYSIZE_192) + aes_xctr_enc_192_avx_by8(in, iv, (void *)ctx, out, len, + byte_ctr); + else + aes_xctr_enc_256_avx_by8(in, iv, (void *)ctx, out, len, + byte_ctr); } -static void aesni_xts_dec8(const void *ctx, u8 *dst, const u8 *src, le128 *iv) -{ - aesni_xts_crypt8(ctx, dst, src, false, iv); -} - -static const struct common_glue_ctx aesni_enc_xts = { - .num_funcs = 2, - .fpu_blocks_limit = 1, - - .funcs = { { - .num_blocks = 8, - .fn_u = { .xts = aesni_xts_enc8 } - }, { - .num_blocks = 1, - .fn_u = { .xts = aesni_xts_enc } - } } -}; - -static const struct common_glue_ctx aesni_dec_xts = { - .num_funcs = 2, - .fpu_blocks_limit = 1, - - .funcs = { { - .num_blocks = 8, - .fn_u = { .xts = aesni_xts_dec8 } - }, { - .num_blocks = 1, - .fn_u = { .xts = aesni_xts_dec } - } } -}; - -static int xts_encrypt(struct skcipher_request *req) +static int xctr_crypt(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - struct aesni_xts_ctx *ctx = crypto_skcipher_ctx(tfm); + struct crypto_aes_ctx *ctx = aes_ctx(crypto_skcipher_ctx(tfm)); + u8 keystream[AES_BLOCK_SIZE]; + struct skcipher_walk walk; + unsigned int nbytes; + unsigned int byte_ctr = 0; + int err; + __le32 block[AES_BLOCK_SIZE / sizeof(__le32)]; - return glue_xts_req_128bit(&aesni_enc_xts, req, aesni_enc, - aes_ctx(ctx->raw_tweak_ctx), - aes_ctx(ctx->raw_crypt_ctx), - false); -} + err = skcipher_walk_virt(&walk, req, false); -static int xts_decrypt(struct skcipher_request *req) -{ - struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - struct aesni_xts_ctx *ctx = crypto_skcipher_ctx(tfm); - - return glue_xts_req_128bit(&aesni_dec_xts, req, aesni_enc, - aes_ctx(ctx->raw_tweak_ctx), - aes_ctx(ctx->raw_crypt_ctx), - true); + while ((nbytes = walk.nbytes) > 0) { + kernel_fpu_begin(); + if (nbytes & AES_BLOCK_MASK) + aesni_xctr_enc_avx_tfm(ctx, walk.dst.virt.addr, + walk.src.virt.addr, nbytes & AES_BLOCK_MASK, + walk.iv, byte_ctr); + nbytes &= ~AES_BLOCK_MASK; + byte_ctr += walk.nbytes - nbytes; + + if (walk.nbytes == walk.total && nbytes > 0) { + memcpy(block, walk.iv, AES_BLOCK_SIZE); + block[0] ^= cpu_to_le32(1 + byte_ctr / AES_BLOCK_SIZE); + aesni_enc(ctx, keystream, (u8 *)block); + crypto_xor_cpy(walk.dst.virt.addr + walk.nbytes - + nbytes, walk.src.virt.addr + walk.nbytes + - nbytes, keystream, nbytes); + byte_ctr += nbytes; + nbytes = 0; + } + kernel_fpu_end(); + err = skcipher_walk_done(&walk, nbytes); + } + return err; } static int @@ -689,46 +670,35 @@ static int generic_gcmaes_set_authsize(struct crypto_aead *tfm, static int gcmaes_crypt_by_sg(bool enc, struct aead_request *req, unsigned int assoclen, u8 *hash_subkey, - u8 *iv, void *aes_ctx) + u8 *iv, void *aes_ctx, u8 *auth_tag, + unsigned long auth_tag_len) { - struct crypto_aead *tfm = crypto_aead_reqtfm(req); - unsigned long auth_tag_len = crypto_aead_authsize(tfm); - const struct aesni_gcm_tfm_s *gcm_tfm = aesni_gcm_tfm; - struct gcm_context_data data AESNI_ALIGN_ATTR; - struct scatter_walk dst_sg_walk = {}; + u8 databuf[sizeof(struct gcm_context_data) + (AESNI_ALIGN - 8)] __aligned(8); + struct gcm_context_data *data = PTR_ALIGN((void *)databuf, AESNI_ALIGN); unsigned long left = req->cryptlen; - unsigned long len, srclen, dstlen; struct scatter_walk assoc_sg_walk; - struct scatter_walk src_sg_walk; - struct scatterlist src_start[2]; - struct scatterlist dst_start[2]; - struct scatterlist *src_sg; - struct scatterlist *dst_sg; - u8 *src, *dst, *assoc; + struct skcipher_walk walk; + bool do_avx, do_avx2; u8 *assocmem = NULL; - u8 authTag[16]; + u8 *assoc; + int err; if (!enc) left -= auth_tag_len; -#ifdef CONFIG_AS_AVX2 - if (left < AVX_GEN4_OPTSIZE && gcm_tfm == &aesni_gcm_tfm_avx_gen4) - gcm_tfm = &aesni_gcm_tfm_avx_gen2; -#endif -#ifdef CONFIG_AS_AVX - if (left < AVX_GEN2_OPTSIZE && gcm_tfm == &aesni_gcm_tfm_avx_gen2) - gcm_tfm = &aesni_gcm_tfm_sse; -#endif + do_avx = (left >= AVX_GEN2_OPTSIZE); + do_avx2 = (left >= AVX_GEN4_OPTSIZE); /* Linearize assoc, if not already linear */ - if (req->src->length >= assoclen && req->src->length && - (!PageHighMem(sg_page(req->src)) || - req->src->offset + req->src->length <= PAGE_SIZE)) { + if (req->src->length >= assoclen && req->src->length) { scatterwalk_start(&assoc_sg_walk, req->src); assoc = scatterwalk_map(&assoc_sg_walk); } else { + gfp_t flags = (req->base.flags & CRYPTO_TFM_REQ_MAY_SLEEP) ? + GFP_KERNEL : GFP_ATOMIC; + /* assoc can be any length, so must be on heap */ - assocmem = kmalloc(assoclen, GFP_ATOMIC); + assocmem = kmalloc(assoclen, flags); if (unlikely(!assocmem)) return -ENOMEM; assoc = assocmem; @@ -736,62 +706,15 @@ static int gcmaes_crypt_by_sg(bool enc, struct aead_request *req, scatterwalk_map_and_copy(assoc, req->src, 0, assoclen, 0); } - if (left) { - src_sg = scatterwalk_ffwd(src_start, req->src, req->assoclen); - scatterwalk_start(&src_sg_walk, src_sg); - if (req->src != req->dst) { - dst_sg = scatterwalk_ffwd(dst_start, req->dst, - req->assoclen); - scatterwalk_start(&dst_sg_walk, dst_sg); - } - } - kernel_fpu_begin(); - gcm_tfm->init(aes_ctx, &data, iv, - hash_subkey, assoc, assoclen); - if (req->src != req->dst) { - while (left) { - src = scatterwalk_map(&src_sg_walk); - dst = scatterwalk_map(&dst_sg_walk); - srclen = scatterwalk_clamp(&src_sg_walk, left); - dstlen = scatterwalk_clamp(&dst_sg_walk, left); - len = min(srclen, dstlen); - if (len) { - if (enc) - gcm_tfm->enc_update(aes_ctx, &data, - dst, src, len); - else - gcm_tfm->dec_update(aes_ctx, &data, - dst, src, len); - } - left -= len; - - scatterwalk_unmap(src); - scatterwalk_unmap(dst); - scatterwalk_advance(&src_sg_walk, len); - scatterwalk_advance(&dst_sg_walk, len); - scatterwalk_done(&src_sg_walk, 0, left); - scatterwalk_done(&dst_sg_walk, 1, left); - } - } else { - while (left) { - dst = src = scatterwalk_map(&src_sg_walk); - len = scatterwalk_clamp(&src_sg_walk, left); - if (len) { - if (enc) - gcm_tfm->enc_update(aes_ctx, &data, - src, src, len); - else - gcm_tfm->dec_update(aes_ctx, &data, - src, src, len); - } - left -= len; - scatterwalk_unmap(src); - scatterwalk_advance(&src_sg_walk, len); - scatterwalk_done(&src_sg_walk, 1, left); - } - } - gcm_tfm->finalize(aes_ctx, &data, authTag, auth_tag_len); + if (static_branch_likely(&gcm_use_avx2) && do_avx2) + aesni_gcm_init_avx_gen4(aes_ctx, data, iv, hash_subkey, assoc, + assoclen); + else if (static_branch_likely(&gcm_use_avx) && do_avx) + aesni_gcm_init_avx_gen2(aes_ctx, data, iv, hash_subkey, assoc, + assoclen); + else + aesni_gcm_init(aes_ctx, data, iv, hash_subkey, assoc, assoclen); kernel_fpu_end(); if (!assocmem) @@ -799,24 +722,58 @@ static int gcmaes_crypt_by_sg(bool enc, struct aead_request *req, else kfree(assocmem); - if (!enc) { - u8 authTagMsg[16]; + err = enc ? skcipher_walk_aead_encrypt(&walk, req, false) + : skcipher_walk_aead_decrypt(&walk, req, false); - /* Copy out original authTag */ - scatterwalk_map_and_copy(authTagMsg, req->src, - req->assoclen + req->cryptlen - - auth_tag_len, - auth_tag_len, 0); + while (walk.nbytes > 0) { + kernel_fpu_begin(); + if (static_branch_likely(&gcm_use_avx2) && do_avx2) { + if (enc) + aesni_gcm_enc_update_avx_gen4(aes_ctx, data, + walk.dst.virt.addr, + walk.src.virt.addr, + walk.nbytes); + else + aesni_gcm_dec_update_avx_gen4(aes_ctx, data, + walk.dst.virt.addr, + walk.src.virt.addr, + walk.nbytes); + } else if (static_branch_likely(&gcm_use_avx) && do_avx) { + if (enc) + aesni_gcm_enc_update_avx_gen2(aes_ctx, data, + walk.dst.virt.addr, + walk.src.virt.addr, + walk.nbytes); + else + aesni_gcm_dec_update_avx_gen2(aes_ctx, data, + walk.dst.virt.addr, + walk.src.virt.addr, + walk.nbytes); + } else if (enc) { + aesni_gcm_enc_update(aes_ctx, data, walk.dst.virt.addr, + walk.src.virt.addr, walk.nbytes); + } else { + aesni_gcm_dec_update(aes_ctx, data, walk.dst.virt.addr, + walk.src.virt.addr, walk.nbytes); + } + kernel_fpu_end(); - /* Compare generated tag with passed in tag. */ - return crypto_memneq(authTagMsg, authTag, auth_tag_len) ? - -EBADMSG : 0; + err = skcipher_walk_done(&walk, 0); } - /* Copy in the authTag */ - scatterwalk_map_and_copy(authTag, req->dst, - req->assoclen + req->cryptlen, - auth_tag_len, 1); + if (err) + return err; + + kernel_fpu_begin(); + if (static_branch_likely(&gcm_use_avx2) && do_avx2) + aesni_gcm_finalize_avx_gen4(aes_ctx, data, auth_tag, + auth_tag_len); + else if (static_branch_likely(&gcm_use_avx) && do_avx) + aesni_gcm_finalize_avx_gen2(aes_ctx, data, auth_tag, + auth_tag_len); + else + aesni_gcm_finalize(aes_ctx, data, auth_tag, auth_tag_len); + kernel_fpu_end(); return 0; } @@ -824,15 +781,47 @@ static int gcmaes_crypt_by_sg(bool enc, struct aead_request *req, static int gcmaes_encrypt(struct aead_request *req, unsigned int assoclen, u8 *hash_subkey, u8 *iv, void *aes_ctx) { - return gcmaes_crypt_by_sg(true, req, assoclen, hash_subkey, iv, - aes_ctx); + struct crypto_aead *tfm = crypto_aead_reqtfm(req); + unsigned long auth_tag_len = crypto_aead_authsize(tfm); + u8 auth_tag[16]; + int err; + + err = gcmaes_crypt_by_sg(true, req, assoclen, hash_subkey, iv, aes_ctx, + auth_tag, auth_tag_len); + if (err) + return err; + + scatterwalk_map_and_copy(auth_tag, req->dst, + req->assoclen + req->cryptlen, + auth_tag_len, 1); + return 0; } static int gcmaes_decrypt(struct aead_request *req, unsigned int assoclen, u8 *hash_subkey, u8 *iv, void *aes_ctx) { - return gcmaes_crypt_by_sg(false, req, assoclen, hash_subkey, iv, - aes_ctx); + struct crypto_aead *tfm = crypto_aead_reqtfm(req); + unsigned long auth_tag_len = crypto_aead_authsize(tfm); + u8 auth_tag_msg[16]; + u8 auth_tag[16]; + int err; + + err = gcmaes_crypt_by_sg(false, req, assoclen, hash_subkey, iv, aes_ctx, + auth_tag, auth_tag_len); + if (err) + return err; + + /* Copy out original auth_tag */ + scatterwalk_map_and_copy(auth_tag_msg, req->src, + req->assoclen + req->cryptlen - auth_tag_len, + auth_tag_len, 0); + + /* Compare generated tag with passed in tag. */ + if (crypto_memneq(auth_tag_msg, auth_tag, auth_tag_len)) { + memzero_explicit(auth_tag, sizeof(auth_tag)); + return -EBADMSG; + } + return 0; } static int helper_rfc4106_encrypt(struct aead_request *req) @@ -840,7 +829,8 @@ static int helper_rfc4106_encrypt(struct aead_request *req) struct crypto_aead *tfm = crypto_aead_reqtfm(req); struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm); void *aes_ctx = &(ctx->aes_key_expanded); - u8 iv[16] __attribute__ ((__aligned__(AESNI_ALIGN))); + u8 ivbuf[16 + (AESNI_ALIGN - 8)] __aligned(8); + u8 *iv = PTR_ALIGN(&ivbuf[0], AESNI_ALIGN); unsigned int i; __be32 counter = cpu_to_be32(1); @@ -867,7 +857,8 @@ static int helper_rfc4106_decrypt(struct aead_request *req) struct crypto_aead *tfm = crypto_aead_reqtfm(req); struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm); void *aes_ctx = &(ctx->aes_key_expanded); - u8 iv[16] __attribute__ ((__aligned__(AESNI_ALIGN))); + u8 ivbuf[16 + (AESNI_ALIGN - 8)] __aligned(8); + u8 *iv = PTR_ALIGN(&ivbuf[0], AESNI_ALIGN); unsigned int i; if (unlikely(req->assoclen != 16 && req->assoclen != 20)) @@ -889,6 +880,133 @@ static int helper_rfc4106_decrypt(struct aead_request *req) } #endif +static int xts_aesni_setkey(struct crypto_skcipher *tfm, const u8 *key, + unsigned int keylen) +{ + struct aesni_xts_ctx *ctx = crypto_skcipher_ctx(tfm); + int err; + + err = xts_verify_key(tfm, key, keylen); + if (err) + return err; + + keylen /= 2; + + /* first half of xts-key is for crypt */ + err = aes_set_key_common(crypto_skcipher_tfm(tfm), ctx->raw_crypt_ctx, + key, keylen); + if (err) + return err; + + /* second half of xts-key is for tweak */ + return aes_set_key_common(crypto_skcipher_tfm(tfm), ctx->raw_tweak_ctx, + key + keylen, keylen); +} + +static int xts_crypt(struct skcipher_request *req, bool encrypt) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct aesni_xts_ctx *ctx = crypto_skcipher_ctx(tfm); + int tail = req->cryptlen % AES_BLOCK_SIZE; + struct skcipher_request subreq; + struct skcipher_walk walk; + int err; + + if (req->cryptlen < AES_BLOCK_SIZE) + return -EINVAL; + + err = skcipher_walk_virt(&walk, req, false); + if (!walk.nbytes) + return err; + + if (unlikely(tail > 0 && walk.nbytes < walk.total)) { + int blocks = DIV_ROUND_UP(req->cryptlen, AES_BLOCK_SIZE) - 2; + + skcipher_walk_abort(&walk); + + skcipher_request_set_tfm(&subreq, tfm); + skcipher_request_set_callback(&subreq, + skcipher_request_flags(req), + NULL, NULL); + skcipher_request_set_crypt(&subreq, req->src, req->dst, + blocks * AES_BLOCK_SIZE, req->iv); + req = &subreq; + + err = skcipher_walk_virt(&walk, req, false); + if (!walk.nbytes) + return err; + } else { + tail = 0; + } + + kernel_fpu_begin(); + + /* calculate first value of T */ + aesni_enc(aes_ctx(ctx->raw_tweak_ctx), walk.iv, walk.iv); + + while (walk.nbytes > 0) { + int nbytes = walk.nbytes; + + if (nbytes < walk.total) + nbytes &= ~(AES_BLOCK_SIZE - 1); + + if (encrypt) + aesni_xts_encrypt(aes_ctx(ctx->raw_crypt_ctx), + walk.dst.virt.addr, walk.src.virt.addr, + nbytes, walk.iv); + else + aesni_xts_decrypt(aes_ctx(ctx->raw_crypt_ctx), + walk.dst.virt.addr, walk.src.virt.addr, + nbytes, walk.iv); + kernel_fpu_end(); + + err = skcipher_walk_done(&walk, walk.nbytes - nbytes); + + if (walk.nbytes > 0) + kernel_fpu_begin(); + } + + if (unlikely(tail > 0 && !err)) { + struct scatterlist sg_src[2], sg_dst[2]; + struct scatterlist *src, *dst; + + dst = src = scatterwalk_ffwd(sg_src, req->src, req->cryptlen); + if (req->dst != req->src) + dst = scatterwalk_ffwd(sg_dst, req->dst, req->cryptlen); + + skcipher_request_set_crypt(req, src, dst, AES_BLOCK_SIZE + tail, + req->iv); + + err = skcipher_walk_virt(&walk, &subreq, false); + if (err) + return err; + + kernel_fpu_begin(); + if (encrypt) + aesni_xts_encrypt(aes_ctx(ctx->raw_crypt_ctx), + walk.dst.virt.addr, walk.src.virt.addr, + walk.nbytes, walk.iv); + else + aesni_xts_decrypt(aes_ctx(ctx->raw_crypt_ctx), + walk.dst.virt.addr, walk.src.virt.addr, + walk.nbytes, walk.iv); + kernel_fpu_end(); + + err = skcipher_walk_done(&walk, 0); + } + return err; +} + +static int xts_encrypt(struct skcipher_request *req) +{ + return xts_crypt(req, true); +} + +static int xts_decrypt(struct skcipher_request *req) +{ + return xts_crypt(req, false); +} + static struct crypto_alg aesni_cipher_alg = { .cra_name = "aes", .cra_driver_name = "aes-aesni", @@ -940,6 +1058,23 @@ static struct skcipher_alg aesni_skciphers[] = { .setkey = aesni_skcipher_setkey, .encrypt = cbc_encrypt, .decrypt = cbc_decrypt, + }, { + .base = { + .cra_name = "__cts(cbc(aes))", + .cra_driver_name = "__cts-cbc-aes-aesni", + .cra_priority = 400, + .cra_flags = CRYPTO_ALG_INTERNAL, + .cra_blocksize = AES_BLOCK_SIZE, + .cra_ctxsize = CRYPTO_AES_CTX_SIZE, + .cra_module = THIS_MODULE, + }, + .min_keysize = AES_MIN_KEY_SIZE, + .max_keysize = AES_MAX_KEY_SIZE, + .ivsize = AES_BLOCK_SIZE, + .walksize = 2 * AES_BLOCK_SIZE, + .setkey = aesni_skcipher_setkey, + .encrypt = cts_cbc_encrypt, + .decrypt = cts_cbc_decrypt, #ifdef CONFIG_X86_64 }, { .base = { @@ -958,6 +1093,7 @@ static struct skcipher_alg aesni_skciphers[] = { .setkey = aesni_skcipher_setkey, .encrypt = ctr_crypt, .decrypt = ctr_crypt, +#endif }, { .base = { .cra_name = "__xts(aes)", @@ -971,10 +1107,10 @@ static struct skcipher_alg aesni_skciphers[] = { .min_keysize = 2 * AES_MIN_KEY_SIZE, .max_keysize = 2 * AES_MAX_KEY_SIZE, .ivsize = AES_BLOCK_SIZE, + .walksize = 2 * AES_BLOCK_SIZE, .setkey = xts_aesni_setkey, .encrypt = xts_encrypt, .decrypt = xts_decrypt, -#endif } }; @@ -982,6 +1118,33 @@ static struct simd_skcipher_alg *aesni_simd_skciphers[ARRAY_SIZE(aesni_skciphers)]; #ifdef CONFIG_X86_64 +/* + * XCTR does not have a non-AVX implementation, so it must be enabled + * conditionally. + */ +static struct skcipher_alg aesni_xctr = { + .base = { + .cra_name = "__xctr(aes)", + .cra_driver_name = "__xctr-aes-aesni", + .cra_priority = 400, + .cra_flags = CRYPTO_ALG_INTERNAL, + .cra_blocksize = 1, + .cra_ctxsize = CRYPTO_AES_CTX_SIZE, + .cra_module = THIS_MODULE, + }, + .min_keysize = AES_MIN_KEY_SIZE, + .max_keysize = AES_MAX_KEY_SIZE, + .ivsize = AES_BLOCK_SIZE, + .chunksize = AES_BLOCK_SIZE, + .setkey = aesni_skcipher_setkey, + .encrypt = xctr_crypt, + .decrypt = xctr_crypt, +}; + +static struct simd_skcipher_alg *aesni_simd_xctr; +#endif /* CONFIG_X86_64 */ + +#ifdef CONFIG_X86_64 static int generic_gcmaes_set_key(struct crypto_aead *aead, const u8 *key, unsigned int key_len) { @@ -997,7 +1160,8 @@ static int generic_gcmaes_encrypt(struct aead_request *req) struct crypto_aead *tfm = crypto_aead_reqtfm(req); struct generic_gcmaes_ctx *ctx = generic_gcmaes_ctx_get(tfm); void *aes_ctx = &(ctx->aes_key_expanded); - u8 iv[16] __attribute__ ((__aligned__(AESNI_ALIGN))); + u8 ivbuf[16 + (AESNI_ALIGN - 8)] __aligned(8); + u8 *iv = PTR_ALIGN(&ivbuf[0], AESNI_ALIGN); __be32 counter = cpu_to_be32(1); memcpy(iv, req->iv, 12); @@ -1013,7 +1177,8 @@ static int generic_gcmaes_decrypt(struct aead_request *req) struct crypto_aead *tfm = crypto_aead_reqtfm(req); struct generic_gcmaes_ctx *ctx = generic_gcmaes_ctx_get(tfm); void *aes_ctx = &(ctx->aes_key_expanded); - u8 iv[16] __attribute__ ((__aligned__(AESNI_ALIGN))); + u8 ivbuf[16 + (AESNI_ALIGN - 8)] __aligned(8); + u8 *iv = PTR_ALIGN(&ivbuf[0], AESNI_ALIGN); memcpy(iv, req->iv, 12); *((__be32 *)(iv+12)) = counter; @@ -1036,7 +1201,7 @@ static struct aead_alg aesni_aeads[] = { { .cra_flags = CRYPTO_ALG_INTERNAL, .cra_blocksize = 1, .cra_ctxsize = sizeof(struct aesni_rfc4106_gcm_ctx), - .cra_alignmask = AESNI_ALIGN - 1, + .cra_alignmask = 0, .cra_module = THIS_MODULE, }, }, { @@ -1053,7 +1218,7 @@ static struct aead_alg aesni_aeads[] = { { .cra_flags = CRYPTO_ALG_INTERNAL, .cra_blocksize = 1, .cra_ctxsize = sizeof(struct generic_gcmaes_ctx), - .cra_alignmask = AESNI_ALIGN - 1, + .cra_alignmask = 0, .cra_module = THIS_MODULE, }, } }; @@ -1064,7 +1229,7 @@ static struct aead_alg aesni_aeads[0]; static struct simd_aead_alg *aesni_simd_aeads[ARRAY_SIZE(aesni_aeads)]; static const struct x86_cpu_id aesni_cpu_id[] = { - X86_FEATURE_MATCH(X86_FEATURE_AES), + X86_MATCH_FEATURE(X86_FEATURE_AES, NULL), {} }; MODULE_DEVICE_TABLE(x86cpu, aesni_cpu_id); @@ -1076,31 +1241,23 @@ static int __init aesni_init(void) if (!x86_match_cpu(aesni_cpu_id)) return -ENODEV; #ifdef CONFIG_X86_64 -#ifdef CONFIG_AS_AVX2 if (boot_cpu_has(X86_FEATURE_AVX2)) { pr_info("AVX2 version of gcm_enc/dec engaged.\n"); - aesni_gcm_tfm = &aesni_gcm_tfm_avx_gen4; + static_branch_enable(&gcm_use_avx); + static_branch_enable(&gcm_use_avx2); } else -#endif -#ifdef CONFIG_AS_AVX if (boot_cpu_has(X86_FEATURE_AVX)) { pr_info("AVX version of gcm_enc/dec engaged.\n"); - aesni_gcm_tfm = &aesni_gcm_tfm_avx_gen2; - } else -#endif - { + static_branch_enable(&gcm_use_avx); + } else { pr_info("SSE version of gcm_enc/dec engaged.\n"); - aesni_gcm_tfm = &aesni_gcm_tfm_sse; } - aesni_ctr_enc_tfm = aesni_ctr_enc; -#ifdef CONFIG_AS_AVX if (boot_cpu_has(X86_FEATURE_AVX)) { /* optimize performance of ctr mode encryption transform */ - aesni_ctr_enc_tfm = aesni_ctr_enc_avx_tfm; + static_call_update(aesni_ctr_enc_tfm, aesni_ctr_enc_avx_tfm); pr_info("AES CTR mode by8 optimization enabled\n"); } -#endif -#endif +#endif /* CONFIG_X86_64 */ err = crypto_register_alg(&aesni_cipher_alg); if (err) @@ -1117,8 +1274,22 @@ static int __init aesni_init(void) if (err) goto unregister_skciphers; +#ifdef CONFIG_X86_64 + if (boot_cpu_has(X86_FEATURE_AVX)) + err = simd_register_skciphers_compat(&aesni_xctr, 1, + &aesni_simd_xctr); + if (err) + goto unregister_aeads; +#endif /* CONFIG_X86_64 */ + return 0; +#ifdef CONFIG_X86_64 +unregister_aeads: + simd_unregister_aeads(aesni_aeads, ARRAY_SIZE(aesni_aeads), + aesni_simd_aeads); +#endif /* CONFIG_X86_64 */ + unregister_skciphers: simd_unregister_skciphers(aesni_skciphers, ARRAY_SIZE(aesni_skciphers), aesni_simd_skciphers); @@ -1134,6 +1305,10 @@ static void __exit aesni_exit(void) simd_unregister_skciphers(aesni_skciphers, ARRAY_SIZE(aesni_skciphers), aesni_simd_skciphers); crypto_unregister_alg(&aesni_cipher_alg); +#ifdef CONFIG_X86_64 + if (boot_cpu_has(X86_FEATURE_AVX)) + simd_unregister_skciphers(&aesni_xctr, 1, &aesni_simd_xctr); +#endif /* CONFIG_X86_64 */ } late_initcall(aesni_init); diff --git a/arch/x86/crypto/aria-aesni-avx-asm_64.S b/arch/x86/crypto/aria-aesni-avx-asm_64.S new file mode 100644 index 000000000000..c75fd7d015ed --- /dev/null +++ b/arch/x86/crypto/aria-aesni-avx-asm_64.S @@ -0,0 +1,1303 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * ARIA Cipher 16-way parallel algorithm (AVX) + * + * Copyright (c) 2022 Taehee Yoo <ap420073@gmail.com> + * + */ + +#include <linux/linkage.h> +#include <asm/frame.h> + +/* struct aria_ctx: */ +#define enc_key 0 +#define dec_key 272 +#define rounds 544 + +/* register macros */ +#define CTX %rdi + + +#define BV8(a0, a1, a2, a3, a4, a5, a6, a7) \ + ( (((a0) & 1) << 0) | \ + (((a1) & 1) << 1) | \ + (((a2) & 1) << 2) | \ + (((a3) & 1) << 3) | \ + (((a4) & 1) << 4) | \ + (((a5) & 1) << 5) | \ + (((a6) & 1) << 6) | \ + (((a7) & 1) << 7) ) + +#define BM8X8(l0, l1, l2, l3, l4, l5, l6, l7) \ + ( ((l7) << (0 * 8)) | \ + ((l6) << (1 * 8)) | \ + ((l5) << (2 * 8)) | \ + ((l4) << (3 * 8)) | \ + ((l3) << (4 * 8)) | \ + ((l2) << (5 * 8)) | \ + ((l1) << (6 * 8)) | \ + ((l0) << (7 * 8)) ) + +#define inc_le128(x, minus_one, tmp) \ + vpcmpeqq minus_one, x, tmp; \ + vpsubq minus_one, x, x; \ + vpslldq $8, tmp, tmp; \ + vpsubq tmp, x, x; + +#define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0) \ + vpand x, mask4bit, tmp0; \ + vpandn x, mask4bit, x; \ + vpsrld $4, x, x; \ + \ + vpshufb tmp0, lo_t, tmp0; \ + vpshufb x, hi_t, x; \ + vpxor tmp0, x, x; + +#define transpose_4x4(x0, x1, x2, x3, t1, t2) \ + vpunpckhdq x1, x0, t2; \ + vpunpckldq x1, x0, x0; \ + \ + vpunpckldq x3, x2, t1; \ + vpunpckhdq x3, x2, x2; \ + \ + vpunpckhqdq t1, x0, x1; \ + vpunpcklqdq t1, x0, x0; \ + \ + vpunpckhqdq x2, t2, x3; \ + vpunpcklqdq x2, t2, x2; + +#define byteslice_16x16b(a0, b0, c0, d0, \ + a1, b1, c1, d1, \ + a2, b2, c2, d2, \ + a3, b3, c3, d3, \ + st0, st1) \ + vmovdqu d2, st0; \ + vmovdqu d3, st1; \ + transpose_4x4(a0, a1, a2, a3, d2, d3); \ + transpose_4x4(b0, b1, b2, b3, d2, d3); \ + vmovdqu st0, d2; \ + vmovdqu st1, d3; \ + \ + vmovdqu a0, st0; \ + vmovdqu a1, st1; \ + transpose_4x4(c0, c1, c2, c3, a0, a1); \ + transpose_4x4(d0, d1, d2, d3, a0, a1); \ + \ + vmovdqu .Lshufb_16x16b, a0; \ + vmovdqu st1, a1; \ + vpshufb a0, a2, a2; \ + vpshufb a0, a3, a3; \ + vpshufb a0, b0, b0; \ + vpshufb a0, b1, b1; \ + vpshufb a0, b2, b2; \ + vpshufb a0, b3, b3; \ + vpshufb a0, a1, a1; \ + vpshufb a0, c0, c0; \ + vpshufb a0, c1, c1; \ + vpshufb a0, c2, c2; \ + vpshufb a0, c3, c3; \ + vpshufb a0, d0, d0; \ + vpshufb a0, d1, d1; \ + vpshufb a0, d2, d2; \ + vpshufb a0, d3, d3; \ + vmovdqu d3, st1; \ + vmovdqu st0, d3; \ + vpshufb a0, d3, a0; \ + vmovdqu d2, st0; \ + \ + transpose_4x4(a0, b0, c0, d0, d2, d3); \ + transpose_4x4(a1, b1, c1, d1, d2, d3); \ + vmovdqu st0, d2; \ + vmovdqu st1, d3; \ + \ + vmovdqu b0, st0; \ + vmovdqu b1, st1; \ + transpose_4x4(a2, b2, c2, d2, b0, b1); \ + transpose_4x4(a3, b3, c3, d3, b0, b1); \ + vmovdqu st0, b0; \ + vmovdqu st1, b1; \ + /* does not adjust output bytes inside vectors */ + +#define debyteslice_16x16b(a0, b0, c0, d0, \ + a1, b1, c1, d1, \ + a2, b2, c2, d2, \ + a3, b3, c3, d3, \ + st0, st1) \ + vmovdqu d2, st0; \ + vmovdqu d3, st1; \ + transpose_4x4(a0, a1, a2, a3, d2, d3); \ + transpose_4x4(b0, b1, b2, b3, d2, d3); \ + vmovdqu st0, d2; \ + vmovdqu st1, d3; \ + \ + vmovdqu a0, st0; \ + vmovdqu a1, st1; \ + transpose_4x4(c0, c1, c2, c3, a0, a1); \ + transpose_4x4(d0, d1, d2, d3, a0, a1); \ + \ + vmovdqu .Lshufb_16x16b, a0; \ + vmovdqu st1, a1; \ + vpshufb a0, a2, a2; \ + vpshufb a0, a3, a3; \ + vpshufb a0, b0, b0; \ + vpshufb a0, b1, b1; \ + vpshufb a0, b2, b2; \ + vpshufb a0, b3, b3; \ + vpshufb a0, a1, a1; \ + vpshufb a0, c0, c0; \ + vpshufb a0, c1, c1; \ + vpshufb a0, c2, c2; \ + vpshufb a0, c3, c3; \ + vpshufb a0, d0, d0; \ + vpshufb a0, d1, d1; \ + vpshufb a0, d2, d2; \ + vpshufb a0, d3, d3; \ + vmovdqu d3, st1; \ + vmovdqu st0, d3; \ + vpshufb a0, d3, a0; \ + vmovdqu d2, st0; \ + \ + transpose_4x4(c0, d0, a0, b0, d2, d3); \ + transpose_4x4(c1, d1, a1, b1, d2, d3); \ + vmovdqu st0, d2; \ + vmovdqu st1, d3; \ + \ + vmovdqu b0, st0; \ + vmovdqu b1, st1; \ + transpose_4x4(c2, d2, a2, b2, b0, b1); \ + transpose_4x4(c3, d3, a3, b3, b0, b1); \ + vmovdqu st0, b0; \ + vmovdqu st1, b1; \ + /* does not adjust output bytes inside vectors */ + +/* load blocks to registers and apply pre-whitening */ +#define inpack16_pre(x0, x1, x2, x3, \ + x4, x5, x6, x7, \ + y0, y1, y2, y3, \ + y4, y5, y6, y7, \ + rio) \ + vmovdqu (0 * 16)(rio), x0; \ + vmovdqu (1 * 16)(rio), x1; \ + vmovdqu (2 * 16)(rio), x2; \ + vmovdqu (3 * 16)(rio), x3; \ + vmovdqu (4 * 16)(rio), x4; \ + vmovdqu (5 * 16)(rio), x5; \ + vmovdqu (6 * 16)(rio), x6; \ + vmovdqu (7 * 16)(rio), x7; \ + vmovdqu (8 * 16)(rio), y0; \ + vmovdqu (9 * 16)(rio), y1; \ + vmovdqu (10 * 16)(rio), y2; \ + vmovdqu (11 * 16)(rio), y3; \ + vmovdqu (12 * 16)(rio), y4; \ + vmovdqu (13 * 16)(rio), y5; \ + vmovdqu (14 * 16)(rio), y6; \ + vmovdqu (15 * 16)(rio), y7; + +/* byteslice pre-whitened blocks and store to temporary memory */ +#define inpack16_post(x0, x1, x2, x3, \ + x4, x5, x6, x7, \ + y0, y1, y2, y3, \ + y4, y5, y6, y7, \ + mem_ab, mem_cd) \ + byteslice_16x16b(x0, x1, x2, x3, \ + x4, x5, x6, x7, \ + y0, y1, y2, y3, \ + y4, y5, y6, y7, \ + (mem_ab), (mem_cd)); \ + \ + vmovdqu x0, 0 * 16(mem_ab); \ + vmovdqu x1, 1 * 16(mem_ab); \ + vmovdqu x2, 2 * 16(mem_ab); \ + vmovdqu x3, 3 * 16(mem_ab); \ + vmovdqu x4, 4 * 16(mem_ab); \ + vmovdqu x5, 5 * 16(mem_ab); \ + vmovdqu x6, 6 * 16(mem_ab); \ + vmovdqu x7, 7 * 16(mem_ab); \ + vmovdqu y0, 0 * 16(mem_cd); \ + vmovdqu y1, 1 * 16(mem_cd); \ + vmovdqu y2, 2 * 16(mem_cd); \ + vmovdqu y3, 3 * 16(mem_cd); \ + vmovdqu y4, 4 * 16(mem_cd); \ + vmovdqu y5, 5 * 16(mem_cd); \ + vmovdqu y6, 6 * 16(mem_cd); \ + vmovdqu y7, 7 * 16(mem_cd); + +#define write_output(x0, x1, x2, x3, \ + x4, x5, x6, x7, \ + y0, y1, y2, y3, \ + y4, y5, y6, y7, \ + mem) \ + vmovdqu x0, 0 * 16(mem); \ + vmovdqu x1, 1 * 16(mem); \ + vmovdqu x2, 2 * 16(mem); \ + vmovdqu x3, 3 * 16(mem); \ + vmovdqu x4, 4 * 16(mem); \ + vmovdqu x5, 5 * 16(mem); \ + vmovdqu x6, 6 * 16(mem); \ + vmovdqu x7, 7 * 16(mem); \ + vmovdqu y0, 8 * 16(mem); \ + vmovdqu y1, 9 * 16(mem); \ + vmovdqu y2, 10 * 16(mem); \ + vmovdqu y3, 11 * 16(mem); \ + vmovdqu y4, 12 * 16(mem); \ + vmovdqu y5, 13 * 16(mem); \ + vmovdqu y6, 14 * 16(mem); \ + vmovdqu y7, 15 * 16(mem); \ + +#define aria_store_state_8way(x0, x1, x2, x3, \ + x4, x5, x6, x7, \ + mem_tmp, idx) \ + vmovdqu x0, ((idx + 0) * 16)(mem_tmp); \ + vmovdqu x1, ((idx + 1) * 16)(mem_tmp); \ + vmovdqu x2, ((idx + 2) * 16)(mem_tmp); \ + vmovdqu x3, ((idx + 3) * 16)(mem_tmp); \ + vmovdqu x4, ((idx + 4) * 16)(mem_tmp); \ + vmovdqu x5, ((idx + 5) * 16)(mem_tmp); \ + vmovdqu x6, ((idx + 6) * 16)(mem_tmp); \ + vmovdqu x7, ((idx + 7) * 16)(mem_tmp); + +#define aria_load_state_8way(x0, x1, x2, x3, \ + x4, x5, x6, x7, \ + mem_tmp, idx) \ + vmovdqu ((idx + 0) * 16)(mem_tmp), x0; \ + vmovdqu ((idx + 1) * 16)(mem_tmp), x1; \ + vmovdqu ((idx + 2) * 16)(mem_tmp), x2; \ + vmovdqu ((idx + 3) * 16)(mem_tmp), x3; \ + vmovdqu ((idx + 4) * 16)(mem_tmp), x4; \ + vmovdqu ((idx + 5) * 16)(mem_tmp), x5; \ + vmovdqu ((idx + 6) * 16)(mem_tmp), x6; \ + vmovdqu ((idx + 7) * 16)(mem_tmp), x7; + +#define aria_ark_8way(x0, x1, x2, x3, \ + x4, x5, x6, x7, \ + t0, rk, idx, round) \ + /* AddRoundKey */ \ + vpbroadcastb ((round * 16) + idx + 3)(rk), t0; \ + vpxor t0, x0, x0; \ + vpbroadcastb ((round * 16) + idx + 2)(rk), t0; \ + vpxor t0, x1, x1; \ + vpbroadcastb ((round * 16) + idx + 1)(rk), t0; \ + vpxor t0, x2, x2; \ + vpbroadcastb ((round * 16) + idx + 0)(rk), t0; \ + vpxor t0, x3, x3; \ + vpbroadcastb ((round * 16) + idx + 7)(rk), t0; \ + vpxor t0, x4, x4; \ + vpbroadcastb ((round * 16) + idx + 6)(rk), t0; \ + vpxor t0, x5, x5; \ + vpbroadcastb ((round * 16) + idx + 5)(rk), t0; \ + vpxor t0, x6, x6; \ + vpbroadcastb ((round * 16) + idx + 4)(rk), t0; \ + vpxor t0, x7, x7; + +#define aria_sbox_8way_gfni(x0, x1, x2, x3, \ + x4, x5, x6, x7, \ + t0, t1, t2, t3, \ + t4, t5, t6, t7) \ + vpbroadcastq .Ltf_s2_bitmatrix, t0; \ + vpbroadcastq .Ltf_inv_bitmatrix, t1; \ + vpbroadcastq .Ltf_id_bitmatrix, t2; \ + vpbroadcastq .Ltf_aff_bitmatrix, t3; \ + vpbroadcastq .Ltf_x2_bitmatrix, t4; \ + vgf2p8affineinvqb $(tf_s2_const), t0, x1, x1; \ + vgf2p8affineinvqb $(tf_s2_const), t0, x5, x5; \ + vgf2p8affineqb $(tf_inv_const), t1, x2, x2; \ + vgf2p8affineqb $(tf_inv_const), t1, x6, x6; \ + vgf2p8affineinvqb $0, t2, x2, x2; \ + vgf2p8affineinvqb $0, t2, x6, x6; \ + vgf2p8affineinvqb $(tf_aff_const), t3, x0, x0; \ + vgf2p8affineinvqb $(tf_aff_const), t3, x4, x4; \ + vgf2p8affineqb $(tf_x2_const), t4, x3, x3; \ + vgf2p8affineqb $(tf_x2_const), t4, x7, x7; \ + vgf2p8affineinvqb $0, t2, x3, x3; \ + vgf2p8affineinvqb $0, t2, x7, x7 + +#define aria_sbox_8way(x0, x1, x2, x3, \ + x4, x5, x6, x7, \ + t0, t1, t2, t3, \ + t4, t5, t6, t7) \ + vpxor t7, t7, t7; \ + vmovdqa .Linv_shift_row, t0; \ + vmovdqa .Lshift_row, t1; \ + vpbroadcastd .L0f0f0f0f, t6; \ + vmovdqa .Ltf_lo__inv_aff__and__s2, t2; \ + vmovdqa .Ltf_hi__inv_aff__and__s2, t3; \ + vmovdqa .Ltf_lo__x2__and__fwd_aff, t4; \ + vmovdqa .Ltf_hi__x2__and__fwd_aff, t5; \ + \ + vaesenclast t7, x0, x0; \ + vaesenclast t7, x4, x4; \ + vaesenclast t7, x1, x1; \ + vaesenclast t7, x5, x5; \ + vaesdeclast t7, x2, x2; \ + vaesdeclast t7, x6, x6; \ + \ + /* AES inverse shift rows */ \ + vpshufb t0, x0, x0; \ + vpshufb t0, x4, x4; \ + vpshufb t0, x1, x1; \ + vpshufb t0, x5, x5; \ + vpshufb t1, x3, x3; \ + vpshufb t1, x7, x7; \ + vpshufb t1, x2, x2; \ + vpshufb t1, x6, x6; \ + \ + /* affine transformation for S2 */ \ + filter_8bit(x1, t2, t3, t6, t0); \ + /* affine transformation for S2 */ \ + filter_8bit(x5, t2, t3, t6, t0); \ + \ + /* affine transformation for X2 */ \ + filter_8bit(x3, t4, t5, t6, t0); \ + /* affine transformation for X2 */ \ + filter_8bit(x7, t4, t5, t6, t0); \ + vaesdeclast t7, x3, x3; \ + vaesdeclast t7, x7, x7; + +#define aria_diff_m(x0, x1, x2, x3, \ + t0, t1, t2, t3) \ + /* T = rotr32(X, 8); */ \ + /* X ^= T */ \ + vpxor x0, x3, t0; \ + vpxor x1, x0, t1; \ + vpxor x2, x1, t2; \ + vpxor x3, x2, t3; \ + /* X = T ^ rotr(X, 16); */ \ + vpxor t2, x0, x0; \ + vpxor x1, t3, t3; \ + vpxor t0, x2, x2; \ + vpxor t1, x3, x1; \ + vmovdqu t3, x3; + +#define aria_diff_word(x0, x1, x2, x3, \ + x4, x5, x6, x7, \ + y0, y1, y2, y3, \ + y4, y5, y6, y7) \ + /* t1 ^= t2; */ \ + vpxor y0, x4, x4; \ + vpxor y1, x5, x5; \ + vpxor y2, x6, x6; \ + vpxor y3, x7, x7; \ + \ + /* t2 ^= t3; */ \ + vpxor y4, y0, y0; \ + vpxor y5, y1, y1; \ + vpxor y6, y2, y2; \ + vpxor y7, y3, y3; \ + \ + /* t0 ^= t1; */ \ + vpxor x4, x0, x0; \ + vpxor x5, x1, x1; \ + vpxor x6, x2, x2; \ + vpxor x7, x3, x3; \ + \ + /* t3 ^= t1; */ \ + vpxor x4, y4, y4; \ + vpxor x5, y5, y5; \ + vpxor x6, y6, y6; \ + vpxor x7, y7, y7; \ + \ + /* t2 ^= t0; */ \ + vpxor x0, y0, y0; \ + vpxor x1, y1, y1; \ + vpxor x2, y2, y2; \ + vpxor x3, y3, y3; \ + \ + /* t1 ^= t2; */ \ + vpxor y0, x4, x4; \ + vpxor y1, x5, x5; \ + vpxor y2, x6, x6; \ + vpxor y3, x7, x7; + +#define aria_fe(x0, x1, x2, x3, \ + x4, x5, x6, x7, \ + y0, y1, y2, y3, \ + y4, y5, y6, y7, \ + mem_tmp, rk, round) \ + aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ + y0, rk, 8, round); \ + \ + aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \ + y0, y1, y2, y3, y4, y5, y6, y7); \ + \ + aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \ + aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \ + aria_store_state_8way(x0, x1, x2, x3, \ + x4, x5, x6, x7, \ + mem_tmp, 8); \ + \ + aria_load_state_8way(x0, x1, x2, x3, \ + x4, x5, x6, x7, \ + mem_tmp, 0); \ + aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ + y0, rk, 0, round); \ + \ + aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \ + y0, y1, y2, y3, y4, y5, y6, y7); \ + \ + aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \ + aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \ + aria_store_state_8way(x0, x1, x2, x3, \ + x4, x5, x6, x7, \ + mem_tmp, 0); \ + aria_load_state_8way(y0, y1, y2, y3, \ + y4, y5, y6, y7, \ + mem_tmp, 8); \ + aria_diff_word(x0, x1, x2, x3, \ + x4, x5, x6, x7, \ + y0, y1, y2, y3, \ + y4, y5, y6, y7); \ + /* aria_diff_byte() \ + * T3 = ABCD -> BADC \ + * T3 = y4, y5, y6, y7 -> y5, y4, y7, y6 \ + * T0 = ABCD -> CDAB \ + * T0 = x0, x1, x2, x3 -> x2, x3, x0, x1 \ + * T1 = ABCD -> DCBA \ + * T1 = x4, x5, x6, x7 -> x7, x6, x5, x4 \ + */ \ + aria_diff_word(x2, x3, x0, x1, \ + x7, x6, x5, x4, \ + y0, y1, y2, y3, \ + y5, y4, y7, y6); \ + aria_store_state_8way(x3, x2, x1, x0, \ + x6, x7, x4, x5, \ + mem_tmp, 0); + +#define aria_fo(x0, x1, x2, x3, \ + x4, x5, x6, x7, \ + y0, y1, y2, y3, \ + y4, y5, y6, y7, \ + mem_tmp, rk, round) \ + aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ + y0, rk, 8, round); \ + \ + aria_sbox_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ + y0, y1, y2, y3, y4, y5, y6, y7); \ + \ + aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \ + aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \ + aria_store_state_8way(x0, x1, x2, x3, \ + x4, x5, x6, x7, \ + mem_tmp, 8); \ + \ + aria_load_state_8way(x0, x1, x2, x3, \ + x4, x5, x6, x7, \ + mem_tmp, 0); \ + aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ + y0, rk, 0, round); \ + \ + aria_sbox_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ + y0, y1, y2, y3, y4, y5, y6, y7); \ + \ + aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \ + aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \ + aria_store_state_8way(x0, x1, x2, x3, \ + x4, x5, x6, x7, \ + mem_tmp, 0); \ + aria_load_state_8way(y0, y1, y2, y3, \ + y4, y5, y6, y7, \ + mem_tmp, 8); \ + aria_diff_word(x0, x1, x2, x3, \ + x4, x5, x6, x7, \ + y0, y1, y2, y3, \ + y4, y5, y6, y7); \ + /* aria_diff_byte() \ + * T1 = ABCD -> BADC \ + * T1 = x4, x5, x6, x7 -> x5, x4, x7, x6 \ + * T2 = ABCD -> CDAB \ + * T2 = y0, y1, y2, y3, -> y2, y3, y0, y1 \ + * T3 = ABCD -> DCBA \ + * T3 = y4, y5, y6, y7 -> y7, y6, y5, y4 \ + */ \ + aria_diff_word(x0, x1, x2, x3, \ + x5, x4, x7, x6, \ + y2, y3, y0, y1, \ + y7, y6, y5, y4); \ + aria_store_state_8way(x3, x2, x1, x0, \ + x6, x7, x4, x5, \ + mem_tmp, 0); + +#define aria_ff(x0, x1, x2, x3, \ + x4, x5, x6, x7, \ + y0, y1, y2, y3, \ + y4, y5, y6, y7, \ + mem_tmp, rk, round, last_round) \ + aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ + y0, rk, 8, round); \ + \ + aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \ + y0, y1, y2, y3, y4, y5, y6, y7); \ + \ + aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ + y0, rk, 8, last_round); \ + \ + aria_store_state_8way(x0, x1, x2, x3, \ + x4, x5, x6, x7, \ + mem_tmp, 8); \ + \ + aria_load_state_8way(x0, x1, x2, x3, \ + x4, x5, x6, x7, \ + mem_tmp, 0); \ + aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ + y0, rk, 0, round); \ + \ + aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \ + y0, y1, y2, y3, y4, y5, y6, y7); \ + \ + aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ + y0, rk, 0, last_round); \ + \ + aria_load_state_8way(y0, y1, y2, y3, \ + y4, y5, y6, y7, \ + mem_tmp, 8); + +#define aria_fe_gfni(x0, x1, x2, x3, \ + x4, x5, x6, x7, \ + y0, y1, y2, y3, \ + y4, y5, y6, y7, \ + mem_tmp, rk, round) \ + aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ + y0, rk, 8, round); \ + \ + aria_sbox_8way_gfni(x2, x3, x0, x1, \ + x6, x7, x4, x5, \ + y0, y1, y2, y3, \ + y4, y5, y6, y7); \ + \ + aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \ + aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \ + aria_store_state_8way(x0, x1, x2, x3, \ + x4, x5, x6, x7, \ + mem_tmp, 8); \ + \ + aria_load_state_8way(x0, x1, x2, x3, \ + x4, x5, x6, x7, \ + mem_tmp, 0); \ + aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ + y0, rk, 0, round); \ + \ + aria_sbox_8way_gfni(x2, x3, x0, x1, \ + x6, x7, x4, x5, \ + y0, y1, y2, y3, \ + y4, y5, y6, y7); \ + \ + aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \ + aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \ + aria_store_state_8way(x0, x1, x2, x3, \ + x4, x5, x6, x7, \ + mem_tmp, 0); \ + aria_load_state_8way(y0, y1, y2, y3, \ + y4, y5, y6, y7, \ + mem_tmp, 8); \ + aria_diff_word(x0, x1, x2, x3, \ + x4, x5, x6, x7, \ + y0, y1, y2, y3, \ + y4, y5, y6, y7); \ + /* aria_diff_byte() \ + * T3 = ABCD -> BADC \ + * T3 = y4, y5, y6, y7 -> y5, y4, y7, y6 \ + * T0 = ABCD -> CDAB \ + * T0 = x0, x1, x2, x3 -> x2, x3, x0, x1 \ + * T1 = ABCD -> DCBA \ + * T1 = x4, x5, x6, x7 -> x7, x6, x5, x4 \ + */ \ + aria_diff_word(x2, x3, x0, x1, \ + x7, x6, x5, x4, \ + y0, y1, y2, y3, \ + y5, y4, y7, y6); \ + aria_store_state_8way(x3, x2, x1, x0, \ + x6, x7, x4, x5, \ + mem_tmp, 0); + +#define aria_fo_gfni(x0, x1, x2, x3, \ + x4, x5, x6, x7, \ + y0, y1, y2, y3, \ + y4, y5, y6, y7, \ + mem_tmp, rk, round) \ + aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ + y0, rk, 8, round); \ + \ + aria_sbox_8way_gfni(x0, x1, x2, x3, \ + x4, x5, x6, x7, \ + y0, y1, y2, y3, \ + y4, y5, y6, y7); \ + \ + aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \ + aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \ + aria_store_state_8way(x0, x1, x2, x3, \ + x4, x5, x6, x7, \ + mem_tmp, 8); \ + \ + aria_load_state_8way(x0, x1, x2, x3, \ + x4, x5, x6, x7, \ + mem_tmp, 0); \ + aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ + y0, rk, 0, round); \ + \ + aria_sbox_8way_gfni(x0, x1, x2, x3, \ + x4, x5, x6, x7, \ + y0, y1, y2, y3, \ + y4, y5, y6, y7); \ + \ + aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \ + aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \ + aria_store_state_8way(x0, x1, x2, x3, \ + x4, x5, x6, x7, \ + mem_tmp, 0); \ + aria_load_state_8way(y0, y1, y2, y3, \ + y4, y5, y6, y7, \ + mem_tmp, 8); \ + aria_diff_word(x0, x1, x2, x3, \ + x4, x5, x6, x7, \ + y0, y1, y2, y3, \ + y4, y5, y6, y7); \ + /* aria_diff_byte() \ + * T1 = ABCD -> BADC \ + * T1 = x4, x5, x6, x7 -> x5, x4, x7, x6 \ + * T2 = ABCD -> CDAB \ + * T2 = y0, y1, y2, y3, -> y2, y3, y0, y1 \ + * T3 = ABCD -> DCBA \ + * T3 = y4, y5, y6, y7 -> y7, y6, y5, y4 \ + */ \ + aria_diff_word(x0, x1, x2, x3, \ + x5, x4, x7, x6, \ + y2, y3, y0, y1, \ + y7, y6, y5, y4); \ + aria_store_state_8way(x3, x2, x1, x0, \ + x6, x7, x4, x5, \ + mem_tmp, 0); + +#define aria_ff_gfni(x0, x1, x2, x3, \ + x4, x5, x6, x7, \ + y0, y1, y2, y3, \ + y4, y5, y6, y7, \ + mem_tmp, rk, round, last_round) \ + aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ + y0, rk, 8, round); \ + \ + aria_sbox_8way_gfni(x2, x3, x0, x1, \ + x6, x7, x4, x5, \ + y0, y1, y2, y3, \ + y4, y5, y6, y7); \ + \ + aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ + y0, rk, 8, last_round); \ + \ + aria_store_state_8way(x0, x1, x2, x3, \ + x4, x5, x6, x7, \ + mem_tmp, 8); \ + \ + aria_load_state_8way(x0, x1, x2, x3, \ + x4, x5, x6, x7, \ + mem_tmp, 0); \ + aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ + y0, rk, 0, round); \ + \ + aria_sbox_8way_gfni(x2, x3, x0, x1, \ + x6, x7, x4, x5, \ + y0, y1, y2, y3, \ + y4, y5, y6, y7); \ + \ + aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ + y0, rk, 0, last_round); \ + \ + aria_load_state_8way(y0, y1, y2, y3, \ + y4, y5, y6, y7, \ + mem_tmp, 8); + +/* NB: section is mergeable, all elements must be aligned 16-byte blocks */ +.section .rodata.cst16, "aM", @progbits, 16 +.align 16 + +#define SHUFB_BYTES(idx) \ + 0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx) + +.Lshufb_16x16b: + .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3); +/* For isolating SubBytes from AESENCLAST, inverse shift row */ +.Linv_shift_row: + .byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b + .byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03 +.Lshift_row: + .byte 0x00, 0x05, 0x0a, 0x0f, 0x04, 0x09, 0x0e, 0x03 + .byte 0x08, 0x0d, 0x02, 0x07, 0x0c, 0x01, 0x06, 0x0b +/* For CTR-mode IV byteswap */ +.Lbswap128_mask: + .byte 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08 + .byte 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00 + +/* AES inverse affine and S2 combined: + * 1 1 0 0 0 0 0 1 x0 0 + * 0 1 0 0 1 0 0 0 x1 0 + * 1 1 0 0 1 1 1 1 x2 0 + * 0 1 1 0 1 0 0 1 x3 1 + * 0 1 0 0 1 1 0 0 * x4 + 0 + * 0 1 0 1 1 0 0 0 x5 0 + * 0 0 0 0 0 1 0 1 x6 0 + * 1 1 1 0 0 1 1 1 x7 1 + */ +.Ltf_lo__inv_aff__and__s2: + .octa 0x92172DA81A9FA520B2370D883ABF8500 +.Ltf_hi__inv_aff__and__s2: + .octa 0x2B15FFC1AF917B45E6D8320C625CB688 + +/* X2 and AES forward affine combined: + * 1 0 1 1 0 0 0 1 x0 0 + * 0 1 1 1 1 0 1 1 x1 0 + * 0 0 0 1 1 0 1 0 x2 1 + * 0 1 0 0 0 1 0 0 x3 0 + * 0 0 1 1 1 0 1 1 * x4 + 0 + * 0 1 0 0 1 0 0 0 x5 0 + * 1 1 0 1 0 0 1 1 x6 0 + * 0 1 0 0 1 0 1 0 x7 0 + */ +.Ltf_lo__x2__and__fwd_aff: + .octa 0xEFAE0544FCBD1657B8F95213ABEA4100 +.Ltf_hi__x2__and__fwd_aff: + .octa 0x3F893781E95FE1576CDA64D2BA0CB204 + +.section .rodata.cst8, "aM", @progbits, 8 +.align 8 +/* AES affine: */ +#define tf_aff_const BV8(1, 1, 0, 0, 0, 1, 1, 0) +.Ltf_aff_bitmatrix: + .quad BM8X8(BV8(1, 0, 0, 0, 1, 1, 1, 1), + BV8(1, 1, 0, 0, 0, 1, 1, 1), + BV8(1, 1, 1, 0, 0, 0, 1, 1), + BV8(1, 1, 1, 1, 0, 0, 0, 1), + BV8(1, 1, 1, 1, 1, 0, 0, 0), + BV8(0, 1, 1, 1, 1, 1, 0, 0), + BV8(0, 0, 1, 1, 1, 1, 1, 0), + BV8(0, 0, 0, 1, 1, 1, 1, 1)) + +/* AES inverse affine: */ +#define tf_inv_const BV8(1, 0, 1, 0, 0, 0, 0, 0) +.Ltf_inv_bitmatrix: + .quad BM8X8(BV8(0, 0, 1, 0, 0, 1, 0, 1), + BV8(1, 0, 0, 1, 0, 0, 1, 0), + BV8(0, 1, 0, 0, 1, 0, 0, 1), + BV8(1, 0, 1, 0, 0, 1, 0, 0), + BV8(0, 1, 0, 1, 0, 0, 1, 0), + BV8(0, 0, 1, 0, 1, 0, 0, 1), + BV8(1, 0, 0, 1, 0, 1, 0, 0), + BV8(0, 1, 0, 0, 1, 0, 1, 0)) + +/* S2: */ +#define tf_s2_const BV8(0, 1, 0, 0, 0, 1, 1, 1) +.Ltf_s2_bitmatrix: + .quad BM8X8(BV8(0, 1, 0, 1, 0, 1, 1, 1), + BV8(0, 0, 1, 1, 1, 1, 1, 1), + BV8(1, 1, 1, 0, 1, 1, 0, 1), + BV8(1, 1, 0, 0, 0, 0, 1, 1), + BV8(0, 1, 0, 0, 0, 0, 1, 1), + BV8(1, 1, 0, 0, 1, 1, 1, 0), + BV8(0, 1, 1, 0, 0, 0, 1, 1), + BV8(1, 1, 1, 1, 0, 1, 1, 0)) + +/* X2: */ +#define tf_x2_const BV8(0, 0, 1, 1, 0, 1, 0, 0) +.Ltf_x2_bitmatrix: + .quad BM8X8(BV8(0, 0, 0, 1, 1, 0, 0, 0), + BV8(0, 0, 1, 0, 0, 1, 1, 0), + BV8(0, 0, 0, 0, 1, 0, 1, 0), + BV8(1, 1, 1, 0, 0, 0, 1, 1), + BV8(1, 1, 1, 0, 1, 1, 0, 0), + BV8(0, 1, 1, 0, 1, 0, 1, 1), + BV8(1, 0, 1, 1, 1, 1, 0, 1), + BV8(1, 0, 0, 1, 0, 0, 1, 1)) + +/* Identity matrix: */ +.Ltf_id_bitmatrix: + .quad BM8X8(BV8(1, 0, 0, 0, 0, 0, 0, 0), + BV8(0, 1, 0, 0, 0, 0, 0, 0), + BV8(0, 0, 1, 0, 0, 0, 0, 0), + BV8(0, 0, 0, 1, 0, 0, 0, 0), + BV8(0, 0, 0, 0, 1, 0, 0, 0), + BV8(0, 0, 0, 0, 0, 1, 0, 0), + BV8(0, 0, 0, 0, 0, 0, 1, 0), + BV8(0, 0, 0, 0, 0, 0, 0, 1)) + +/* 4-bit mask */ +.section .rodata.cst4.L0f0f0f0f, "aM", @progbits, 4 +.align 4 +.L0f0f0f0f: + .long 0x0f0f0f0f + +.text + +SYM_FUNC_START_LOCAL(__aria_aesni_avx_crypt_16way) + /* input: + * %r9: rk + * %rsi: dst + * %rdx: src + * %xmm0..%xmm15: 16 byte-sliced blocks + */ + + FRAME_BEGIN + + movq %rsi, %rax; + leaq 8 * 16(%rax), %r8; + + inpack16_post(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, + %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, + %xmm15, %rax, %r8); + aria_fo(%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, + %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, + %rax, %r9, 0); + aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7, + %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, + %xmm15, %rax, %r9, 1); + aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15, + %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, + %rax, %r9, 2); + aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7, + %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, + %xmm15, %rax, %r9, 3); + aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15, + %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, + %rax, %r9, 4); + aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7, + %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, + %xmm15, %rax, %r9, 5); + aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15, + %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, + %rax, %r9, 6); + aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7, + %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, + %xmm15, %rax, %r9, 7); + aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15, + %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, + %rax, %r9, 8); + aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7, + %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, + %xmm15, %rax, %r9, 9); + aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15, + %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, + %rax, %r9, 10); + cmpl $12, rounds(CTX); + jne .Laria_192; + aria_ff(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7, + %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, + %xmm15, %rax, %r9, 11, 12); + jmp .Laria_end; +.Laria_192: + aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7, + %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, + %xmm15, %rax, %r9, 11); + aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15, + %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, + %rax, %r9, 12); + cmpl $14, rounds(CTX); + jne .Laria_256; + aria_ff(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7, + %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, + %xmm15, %rax, %r9, 13, 14); + jmp .Laria_end; +.Laria_256: + aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7, + %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, + %xmm15, %rax, %r9, 13); + aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15, + %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, + %rax, %r9, 14); + aria_ff(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7, + %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, + %xmm15, %rax, %r9, 15, 16); +.Laria_end: + debyteslice_16x16b(%xmm8, %xmm12, %xmm1, %xmm4, + %xmm9, %xmm13, %xmm0, %xmm5, + %xmm10, %xmm14, %xmm3, %xmm6, + %xmm11, %xmm15, %xmm2, %xmm7, + (%rax), (%r8)); + + FRAME_END + RET; +SYM_FUNC_END(__aria_aesni_avx_crypt_16way) + +SYM_FUNC_START(aria_aesni_avx_encrypt_16way) + /* input: + * %rdi: ctx, CTX + * %rsi: dst + * %rdx: src + */ + + FRAME_BEGIN + + leaq enc_key(CTX), %r9; + + inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, + %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, + %xmm15, %rdx); + + call __aria_aesni_avx_crypt_16way; + + write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7, + %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, + %xmm15, %rax); + + FRAME_END + RET; +SYM_FUNC_END(aria_aesni_avx_encrypt_16way) + +SYM_FUNC_START(aria_aesni_avx_decrypt_16way) + /* input: + * %rdi: ctx, CTX + * %rsi: dst + * %rdx: src + */ + + FRAME_BEGIN + + leaq dec_key(CTX), %r9; + + inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, + %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, + %xmm15, %rdx); + + call __aria_aesni_avx_crypt_16way; + + write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7, + %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, + %xmm15, %rax); + + FRAME_END + RET; +SYM_FUNC_END(aria_aesni_avx_decrypt_16way) + +SYM_FUNC_START_LOCAL(__aria_aesni_avx_ctr_gen_keystream_16way) + /* input: + * %rdi: ctx + * %rsi: dst + * %rdx: src + * %rcx: keystream + * %r8: iv (big endian, 128bit) + */ + + FRAME_BEGIN + /* load IV and byteswap */ + vmovdqu (%r8), %xmm8; + + vmovdqa .Lbswap128_mask (%rip), %xmm1; + vpshufb %xmm1, %xmm8, %xmm3; /* be => le */ + + vpcmpeqd %xmm0, %xmm0, %xmm0; + vpsrldq $8, %xmm0, %xmm0; /* low: -1, high: 0 */ + + /* construct IVs */ + inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */ + vpshufb %xmm1, %xmm3, %xmm9; + inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */ + vpshufb %xmm1, %xmm3, %xmm10; + inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */ + vpshufb %xmm1, %xmm3, %xmm11; + inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */ + vpshufb %xmm1, %xmm3, %xmm12; + inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */ + vpshufb %xmm1, %xmm3, %xmm13; + inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */ + vpshufb %xmm1, %xmm3, %xmm14; + inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */ + vpshufb %xmm1, %xmm3, %xmm15; + vmovdqu %xmm8, (0 * 16)(%rcx); + vmovdqu %xmm9, (1 * 16)(%rcx); + vmovdqu %xmm10, (2 * 16)(%rcx); + vmovdqu %xmm11, (3 * 16)(%rcx); + vmovdqu %xmm12, (4 * 16)(%rcx); + vmovdqu %xmm13, (5 * 16)(%rcx); + vmovdqu %xmm14, (6 * 16)(%rcx); + vmovdqu %xmm15, (7 * 16)(%rcx); + + inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */ + vpshufb %xmm1, %xmm3, %xmm8; + inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */ + vpshufb %xmm1, %xmm3, %xmm9; + inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */ + vpshufb %xmm1, %xmm3, %xmm10; + inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */ + vpshufb %xmm1, %xmm3, %xmm11; + inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */ + vpshufb %xmm1, %xmm3, %xmm12; + inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */ + vpshufb %xmm1, %xmm3, %xmm13; + inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */ + vpshufb %xmm1, %xmm3, %xmm14; + inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */ + vpshufb %xmm1, %xmm3, %xmm15; + inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */ + vpshufb %xmm1, %xmm3, %xmm4; + vmovdqu %xmm4, (%r8); + + vmovdqu (0 * 16)(%rcx), %xmm0; + vmovdqu (1 * 16)(%rcx), %xmm1; + vmovdqu (2 * 16)(%rcx), %xmm2; + vmovdqu (3 * 16)(%rcx), %xmm3; + vmovdqu (4 * 16)(%rcx), %xmm4; + vmovdqu (5 * 16)(%rcx), %xmm5; + vmovdqu (6 * 16)(%rcx), %xmm6; + vmovdqu (7 * 16)(%rcx), %xmm7; + + FRAME_END + RET; +SYM_FUNC_END(__aria_aesni_avx_ctr_gen_keystream_16way) + +SYM_FUNC_START(aria_aesni_avx_ctr_crypt_16way) + /* input: + * %rdi: ctx + * %rsi: dst + * %rdx: src + * %rcx: keystream + * %r8: iv (big endian, 128bit) + */ + FRAME_BEGIN + + call __aria_aesni_avx_ctr_gen_keystream_16way; + + leaq (%rsi), %r10; + leaq (%rdx), %r11; + leaq (%rcx), %rsi; + leaq (%rcx), %rdx; + leaq enc_key(CTX), %r9; + + call __aria_aesni_avx_crypt_16way; + + vpxor (0 * 16)(%r11), %xmm1, %xmm1; + vpxor (1 * 16)(%r11), %xmm0, %xmm0; + vpxor (2 * 16)(%r11), %xmm3, %xmm3; + vpxor (3 * 16)(%r11), %xmm2, %xmm2; + vpxor (4 * 16)(%r11), %xmm4, %xmm4; + vpxor (5 * 16)(%r11), %xmm5, %xmm5; + vpxor (6 * 16)(%r11), %xmm6, %xmm6; + vpxor (7 * 16)(%r11), %xmm7, %xmm7; + vpxor (8 * 16)(%r11), %xmm8, %xmm8; + vpxor (9 * 16)(%r11), %xmm9, %xmm9; + vpxor (10 * 16)(%r11), %xmm10, %xmm10; + vpxor (11 * 16)(%r11), %xmm11, %xmm11; + vpxor (12 * 16)(%r11), %xmm12, %xmm12; + vpxor (13 * 16)(%r11), %xmm13, %xmm13; + vpxor (14 * 16)(%r11), %xmm14, %xmm14; + vpxor (15 * 16)(%r11), %xmm15, %xmm15; + write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7, + %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, + %xmm15, %r10); + + FRAME_END + RET; +SYM_FUNC_END(aria_aesni_avx_ctr_crypt_16way) + +SYM_FUNC_START_LOCAL(__aria_aesni_avx_gfni_crypt_16way) + /* input: + * %r9: rk + * %rsi: dst + * %rdx: src + * %xmm0..%xmm15: 16 byte-sliced blocks + */ + + FRAME_BEGIN + + movq %rsi, %rax; + leaq 8 * 16(%rax), %r8; + + inpack16_post(%xmm0, %xmm1, %xmm2, %xmm3, + %xmm4, %xmm5, %xmm6, %xmm7, + %xmm8, %xmm9, %xmm10, %xmm11, + %xmm12, %xmm13, %xmm14, + %xmm15, %rax, %r8); + aria_fo_gfni(%xmm8, %xmm9, %xmm10, %xmm11, + %xmm12, %xmm13, %xmm14, %xmm15, + %xmm0, %xmm1, %xmm2, %xmm3, + %xmm4, %xmm5, %xmm6, %xmm7, + %rax, %r9, 0); + aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2, + %xmm4, %xmm5, %xmm6, %xmm7, + %xmm8, %xmm9, %xmm10, %xmm11, + %xmm12, %xmm13, %xmm14, + %xmm15, %rax, %r9, 1); + aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10, + %xmm12, %xmm13, %xmm14, %xmm15, + %xmm0, %xmm1, %xmm2, %xmm3, + %xmm4, %xmm5, %xmm6, %xmm7, + %rax, %r9, 2); + aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2, + %xmm4, %xmm5, %xmm6, %xmm7, + %xmm8, %xmm9, %xmm10, %xmm11, + %xmm12, %xmm13, %xmm14, + %xmm15, %rax, %r9, 3); + aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10, + %xmm12, %xmm13, %xmm14, %xmm15, + %xmm0, %xmm1, %xmm2, %xmm3, + %xmm4, %xmm5, %xmm6, %xmm7, + %rax, %r9, 4); + aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2, + %xmm4, %xmm5, %xmm6, %xmm7, + %xmm8, %xmm9, %xmm10, %xmm11, + %xmm12, %xmm13, %xmm14, + %xmm15, %rax, %r9, 5); + aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10, + %xmm12, %xmm13, %xmm14, %xmm15, + %xmm0, %xmm1, %xmm2, %xmm3, + %xmm4, %xmm5, %xmm6, %xmm7, + %rax, %r9, 6); + aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2, + %xmm4, %xmm5, %xmm6, %xmm7, + %xmm8, %xmm9, %xmm10, %xmm11, + %xmm12, %xmm13, %xmm14, + %xmm15, %rax, %r9, 7); + aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10, + %xmm12, %xmm13, %xmm14, %xmm15, + %xmm0, %xmm1, %xmm2, %xmm3, + %xmm4, %xmm5, %xmm6, %xmm7, + %rax, %r9, 8); + aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2, + %xmm4, %xmm5, %xmm6, %xmm7, + %xmm8, %xmm9, %xmm10, %xmm11, + %xmm12, %xmm13, %xmm14, + %xmm15, %rax, %r9, 9); + aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10, + %xmm12, %xmm13, %xmm14, %xmm15, + %xmm0, %xmm1, %xmm2, %xmm3, + %xmm4, %xmm5, %xmm6, %xmm7, + %rax, %r9, 10); + cmpl $12, rounds(CTX); + jne .Laria_gfni_192; + aria_ff_gfni(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7, + %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, + %xmm15, %rax, %r9, 11, 12); + jmp .Laria_gfni_end; +.Laria_gfni_192: + aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2, + %xmm4, %xmm5, %xmm6, %xmm7, + %xmm8, %xmm9, %xmm10, %xmm11, + %xmm12, %xmm13, %xmm14, + %xmm15, %rax, %r9, 11); + aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10, + %xmm12, %xmm13, %xmm14, %xmm15, + %xmm0, %xmm1, %xmm2, %xmm3, + %xmm4, %xmm5, %xmm6, %xmm7, + %rax, %r9, 12); + cmpl $14, rounds(CTX); + jne .Laria_gfni_256; + aria_ff_gfni(%xmm1, %xmm0, %xmm3, %xmm2, + %xmm4, %xmm5, %xmm6, %xmm7, + %xmm8, %xmm9, %xmm10, %xmm11, + %xmm12, %xmm13, %xmm14, + %xmm15, %rax, %r9, 13, 14); + jmp .Laria_gfni_end; +.Laria_gfni_256: + aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2, + %xmm4, %xmm5, %xmm6, %xmm7, + %xmm8, %xmm9, %xmm10, %xmm11, + %xmm12, %xmm13, %xmm14, + %xmm15, %rax, %r9, 13); + aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10, + %xmm12, %xmm13, %xmm14, %xmm15, + %xmm0, %xmm1, %xmm2, %xmm3, + %xmm4, %xmm5, %xmm6, %xmm7, + %rax, %r9, 14); + aria_ff_gfni(%xmm1, %xmm0, %xmm3, %xmm2, + %xmm4, %xmm5, %xmm6, %xmm7, + %xmm8, %xmm9, %xmm10, %xmm11, + %xmm12, %xmm13, %xmm14, + %xmm15, %rax, %r9, 15, 16); +.Laria_gfni_end: + debyteslice_16x16b(%xmm8, %xmm12, %xmm1, %xmm4, + %xmm9, %xmm13, %xmm0, %xmm5, + %xmm10, %xmm14, %xmm3, %xmm6, + %xmm11, %xmm15, %xmm2, %xmm7, + (%rax), (%r8)); + + FRAME_END + RET; +SYM_FUNC_END(__aria_aesni_avx_gfni_crypt_16way) + +SYM_FUNC_START(aria_aesni_avx_gfni_encrypt_16way) + /* input: + * %rdi: ctx, CTX + * %rsi: dst + * %rdx: src + */ + + FRAME_BEGIN + + leaq enc_key(CTX), %r9; + + inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, + %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, + %xmm15, %rdx); + + call __aria_aesni_avx_gfni_crypt_16way; + + write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7, + %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, + %xmm15, %rax); + + FRAME_END + RET; +SYM_FUNC_END(aria_aesni_avx_gfni_encrypt_16way) + +SYM_FUNC_START(aria_aesni_avx_gfni_decrypt_16way) + /* input: + * %rdi: ctx, CTX + * %rsi: dst + * %rdx: src + */ + + FRAME_BEGIN + + leaq dec_key(CTX), %r9; + + inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, + %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, + %xmm15, %rdx); + + call __aria_aesni_avx_gfni_crypt_16way; + + write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7, + %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, + %xmm15, %rax); + + FRAME_END + RET; +SYM_FUNC_END(aria_aesni_avx_gfni_decrypt_16way) + +SYM_FUNC_START(aria_aesni_avx_gfni_ctr_crypt_16way) + /* input: + * %rdi: ctx + * %rsi: dst + * %rdx: src + * %rcx: keystream + * %r8: iv (big endian, 128bit) + */ + FRAME_BEGIN + + call __aria_aesni_avx_ctr_gen_keystream_16way + + leaq (%rsi), %r10; + leaq (%rdx), %r11; + leaq (%rcx), %rsi; + leaq (%rcx), %rdx; + leaq enc_key(CTX), %r9; + + call __aria_aesni_avx_gfni_crypt_16way; + + vpxor (0 * 16)(%r11), %xmm1, %xmm1; + vpxor (1 * 16)(%r11), %xmm0, %xmm0; + vpxor (2 * 16)(%r11), %xmm3, %xmm3; + vpxor (3 * 16)(%r11), %xmm2, %xmm2; + vpxor (4 * 16)(%r11), %xmm4, %xmm4; + vpxor (5 * 16)(%r11), %xmm5, %xmm5; + vpxor (6 * 16)(%r11), %xmm6, %xmm6; + vpxor (7 * 16)(%r11), %xmm7, %xmm7; + vpxor (8 * 16)(%r11), %xmm8, %xmm8; + vpxor (9 * 16)(%r11), %xmm9, %xmm9; + vpxor (10 * 16)(%r11), %xmm10, %xmm10; + vpxor (11 * 16)(%r11), %xmm11, %xmm11; + vpxor (12 * 16)(%r11), %xmm12, %xmm12; + vpxor (13 * 16)(%r11), %xmm13, %xmm13; + vpxor (14 * 16)(%r11), %xmm14, %xmm14; + vpxor (15 * 16)(%r11), %xmm15, %xmm15; + write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7, + %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, + %xmm15, %r10); + + FRAME_END + RET; +SYM_FUNC_END(aria_aesni_avx_gfni_ctr_crypt_16way) diff --git a/arch/x86/crypto/aria-avx.h b/arch/x86/crypto/aria-avx.h new file mode 100644 index 000000000000..01e9a01dc157 --- /dev/null +++ b/arch/x86/crypto/aria-avx.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#ifndef ASM_X86_ARIA_AVX_H +#define ASM_X86_ARIA_AVX_H + +#include <linux/types.h> + +#define ARIA_AESNI_PARALLEL_BLOCKS 16 +#define ARIA_AESNI_PARALLEL_BLOCK_SIZE (ARIA_BLOCK_SIZE * 16) + +struct aria_avx_ops { + void (*aria_encrypt_16way)(const void *ctx, u8 *dst, const u8 *src); + void (*aria_decrypt_16way)(const void *ctx, u8 *dst, const u8 *src); + void (*aria_ctr_crypt_16way)(const void *ctx, u8 *dst, const u8 *src, + u8 *keystream, u8 *iv); +}; +#endif diff --git a/arch/x86/crypto/aria_aesni_avx_glue.c b/arch/x86/crypto/aria_aesni_avx_glue.c new file mode 100644 index 000000000000..c561ea4fefa5 --- /dev/null +++ b/arch/x86/crypto/aria_aesni_avx_glue.c @@ -0,0 +1,213 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Glue Code for the AVX/AES-NI/GFNI assembler implementation of the ARIA Cipher + * + * Copyright (c) 2022 Taehee Yoo <ap420073@gmail.com> + */ + +#include <crypto/algapi.h> +#include <crypto/internal/simd.h> +#include <crypto/aria.h> +#include <linux/crypto.h> +#include <linux/err.h> +#include <linux/module.h> +#include <linux/types.h> + +#include "ecb_cbc_helpers.h" +#include "aria-avx.h" + +asmlinkage void aria_aesni_avx_encrypt_16way(const void *ctx, u8 *dst, + const u8 *src); +asmlinkage void aria_aesni_avx_decrypt_16way(const void *ctx, u8 *dst, + const u8 *src); +asmlinkage void aria_aesni_avx_ctr_crypt_16way(const void *ctx, u8 *dst, + const u8 *src, + u8 *keystream, u8 *iv); +asmlinkage void aria_aesni_avx_gfni_encrypt_16way(const void *ctx, u8 *dst, + const u8 *src); +asmlinkage void aria_aesni_avx_gfni_decrypt_16way(const void *ctx, u8 *dst, + const u8 *src); +asmlinkage void aria_aesni_avx_gfni_ctr_crypt_16way(const void *ctx, u8 *dst, + const u8 *src, + u8 *keystream, u8 *iv); + +static struct aria_avx_ops aria_ops; + +static int ecb_do_encrypt(struct skcipher_request *req, const u32 *rkey) +{ + ECB_WALK_START(req, ARIA_BLOCK_SIZE, ARIA_AESNI_PARALLEL_BLOCKS); + ECB_BLOCK(ARIA_AESNI_PARALLEL_BLOCKS, aria_ops.aria_encrypt_16way); + ECB_BLOCK(1, aria_encrypt); + ECB_WALK_END(); +} + +static int ecb_do_decrypt(struct skcipher_request *req, const u32 *rkey) +{ + ECB_WALK_START(req, ARIA_BLOCK_SIZE, ARIA_AESNI_PARALLEL_BLOCKS); + ECB_BLOCK(ARIA_AESNI_PARALLEL_BLOCKS, aria_ops.aria_decrypt_16way); + ECB_BLOCK(1, aria_decrypt); + ECB_WALK_END(); +} + +static int aria_avx_ecb_encrypt(struct skcipher_request *req) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct aria_ctx *ctx = crypto_skcipher_ctx(tfm); + + return ecb_do_encrypt(req, ctx->enc_key[0]); +} + +static int aria_avx_ecb_decrypt(struct skcipher_request *req) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct aria_ctx *ctx = crypto_skcipher_ctx(tfm); + + return ecb_do_decrypt(req, ctx->dec_key[0]); +} + +static int aria_avx_set_key(struct crypto_skcipher *tfm, const u8 *key, + unsigned int keylen) +{ + return aria_set_key(&tfm->base, key, keylen); +} + +static int aria_avx_ctr_encrypt(struct skcipher_request *req) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct aria_ctx *ctx = crypto_skcipher_ctx(tfm); + struct skcipher_walk walk; + unsigned int nbytes; + int err; + + err = skcipher_walk_virt(&walk, req, false); + + while ((nbytes = walk.nbytes) > 0) { + const u8 *src = walk.src.virt.addr; + u8 *dst = walk.dst.virt.addr; + + while (nbytes >= ARIA_AESNI_PARALLEL_BLOCK_SIZE) { + u8 keystream[ARIA_AESNI_PARALLEL_BLOCK_SIZE]; + + kernel_fpu_begin(); + aria_ops.aria_ctr_crypt_16way(ctx, dst, src, keystream, + walk.iv); + kernel_fpu_end(); + dst += ARIA_AESNI_PARALLEL_BLOCK_SIZE; + src += ARIA_AESNI_PARALLEL_BLOCK_SIZE; + nbytes -= ARIA_AESNI_PARALLEL_BLOCK_SIZE; + } + + while (nbytes >= ARIA_BLOCK_SIZE) { + u8 keystream[ARIA_BLOCK_SIZE]; + + memcpy(keystream, walk.iv, ARIA_BLOCK_SIZE); + crypto_inc(walk.iv, ARIA_BLOCK_SIZE); + + aria_encrypt(ctx, keystream, keystream); + + crypto_xor_cpy(dst, src, keystream, ARIA_BLOCK_SIZE); + dst += ARIA_BLOCK_SIZE; + src += ARIA_BLOCK_SIZE; + nbytes -= ARIA_BLOCK_SIZE; + } + + if (walk.nbytes == walk.total && nbytes > 0) { + u8 keystream[ARIA_BLOCK_SIZE]; + + memcpy(keystream, walk.iv, ARIA_BLOCK_SIZE); + crypto_inc(walk.iv, ARIA_BLOCK_SIZE); + + aria_encrypt(ctx, keystream, keystream); + + crypto_xor_cpy(dst, src, keystream, nbytes); + dst += nbytes; + src += nbytes; + nbytes = 0; + } + err = skcipher_walk_done(&walk, nbytes); + } + + return err; +} + +static struct skcipher_alg aria_algs[] = { + { + .base.cra_name = "__ecb(aria)", + .base.cra_driver_name = "__ecb-aria-avx", + .base.cra_priority = 400, + .base.cra_flags = CRYPTO_ALG_INTERNAL, + .base.cra_blocksize = ARIA_BLOCK_SIZE, + .base.cra_ctxsize = sizeof(struct aria_ctx), + .base.cra_module = THIS_MODULE, + .min_keysize = ARIA_MIN_KEY_SIZE, + .max_keysize = ARIA_MAX_KEY_SIZE, + .setkey = aria_avx_set_key, + .encrypt = aria_avx_ecb_encrypt, + .decrypt = aria_avx_ecb_decrypt, + }, { + .base.cra_name = "__ctr(aria)", + .base.cra_driver_name = "__ctr-aria-avx", + .base.cra_priority = 400, + .base.cra_flags = CRYPTO_ALG_INTERNAL, + .base.cra_blocksize = 1, + .base.cra_ctxsize = sizeof(struct aria_ctx), + .base.cra_module = THIS_MODULE, + .min_keysize = ARIA_MIN_KEY_SIZE, + .max_keysize = ARIA_MAX_KEY_SIZE, + .ivsize = ARIA_BLOCK_SIZE, + .chunksize = ARIA_BLOCK_SIZE, + .walksize = 16 * ARIA_BLOCK_SIZE, + .setkey = aria_avx_set_key, + .encrypt = aria_avx_ctr_encrypt, + .decrypt = aria_avx_ctr_encrypt, + } +}; + +static struct simd_skcipher_alg *aria_simd_algs[ARRAY_SIZE(aria_algs)]; + +static int __init aria_avx_init(void) +{ + const char *feature_name; + + if (!boot_cpu_has(X86_FEATURE_AVX) || + !boot_cpu_has(X86_FEATURE_AES) || + !boot_cpu_has(X86_FEATURE_OSXSAVE)) { + pr_info("AVX or AES-NI instructions are not detected.\n"); + return -ENODEV; + } + + if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, + &feature_name)) { + pr_info("CPU feature '%s' is not supported.\n", feature_name); + return -ENODEV; + } + + if (boot_cpu_has(X86_FEATURE_GFNI)) { + aria_ops.aria_encrypt_16way = aria_aesni_avx_gfni_encrypt_16way; + aria_ops.aria_decrypt_16way = aria_aesni_avx_gfni_decrypt_16way; + aria_ops.aria_ctr_crypt_16way = aria_aesni_avx_gfni_ctr_crypt_16way; + } else { + aria_ops.aria_encrypt_16way = aria_aesni_avx_encrypt_16way; + aria_ops.aria_decrypt_16way = aria_aesni_avx_decrypt_16way; + aria_ops.aria_ctr_crypt_16way = aria_aesni_avx_ctr_crypt_16way; + } + + return simd_register_skciphers_compat(aria_algs, + ARRAY_SIZE(aria_algs), + aria_simd_algs); +} + +static void __exit aria_avx_exit(void) +{ + simd_unregister_skciphers(aria_algs, ARRAY_SIZE(aria_algs), + aria_simd_algs); +} + +module_init(aria_avx_init); +module_exit(aria_avx_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Taehee Yoo <ap420073@gmail.com>"); +MODULE_DESCRIPTION("ARIA Cipher Algorithm, AVX/AES-NI/GFNI optimized"); +MODULE_ALIAS_CRYPTO("aria"); +MODULE_ALIAS_CRYPTO("aria-aesni-avx"); diff --git a/arch/x86/crypto/blake2s-core.S b/arch/x86/crypto/blake2s-core.S index 24910b766bdd..b50b35ff1fdb 100644 --- a/arch/x86/crypto/blake2s-core.S +++ b/arch/x86/crypto/blake2s-core.S @@ -46,7 +46,6 @@ SIGMA2: #endif /* CONFIG_AS_AVX512 */ .text -#ifdef CONFIG_AS_SSSE3 SYM_FUNC_START(blake2s_compress_ssse3) testq %rdx,%rdx je .Lendofloop @@ -172,9 +171,8 @@ SYM_FUNC_START(blake2s_compress_ssse3) movdqu %xmm1,0x10(%rdi) movdqu %xmm14,0x20(%rdi) .Lendofloop: - ret + RET SYM_FUNC_END(blake2s_compress_ssse3) -#endif /* CONFIG_AS_SSSE3 */ #ifdef CONFIG_AS_AVX512 SYM_FUNC_START(blake2s_compress_avx512) @@ -253,6 +251,6 @@ SYM_FUNC_START(blake2s_compress_avx512) vmovdqu %xmm1,0x10(%rdi) vmovdqu %xmm4,0x20(%rdi) vzeroupper - retq + RET SYM_FUNC_END(blake2s_compress_avx512) #endif /* CONFIG_AS_AVX512 */ diff --git a/arch/x86/crypto/blake2s-glue.c b/arch/x86/crypto/blake2s-glue.c index 06ef2d4a4701..aaba21230528 100644 --- a/arch/x86/crypto/blake2s-glue.c +++ b/arch/x86/crypto/blake2s-glue.c @@ -4,13 +4,12 @@ */ #include <crypto/internal/blake2s.h> -#include <crypto/internal/simd.h> -#include <crypto/internal/hash.h> #include <linux/types.h> #include <linux/jump_label.h> #include <linux/kernel.h> #include <linux/module.h> +#include <linux/sizes.h> #include <asm/cpufeature.h> #include <asm/fpu/api.h> @@ -27,21 +26,20 @@ asmlinkage void blake2s_compress_avx512(struct blake2s_state *state, static __ro_after_init DEFINE_STATIC_KEY_FALSE(blake2s_use_ssse3); static __ro_after_init DEFINE_STATIC_KEY_FALSE(blake2s_use_avx512); -void blake2s_compress_arch(struct blake2s_state *state, - const u8 *block, size_t nblocks, - const u32 inc) +void blake2s_compress(struct blake2s_state *state, const u8 *block, + size_t nblocks, const u32 inc) { /* SIMD disables preemption, so relax after processing each page. */ - BUILD_BUG_ON(PAGE_SIZE / BLAKE2S_BLOCK_SIZE < 8); + BUILD_BUG_ON(SZ_4K / BLAKE2S_BLOCK_SIZE < 8); - if (!static_branch_likely(&blake2s_use_ssse3) || !crypto_simd_usable()) { + if (!static_branch_likely(&blake2s_use_ssse3) || !may_use_simd()) { blake2s_compress_generic(state, block, nblocks, inc); return; } - for (;;) { + do { const size_t blocks = min_t(size_t, nblocks, - PAGE_SIZE / BLAKE2S_BLOCK_SIZE); + SZ_4K / BLAKE2S_BLOCK_SIZE); kernel_fpu_begin(); if (IS_ENABLED(CONFIG_AS_AVX512) && @@ -52,152 +50,15 @@ void blake2s_compress_arch(struct blake2s_state *state, kernel_fpu_end(); nblocks -= blocks; - if (!nblocks) - break; block += blocks * BLAKE2S_BLOCK_SIZE; - } -} -EXPORT_SYMBOL(blake2s_compress_arch); - -static int crypto_blake2s_setkey(struct crypto_shash *tfm, const u8 *key, - unsigned int keylen) -{ - struct blake2s_tfm_ctx *tctx = crypto_shash_ctx(tfm); - - if (keylen == 0 || keylen > BLAKE2S_KEY_SIZE) - return -EINVAL; - - memcpy(tctx->key, key, keylen); - tctx->keylen = keylen; - - return 0; -} - -static int crypto_blake2s_init(struct shash_desc *desc) -{ - struct blake2s_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm); - struct blake2s_state *state = shash_desc_ctx(desc); - const int outlen = crypto_shash_digestsize(desc->tfm); - - if (tctx->keylen) - blake2s_init_key(state, outlen, tctx->key, tctx->keylen); - else - blake2s_init(state, outlen); - - return 0; -} - -static int crypto_blake2s_update(struct shash_desc *desc, const u8 *in, - unsigned int inlen) -{ - struct blake2s_state *state = shash_desc_ctx(desc); - const size_t fill = BLAKE2S_BLOCK_SIZE - state->buflen; - - if (unlikely(!inlen)) - return 0; - if (inlen > fill) { - memcpy(state->buf + state->buflen, in, fill); - blake2s_compress_arch(state, state->buf, 1, BLAKE2S_BLOCK_SIZE); - state->buflen = 0; - in += fill; - inlen -= fill; - } - if (inlen > BLAKE2S_BLOCK_SIZE) { - const size_t nblocks = DIV_ROUND_UP(inlen, BLAKE2S_BLOCK_SIZE); - /* Hash one less (full) block than strictly possible */ - blake2s_compress_arch(state, in, nblocks - 1, BLAKE2S_BLOCK_SIZE); - in += BLAKE2S_BLOCK_SIZE * (nblocks - 1); - inlen -= BLAKE2S_BLOCK_SIZE * (nblocks - 1); - } - memcpy(state->buf + state->buflen, in, inlen); - state->buflen += inlen; - - return 0; -} - -static int crypto_blake2s_final(struct shash_desc *desc, u8 *out) -{ - struct blake2s_state *state = shash_desc_ctx(desc); - - blake2s_set_lastblock(state); - memset(state->buf + state->buflen, 0, - BLAKE2S_BLOCK_SIZE - state->buflen); /* Padding */ - blake2s_compress_arch(state, state->buf, 1, state->buflen); - cpu_to_le32_array(state->h, ARRAY_SIZE(state->h)); - memcpy(out, state->h, state->outlen); - memzero_explicit(state, sizeof(*state)); - - return 0; + } while (nblocks); } - -static struct shash_alg blake2s_algs[] = {{ - .base.cra_name = "blake2s-128", - .base.cra_driver_name = "blake2s-128-x86", - .base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY, - .base.cra_ctxsize = sizeof(struct blake2s_tfm_ctx), - .base.cra_priority = 200, - .base.cra_blocksize = BLAKE2S_BLOCK_SIZE, - .base.cra_module = THIS_MODULE, - - .digestsize = BLAKE2S_128_HASH_SIZE, - .setkey = crypto_blake2s_setkey, - .init = crypto_blake2s_init, - .update = crypto_blake2s_update, - .final = crypto_blake2s_final, - .descsize = sizeof(struct blake2s_state), -}, { - .base.cra_name = "blake2s-160", - .base.cra_driver_name = "blake2s-160-x86", - .base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY, - .base.cra_ctxsize = sizeof(struct blake2s_tfm_ctx), - .base.cra_priority = 200, - .base.cra_blocksize = BLAKE2S_BLOCK_SIZE, - .base.cra_module = THIS_MODULE, - - .digestsize = BLAKE2S_160_HASH_SIZE, - .setkey = crypto_blake2s_setkey, - .init = crypto_blake2s_init, - .update = crypto_blake2s_update, - .final = crypto_blake2s_final, - .descsize = sizeof(struct blake2s_state), -}, { - .base.cra_name = "blake2s-224", - .base.cra_driver_name = "blake2s-224-x86", - .base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY, - .base.cra_ctxsize = sizeof(struct blake2s_tfm_ctx), - .base.cra_priority = 200, - .base.cra_blocksize = BLAKE2S_BLOCK_SIZE, - .base.cra_module = THIS_MODULE, - - .digestsize = BLAKE2S_224_HASH_SIZE, - .setkey = crypto_blake2s_setkey, - .init = crypto_blake2s_init, - .update = crypto_blake2s_update, - .final = crypto_blake2s_final, - .descsize = sizeof(struct blake2s_state), -}, { - .base.cra_name = "blake2s-256", - .base.cra_driver_name = "blake2s-256-x86", - .base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY, - .base.cra_ctxsize = sizeof(struct blake2s_tfm_ctx), - .base.cra_priority = 200, - .base.cra_blocksize = BLAKE2S_BLOCK_SIZE, - .base.cra_module = THIS_MODULE, - - .digestsize = BLAKE2S_256_HASH_SIZE, - .setkey = crypto_blake2s_setkey, - .init = crypto_blake2s_init, - .update = crypto_blake2s_update, - .final = crypto_blake2s_final, - .descsize = sizeof(struct blake2s_state), -}}; +EXPORT_SYMBOL(blake2s_compress); static int __init blake2s_mod_init(void) { - if (!boot_cpu_has(X86_FEATURE_SSSE3)) - return 0; - - static_branch_enable(&blake2s_use_ssse3); + if (boot_cpu_has(X86_FEATURE_SSSE3)) + static_branch_enable(&blake2s_use_ssse3); if (IS_ENABLED(CONFIG_AS_AVX512) && boot_cpu_has(X86_FEATURE_AVX) && @@ -208,26 +69,9 @@ static int __init blake2s_mod_init(void) XFEATURE_MASK_AVX512, NULL)) static_branch_enable(&blake2s_use_avx512); - return IS_REACHABLE(CONFIG_CRYPTO_HASH) ? - crypto_register_shashes(blake2s_algs, - ARRAY_SIZE(blake2s_algs)) : 0; -} - -static void __exit blake2s_mod_exit(void) -{ - if (IS_REACHABLE(CONFIG_CRYPTO_HASH) && boot_cpu_has(X86_FEATURE_SSSE3)) - crypto_unregister_shashes(blake2s_algs, ARRAY_SIZE(blake2s_algs)); + return 0; } module_init(blake2s_mod_init); -module_exit(blake2s_mod_exit); -MODULE_ALIAS_CRYPTO("blake2s-128"); -MODULE_ALIAS_CRYPTO("blake2s-128-x86"); -MODULE_ALIAS_CRYPTO("blake2s-160"); -MODULE_ALIAS_CRYPTO("blake2s-160-x86"); -MODULE_ALIAS_CRYPTO("blake2s-224"); -MODULE_ALIAS_CRYPTO("blake2s-224-x86"); -MODULE_ALIAS_CRYPTO("blake2s-256"); -MODULE_ALIAS_CRYPTO("blake2s-256-x86"); MODULE_LICENSE("GPL v2"); diff --git a/arch/x86/crypto/blowfish-x86_64-asm_64.S b/arch/x86/crypto/blowfish-x86_64-asm_64.S index 4222ac6d6584..4a43e072d2d1 100644 --- a/arch/x86/crypto/blowfish-x86_64-asm_64.S +++ b/arch/x86/crypto/blowfish-x86_64-asm_64.S @@ -6,6 +6,7 @@ */ #include <linux/linkage.h> +#include <linux/cfi_types.h> .file "blowfish-x86_64-asm.S" .text @@ -135,13 +136,13 @@ SYM_FUNC_START(__blowfish_enc_blk) jnz .L__enc_xor; write_block(); - ret; + RET; .L__enc_xor: xor_block(); - ret; + RET; SYM_FUNC_END(__blowfish_enc_blk) -SYM_FUNC_START(blowfish_dec_blk) +SYM_TYPED_FUNC_START(blowfish_dec_blk) /* input: * %rdi: ctx * %rsi: dst @@ -170,7 +171,7 @@ SYM_FUNC_START(blowfish_dec_blk) movq %r11, %r12; - ret; + RET; SYM_FUNC_END(blowfish_dec_blk) /********************************************************************** @@ -322,17 +323,17 @@ SYM_FUNC_START(__blowfish_enc_blk_4way) popq %rbx; popq %r12; - ret; + RET; .L__enc_xor4: xor_block4(); popq %rbx; popq %r12; - ret; + RET; SYM_FUNC_END(__blowfish_enc_blk_4way) -SYM_FUNC_START(blowfish_dec_blk_4way) +SYM_TYPED_FUNC_START(blowfish_dec_blk_4way) /* input: * %rdi: ctx * %rsi: dst @@ -364,5 +365,5 @@ SYM_FUNC_START(blowfish_dec_blk_4way) popq %rbx; popq %r12; - ret; + RET; SYM_FUNC_END(blowfish_dec_blk_4way) diff --git a/arch/x86/crypto/blowfish_glue.c b/arch/x86/crypto/blowfish_glue.c index cedfdba69ce3..019c64c1340a 100644 --- a/arch/x86/crypto/blowfish_glue.c +++ b/arch/x86/crypto/blowfish_glue.c @@ -6,8 +6,6 @@ * * CBC & ECB parts based on code (crypto/cbc.c,ecb.c) by: * Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au> - * CTR part based on code (crypto/ctr.c) by: - * (C) Copyright IBM Corp. 2007 - Joy Latten <latten@us.ibm.com> */ #include <crypto/algapi.h> @@ -34,24 +32,12 @@ static inline void blowfish_enc_blk(struct bf_ctx *ctx, u8 *dst, const u8 *src) __blowfish_enc_blk(ctx, dst, src, false); } -static inline void blowfish_enc_blk_xor(struct bf_ctx *ctx, u8 *dst, - const u8 *src) -{ - __blowfish_enc_blk(ctx, dst, src, true); -} - static inline void blowfish_enc_blk_4way(struct bf_ctx *ctx, u8 *dst, const u8 *src) { __blowfish_enc_blk_4way(ctx, dst, src, false); } -static inline void blowfish_enc_blk_xor_4way(struct bf_ctx *ctx, u8 *dst, - const u8 *src) -{ - __blowfish_enc_blk_4way(ctx, dst, src, true); -} - static void blowfish_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) { blowfish_enc_blk(crypto_tfm_ctx(tfm), dst, src); @@ -158,7 +144,7 @@ static int cbc_encrypt(struct skcipher_request *req) err = skcipher_walk_virt(&walk, req, false); - while ((nbytes = walk.nbytes)) { + while (walk.nbytes) { nbytes = __cbc_encrypt(ctx, &walk); err = skcipher_walk_done(&walk, nbytes); } @@ -239,7 +225,7 @@ static int cbc_decrypt(struct skcipher_request *req) err = skcipher_walk_virt(&walk, req, false); - while ((nbytes = walk.nbytes)) { + while (walk.nbytes) { nbytes = __cbc_decrypt(ctx, &walk); err = skcipher_walk_done(&walk, nbytes); } @@ -247,97 +233,6 @@ static int cbc_decrypt(struct skcipher_request *req) return err; } -static void ctr_crypt_final(struct bf_ctx *ctx, struct skcipher_walk *walk) -{ - u8 *ctrblk = walk->iv; - u8 keystream[BF_BLOCK_SIZE]; - u8 *src = walk->src.virt.addr; - u8 *dst = walk->dst.virt.addr; - unsigned int nbytes = walk->nbytes; - - blowfish_enc_blk(ctx, keystream, ctrblk); - crypto_xor_cpy(dst, keystream, src, nbytes); - - crypto_inc(ctrblk, BF_BLOCK_SIZE); -} - -static unsigned int __ctr_crypt(struct bf_ctx *ctx, struct skcipher_walk *walk) -{ - unsigned int bsize = BF_BLOCK_SIZE; - unsigned int nbytes = walk->nbytes; - u64 *src = (u64 *)walk->src.virt.addr; - u64 *dst = (u64 *)walk->dst.virt.addr; - u64 ctrblk = be64_to_cpu(*(__be64 *)walk->iv); - __be64 ctrblocks[4]; - - /* Process four block batch */ - if (nbytes >= bsize * 4) { - do { - if (dst != src) { - dst[0] = src[0]; - dst[1] = src[1]; - dst[2] = src[2]; - dst[3] = src[3]; - } - - /* create ctrblks for parallel encrypt */ - ctrblocks[0] = cpu_to_be64(ctrblk++); - ctrblocks[1] = cpu_to_be64(ctrblk++); - ctrblocks[2] = cpu_to_be64(ctrblk++); - ctrblocks[3] = cpu_to_be64(ctrblk++); - - blowfish_enc_blk_xor_4way(ctx, (u8 *)dst, - (u8 *)ctrblocks); - - src += 4; - dst += 4; - } while ((nbytes -= bsize * 4) >= bsize * 4); - - if (nbytes < bsize) - goto done; - } - - /* Handle leftovers */ - do { - if (dst != src) - *dst = *src; - - ctrblocks[0] = cpu_to_be64(ctrblk++); - - blowfish_enc_blk_xor(ctx, (u8 *)dst, (u8 *)ctrblocks); - - src += 1; - dst += 1; - } while ((nbytes -= bsize) >= bsize); - -done: - *(__be64 *)walk->iv = cpu_to_be64(ctrblk); - return nbytes; -} - -static int ctr_crypt(struct skcipher_request *req) -{ - struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - struct bf_ctx *ctx = crypto_skcipher_ctx(tfm); - struct skcipher_walk walk; - unsigned int nbytes; - int err; - - err = skcipher_walk_virt(&walk, req, false); - - while ((nbytes = walk.nbytes) >= BF_BLOCK_SIZE) { - nbytes = __ctr_crypt(ctx, &walk); - err = skcipher_walk_done(&walk, nbytes); - } - - if (nbytes) { - ctr_crypt_final(ctx, &walk); - err = skcipher_walk_done(&walk, 0); - } - - return err; -} - static struct crypto_alg bf_cipher_alg = { .cra_name = "blowfish", .cra_driver_name = "blowfish-asm", @@ -384,20 +279,6 @@ static struct skcipher_alg bf_skcipher_algs[] = { .setkey = blowfish_setkey_skcipher, .encrypt = cbc_encrypt, .decrypt = cbc_decrypt, - }, { - .base.cra_name = "ctr(blowfish)", - .base.cra_driver_name = "ctr-blowfish-asm", - .base.cra_priority = 300, - .base.cra_blocksize = 1, - .base.cra_ctxsize = sizeof(struct bf_ctx), - .base.cra_module = THIS_MODULE, - .min_keysize = BF_MIN_KEY_SIZE, - .max_keysize = BF_MAX_KEY_SIZE, - .ivsize = BF_BLOCK_SIZE, - .chunksize = BF_BLOCK_SIZE, - .setkey = blowfish_setkey_skcipher, - .encrypt = ctr_crypt, - .decrypt = ctr_crypt, }, }; @@ -422,7 +303,7 @@ static int force; module_param(force, int, 0); MODULE_PARM_DESC(force, "Force module load, ignore CPU blacklist"); -static int __init init(void) +static int __init blowfish_init(void) { int err; @@ -446,15 +327,15 @@ static int __init init(void) return err; } -static void __exit fini(void) +static void __exit blowfish_fini(void) { crypto_unregister_alg(&bf_cipher_alg); crypto_unregister_skciphers(bf_skcipher_algs, ARRAY_SIZE(bf_skcipher_algs)); } -module_init(init); -module_exit(fini); +module_init(blowfish_init); +module_exit(blowfish_fini); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Blowfish Cipher Algorithm, asm optimized"); diff --git a/arch/x86/crypto/camellia-aesni-avx-asm_64.S b/arch/x86/crypto/camellia-aesni-avx-asm_64.S index d01ddd73de65..2e1658ddbe1a 100644 --- a/arch/x86/crypto/camellia-aesni-avx-asm_64.S +++ b/arch/x86/crypto/camellia-aesni-avx-asm_64.S @@ -17,7 +17,6 @@ #include <linux/linkage.h> #include <asm/frame.h> -#include <asm/nospec-branch.h> #define CAMELLIA_TABLE_BYTE_LEN 272 @@ -193,7 +192,7 @@ SYM_FUNC_START_LOCAL(roundsm16_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_c roundsm16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %rcx, (%r9)); - ret; + RET; SYM_FUNC_END(roundsm16_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd) .align 8 @@ -201,7 +200,7 @@ SYM_FUNC_START_LOCAL(roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_a roundsm16(%xmm4, %xmm5, %xmm6, %xmm7, %xmm0, %xmm1, %xmm2, %xmm3, %xmm12, %xmm13, %xmm14, %xmm15, %xmm8, %xmm9, %xmm10, %xmm11, %rax, (%r9)); - ret; + RET; SYM_FUNC_END(roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab) /* @@ -589,14 +588,6 @@ SYM_FUNC_END(roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab) .long 0x80808080 .long 0x80808080 -/* For CTR-mode IV byteswap */ -.Lbswap128_mask: - .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 - -/* For XTS mode IV generation */ -.Lxts_gf128mul_and_shl1_mask: - .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0 - /* * pre-SubByte transform * @@ -787,7 +778,7 @@ SYM_FUNC_START_LOCAL(__camellia_enc_blk16) %xmm15, (key_table)(CTX, %r8, 8), (%rax), 1 * 16(%rax)); FRAME_END - ret; + RET; .align 8 .Lenc_max32: @@ -874,7 +865,7 @@ SYM_FUNC_START_LOCAL(__camellia_dec_blk16) %xmm15, (key_table)(CTX), (%rax), 1 * 16(%rax)); FRAME_END - ret; + RET; .align 8 .Ldec_max32: @@ -915,7 +906,7 @@ SYM_FUNC_START(camellia_ecb_enc_16way) %xmm8, %rsi); FRAME_END - ret; + RET; SYM_FUNC_END(camellia_ecb_enc_16way) SYM_FUNC_START(camellia_ecb_dec_16way) @@ -945,7 +936,7 @@ SYM_FUNC_START(camellia_ecb_dec_16way) %xmm8, %rsi); FRAME_END - ret; + RET; SYM_FUNC_END(camellia_ecb_dec_16way) SYM_FUNC_START(camellia_cbc_dec_16way) @@ -996,294 +987,5 @@ SYM_FUNC_START(camellia_cbc_dec_16way) %xmm8, %rsi); FRAME_END - ret; + RET; SYM_FUNC_END(camellia_cbc_dec_16way) - -#define inc_le128(x, minus_one, tmp) \ - vpcmpeqq minus_one, x, tmp; \ - vpsubq minus_one, x, x; \ - vpslldq $8, tmp, tmp; \ - vpsubq tmp, x, x; - -SYM_FUNC_START(camellia_ctr_16way) - /* input: - * %rdi: ctx, CTX - * %rsi: dst (16 blocks) - * %rdx: src (16 blocks) - * %rcx: iv (little endian, 128bit) - */ - FRAME_BEGIN - - subq $(16 * 16), %rsp; - movq %rsp, %rax; - - vmovdqa .Lbswap128_mask, %xmm14; - - /* load IV and byteswap */ - vmovdqu (%rcx), %xmm0; - vpshufb %xmm14, %xmm0, %xmm15; - vmovdqu %xmm15, 15 * 16(%rax); - - vpcmpeqd %xmm15, %xmm15, %xmm15; - vpsrldq $8, %xmm15, %xmm15; /* low: -1, high: 0 */ - - /* construct IVs */ - inc_le128(%xmm0, %xmm15, %xmm13); - vpshufb %xmm14, %xmm0, %xmm13; - vmovdqu %xmm13, 14 * 16(%rax); - inc_le128(%xmm0, %xmm15, %xmm13); - vpshufb %xmm14, %xmm0, %xmm13; - vmovdqu %xmm13, 13 * 16(%rax); - inc_le128(%xmm0, %xmm15, %xmm13); - vpshufb %xmm14, %xmm0, %xmm12; - inc_le128(%xmm0, %xmm15, %xmm13); - vpshufb %xmm14, %xmm0, %xmm11; - inc_le128(%xmm0, %xmm15, %xmm13); - vpshufb %xmm14, %xmm0, %xmm10; - inc_le128(%xmm0, %xmm15, %xmm13); - vpshufb %xmm14, %xmm0, %xmm9; - inc_le128(%xmm0, %xmm15, %xmm13); - vpshufb %xmm14, %xmm0, %xmm8; - inc_le128(%xmm0, %xmm15, %xmm13); - vpshufb %xmm14, %xmm0, %xmm7; - inc_le128(%xmm0, %xmm15, %xmm13); - vpshufb %xmm14, %xmm0, %xmm6; - inc_le128(%xmm0, %xmm15, %xmm13); - vpshufb %xmm14, %xmm0, %xmm5; - inc_le128(%xmm0, %xmm15, %xmm13); - vpshufb %xmm14, %xmm0, %xmm4; - inc_le128(%xmm0, %xmm15, %xmm13); - vpshufb %xmm14, %xmm0, %xmm3; - inc_le128(%xmm0, %xmm15, %xmm13); - vpshufb %xmm14, %xmm0, %xmm2; - inc_le128(%xmm0, %xmm15, %xmm13); - vpshufb %xmm14, %xmm0, %xmm1; - inc_le128(%xmm0, %xmm15, %xmm13); - vmovdqa %xmm0, %xmm13; - vpshufb %xmm14, %xmm0, %xmm0; - inc_le128(%xmm13, %xmm15, %xmm14); - vmovdqu %xmm13, (%rcx); - - /* inpack16_pre: */ - vmovq (key_table)(CTX), %xmm15; - vpshufb .Lpack_bswap, %xmm15, %xmm15; - vpxor %xmm0, %xmm15, %xmm0; - vpxor %xmm1, %xmm15, %xmm1; - vpxor %xmm2, %xmm15, %xmm2; - vpxor %xmm3, %xmm15, %xmm3; - vpxor %xmm4, %xmm15, %xmm4; - vpxor %xmm5, %xmm15, %xmm5; - vpxor %xmm6, %xmm15, %xmm6; - vpxor %xmm7, %xmm15, %xmm7; - vpxor %xmm8, %xmm15, %xmm8; - vpxor %xmm9, %xmm15, %xmm9; - vpxor %xmm10, %xmm15, %xmm10; - vpxor %xmm11, %xmm15, %xmm11; - vpxor %xmm12, %xmm15, %xmm12; - vpxor 13 * 16(%rax), %xmm15, %xmm13; - vpxor 14 * 16(%rax), %xmm15, %xmm14; - vpxor 15 * 16(%rax), %xmm15, %xmm15; - - call __camellia_enc_blk16; - - addq $(16 * 16), %rsp; - - vpxor 0 * 16(%rdx), %xmm7, %xmm7; - vpxor 1 * 16(%rdx), %xmm6, %xmm6; - vpxor 2 * 16(%rdx), %xmm5, %xmm5; - vpxor 3 * 16(%rdx), %xmm4, %xmm4; - vpxor 4 * 16(%rdx), %xmm3, %xmm3; - vpxor 5 * 16(%rdx), %xmm2, %xmm2; - vpxor 6 * 16(%rdx), %xmm1, %xmm1; - vpxor 7 * 16(%rdx), %xmm0, %xmm0; - vpxor 8 * 16(%rdx), %xmm15, %xmm15; - vpxor 9 * 16(%rdx), %xmm14, %xmm14; - vpxor 10 * 16(%rdx), %xmm13, %xmm13; - vpxor 11 * 16(%rdx), %xmm12, %xmm12; - vpxor 12 * 16(%rdx), %xmm11, %xmm11; - vpxor 13 * 16(%rdx), %xmm10, %xmm10; - vpxor 14 * 16(%rdx), %xmm9, %xmm9; - vpxor 15 * 16(%rdx), %xmm8, %xmm8; - write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0, - %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9, - %xmm8, %rsi); - - FRAME_END - ret; -SYM_FUNC_END(camellia_ctr_16way) - -#define gf128mul_x_ble(iv, mask, tmp) \ - vpsrad $31, iv, tmp; \ - vpaddq iv, iv, iv; \ - vpshufd $0x13, tmp, tmp; \ - vpand mask, tmp, tmp; \ - vpxor tmp, iv, iv; - -.align 8 -SYM_FUNC_START_LOCAL(camellia_xts_crypt_16way) - /* input: - * %rdi: ctx, CTX - * %rsi: dst (16 blocks) - * %rdx: src (16 blocks) - * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) - * %r8: index for input whitening key - * %r9: pointer to __camellia_enc_blk16 or __camellia_dec_blk16 - */ - FRAME_BEGIN - - subq $(16 * 16), %rsp; - movq %rsp, %rax; - - vmovdqa .Lxts_gf128mul_and_shl1_mask, %xmm14; - - /* load IV */ - vmovdqu (%rcx), %xmm0; - vpxor 0 * 16(%rdx), %xmm0, %xmm15; - vmovdqu %xmm15, 15 * 16(%rax); - vmovdqu %xmm0, 0 * 16(%rsi); - - /* construct IVs */ - gf128mul_x_ble(%xmm0, %xmm14, %xmm15); - vpxor 1 * 16(%rdx), %xmm0, %xmm15; - vmovdqu %xmm15, 14 * 16(%rax); - vmovdqu %xmm0, 1 * 16(%rsi); - - gf128mul_x_ble(%xmm0, %xmm14, %xmm15); - vpxor 2 * 16(%rdx), %xmm0, %xmm13; - vmovdqu %xmm0, 2 * 16(%rsi); - - gf128mul_x_ble(%xmm0, %xmm14, %xmm15); - vpxor 3 * 16(%rdx), %xmm0, %xmm12; - vmovdqu %xmm0, 3 * 16(%rsi); - - gf128mul_x_ble(%xmm0, %xmm14, %xmm15); - vpxor 4 * 16(%rdx), %xmm0, %xmm11; - vmovdqu %xmm0, 4 * 16(%rsi); - - gf128mul_x_ble(%xmm0, %xmm14, %xmm15); - vpxor 5 * 16(%rdx), %xmm0, %xmm10; - vmovdqu %xmm0, 5 * 16(%rsi); - - gf128mul_x_ble(%xmm0, %xmm14, %xmm15); - vpxor 6 * 16(%rdx), %xmm0, %xmm9; - vmovdqu %xmm0, 6 * 16(%rsi); - - gf128mul_x_ble(%xmm0, %xmm14, %xmm15); - vpxor 7 * 16(%rdx), %xmm0, %xmm8; - vmovdqu %xmm0, 7 * 16(%rsi); - - gf128mul_x_ble(%xmm0, %xmm14, %xmm15); - vpxor 8 * 16(%rdx), %xmm0, %xmm7; - vmovdqu %xmm0, 8 * 16(%rsi); - - gf128mul_x_ble(%xmm0, %xmm14, %xmm15); - vpxor 9 * 16(%rdx), %xmm0, %xmm6; - vmovdqu %xmm0, 9 * 16(%rsi); - - gf128mul_x_ble(%xmm0, %xmm14, %xmm15); - vpxor 10 * 16(%rdx), %xmm0, %xmm5; - vmovdqu %xmm0, 10 * 16(%rsi); - - gf128mul_x_ble(%xmm0, %xmm14, %xmm15); - vpxor 11 * 16(%rdx), %xmm0, %xmm4; - vmovdqu %xmm0, 11 * 16(%rsi); - - gf128mul_x_ble(%xmm0, %xmm14, %xmm15); - vpxor 12 * 16(%rdx), %xmm0, %xmm3; - vmovdqu %xmm0, 12 * 16(%rsi); - - gf128mul_x_ble(%xmm0, %xmm14, %xmm15); - vpxor 13 * 16(%rdx), %xmm0, %xmm2; - vmovdqu %xmm0, 13 * 16(%rsi); - - gf128mul_x_ble(%xmm0, %xmm14, %xmm15); - vpxor 14 * 16(%rdx), %xmm0, %xmm1; - vmovdqu %xmm0, 14 * 16(%rsi); - - gf128mul_x_ble(%xmm0, %xmm14, %xmm15); - vpxor 15 * 16(%rdx), %xmm0, %xmm15; - vmovdqu %xmm15, 0 * 16(%rax); - vmovdqu %xmm0, 15 * 16(%rsi); - - gf128mul_x_ble(%xmm0, %xmm14, %xmm15); - vmovdqu %xmm0, (%rcx); - - /* inpack16_pre: */ - vmovq (key_table)(CTX, %r8, 8), %xmm15; - vpshufb .Lpack_bswap, %xmm15, %xmm15; - vpxor 0 * 16(%rax), %xmm15, %xmm0; - vpxor %xmm1, %xmm15, %xmm1; - vpxor %xmm2, %xmm15, %xmm2; - vpxor %xmm3, %xmm15, %xmm3; - vpxor %xmm4, %xmm15, %xmm4; - vpxor %xmm5, %xmm15, %xmm5; - vpxor %xmm6, %xmm15, %xmm6; - vpxor %xmm7, %xmm15, %xmm7; - vpxor %xmm8, %xmm15, %xmm8; - vpxor %xmm9, %xmm15, %xmm9; - vpxor %xmm10, %xmm15, %xmm10; - vpxor %xmm11, %xmm15, %xmm11; - vpxor %xmm12, %xmm15, %xmm12; - vpxor %xmm13, %xmm15, %xmm13; - vpxor 14 * 16(%rax), %xmm15, %xmm14; - vpxor 15 * 16(%rax), %xmm15, %xmm15; - - CALL_NOSPEC %r9; - - addq $(16 * 16), %rsp; - - vpxor 0 * 16(%rsi), %xmm7, %xmm7; - vpxor 1 * 16(%rsi), %xmm6, %xmm6; - vpxor 2 * 16(%rsi), %xmm5, %xmm5; - vpxor 3 * 16(%rsi), %xmm4, %xmm4; - vpxor 4 * 16(%rsi), %xmm3, %xmm3; - vpxor 5 * 16(%rsi), %xmm2, %xmm2; - vpxor 6 * 16(%rsi), %xmm1, %xmm1; - vpxor 7 * 16(%rsi), %xmm0, %xmm0; - vpxor 8 * 16(%rsi), %xmm15, %xmm15; - vpxor 9 * 16(%rsi), %xmm14, %xmm14; - vpxor 10 * 16(%rsi), %xmm13, %xmm13; - vpxor 11 * 16(%rsi), %xmm12, %xmm12; - vpxor 12 * 16(%rsi), %xmm11, %xmm11; - vpxor 13 * 16(%rsi), %xmm10, %xmm10; - vpxor 14 * 16(%rsi), %xmm9, %xmm9; - vpxor 15 * 16(%rsi), %xmm8, %xmm8; - write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0, - %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9, - %xmm8, %rsi); - - FRAME_END - ret; -SYM_FUNC_END(camellia_xts_crypt_16way) - -SYM_FUNC_START(camellia_xts_enc_16way) - /* input: - * %rdi: ctx, CTX - * %rsi: dst (16 blocks) - * %rdx: src (16 blocks) - * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) - */ - xorl %r8d, %r8d; /* input whitening key, 0 for enc */ - - leaq __camellia_enc_blk16, %r9; - - jmp camellia_xts_crypt_16way; -SYM_FUNC_END(camellia_xts_enc_16way) - -SYM_FUNC_START(camellia_xts_dec_16way) - /* input: - * %rdi: ctx, CTX - * %rsi: dst (16 blocks) - * %rdx: src (16 blocks) - * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) - */ - - cmpl $16, key_length(CTX); - movl $32, %r8d; - movl $24, %eax; - cmovel %eax, %r8d; /* input whitening key, last for dec */ - - leaq __camellia_dec_blk16, %r9; - - jmp camellia_xts_crypt_16way; -SYM_FUNC_END(camellia_xts_dec_16way) diff --git a/arch/x86/crypto/camellia-aesni-avx2-asm_64.S b/arch/x86/crypto/camellia-aesni-avx2-asm_64.S index 563ef6e83cdd..0e4e9abbf4de 100644 --- a/arch/x86/crypto/camellia-aesni-avx2-asm_64.S +++ b/arch/x86/crypto/camellia-aesni-avx2-asm_64.S @@ -7,7 +7,6 @@ #include <linux/linkage.h> #include <asm/frame.h> -#include <asm/nospec-branch.h> #define CAMELLIA_TABLE_BYTE_LEN 272 @@ -227,7 +226,7 @@ SYM_FUNC_START_LOCAL(roundsm32_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_c roundsm32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, %ymm15, %rcx, (%r9)); - ret; + RET; SYM_FUNC_END(roundsm32_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd) .align 8 @@ -235,7 +234,7 @@ SYM_FUNC_START_LOCAL(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_a roundsm32(%ymm4, %ymm5, %ymm6, %ymm7, %ymm0, %ymm1, %ymm2, %ymm3, %ymm12, %ymm13, %ymm14, %ymm15, %ymm8, %ymm9, %ymm10, %ymm11, %rax, (%r9)); - ret; + RET; SYM_FUNC_END(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab) /* @@ -625,16 +624,6 @@ SYM_FUNC_END(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab) .section .rodata.cst16, "aM", @progbits, 16 .align 16 -/* For CTR-mode IV byteswap */ -.Lbswap128_mask: - .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 - -/* For XTS mode */ -.Lxts_gf128mul_and_shl1_mask_0: - .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0 -.Lxts_gf128mul_and_shl1_mask_1: - .byte 0x0e, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0 - /* * pre-SubByte transform * @@ -825,7 +814,7 @@ SYM_FUNC_START_LOCAL(__camellia_enc_blk32) %ymm15, (key_table)(CTX, %r8, 8), (%rax), 1 * 32(%rax)); FRAME_END - ret; + RET; .align 8 .Lenc_max32: @@ -912,7 +901,7 @@ SYM_FUNC_START_LOCAL(__camellia_dec_blk32) %ymm15, (key_table)(CTX), (%rax), 1 * 32(%rax)); FRAME_END - ret; + RET; .align 8 .Ldec_max32: @@ -957,7 +946,7 @@ SYM_FUNC_START(camellia_ecb_enc_32way) vzeroupper; FRAME_END - ret; + RET; SYM_FUNC_END(camellia_ecb_enc_32way) SYM_FUNC_START(camellia_ecb_dec_32way) @@ -991,7 +980,7 @@ SYM_FUNC_START(camellia_ecb_dec_32way) vzeroupper; FRAME_END - ret; + RET; SYM_FUNC_END(camellia_ecb_dec_32way) SYM_FUNC_START(camellia_cbc_dec_32way) @@ -1001,6 +990,7 @@ SYM_FUNC_START(camellia_cbc_dec_32way) * %rdx: src (32 blocks) */ FRAME_BEGIN + subq $(16 * 32), %rsp; vzeroupper; @@ -1013,7 +1003,6 @@ SYM_FUNC_START(camellia_cbc_dec_32way) %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, %ymm15, %rdx, (key_table)(CTX, %r8, 8)); - movq %rsp, %r10; cmpq %rsi, %rdx; je .Lcbc_dec_use_stack; @@ -1026,7 +1015,6 @@ SYM_FUNC_START(camellia_cbc_dec_32way) * dst still in-use (because dst == src), so use stack for temporary * storage. */ - subq $(16 * 32), %rsp; movq %rsp, %rax; .Lcbc_dec_continue: @@ -1036,7 +1024,6 @@ SYM_FUNC_START(camellia_cbc_dec_32way) vpxor %ymm7, %ymm7, %ymm7; vinserti128 $1, (%rdx), %ymm7, %ymm7; vpxor (%rax), %ymm7, %ymm7; - movq %r10, %rsp; vpxor (0 * 32 + 16)(%rdx), %ymm6, %ymm6; vpxor (1 * 32 + 16)(%rdx), %ymm5, %ymm5; vpxor (2 * 32 + 16)(%rdx), %ymm4, %ymm4; @@ -1058,346 +1045,7 @@ SYM_FUNC_START(camellia_cbc_dec_32way) vzeroupper; - FRAME_END - ret; -SYM_FUNC_END(camellia_cbc_dec_32way) - -#define inc_le128(x, minus_one, tmp) \ - vpcmpeqq minus_one, x, tmp; \ - vpsubq minus_one, x, x; \ - vpslldq $8, tmp, tmp; \ - vpsubq tmp, x, x; - -#define add2_le128(x, minus_one, minus_two, tmp1, tmp2) \ - vpcmpeqq minus_one, x, tmp1; \ - vpcmpeqq minus_two, x, tmp2; \ - vpsubq minus_two, x, x; \ - vpor tmp2, tmp1, tmp1; \ - vpslldq $8, tmp1, tmp1; \ - vpsubq tmp1, x, x; - -SYM_FUNC_START(camellia_ctr_32way) - /* input: - * %rdi: ctx, CTX - * %rsi: dst (32 blocks) - * %rdx: src (32 blocks) - * %rcx: iv (little endian, 128bit) - */ - FRAME_BEGIN - - vzeroupper; - - movq %rsp, %r10; - cmpq %rsi, %rdx; - je .Lctr_use_stack; - - /* dst can be used as temporary storage, src is not overwritten. */ - movq %rsi, %rax; - jmp .Lctr_continue; - -.Lctr_use_stack: - subq $(16 * 32), %rsp; - movq %rsp, %rax; - -.Lctr_continue: - vpcmpeqd %ymm15, %ymm15, %ymm15; - vpsrldq $8, %ymm15, %ymm15; /* ab: -1:0 ; cd: -1:0 */ - vpaddq %ymm15, %ymm15, %ymm12; /* ab: -2:0 ; cd: -2:0 */ - - /* load IV and byteswap */ - vmovdqu (%rcx), %xmm0; - vmovdqa %xmm0, %xmm1; - inc_le128(%xmm0, %xmm15, %xmm14); - vbroadcasti128 .Lbswap128_mask, %ymm14; - vinserti128 $1, %xmm0, %ymm1, %ymm0; - vpshufb %ymm14, %ymm0, %ymm13; - vmovdqu %ymm13, 15 * 32(%rax); - - /* construct IVs */ - add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); /* ab:le2 ; cd:le3 */ - vpshufb %ymm14, %ymm0, %ymm13; - vmovdqu %ymm13, 14 * 32(%rax); - add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); - vpshufb %ymm14, %ymm0, %ymm13; - vmovdqu %ymm13, 13 * 32(%rax); - add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); - vpshufb %ymm14, %ymm0, %ymm13; - vmovdqu %ymm13, 12 * 32(%rax); - add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); - vpshufb %ymm14, %ymm0, %ymm13; - vmovdqu %ymm13, 11 * 32(%rax); - add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); - vpshufb %ymm14, %ymm0, %ymm10; - add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); - vpshufb %ymm14, %ymm0, %ymm9; - add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); - vpshufb %ymm14, %ymm0, %ymm8; - add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); - vpshufb %ymm14, %ymm0, %ymm7; - add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); - vpshufb %ymm14, %ymm0, %ymm6; - add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); - vpshufb %ymm14, %ymm0, %ymm5; - add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); - vpshufb %ymm14, %ymm0, %ymm4; - add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); - vpshufb %ymm14, %ymm0, %ymm3; - add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); - vpshufb %ymm14, %ymm0, %ymm2; - add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); - vpshufb %ymm14, %ymm0, %ymm1; - add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); - vextracti128 $1, %ymm0, %xmm13; - vpshufb %ymm14, %ymm0, %ymm0; - inc_le128(%xmm13, %xmm15, %xmm14); - vmovdqu %xmm13, (%rcx); - - /* inpack32_pre: */ - vpbroadcastq (key_table)(CTX), %ymm15; - vpshufb .Lpack_bswap, %ymm15, %ymm15; - vpxor %ymm0, %ymm15, %ymm0; - vpxor %ymm1, %ymm15, %ymm1; - vpxor %ymm2, %ymm15, %ymm2; - vpxor %ymm3, %ymm15, %ymm3; - vpxor %ymm4, %ymm15, %ymm4; - vpxor %ymm5, %ymm15, %ymm5; - vpxor %ymm6, %ymm15, %ymm6; - vpxor %ymm7, %ymm15, %ymm7; - vpxor %ymm8, %ymm15, %ymm8; - vpxor %ymm9, %ymm15, %ymm9; - vpxor %ymm10, %ymm15, %ymm10; - vpxor 11 * 32(%rax), %ymm15, %ymm11; - vpxor 12 * 32(%rax), %ymm15, %ymm12; - vpxor 13 * 32(%rax), %ymm15, %ymm13; - vpxor 14 * 32(%rax), %ymm15, %ymm14; - vpxor 15 * 32(%rax), %ymm15, %ymm15; - - call __camellia_enc_blk32; - - movq %r10, %rsp; - - vpxor 0 * 32(%rdx), %ymm7, %ymm7; - vpxor 1 * 32(%rdx), %ymm6, %ymm6; - vpxor 2 * 32(%rdx), %ymm5, %ymm5; - vpxor 3 * 32(%rdx), %ymm4, %ymm4; - vpxor 4 * 32(%rdx), %ymm3, %ymm3; - vpxor 5 * 32(%rdx), %ymm2, %ymm2; - vpxor 6 * 32(%rdx), %ymm1, %ymm1; - vpxor 7 * 32(%rdx), %ymm0, %ymm0; - vpxor 8 * 32(%rdx), %ymm15, %ymm15; - vpxor 9 * 32(%rdx), %ymm14, %ymm14; - vpxor 10 * 32(%rdx), %ymm13, %ymm13; - vpxor 11 * 32(%rdx), %ymm12, %ymm12; - vpxor 12 * 32(%rdx), %ymm11, %ymm11; - vpxor 13 * 32(%rdx), %ymm10, %ymm10; - vpxor 14 * 32(%rdx), %ymm9, %ymm9; - vpxor 15 * 32(%rdx), %ymm8, %ymm8; - write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0, - %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9, - %ymm8, %rsi); - - vzeroupper; - - FRAME_END - ret; -SYM_FUNC_END(camellia_ctr_32way) - -#define gf128mul_x_ble(iv, mask, tmp) \ - vpsrad $31, iv, tmp; \ - vpaddq iv, iv, iv; \ - vpshufd $0x13, tmp, tmp; \ - vpand mask, tmp, tmp; \ - vpxor tmp, iv, iv; - -#define gf128mul_x2_ble(iv, mask1, mask2, tmp0, tmp1) \ - vpsrad $31, iv, tmp0; \ - vpaddq iv, iv, tmp1; \ - vpsllq $2, iv, iv; \ - vpshufd $0x13, tmp0, tmp0; \ - vpsrad $31, tmp1, tmp1; \ - vpand mask2, tmp0, tmp0; \ - vpshufd $0x13, tmp1, tmp1; \ - vpxor tmp0, iv, iv; \ - vpand mask1, tmp1, tmp1; \ - vpxor tmp1, iv, iv; - -.align 8 -SYM_FUNC_START_LOCAL(camellia_xts_crypt_32way) - /* input: - * %rdi: ctx, CTX - * %rsi: dst (32 blocks) - * %rdx: src (32 blocks) - * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) - * %r8: index for input whitening key - * %r9: pointer to __camellia_enc_blk32 or __camellia_dec_blk32 - */ - FRAME_BEGIN - - vzeroupper; - - subq $(16 * 32), %rsp; - movq %rsp, %rax; - - vbroadcasti128 .Lxts_gf128mul_and_shl1_mask_0, %ymm12; - - /* load IV and construct second IV */ - vmovdqu (%rcx), %xmm0; - vmovdqa %xmm0, %xmm15; - gf128mul_x_ble(%xmm0, %xmm12, %xmm13); - vbroadcasti128 .Lxts_gf128mul_and_shl1_mask_1, %ymm13; - vinserti128 $1, %xmm0, %ymm15, %ymm0; - vpxor 0 * 32(%rdx), %ymm0, %ymm15; - vmovdqu %ymm15, 15 * 32(%rax); - vmovdqu %ymm0, 0 * 32(%rsi); - - /* construct IVs */ - gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); - vpxor 1 * 32(%rdx), %ymm0, %ymm15; - vmovdqu %ymm15, 14 * 32(%rax); - vmovdqu %ymm0, 1 * 32(%rsi); - - gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); - vpxor 2 * 32(%rdx), %ymm0, %ymm15; - vmovdqu %ymm15, 13 * 32(%rax); - vmovdqu %ymm0, 2 * 32(%rsi); - - gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); - vpxor 3 * 32(%rdx), %ymm0, %ymm15; - vmovdqu %ymm15, 12 * 32(%rax); - vmovdqu %ymm0, 3 * 32(%rsi); - - gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); - vpxor 4 * 32(%rdx), %ymm0, %ymm11; - vmovdqu %ymm0, 4 * 32(%rsi); - - gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); - vpxor 5 * 32(%rdx), %ymm0, %ymm10; - vmovdqu %ymm0, 5 * 32(%rsi); - - gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); - vpxor 6 * 32(%rdx), %ymm0, %ymm9; - vmovdqu %ymm0, 6 * 32(%rsi); - - gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); - vpxor 7 * 32(%rdx), %ymm0, %ymm8; - vmovdqu %ymm0, 7 * 32(%rsi); - - gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); - vpxor 8 * 32(%rdx), %ymm0, %ymm7; - vmovdqu %ymm0, 8 * 32(%rsi); - - gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); - vpxor 9 * 32(%rdx), %ymm0, %ymm6; - vmovdqu %ymm0, 9 * 32(%rsi); - - gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); - vpxor 10 * 32(%rdx), %ymm0, %ymm5; - vmovdqu %ymm0, 10 * 32(%rsi); - - gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); - vpxor 11 * 32(%rdx), %ymm0, %ymm4; - vmovdqu %ymm0, 11 * 32(%rsi); - - gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); - vpxor 12 * 32(%rdx), %ymm0, %ymm3; - vmovdqu %ymm0, 12 * 32(%rsi); - - gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); - vpxor 13 * 32(%rdx), %ymm0, %ymm2; - vmovdqu %ymm0, 13 * 32(%rsi); - - gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); - vpxor 14 * 32(%rdx), %ymm0, %ymm1; - vmovdqu %ymm0, 14 * 32(%rsi); - - gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); - vpxor 15 * 32(%rdx), %ymm0, %ymm15; - vmovdqu %ymm15, 0 * 32(%rax); - vmovdqu %ymm0, 15 * 32(%rsi); - - vextracti128 $1, %ymm0, %xmm0; - gf128mul_x_ble(%xmm0, %xmm12, %xmm15); - vmovdqu %xmm0, (%rcx); - - /* inpack32_pre: */ - vpbroadcastq (key_table)(CTX, %r8, 8), %ymm15; - vpshufb .Lpack_bswap, %ymm15, %ymm15; - vpxor 0 * 32(%rax), %ymm15, %ymm0; - vpxor %ymm1, %ymm15, %ymm1; - vpxor %ymm2, %ymm15, %ymm2; - vpxor %ymm3, %ymm15, %ymm3; - vpxor %ymm4, %ymm15, %ymm4; - vpxor %ymm5, %ymm15, %ymm5; - vpxor %ymm6, %ymm15, %ymm6; - vpxor %ymm7, %ymm15, %ymm7; - vpxor %ymm8, %ymm15, %ymm8; - vpxor %ymm9, %ymm15, %ymm9; - vpxor %ymm10, %ymm15, %ymm10; - vpxor %ymm11, %ymm15, %ymm11; - vpxor 12 * 32(%rax), %ymm15, %ymm12; - vpxor 13 * 32(%rax), %ymm15, %ymm13; - vpxor 14 * 32(%rax), %ymm15, %ymm14; - vpxor 15 * 32(%rax), %ymm15, %ymm15; - - CALL_NOSPEC %r9; - addq $(16 * 32), %rsp; - - vpxor 0 * 32(%rsi), %ymm7, %ymm7; - vpxor 1 * 32(%rsi), %ymm6, %ymm6; - vpxor 2 * 32(%rsi), %ymm5, %ymm5; - vpxor 3 * 32(%rsi), %ymm4, %ymm4; - vpxor 4 * 32(%rsi), %ymm3, %ymm3; - vpxor 5 * 32(%rsi), %ymm2, %ymm2; - vpxor 6 * 32(%rsi), %ymm1, %ymm1; - vpxor 7 * 32(%rsi), %ymm0, %ymm0; - vpxor 8 * 32(%rsi), %ymm15, %ymm15; - vpxor 9 * 32(%rsi), %ymm14, %ymm14; - vpxor 10 * 32(%rsi), %ymm13, %ymm13; - vpxor 11 * 32(%rsi), %ymm12, %ymm12; - vpxor 12 * 32(%rsi), %ymm11, %ymm11; - vpxor 13 * 32(%rsi), %ymm10, %ymm10; - vpxor 14 * 32(%rsi), %ymm9, %ymm9; - vpxor 15 * 32(%rsi), %ymm8, %ymm8; - write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0, - %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9, - %ymm8, %rsi); - - vzeroupper; - FRAME_END - ret; -SYM_FUNC_END(camellia_xts_crypt_32way) - -SYM_FUNC_START(camellia_xts_enc_32way) - /* input: - * %rdi: ctx, CTX - * %rsi: dst (32 blocks) - * %rdx: src (32 blocks) - * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) - */ - - xorl %r8d, %r8d; /* input whitening key, 0 for enc */ - - leaq __camellia_enc_blk32, %r9; - - jmp camellia_xts_crypt_32way; -SYM_FUNC_END(camellia_xts_enc_32way) - -SYM_FUNC_START(camellia_xts_dec_32way) - /* input: - * %rdi: ctx, CTX - * %rsi: dst (32 blocks) - * %rdx: src (32 blocks) - * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) - */ - - cmpl $16, key_length(CTX); - movl $32, %r8d; - movl $24, %eax; - cmovel %eax, %r8d; /* input whitening key, last for dec */ - - leaq __camellia_dec_blk32, %r9; - - jmp camellia_xts_crypt_32way; -SYM_FUNC_END(camellia_xts_dec_32way) + RET; +SYM_FUNC_END(camellia_cbc_dec_32way) diff --git a/arch/x86/crypto/camellia-x86_64-asm_64.S b/arch/x86/crypto/camellia-x86_64-asm_64.S index 1372e6408850..347c059f5940 100644 --- a/arch/x86/crypto/camellia-x86_64-asm_64.S +++ b/arch/x86/crypto/camellia-x86_64-asm_64.S @@ -213,13 +213,13 @@ SYM_FUNC_START(__camellia_enc_blk) enc_outunpack(mov, RT1); movq RR12, %r12; - ret; + RET; .L__enc_xor: enc_outunpack(xor, RT1); movq RR12, %r12; - ret; + RET; SYM_FUNC_END(__camellia_enc_blk) SYM_FUNC_START(camellia_dec_blk) @@ -257,7 +257,7 @@ SYM_FUNC_START(camellia_dec_blk) dec_outunpack(); movq RR12, %r12; - ret; + RET; SYM_FUNC_END(camellia_dec_blk) /********************************************************************** @@ -448,14 +448,14 @@ SYM_FUNC_START(__camellia_enc_blk_2way) movq RR12, %r12; popq %rbx; - ret; + RET; .L__enc2_xor: enc_outunpack2(xor, RT2); movq RR12, %r12; popq %rbx; - ret; + RET; SYM_FUNC_END(__camellia_enc_blk_2way) SYM_FUNC_START(camellia_dec_blk_2way) @@ -495,5 +495,5 @@ SYM_FUNC_START(camellia_dec_blk_2way) movq RR12, %r12; movq RXOR, %rbx; - ret; + RET; SYM_FUNC_END(camellia_dec_blk_2way) diff --git a/arch/x86/crypto/camellia.h b/arch/x86/crypto/camellia.h new file mode 100644 index 000000000000..1dcea79e8f8e --- /dev/null +++ b/arch/x86/crypto/camellia.h @@ -0,0 +1,67 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef ASM_X86_CAMELLIA_H +#define ASM_X86_CAMELLIA_H + +#include <crypto/b128ops.h> +#include <linux/crypto.h> +#include <linux/kernel.h> + +#define CAMELLIA_MIN_KEY_SIZE 16 +#define CAMELLIA_MAX_KEY_SIZE 32 +#define CAMELLIA_BLOCK_SIZE 16 +#define CAMELLIA_TABLE_BYTE_LEN 272 +#define CAMELLIA_PARALLEL_BLOCKS 2 + +struct crypto_skcipher; + +struct camellia_ctx { + u64 key_table[CAMELLIA_TABLE_BYTE_LEN / sizeof(u64)]; + u32 key_length; +}; + +extern int __camellia_setkey(struct camellia_ctx *cctx, + const unsigned char *key, + unsigned int key_len); + +/* regular block cipher functions */ +asmlinkage void __camellia_enc_blk(const void *ctx, u8 *dst, const u8 *src, + bool xor); +asmlinkage void camellia_dec_blk(const void *ctx, u8 *dst, const u8 *src); + +/* 2-way parallel cipher functions */ +asmlinkage void __camellia_enc_blk_2way(const void *ctx, u8 *dst, const u8 *src, + bool xor); +asmlinkage void camellia_dec_blk_2way(const void *ctx, u8 *dst, const u8 *src); + +/* 16-way parallel cipher functions (avx/aes-ni) */ +asmlinkage void camellia_ecb_enc_16way(const void *ctx, u8 *dst, const u8 *src); +asmlinkage void camellia_ecb_dec_16way(const void *ctx, u8 *dst, const u8 *src); + +asmlinkage void camellia_cbc_dec_16way(const void *ctx, u8 *dst, const u8 *src); + +static inline void camellia_enc_blk(const void *ctx, u8 *dst, const u8 *src) +{ + __camellia_enc_blk(ctx, dst, src, false); +} + +static inline void camellia_enc_blk_xor(const void *ctx, u8 *dst, const u8 *src) +{ + __camellia_enc_blk(ctx, dst, src, true); +} + +static inline void camellia_enc_blk_2way(const void *ctx, u8 *dst, + const u8 *src) +{ + __camellia_enc_blk_2way(ctx, dst, src, false); +} + +static inline void camellia_enc_blk_xor_2way(const void *ctx, u8 *dst, + const u8 *src) +{ + __camellia_enc_blk_2way(ctx, dst, src, true); +} + +/* glue helpers */ +extern void camellia_decrypt_cbc_2way(const void *ctx, u8 *dst, const u8 *src); + +#endif /* ASM_X86_CAMELLIA_H */ diff --git a/arch/x86/crypto/camellia_aesni_avx2_glue.c b/arch/x86/crypto/camellia_aesni_avx2_glue.c index ccda647422d6..e7e4d64e9577 100644 --- a/arch/x86/crypto/camellia_aesni_avx2_glue.c +++ b/arch/x86/crypto/camellia_aesni_avx2_glue.c @@ -5,16 +5,16 @@ * Copyright © 2013 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> */ -#include <asm/crypto/camellia.h> -#include <asm/crypto/glue_helper.h> #include <crypto/algapi.h> #include <crypto/internal/simd.h> -#include <crypto/xts.h> #include <linux/crypto.h> #include <linux/err.h> #include <linux/module.h> #include <linux/types.h> +#include "camellia.h" +#include "ecb_cbc_helpers.h" + #define CAMELLIA_AESNI_PARALLEL_BLOCKS 16 #define CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS 32 @@ -23,121 +23,6 @@ asmlinkage void camellia_ecb_enc_32way(const void *ctx, u8 *dst, const u8 *src); asmlinkage void camellia_ecb_dec_32way(const void *ctx, u8 *dst, const u8 *src); asmlinkage void camellia_cbc_dec_32way(const void *ctx, u8 *dst, const u8 *src); -asmlinkage void camellia_ctr_32way(const void *ctx, u8 *dst, const u8 *src, - le128 *iv); - -asmlinkage void camellia_xts_enc_32way(const void *ctx, u8 *dst, const u8 *src, - le128 *iv); -asmlinkage void camellia_xts_dec_32way(const void *ctx, u8 *dst, const u8 *src, - le128 *iv); - -static const struct common_glue_ctx camellia_enc = { - .num_funcs = 4, - .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS, - - .funcs = { { - .num_blocks = CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS, - .fn_u = { .ecb = camellia_ecb_enc_32way } - }, { - .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS, - .fn_u = { .ecb = camellia_ecb_enc_16way } - }, { - .num_blocks = 2, - .fn_u = { .ecb = camellia_enc_blk_2way } - }, { - .num_blocks = 1, - .fn_u = { .ecb = camellia_enc_blk } - } } -}; - -static const struct common_glue_ctx camellia_ctr = { - .num_funcs = 4, - .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS, - - .funcs = { { - .num_blocks = CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS, - .fn_u = { .ctr = camellia_ctr_32way } - }, { - .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS, - .fn_u = { .ctr = camellia_ctr_16way } - }, { - .num_blocks = 2, - .fn_u = { .ctr = camellia_crypt_ctr_2way } - }, { - .num_blocks = 1, - .fn_u = { .ctr = camellia_crypt_ctr } - } } -}; - -static const struct common_glue_ctx camellia_enc_xts = { - .num_funcs = 3, - .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS, - - .funcs = { { - .num_blocks = CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS, - .fn_u = { .xts = camellia_xts_enc_32way } - }, { - .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS, - .fn_u = { .xts = camellia_xts_enc_16way } - }, { - .num_blocks = 1, - .fn_u = { .xts = camellia_xts_enc } - } } -}; - -static const struct common_glue_ctx camellia_dec = { - .num_funcs = 4, - .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS, - - .funcs = { { - .num_blocks = CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS, - .fn_u = { .ecb = camellia_ecb_dec_32way } - }, { - .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS, - .fn_u = { .ecb = camellia_ecb_dec_16way } - }, { - .num_blocks = 2, - .fn_u = { .ecb = camellia_dec_blk_2way } - }, { - .num_blocks = 1, - .fn_u = { .ecb = camellia_dec_blk } - } } -}; - -static const struct common_glue_ctx camellia_dec_cbc = { - .num_funcs = 4, - .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS, - - .funcs = { { - .num_blocks = CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS, - .fn_u = { .cbc = camellia_cbc_dec_32way } - }, { - .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS, - .fn_u = { .cbc = camellia_cbc_dec_16way } - }, { - .num_blocks = 2, - .fn_u = { .cbc = camellia_decrypt_cbc_2way } - }, { - .num_blocks = 1, - .fn_u = { .cbc = camellia_dec_blk } - } } -}; - -static const struct common_glue_ctx camellia_dec_xts = { - .num_funcs = 3, - .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS, - - .funcs = { { - .num_blocks = CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS, - .fn_u = { .xts = camellia_xts_dec_32way } - }, { - .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS, - .fn_u = { .xts = camellia_xts_dec_16way } - }, { - .num_blocks = 1, - .fn_u = { .xts = camellia_xts_dec } - } } -}; static int camellia_setkey(struct crypto_skcipher *tfm, const u8 *key, unsigned int keylen) @@ -147,45 +32,39 @@ static int camellia_setkey(struct crypto_skcipher *tfm, const u8 *key, static int ecb_encrypt(struct skcipher_request *req) { - return glue_ecb_req_128bit(&camellia_enc, req); + ECB_WALK_START(req, CAMELLIA_BLOCK_SIZE, CAMELLIA_AESNI_PARALLEL_BLOCKS); + ECB_BLOCK(CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS, camellia_ecb_enc_32way); + ECB_BLOCK(CAMELLIA_AESNI_PARALLEL_BLOCKS, camellia_ecb_enc_16way); + ECB_BLOCK(2, camellia_enc_blk_2way); + ECB_BLOCK(1, camellia_enc_blk); + ECB_WALK_END(); } static int ecb_decrypt(struct skcipher_request *req) { - return glue_ecb_req_128bit(&camellia_dec, req); + ECB_WALK_START(req, CAMELLIA_BLOCK_SIZE, CAMELLIA_AESNI_PARALLEL_BLOCKS); + ECB_BLOCK(CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS, camellia_ecb_dec_32way); + ECB_BLOCK(CAMELLIA_AESNI_PARALLEL_BLOCKS, camellia_ecb_dec_16way); + ECB_BLOCK(2, camellia_dec_blk_2way); + ECB_BLOCK(1, camellia_dec_blk); + ECB_WALK_END(); } static int cbc_encrypt(struct skcipher_request *req) { - return glue_cbc_encrypt_req_128bit(camellia_enc_blk, req); + CBC_WALK_START(req, CAMELLIA_BLOCK_SIZE, -1); + CBC_ENC_BLOCK(camellia_enc_blk); + CBC_WALK_END(); } static int cbc_decrypt(struct skcipher_request *req) { - return glue_cbc_decrypt_req_128bit(&camellia_dec_cbc, req); -} - -static int ctr_crypt(struct skcipher_request *req) -{ - return glue_ctr_req_128bit(&camellia_ctr, req); -} - -static int xts_encrypt(struct skcipher_request *req) -{ - struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - struct camellia_xts_ctx *ctx = crypto_skcipher_ctx(tfm); - - return glue_xts_req_128bit(&camellia_enc_xts, req, camellia_enc_blk, - &ctx->tweak_ctx, &ctx->crypt_ctx, false); -} - -static int xts_decrypt(struct skcipher_request *req) -{ - struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - struct camellia_xts_ctx *ctx = crypto_skcipher_ctx(tfm); - - return glue_xts_req_128bit(&camellia_dec_xts, req, camellia_enc_blk, - &ctx->tweak_ctx, &ctx->crypt_ctx, true); + CBC_WALK_START(req, CAMELLIA_BLOCK_SIZE, CAMELLIA_AESNI_PARALLEL_BLOCKS); + CBC_DEC_BLOCK(CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS, camellia_cbc_dec_32way); + CBC_DEC_BLOCK(CAMELLIA_AESNI_PARALLEL_BLOCKS, camellia_cbc_dec_16way); + CBC_DEC_BLOCK(2, camellia_decrypt_cbc_2way); + CBC_DEC_BLOCK(1, camellia_dec_blk); + CBC_WALK_END(); } static struct skcipher_alg camellia_algs[] = { @@ -216,35 +95,6 @@ static struct skcipher_alg camellia_algs[] = { .setkey = camellia_setkey, .encrypt = cbc_encrypt, .decrypt = cbc_decrypt, - }, { - .base.cra_name = "__ctr(camellia)", - .base.cra_driver_name = "__ctr-camellia-aesni-avx2", - .base.cra_priority = 500, - .base.cra_flags = CRYPTO_ALG_INTERNAL, - .base.cra_blocksize = 1, - .base.cra_ctxsize = sizeof(struct camellia_ctx), - .base.cra_module = THIS_MODULE, - .min_keysize = CAMELLIA_MIN_KEY_SIZE, - .max_keysize = CAMELLIA_MAX_KEY_SIZE, - .ivsize = CAMELLIA_BLOCK_SIZE, - .chunksize = CAMELLIA_BLOCK_SIZE, - .setkey = camellia_setkey, - .encrypt = ctr_crypt, - .decrypt = ctr_crypt, - }, { - .base.cra_name = "__xts(camellia)", - .base.cra_driver_name = "__xts-camellia-aesni-avx2", - .base.cra_priority = 500, - .base.cra_flags = CRYPTO_ALG_INTERNAL, - .base.cra_blocksize = CAMELLIA_BLOCK_SIZE, - .base.cra_ctxsize = sizeof(struct camellia_xts_ctx), - .base.cra_module = THIS_MODULE, - .min_keysize = 2 * CAMELLIA_MIN_KEY_SIZE, - .max_keysize = 2 * CAMELLIA_MAX_KEY_SIZE, - .ivsize = CAMELLIA_BLOCK_SIZE, - .setkey = xts_camellia_setkey, - .encrypt = xts_encrypt, - .decrypt = xts_decrypt, }, }; diff --git a/arch/x86/crypto/camellia_aesni_avx_glue.c b/arch/x86/crypto/camellia_aesni_avx_glue.c index 4e5de6ef206e..c7ccf63e741e 100644 --- a/arch/x86/crypto/camellia_aesni_avx_glue.c +++ b/arch/x86/crypto/camellia_aesni_avx_glue.c @@ -5,16 +5,16 @@ * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi> */ -#include <asm/crypto/camellia.h> -#include <asm/crypto/glue_helper.h> #include <crypto/algapi.h> #include <crypto/internal/simd.h> -#include <crypto/xts.h> #include <linux/crypto.h> #include <linux/err.h> #include <linux/module.h> #include <linux/types.h> +#include "camellia.h" +#include "ecb_cbc_helpers.h" + #define CAMELLIA_AESNI_PARALLEL_BLOCKS 16 /* 16-way parallel cipher functions (avx/aes-ni) */ @@ -27,120 +27,6 @@ EXPORT_SYMBOL_GPL(camellia_ecb_dec_16way); asmlinkage void camellia_cbc_dec_16way(const void *ctx, u8 *dst, const u8 *src); EXPORT_SYMBOL_GPL(camellia_cbc_dec_16way); -asmlinkage void camellia_ctr_16way(const void *ctx, u8 *dst, const u8 *src, - le128 *iv); -EXPORT_SYMBOL_GPL(camellia_ctr_16way); - -asmlinkage void camellia_xts_enc_16way(const void *ctx, u8 *dst, const u8 *src, - le128 *iv); -EXPORT_SYMBOL_GPL(camellia_xts_enc_16way); - -asmlinkage void camellia_xts_dec_16way(const void *ctx, u8 *dst, const u8 *src, - le128 *iv); -EXPORT_SYMBOL_GPL(camellia_xts_dec_16way); - -void camellia_xts_enc(const void *ctx, u8 *dst, const u8 *src, le128 *iv) -{ - glue_xts_crypt_128bit_one(ctx, dst, src, iv, camellia_enc_blk); -} -EXPORT_SYMBOL_GPL(camellia_xts_enc); - -void camellia_xts_dec(const void *ctx, u8 *dst, const u8 *src, le128 *iv) -{ - glue_xts_crypt_128bit_one(ctx, dst, src, iv, camellia_dec_blk); -} -EXPORT_SYMBOL_GPL(camellia_xts_dec); - -static const struct common_glue_ctx camellia_enc = { - .num_funcs = 3, - .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS, - - .funcs = { { - .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS, - .fn_u = { .ecb = camellia_ecb_enc_16way } - }, { - .num_blocks = 2, - .fn_u = { .ecb = camellia_enc_blk_2way } - }, { - .num_blocks = 1, - .fn_u = { .ecb = camellia_enc_blk } - } } -}; - -static const struct common_glue_ctx camellia_ctr = { - .num_funcs = 3, - .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS, - - .funcs = { { - .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS, - .fn_u = { .ctr = camellia_ctr_16way } - }, { - .num_blocks = 2, - .fn_u = { .ctr = camellia_crypt_ctr_2way } - }, { - .num_blocks = 1, - .fn_u = { .ctr = camellia_crypt_ctr } - } } -}; - -static const struct common_glue_ctx camellia_enc_xts = { - .num_funcs = 2, - .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS, - - .funcs = { { - .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS, - .fn_u = { .xts = camellia_xts_enc_16way } - }, { - .num_blocks = 1, - .fn_u = { .xts = camellia_xts_enc } - } } -}; - -static const struct common_glue_ctx camellia_dec = { - .num_funcs = 3, - .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS, - - .funcs = { { - .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS, - .fn_u = { .ecb = camellia_ecb_dec_16way } - }, { - .num_blocks = 2, - .fn_u = { .ecb = camellia_dec_blk_2way } - }, { - .num_blocks = 1, - .fn_u = { .ecb = camellia_dec_blk } - } } -}; - -static const struct common_glue_ctx camellia_dec_cbc = { - .num_funcs = 3, - .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS, - - .funcs = { { - .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS, - .fn_u = { .cbc = camellia_cbc_dec_16way } - }, { - .num_blocks = 2, - .fn_u = { .cbc = camellia_decrypt_cbc_2way } - }, { - .num_blocks = 1, - .fn_u = { .cbc = camellia_dec_blk } - } } -}; - -static const struct common_glue_ctx camellia_dec_xts = { - .num_funcs = 2, - .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS, - - .funcs = { { - .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS, - .fn_u = { .xts = camellia_xts_dec_16way } - }, { - .num_blocks = 1, - .fn_u = { .xts = camellia_xts_dec } - } } -}; - static int camellia_setkey(struct crypto_skcipher *tfm, const u8 *key, unsigned int keylen) { @@ -149,65 +35,36 @@ static int camellia_setkey(struct crypto_skcipher *tfm, const u8 *key, static int ecb_encrypt(struct skcipher_request *req) { - return glue_ecb_req_128bit(&camellia_enc, req); + ECB_WALK_START(req, CAMELLIA_BLOCK_SIZE, CAMELLIA_AESNI_PARALLEL_BLOCKS); + ECB_BLOCK(CAMELLIA_AESNI_PARALLEL_BLOCKS, camellia_ecb_enc_16way); + ECB_BLOCK(2, camellia_enc_blk_2way); + ECB_BLOCK(1, camellia_enc_blk); + ECB_WALK_END(); } static int ecb_decrypt(struct skcipher_request *req) { - return glue_ecb_req_128bit(&camellia_dec, req); + ECB_WALK_START(req, CAMELLIA_BLOCK_SIZE, CAMELLIA_AESNI_PARALLEL_BLOCKS); + ECB_BLOCK(CAMELLIA_AESNI_PARALLEL_BLOCKS, camellia_ecb_dec_16way); + ECB_BLOCK(2, camellia_dec_blk_2way); + ECB_BLOCK(1, camellia_dec_blk); + ECB_WALK_END(); } static int cbc_encrypt(struct skcipher_request *req) { - return glue_cbc_encrypt_req_128bit(camellia_enc_blk, req); + CBC_WALK_START(req, CAMELLIA_BLOCK_SIZE, -1); + CBC_ENC_BLOCK(camellia_enc_blk); + CBC_WALK_END(); } static int cbc_decrypt(struct skcipher_request *req) { - return glue_cbc_decrypt_req_128bit(&camellia_dec_cbc, req); -} - -static int ctr_crypt(struct skcipher_request *req) -{ - return glue_ctr_req_128bit(&camellia_ctr, req); -} - -int xts_camellia_setkey(struct crypto_skcipher *tfm, const u8 *key, - unsigned int keylen) -{ - struct camellia_xts_ctx *ctx = crypto_skcipher_ctx(tfm); - int err; - - err = xts_verify_key(tfm, key, keylen); - if (err) - return err; - - /* first half of xts-key is for crypt */ - err = __camellia_setkey(&ctx->crypt_ctx, key, keylen / 2); - if (err) - return err; - - /* second half of xts-key is for tweak */ - return __camellia_setkey(&ctx->tweak_ctx, key + keylen / 2, keylen / 2); -} -EXPORT_SYMBOL_GPL(xts_camellia_setkey); - -static int xts_encrypt(struct skcipher_request *req) -{ - struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - struct camellia_xts_ctx *ctx = crypto_skcipher_ctx(tfm); - - return glue_xts_req_128bit(&camellia_enc_xts, req, camellia_enc_blk, - &ctx->tweak_ctx, &ctx->crypt_ctx, false); -} - -static int xts_decrypt(struct skcipher_request *req) -{ - struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - struct camellia_xts_ctx *ctx = crypto_skcipher_ctx(tfm); - - return glue_xts_req_128bit(&camellia_dec_xts, req, camellia_enc_blk, - &ctx->tweak_ctx, &ctx->crypt_ctx, true); + CBC_WALK_START(req, CAMELLIA_BLOCK_SIZE, CAMELLIA_AESNI_PARALLEL_BLOCKS); + CBC_DEC_BLOCK(CAMELLIA_AESNI_PARALLEL_BLOCKS, camellia_cbc_dec_16way); + CBC_DEC_BLOCK(2, camellia_decrypt_cbc_2way); + CBC_DEC_BLOCK(1, camellia_dec_blk); + CBC_WALK_END(); } static struct skcipher_alg camellia_algs[] = { @@ -238,36 +95,7 @@ static struct skcipher_alg camellia_algs[] = { .setkey = camellia_setkey, .encrypt = cbc_encrypt, .decrypt = cbc_decrypt, - }, { - .base.cra_name = "__ctr(camellia)", - .base.cra_driver_name = "__ctr-camellia-aesni", - .base.cra_priority = 400, - .base.cra_flags = CRYPTO_ALG_INTERNAL, - .base.cra_blocksize = 1, - .base.cra_ctxsize = sizeof(struct camellia_ctx), - .base.cra_module = THIS_MODULE, - .min_keysize = CAMELLIA_MIN_KEY_SIZE, - .max_keysize = CAMELLIA_MAX_KEY_SIZE, - .ivsize = CAMELLIA_BLOCK_SIZE, - .chunksize = CAMELLIA_BLOCK_SIZE, - .setkey = camellia_setkey, - .encrypt = ctr_crypt, - .decrypt = ctr_crypt, - }, { - .base.cra_name = "__xts(camellia)", - .base.cra_driver_name = "__xts-camellia-aesni", - .base.cra_priority = 400, - .base.cra_flags = CRYPTO_ALG_INTERNAL, - .base.cra_blocksize = CAMELLIA_BLOCK_SIZE, - .base.cra_ctxsize = sizeof(struct camellia_xts_ctx), - .base.cra_module = THIS_MODULE, - .min_keysize = 2 * CAMELLIA_MIN_KEY_SIZE, - .max_keysize = 2 * CAMELLIA_MAX_KEY_SIZE, - .ivsize = CAMELLIA_BLOCK_SIZE, - .setkey = xts_camellia_setkey, - .encrypt = xts_encrypt, - .decrypt = xts_decrypt, - }, + } }; static struct simd_skcipher_alg *camellia_simd_algs[ARRAY_SIZE(camellia_algs)]; diff --git a/arch/x86/crypto/camellia_glue.c b/arch/x86/crypto/camellia_glue.c index 242c056e5fa8..d45e9c0c42ac 100644 --- a/arch/x86/crypto/camellia_glue.c +++ b/arch/x86/crypto/camellia_glue.c @@ -14,8 +14,9 @@ #include <linux/module.h> #include <linux/types.h> #include <crypto/algapi.h> -#include <asm/crypto/camellia.h> -#include <asm/crypto/glue_helper.h> + +#include "camellia.h" +#include "ecb_cbc_helpers.h" /* regular block cipher functions */ asmlinkage void __camellia_enc_blk(const void *ctx, u8 *dst, const u8 *src, @@ -1262,129 +1263,47 @@ static int camellia_setkey_skcipher(struct crypto_skcipher *tfm, const u8 *key, return camellia_setkey(&tfm->base, key, key_len); } -void camellia_decrypt_cbc_2way(const void *ctx, u8 *d, const u8 *s) +void camellia_decrypt_cbc_2way(const void *ctx, u8 *dst, const u8 *src) { - u128 *dst = (u128 *)d; - const u128 *src = (const u128 *)s; - u128 iv = *src; - - camellia_dec_blk_2way(ctx, (u8 *)dst, (u8 *)src); + u8 buf[CAMELLIA_BLOCK_SIZE]; + const u8 *iv = src; - u128_xor(&dst[1], &dst[1], &iv); + if (dst == src) + iv = memcpy(buf, iv, sizeof(buf)); + camellia_dec_blk_2way(ctx, dst, src); + crypto_xor(dst + CAMELLIA_BLOCK_SIZE, iv, CAMELLIA_BLOCK_SIZE); } EXPORT_SYMBOL_GPL(camellia_decrypt_cbc_2way); -void camellia_crypt_ctr(const void *ctx, u8 *d, const u8 *s, le128 *iv) -{ - be128 ctrblk; - u128 *dst = (u128 *)d; - const u128 *src = (const u128 *)s; - - if (dst != src) - *dst = *src; - - le128_to_be128(&ctrblk, iv); - le128_inc(iv); - - camellia_enc_blk_xor(ctx, (u8 *)dst, (u8 *)&ctrblk); -} -EXPORT_SYMBOL_GPL(camellia_crypt_ctr); - -void camellia_crypt_ctr_2way(const void *ctx, u8 *d, const u8 *s, le128 *iv) -{ - be128 ctrblks[2]; - u128 *dst = (u128 *)d; - const u128 *src = (const u128 *)s; - - if (dst != src) { - dst[0] = src[0]; - dst[1] = src[1]; - } - - le128_to_be128(&ctrblks[0], iv); - le128_inc(iv); - le128_to_be128(&ctrblks[1], iv); - le128_inc(iv); - - camellia_enc_blk_xor_2way(ctx, (u8 *)dst, (u8 *)ctrblks); -} -EXPORT_SYMBOL_GPL(camellia_crypt_ctr_2way); - -static const struct common_glue_ctx camellia_enc = { - .num_funcs = 2, - .fpu_blocks_limit = -1, - - .funcs = { { - .num_blocks = 2, - .fn_u = { .ecb = camellia_enc_blk_2way } - }, { - .num_blocks = 1, - .fn_u = { .ecb = camellia_enc_blk } - } } -}; - -static const struct common_glue_ctx camellia_ctr = { - .num_funcs = 2, - .fpu_blocks_limit = -1, - - .funcs = { { - .num_blocks = 2, - .fn_u = { .ctr = camellia_crypt_ctr_2way } - }, { - .num_blocks = 1, - .fn_u = { .ctr = camellia_crypt_ctr } - } } -}; - -static const struct common_glue_ctx camellia_dec = { - .num_funcs = 2, - .fpu_blocks_limit = -1, - - .funcs = { { - .num_blocks = 2, - .fn_u = { .ecb = camellia_dec_blk_2way } - }, { - .num_blocks = 1, - .fn_u = { .ecb = camellia_dec_blk } - } } -}; - -static const struct common_glue_ctx camellia_dec_cbc = { - .num_funcs = 2, - .fpu_blocks_limit = -1, - - .funcs = { { - .num_blocks = 2, - .fn_u = { .cbc = camellia_decrypt_cbc_2way } - }, { - .num_blocks = 1, - .fn_u = { .cbc = camellia_dec_blk } - } } -}; - static int ecb_encrypt(struct skcipher_request *req) { - return glue_ecb_req_128bit(&camellia_enc, req); + ECB_WALK_START(req, CAMELLIA_BLOCK_SIZE, -1); + ECB_BLOCK(2, camellia_enc_blk_2way); + ECB_BLOCK(1, camellia_enc_blk); + ECB_WALK_END(); } static int ecb_decrypt(struct skcipher_request *req) { - return glue_ecb_req_128bit(&camellia_dec, req); + ECB_WALK_START(req, CAMELLIA_BLOCK_SIZE, -1); + ECB_BLOCK(2, camellia_dec_blk_2way); + ECB_BLOCK(1, camellia_dec_blk); + ECB_WALK_END(); } static int cbc_encrypt(struct skcipher_request *req) { - return glue_cbc_encrypt_req_128bit(camellia_enc_blk, req); + CBC_WALK_START(req, CAMELLIA_BLOCK_SIZE, -1); + CBC_ENC_BLOCK(camellia_enc_blk); + CBC_WALK_END(); } static int cbc_decrypt(struct skcipher_request *req) { - return glue_cbc_decrypt_req_128bit(&camellia_dec_cbc, req); -} - -static int ctr_crypt(struct skcipher_request *req) -{ - return glue_ctr_req_128bit(&camellia_ctr, req); + CBC_WALK_START(req, CAMELLIA_BLOCK_SIZE, -1); + CBC_DEC_BLOCK(2, camellia_decrypt_cbc_2way); + CBC_DEC_BLOCK(1, camellia_dec_blk); + CBC_WALK_END(); } static struct crypto_alg camellia_cipher_alg = { @@ -1433,20 +1352,6 @@ static struct skcipher_alg camellia_skcipher_algs[] = { .setkey = camellia_setkey_skcipher, .encrypt = cbc_encrypt, .decrypt = cbc_decrypt, - }, { - .base.cra_name = "ctr(camellia)", - .base.cra_driver_name = "ctr-camellia-asm", - .base.cra_priority = 300, - .base.cra_blocksize = 1, - .base.cra_ctxsize = sizeof(struct camellia_ctx), - .base.cra_module = THIS_MODULE, - .min_keysize = CAMELLIA_MIN_KEY_SIZE, - .max_keysize = CAMELLIA_MAX_KEY_SIZE, - .ivsize = CAMELLIA_BLOCK_SIZE, - .chunksize = CAMELLIA_BLOCK_SIZE, - .setkey = camellia_setkey_skcipher, - .encrypt = ctr_crypt, - .decrypt = ctr_crypt, } }; @@ -1472,7 +1377,7 @@ static int force; module_param(force, int, 0); MODULE_PARM_DESC(force, "Force module load, ignore CPU blacklist"); -static int __init init(void) +static int __init camellia_init(void) { int err; @@ -1496,15 +1401,15 @@ static int __init init(void) return err; } -static void __exit fini(void) +static void __exit camellia_fini(void) { crypto_unregister_alg(&camellia_cipher_alg); crypto_unregister_skciphers(camellia_skcipher_algs, ARRAY_SIZE(camellia_skcipher_algs)); } -module_init(init); -module_exit(fini); +module_init(camellia_init); +module_exit(camellia_fini); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Camellia Cipher Algorithm, asm optimized"); diff --git a/arch/x86/crypto/cast5-avx-x86_64-asm_64.S b/arch/x86/crypto/cast5-avx-x86_64-asm_64.S index 8a6181b08b59..b258af420c92 100644 --- a/arch/x86/crypto/cast5-avx-x86_64-asm_64.S +++ b/arch/x86/crypto/cast5-avx-x86_64-asm_64.S @@ -279,7 +279,7 @@ SYM_FUNC_START_LOCAL(__cast5_enc_blk16) outunpack_blocks(RR3, RL3, RTMP, RX, RKM); outunpack_blocks(RR4, RL4, RTMP, RX, RKM); - ret; + RET; SYM_FUNC_END(__cast5_enc_blk16) .align 16 @@ -352,7 +352,7 @@ SYM_FUNC_START_LOCAL(__cast5_dec_blk16) outunpack_blocks(RR3, RL3, RTMP, RX, RKM); outunpack_blocks(RR4, RL4, RTMP, RX, RKM); - ret; + RET; .L__skip_dec: vpsrldq $4, RKR, RKR; @@ -393,7 +393,7 @@ SYM_FUNC_START(cast5_ecb_enc_16way) popq %r15; FRAME_END - ret; + RET; SYM_FUNC_END(cast5_ecb_enc_16way) SYM_FUNC_START(cast5_ecb_dec_16way) @@ -431,7 +431,7 @@ SYM_FUNC_START(cast5_ecb_dec_16way) popq %r15; FRAME_END - ret; + RET; SYM_FUNC_END(cast5_ecb_dec_16way) SYM_FUNC_START(cast5_cbc_dec_16way) @@ -483,7 +483,7 @@ SYM_FUNC_START(cast5_cbc_dec_16way) popq %r15; popq %r12; FRAME_END - ret; + RET; SYM_FUNC_END(cast5_cbc_dec_16way) SYM_FUNC_START(cast5_ctr_16way) @@ -559,5 +559,5 @@ SYM_FUNC_START(cast5_ctr_16way) popq %r15; popq %r12; FRAME_END - ret; + RET; SYM_FUNC_END(cast5_ctr_16way) diff --git a/arch/x86/crypto/cast5_avx_glue.c b/arch/x86/crypto/cast5_avx_glue.c index 384ccb00f9e1..3976a87f92ad 100644 --- a/arch/x86/crypto/cast5_avx_glue.c +++ b/arch/x86/crypto/cast5_avx_glue.c @@ -6,7 +6,6 @@ * <Johannes.Goetzfried@informatik.stud.uni-erlangen.de> */ -#include <asm/crypto/glue_helper.h> #include <crypto/algapi.h> #include <crypto/cast5.h> #include <crypto/internal/simd.h> @@ -15,6 +14,8 @@ #include <linux/module.h> #include <linux/types.h> +#include "ecb_cbc_helpers.h" + #define CAST5_PARALLEL_BLOCKS 16 asmlinkage void cast5_ecb_enc_16way(struct cast5_ctx *ctx, u8 *dst, @@ -23,8 +24,6 @@ asmlinkage void cast5_ecb_dec_16way(struct cast5_ctx *ctx, u8 *dst, const u8 *src); asmlinkage void cast5_cbc_dec_16way(struct cast5_ctx *ctx, u8 *dst, const u8 *src); -asmlinkage void cast5_ctr_16way(struct cast5_ctx *ctx, u8 *dst, const u8 *src, - __be64 *iv); static int cast5_setkey_skcipher(struct crypto_skcipher *tfm, const u8 *key, unsigned int keylen) @@ -32,272 +31,35 @@ static int cast5_setkey_skcipher(struct crypto_skcipher *tfm, const u8 *key, return cast5_setkey(&tfm->base, key, keylen); } -static inline bool cast5_fpu_begin(bool fpu_enabled, struct skcipher_walk *walk, - unsigned int nbytes) -{ - return glue_fpu_begin(CAST5_BLOCK_SIZE, CAST5_PARALLEL_BLOCKS, - walk, fpu_enabled, nbytes); -} - -static inline void cast5_fpu_end(bool fpu_enabled) -{ - return glue_fpu_end(fpu_enabled); -} - -static int ecb_crypt(struct skcipher_request *req, bool enc) -{ - bool fpu_enabled = false; - struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - struct cast5_ctx *ctx = crypto_skcipher_ctx(tfm); - struct skcipher_walk walk; - const unsigned int bsize = CAST5_BLOCK_SIZE; - unsigned int nbytes; - void (*fn)(struct cast5_ctx *ctx, u8 *dst, const u8 *src); - int err; - - err = skcipher_walk_virt(&walk, req, false); - - while ((nbytes = walk.nbytes)) { - u8 *wsrc = walk.src.virt.addr; - u8 *wdst = walk.dst.virt.addr; - - fpu_enabled = cast5_fpu_begin(fpu_enabled, &walk, nbytes); - - /* Process multi-block batch */ - if (nbytes >= bsize * CAST5_PARALLEL_BLOCKS) { - fn = (enc) ? cast5_ecb_enc_16way : cast5_ecb_dec_16way; - do { - fn(ctx, wdst, wsrc); - - wsrc += bsize * CAST5_PARALLEL_BLOCKS; - wdst += bsize * CAST5_PARALLEL_BLOCKS; - nbytes -= bsize * CAST5_PARALLEL_BLOCKS; - } while (nbytes >= bsize * CAST5_PARALLEL_BLOCKS); - - if (nbytes < bsize) - goto done; - } - - fn = (enc) ? __cast5_encrypt : __cast5_decrypt; - - /* Handle leftovers */ - do { - fn(ctx, wdst, wsrc); - - wsrc += bsize; - wdst += bsize; - nbytes -= bsize; - } while (nbytes >= bsize); - -done: - err = skcipher_walk_done(&walk, nbytes); - } - - cast5_fpu_end(fpu_enabled); - return err; -} - static int ecb_encrypt(struct skcipher_request *req) { - return ecb_crypt(req, true); + ECB_WALK_START(req, CAST5_BLOCK_SIZE, CAST5_PARALLEL_BLOCKS); + ECB_BLOCK(CAST5_PARALLEL_BLOCKS, cast5_ecb_enc_16way); + ECB_BLOCK(1, __cast5_encrypt); + ECB_WALK_END(); } static int ecb_decrypt(struct skcipher_request *req) { - return ecb_crypt(req, false); + ECB_WALK_START(req, CAST5_BLOCK_SIZE, CAST5_PARALLEL_BLOCKS); + ECB_BLOCK(CAST5_PARALLEL_BLOCKS, cast5_ecb_dec_16way); + ECB_BLOCK(1, __cast5_decrypt); + ECB_WALK_END(); } static int cbc_encrypt(struct skcipher_request *req) { - const unsigned int bsize = CAST5_BLOCK_SIZE; - struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - struct cast5_ctx *ctx = crypto_skcipher_ctx(tfm); - struct skcipher_walk walk; - unsigned int nbytes; - int err; - - err = skcipher_walk_virt(&walk, req, false); - - while ((nbytes = walk.nbytes)) { - u64 *src = (u64 *)walk.src.virt.addr; - u64 *dst = (u64 *)walk.dst.virt.addr; - u64 *iv = (u64 *)walk.iv; - - do { - *dst = *src ^ *iv; - __cast5_encrypt(ctx, (u8 *)dst, (u8 *)dst); - iv = dst; - src++; - dst++; - nbytes -= bsize; - } while (nbytes >= bsize); - - *(u64 *)walk.iv = *iv; - err = skcipher_walk_done(&walk, nbytes); - } - - return err; -} - -static unsigned int __cbc_decrypt(struct cast5_ctx *ctx, - struct skcipher_walk *walk) -{ - const unsigned int bsize = CAST5_BLOCK_SIZE; - unsigned int nbytes = walk->nbytes; - u64 *src = (u64 *)walk->src.virt.addr; - u64 *dst = (u64 *)walk->dst.virt.addr; - u64 last_iv; - - /* Start of the last block. */ - src += nbytes / bsize - 1; - dst += nbytes / bsize - 1; - - last_iv = *src; - - /* Process multi-block batch */ - if (nbytes >= bsize * CAST5_PARALLEL_BLOCKS) { - do { - nbytes -= bsize * (CAST5_PARALLEL_BLOCKS - 1); - src -= CAST5_PARALLEL_BLOCKS - 1; - dst -= CAST5_PARALLEL_BLOCKS - 1; - - cast5_cbc_dec_16way(ctx, (u8 *)dst, (u8 *)src); - - nbytes -= bsize; - if (nbytes < bsize) - goto done; - - *dst ^= *(src - 1); - src -= 1; - dst -= 1; - } while (nbytes >= bsize * CAST5_PARALLEL_BLOCKS); - } - - /* Handle leftovers */ - for (;;) { - __cast5_decrypt(ctx, (u8 *)dst, (u8 *)src); - - nbytes -= bsize; - if (nbytes < bsize) - break; - - *dst ^= *(src - 1); - src -= 1; - dst -= 1; - } - -done: - *dst ^= *(u64 *)walk->iv; - *(u64 *)walk->iv = last_iv; - - return nbytes; + CBC_WALK_START(req, CAST5_BLOCK_SIZE, -1); + CBC_ENC_BLOCK(__cast5_encrypt); + CBC_WALK_END(); } static int cbc_decrypt(struct skcipher_request *req) { - struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - struct cast5_ctx *ctx = crypto_skcipher_ctx(tfm); - bool fpu_enabled = false; - struct skcipher_walk walk; - unsigned int nbytes; - int err; - - err = skcipher_walk_virt(&walk, req, false); - - while ((nbytes = walk.nbytes)) { - fpu_enabled = cast5_fpu_begin(fpu_enabled, &walk, nbytes); - nbytes = __cbc_decrypt(ctx, &walk); - err = skcipher_walk_done(&walk, nbytes); - } - - cast5_fpu_end(fpu_enabled); - return err; -} - -static void ctr_crypt_final(struct skcipher_walk *walk, struct cast5_ctx *ctx) -{ - u8 *ctrblk = walk->iv; - u8 keystream[CAST5_BLOCK_SIZE]; - u8 *src = walk->src.virt.addr; - u8 *dst = walk->dst.virt.addr; - unsigned int nbytes = walk->nbytes; - - __cast5_encrypt(ctx, keystream, ctrblk); - crypto_xor_cpy(dst, keystream, src, nbytes); - - crypto_inc(ctrblk, CAST5_BLOCK_SIZE); -} - -static unsigned int __ctr_crypt(struct skcipher_walk *walk, - struct cast5_ctx *ctx) -{ - const unsigned int bsize = CAST5_BLOCK_SIZE; - unsigned int nbytes = walk->nbytes; - u64 *src = (u64 *)walk->src.virt.addr; - u64 *dst = (u64 *)walk->dst.virt.addr; - - /* Process multi-block batch */ - if (nbytes >= bsize * CAST5_PARALLEL_BLOCKS) { - do { - cast5_ctr_16way(ctx, (u8 *)dst, (u8 *)src, - (__be64 *)walk->iv); - - src += CAST5_PARALLEL_BLOCKS; - dst += CAST5_PARALLEL_BLOCKS; - nbytes -= bsize * CAST5_PARALLEL_BLOCKS; - } while (nbytes >= bsize * CAST5_PARALLEL_BLOCKS); - - if (nbytes < bsize) - goto done; - } - - /* Handle leftovers */ - do { - u64 ctrblk; - - if (dst != src) - *dst = *src; - - ctrblk = *(u64 *)walk->iv; - be64_add_cpu((__be64 *)walk->iv, 1); - - __cast5_encrypt(ctx, (u8 *)&ctrblk, (u8 *)&ctrblk); - *dst ^= ctrblk; - - src += 1; - dst += 1; - nbytes -= bsize; - } while (nbytes >= bsize); - -done: - return nbytes; -} - -static int ctr_crypt(struct skcipher_request *req) -{ - struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - struct cast5_ctx *ctx = crypto_skcipher_ctx(tfm); - bool fpu_enabled = false; - struct skcipher_walk walk; - unsigned int nbytes; - int err; - - err = skcipher_walk_virt(&walk, req, false); - - while ((nbytes = walk.nbytes) >= CAST5_BLOCK_SIZE) { - fpu_enabled = cast5_fpu_begin(fpu_enabled, &walk, nbytes); - nbytes = __ctr_crypt(&walk, ctx); - err = skcipher_walk_done(&walk, nbytes); - } - - cast5_fpu_end(fpu_enabled); - - if (walk.nbytes) { - ctr_crypt_final(&walk, ctx); - err = skcipher_walk_done(&walk, 0); - } - - return err; + CBC_WALK_START(req, CAST5_BLOCK_SIZE, CAST5_PARALLEL_BLOCKS); + CBC_DEC_BLOCK(CAST5_PARALLEL_BLOCKS, cast5_cbc_dec_16way); + CBC_DEC_BLOCK(1, __cast5_decrypt); + CBC_WALK_END(); } static struct skcipher_alg cast5_algs[] = { @@ -328,21 +90,6 @@ static struct skcipher_alg cast5_algs[] = { .setkey = cast5_setkey_skcipher, .encrypt = cbc_encrypt, .decrypt = cbc_decrypt, - }, { - .base.cra_name = "__ctr(cast5)", - .base.cra_driver_name = "__ctr-cast5-avx", - .base.cra_priority = 200, - .base.cra_flags = CRYPTO_ALG_INTERNAL, - .base.cra_blocksize = 1, - .base.cra_ctxsize = sizeof(struct cast5_ctx), - .base.cra_module = THIS_MODULE, - .min_keysize = CAST5_MIN_KEY_SIZE, - .max_keysize = CAST5_MAX_KEY_SIZE, - .ivsize = CAST5_BLOCK_SIZE, - .chunksize = CAST5_BLOCK_SIZE, - .setkey = cast5_setkey_skcipher, - .encrypt = ctr_crypt, - .decrypt = ctr_crypt, } }; diff --git a/arch/x86/crypto/cast6-avx-x86_64-asm_64.S b/arch/x86/crypto/cast6-avx-x86_64-asm_64.S index 932a3ce32a88..82b716fd5dba 100644 --- a/arch/x86/crypto/cast6-avx-x86_64-asm_64.S +++ b/arch/x86/crypto/cast6-avx-x86_64-asm_64.S @@ -212,8 +212,6 @@ .section .rodata.cst16, "aM", @progbits, 16 .align 16 -.Lxts_gf128mul_and_shl1_mask: - .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0 .Lbswap_mask: .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 .Lbswap128_mask: @@ -291,7 +289,7 @@ SYM_FUNC_START_LOCAL(__cast6_enc_blk8) outunpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM); outunpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM); - ret; + RET; SYM_FUNC_END(__cast6_enc_blk8) .align 8 @@ -338,7 +336,7 @@ SYM_FUNC_START_LOCAL(__cast6_dec_blk8) outunpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM); outunpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM); - ret; + RET; SYM_FUNC_END(__cast6_dec_blk8) SYM_FUNC_START(cast6_ecb_enc_8way) @@ -361,7 +359,7 @@ SYM_FUNC_START(cast6_ecb_enc_8way) popq %r15; FRAME_END - ret; + RET; SYM_FUNC_END(cast6_ecb_enc_8way) SYM_FUNC_START(cast6_ecb_dec_8way) @@ -384,7 +382,7 @@ SYM_FUNC_START(cast6_ecb_dec_8way) popq %r15; FRAME_END - ret; + RET; SYM_FUNC_END(cast6_ecb_dec_8way) SYM_FUNC_START(cast6_cbc_dec_8way) @@ -410,87 +408,5 @@ SYM_FUNC_START(cast6_cbc_dec_8way) popq %r15; popq %r12; FRAME_END - ret; + RET; SYM_FUNC_END(cast6_cbc_dec_8way) - -SYM_FUNC_START(cast6_ctr_8way) - /* input: - * %rdi: ctx, CTX - * %rsi: dst - * %rdx: src - * %rcx: iv (little endian, 128bit) - */ - FRAME_BEGIN - pushq %r12; - pushq %r15 - - movq %rdi, CTX; - movq %rsi, %r11; - movq %rdx, %r12; - - load_ctr_8way(%rcx, .Lbswap128_mask, RA1, RB1, RC1, RD1, RA2, RB2, RC2, - RD2, RX, RKR, RKM); - - call __cast6_enc_blk8; - - store_ctr_8way(%r12, %r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); - - popq %r15; - popq %r12; - FRAME_END - ret; -SYM_FUNC_END(cast6_ctr_8way) - -SYM_FUNC_START(cast6_xts_enc_8way) - /* input: - * %rdi: ctx, CTX - * %rsi: dst - * %rdx: src - * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) - */ - FRAME_BEGIN - pushq %r15; - - movq %rdi, CTX - movq %rsi, %r11; - - /* regs <= src, dst <= IVs, regs <= regs xor IVs */ - load_xts_8way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2, - RX, RKR, RKM, .Lxts_gf128mul_and_shl1_mask); - - call __cast6_enc_blk8; - - /* dst <= regs xor IVs(in dst) */ - store_xts_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); - - popq %r15; - FRAME_END - ret; -SYM_FUNC_END(cast6_xts_enc_8way) - -SYM_FUNC_START(cast6_xts_dec_8way) - /* input: - * %rdi: ctx, CTX - * %rsi: dst - * %rdx: src - * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) - */ - FRAME_BEGIN - pushq %r15; - - movq %rdi, CTX - movq %rsi, %r11; - - /* regs <= src, dst <= IVs, regs <= regs xor IVs */ - load_xts_8way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2, - RX, RKR, RKM, .Lxts_gf128mul_and_shl1_mask); - - call __cast6_dec_blk8; - - /* dst <= regs xor IVs(in dst) */ - store_xts_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); - - popq %r15; - FRAME_END - ret; -SYM_FUNC_END(cast6_xts_dec_8way) diff --git a/arch/x86/crypto/cast6_avx_glue.c b/arch/x86/crypto/cast6_avx_glue.c index 48e0f37796fa..7e2aea372349 100644 --- a/arch/x86/crypto/cast6_avx_glue.c +++ b/arch/x86/crypto/cast6_avx_glue.c @@ -15,8 +15,8 @@ #include <crypto/algapi.h> #include <crypto/cast6.h> #include <crypto/internal/simd.h> -#include <crypto/xts.h> -#include <asm/crypto/glue_helper.h> + +#include "ecb_cbc_helpers.h" #define CAST6_PARALLEL_BLOCKS 8 @@ -24,13 +24,6 @@ asmlinkage void cast6_ecb_enc_8way(const void *ctx, u8 *dst, const u8 *src); asmlinkage void cast6_ecb_dec_8way(const void *ctx, u8 *dst, const u8 *src); asmlinkage void cast6_cbc_dec_8way(const void *ctx, u8 *dst, const u8 *src); -asmlinkage void cast6_ctr_8way(const void *ctx, u8 *dst, const u8 *src, - le128 *iv); - -asmlinkage void cast6_xts_enc_8way(const void *ctx, u8 *dst, const u8 *src, - le128 *iv); -asmlinkage void cast6_xts_dec_8way(const void *ctx, u8 *dst, const u8 *src, - le128 *iv); static int cast6_setkey_skcipher(struct crypto_skcipher *tfm, const u8 *key, unsigned int keylen) @@ -38,172 +31,35 @@ static int cast6_setkey_skcipher(struct crypto_skcipher *tfm, return cast6_setkey(&tfm->base, key, keylen); } -static void cast6_xts_enc(const void *ctx, u8 *dst, const u8 *src, le128 *iv) -{ - glue_xts_crypt_128bit_one(ctx, dst, src, iv, __cast6_encrypt); -} - -static void cast6_xts_dec(const void *ctx, u8 *dst, const u8 *src, le128 *iv) -{ - glue_xts_crypt_128bit_one(ctx, dst, src, iv, __cast6_decrypt); -} - -static void cast6_crypt_ctr(const void *ctx, u8 *d, const u8 *s, le128 *iv) -{ - be128 ctrblk; - u128 *dst = (u128 *)d; - const u128 *src = (const u128 *)s; - - le128_to_be128(&ctrblk, iv); - le128_inc(iv); - - __cast6_encrypt(ctx, (u8 *)&ctrblk, (u8 *)&ctrblk); - u128_xor(dst, src, (u128 *)&ctrblk); -} - -static const struct common_glue_ctx cast6_enc = { - .num_funcs = 2, - .fpu_blocks_limit = CAST6_PARALLEL_BLOCKS, - - .funcs = { { - .num_blocks = CAST6_PARALLEL_BLOCKS, - .fn_u = { .ecb = cast6_ecb_enc_8way } - }, { - .num_blocks = 1, - .fn_u = { .ecb = __cast6_encrypt } - } } -}; - -static const struct common_glue_ctx cast6_ctr = { - .num_funcs = 2, - .fpu_blocks_limit = CAST6_PARALLEL_BLOCKS, - - .funcs = { { - .num_blocks = CAST6_PARALLEL_BLOCKS, - .fn_u = { .ctr = cast6_ctr_8way } - }, { - .num_blocks = 1, - .fn_u = { .ctr = cast6_crypt_ctr } - } } -}; - -static const struct common_glue_ctx cast6_enc_xts = { - .num_funcs = 2, - .fpu_blocks_limit = CAST6_PARALLEL_BLOCKS, - - .funcs = { { - .num_blocks = CAST6_PARALLEL_BLOCKS, - .fn_u = { .xts = cast6_xts_enc_8way } - }, { - .num_blocks = 1, - .fn_u = { .xts = cast6_xts_enc } - } } -}; - -static const struct common_glue_ctx cast6_dec = { - .num_funcs = 2, - .fpu_blocks_limit = CAST6_PARALLEL_BLOCKS, - - .funcs = { { - .num_blocks = CAST6_PARALLEL_BLOCKS, - .fn_u = { .ecb = cast6_ecb_dec_8way } - }, { - .num_blocks = 1, - .fn_u = { .ecb = __cast6_decrypt } - } } -}; - -static const struct common_glue_ctx cast6_dec_cbc = { - .num_funcs = 2, - .fpu_blocks_limit = CAST6_PARALLEL_BLOCKS, - - .funcs = { { - .num_blocks = CAST6_PARALLEL_BLOCKS, - .fn_u = { .cbc = cast6_cbc_dec_8way } - }, { - .num_blocks = 1, - .fn_u = { .cbc = __cast6_decrypt } - } } -}; - -static const struct common_glue_ctx cast6_dec_xts = { - .num_funcs = 2, - .fpu_blocks_limit = CAST6_PARALLEL_BLOCKS, - - .funcs = { { - .num_blocks = CAST6_PARALLEL_BLOCKS, - .fn_u = { .xts = cast6_xts_dec_8way } - }, { - .num_blocks = 1, - .fn_u = { .xts = cast6_xts_dec } - } } -}; - static int ecb_encrypt(struct skcipher_request *req) { - return glue_ecb_req_128bit(&cast6_enc, req); + ECB_WALK_START(req, CAST6_BLOCK_SIZE, CAST6_PARALLEL_BLOCKS); + ECB_BLOCK(CAST6_PARALLEL_BLOCKS, cast6_ecb_enc_8way); + ECB_BLOCK(1, __cast6_encrypt); + ECB_WALK_END(); } static int ecb_decrypt(struct skcipher_request *req) { - return glue_ecb_req_128bit(&cast6_dec, req); + ECB_WALK_START(req, CAST6_BLOCK_SIZE, CAST6_PARALLEL_BLOCKS); + ECB_BLOCK(CAST6_PARALLEL_BLOCKS, cast6_ecb_dec_8way); + ECB_BLOCK(1, __cast6_decrypt); + ECB_WALK_END(); } static int cbc_encrypt(struct skcipher_request *req) { - return glue_cbc_encrypt_req_128bit(__cast6_encrypt, req); + CBC_WALK_START(req, CAST6_BLOCK_SIZE, -1); + CBC_ENC_BLOCK(__cast6_encrypt); + CBC_WALK_END(); } static int cbc_decrypt(struct skcipher_request *req) { - return glue_cbc_decrypt_req_128bit(&cast6_dec_cbc, req); -} - -static int ctr_crypt(struct skcipher_request *req) -{ - return glue_ctr_req_128bit(&cast6_ctr, req); -} - -struct cast6_xts_ctx { - struct cast6_ctx tweak_ctx; - struct cast6_ctx crypt_ctx; -}; - -static int xts_cast6_setkey(struct crypto_skcipher *tfm, const u8 *key, - unsigned int keylen) -{ - struct cast6_xts_ctx *ctx = crypto_skcipher_ctx(tfm); - int err; - - err = xts_verify_key(tfm, key, keylen); - if (err) - return err; - - /* first half of xts-key is for crypt */ - err = __cast6_setkey(&ctx->crypt_ctx, key, keylen / 2); - if (err) - return err; - - /* second half of xts-key is for tweak */ - return __cast6_setkey(&ctx->tweak_ctx, key + keylen / 2, keylen / 2); -} - -static int xts_encrypt(struct skcipher_request *req) -{ - struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - struct cast6_xts_ctx *ctx = crypto_skcipher_ctx(tfm); - - return glue_xts_req_128bit(&cast6_enc_xts, req, __cast6_encrypt, - &ctx->tweak_ctx, &ctx->crypt_ctx, false); -} - -static int xts_decrypt(struct skcipher_request *req) -{ - struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - struct cast6_xts_ctx *ctx = crypto_skcipher_ctx(tfm); - - return glue_xts_req_128bit(&cast6_dec_xts, req, __cast6_encrypt, - &ctx->tweak_ctx, &ctx->crypt_ctx, true); + CBC_WALK_START(req, CAST6_BLOCK_SIZE, CAST6_PARALLEL_BLOCKS); + CBC_DEC_BLOCK(CAST6_PARALLEL_BLOCKS, cast6_cbc_dec_8way); + CBC_DEC_BLOCK(1, __cast6_decrypt); + CBC_WALK_END(); } static struct skcipher_alg cast6_algs[] = { @@ -234,35 +90,6 @@ static struct skcipher_alg cast6_algs[] = { .setkey = cast6_setkey_skcipher, .encrypt = cbc_encrypt, .decrypt = cbc_decrypt, - }, { - .base.cra_name = "__ctr(cast6)", - .base.cra_driver_name = "__ctr-cast6-avx", - .base.cra_priority = 200, - .base.cra_flags = CRYPTO_ALG_INTERNAL, - .base.cra_blocksize = 1, - .base.cra_ctxsize = sizeof(struct cast6_ctx), - .base.cra_module = THIS_MODULE, - .min_keysize = CAST6_MIN_KEY_SIZE, - .max_keysize = CAST6_MAX_KEY_SIZE, - .ivsize = CAST6_BLOCK_SIZE, - .chunksize = CAST6_BLOCK_SIZE, - .setkey = cast6_setkey_skcipher, - .encrypt = ctr_crypt, - .decrypt = ctr_crypt, - }, { - .base.cra_name = "__xts(cast6)", - .base.cra_driver_name = "__xts-cast6-avx", - .base.cra_priority = 200, - .base.cra_flags = CRYPTO_ALG_INTERNAL, - .base.cra_blocksize = CAST6_BLOCK_SIZE, - .base.cra_ctxsize = sizeof(struct cast6_xts_ctx), - .base.cra_module = THIS_MODULE, - .min_keysize = 2 * CAST6_MIN_KEY_SIZE, - .max_keysize = 2 * CAST6_MAX_KEY_SIZE, - .ivsize = CAST6_BLOCK_SIZE, - .setkey = xts_cast6_setkey, - .encrypt = xts_encrypt, - .decrypt = xts_decrypt, }, }; diff --git a/arch/x86/crypto/chacha-avx2-x86_64.S b/arch/x86/crypto/chacha-avx2-x86_64.S index ee9a40ab4109..f3d8fc018249 100644 --- a/arch/x86/crypto/chacha-avx2-x86_64.S +++ b/arch/x86/crypto/chacha-avx2-x86_64.S @@ -193,7 +193,7 @@ SYM_FUNC_START(chacha_2block_xor_avx2) .Ldone2: vzeroupper - ret + RET .Lxorpart2: # xor remaining bytes from partial register into output @@ -498,7 +498,7 @@ SYM_FUNC_START(chacha_4block_xor_avx2) .Ldone4: vzeroupper - ret + RET .Lxorpart4: # xor remaining bytes from partial register into output @@ -992,7 +992,7 @@ SYM_FUNC_START(chacha_8block_xor_avx2) .Ldone8: vzeroupper lea -8(%r10),%rsp - ret + RET .Lxorpart8: # xor remaining bytes from partial register into output diff --git a/arch/x86/crypto/chacha-avx512vl-x86_64.S b/arch/x86/crypto/chacha-avx512vl-x86_64.S index bb193fde123a..259383e1ad44 100644 --- a/arch/x86/crypto/chacha-avx512vl-x86_64.S +++ b/arch/x86/crypto/chacha-avx512vl-x86_64.S @@ -166,13 +166,13 @@ SYM_FUNC_START(chacha_2block_xor_avx512vl) .Ldone2: vzeroupper - ret + RET .Lxorpart2: # xor remaining bytes from partial register into output mov %rcx,%rax and $0xf,%rcx - jz .Ldone8 + jz .Ldone2 mov %rax,%r9 and $~0xf,%r9 @@ -432,13 +432,13 @@ SYM_FUNC_START(chacha_4block_xor_avx512vl) .Ldone4: vzeroupper - ret + RET .Lxorpart4: # xor remaining bytes from partial register into output mov %rcx,%rax and $0xf,%rcx - jz .Ldone8 + jz .Ldone4 mov %rax,%r9 and $~0xf,%r9 @@ -812,7 +812,7 @@ SYM_FUNC_START(chacha_8block_xor_avx512vl) .Ldone8: vzeroupper - ret + RET .Lxorpart8: # xor remaining bytes from partial register into output diff --git a/arch/x86/crypto/chacha-ssse3-x86_64.S b/arch/x86/crypto/chacha-ssse3-x86_64.S index a38ab2512a6f..7111949cd5b9 100644 --- a/arch/x86/crypto/chacha-ssse3-x86_64.S +++ b/arch/x86/crypto/chacha-ssse3-x86_64.S @@ -108,7 +108,7 @@ SYM_FUNC_START_LOCAL(chacha_permute) sub $2,%r8d jnz .Ldoubleround - ret + RET SYM_FUNC_END(chacha_permute) SYM_FUNC_START(chacha_block_xor_ssse3) @@ -120,10 +120,10 @@ SYM_FUNC_START(chacha_block_xor_ssse3) FRAME_BEGIN # x0..3 = s0..3 - movdqa 0x00(%rdi),%xmm0 - movdqa 0x10(%rdi),%xmm1 - movdqa 0x20(%rdi),%xmm2 - movdqa 0x30(%rdi),%xmm3 + movdqu 0x00(%rdi),%xmm0 + movdqu 0x10(%rdi),%xmm1 + movdqu 0x20(%rdi),%xmm2 + movdqu 0x30(%rdi),%xmm3 movdqa %xmm0,%xmm8 movdqa %xmm1,%xmm9 movdqa %xmm2,%xmm10 @@ -166,7 +166,7 @@ SYM_FUNC_START(chacha_block_xor_ssse3) .Ldone: FRAME_END - ret + RET .Lxorpart: # xor remaining bytes from partial register into output @@ -205,10 +205,10 @@ SYM_FUNC_START(hchacha_block_ssse3) # %edx: nrounds FRAME_BEGIN - movdqa 0x00(%rdi),%xmm0 - movdqa 0x10(%rdi),%xmm1 - movdqa 0x20(%rdi),%xmm2 - movdqa 0x30(%rdi),%xmm3 + movdqu 0x00(%rdi),%xmm0 + movdqu 0x10(%rdi),%xmm1 + movdqu 0x20(%rdi),%xmm2 + movdqu 0x30(%rdi),%xmm3 mov %edx,%r8d call chacha_permute @@ -217,7 +217,7 @@ SYM_FUNC_START(hchacha_block_ssse3) movdqu %xmm3,0x10(%rsi) FRAME_END - ret + RET SYM_FUNC_END(hchacha_block_ssse3) SYM_FUNC_START(chacha_4block_xor_ssse3) @@ -762,7 +762,7 @@ SYM_FUNC_START(chacha_4block_xor_ssse3) .Ldone4: lea -8(%r10),%rsp - ret + RET .Lxorpart4: # xor remaining bytes from partial register into output diff --git a/arch/x86/crypto/chacha_glue.c b/arch/x86/crypto/chacha_glue.c index 68a74953efaf..7b3a1cf0984b 100644 --- a/arch/x86/crypto/chacha_glue.c +++ b/arch/x86/crypto/chacha_glue.c @@ -12,10 +12,9 @@ #include <crypto/internal/skcipher.h> #include <linux/kernel.h> #include <linux/module.h> +#include <linux/sizes.h> #include <asm/simd.h> -#define CHACHA_STATE_ALIGN 16 - asmlinkage void chacha_block_xor_ssse3(u32 *state, u8 *dst, const u8 *src, unsigned int len, int nrounds); asmlinkage void chacha_4block_xor_ssse3(u32 *state, u8 *dst, const u8 *src, @@ -79,8 +78,7 @@ static void chacha_dosimd(u32 *state, u8 *dst, const u8 *src, } } - if (IS_ENABLED(CONFIG_AS_AVX2) && - static_branch_likely(&chacha_use_avx2)) { + if (static_branch_likely(&chacha_use_avx2)) { while (bytes >= CHACHA_BLOCK_SIZE * 8) { chacha_8block_xor_avx2(state, dst, src, bytes, nrounds); bytes -= CHACHA_BLOCK_SIZE * 8; @@ -125,8 +123,6 @@ static void chacha_dosimd(u32 *state, u8 *dst, const u8 *src, void hchacha_block_arch(const u32 *state, u32 *stream, int nrounds) { - state = PTR_ALIGN(state, CHACHA_STATE_ALIGN); - if (!static_branch_likely(&chacha_use_simd) || !crypto_simd_usable()) { hchacha_block_generic(state, stream, nrounds); } else { @@ -139,8 +135,6 @@ EXPORT_SYMBOL(hchacha_block_arch); void chacha_init_arch(u32 *state, const u32 *key, const u8 *iv) { - state = PTR_ALIGN(state, CHACHA_STATE_ALIGN); - chacha_init_generic(state, key, iv); } EXPORT_SYMBOL(chacha_init_arch); @@ -148,30 +142,33 @@ EXPORT_SYMBOL(chacha_init_arch); void chacha_crypt_arch(u32 *state, u8 *dst, const u8 *src, unsigned int bytes, int nrounds) { - state = PTR_ALIGN(state, CHACHA_STATE_ALIGN); - if (!static_branch_likely(&chacha_use_simd) || !crypto_simd_usable() || bytes <= CHACHA_BLOCK_SIZE) return chacha_crypt_generic(state, dst, src, bytes, nrounds); - kernel_fpu_begin(); - chacha_dosimd(state, dst, src, bytes, nrounds); - kernel_fpu_end(); + do { + unsigned int todo = min_t(unsigned int, bytes, SZ_4K); + + kernel_fpu_begin(); + chacha_dosimd(state, dst, src, todo, nrounds); + kernel_fpu_end(); + + bytes -= todo; + src += todo; + dst += todo; + } while (bytes); } EXPORT_SYMBOL(chacha_crypt_arch); static int chacha_simd_stream_xor(struct skcipher_request *req, const struct chacha_ctx *ctx, const u8 *iv) { - u32 *state, state_buf[16 + 2] __aligned(8); + u32 state[CHACHA_STATE_WORDS] __aligned(8); struct skcipher_walk walk; int err; err = skcipher_walk_virt(&walk, req, false); - BUILD_BUG_ON(CHACHA_STATE_ALIGN != 16); - state = PTR_ALIGN(state_buf + 0, CHACHA_STATE_ALIGN); - chacha_init_generic(state, ctx->key, iv); while (walk.nbytes > 0) { @@ -210,12 +207,10 @@ static int xchacha_simd(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm); - u32 *state, state_buf[16 + 2] __aligned(8); + u32 state[CHACHA_STATE_WORDS] __aligned(8); struct chacha_ctx subctx; u8 real_iv[16]; - BUILD_BUG_ON(CHACHA_STATE_ALIGN != 16); - state = PTR_ALIGN(state_buf + 0, CHACHA_STATE_ALIGN); chacha_init_generic(state, ctx->key, req->iv); if (req->cryptlen > CHACHA_BLOCK_SIZE && crypto_simd_usable()) { @@ -288,8 +283,7 @@ static int __init chacha_simd_mod_init(void) static_branch_enable(&chacha_use_simd); - if (IS_ENABLED(CONFIG_AS_AVX2) && - boot_cpu_has(X86_FEATURE_AVX) && + if (boot_cpu_has(X86_FEATURE_AVX) && boot_cpu_has(X86_FEATURE_AVX2) && cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) { static_branch_enable(&chacha_use_avx2); diff --git a/arch/x86/crypto/crc32-pclmul_asm.S b/arch/x86/crypto/crc32-pclmul_asm.S index 9fd28ff65bc2..ca53e96996ac 100644 --- a/arch/x86/crypto/crc32-pclmul_asm.S +++ b/arch/x86/crypto/crc32-pclmul_asm.S @@ -1,26 +1,4 @@ -/* GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see http://www.gnu.org/licenses - * - * Please visit http://www.xyratex.com/contact if you need additional - * information or have any questions. - * - * GPL HEADER END - */ - +/* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright 2012 Xyratex Technology Limited * @@ -38,7 +16,6 @@ */ #include <linux/linkage.h> -#include <asm/inst.h> .section .rodata @@ -129,17 +106,17 @@ loop_64:/* 64 bytes Full cache line folding */ #ifdef __x86_64__ movdqa %xmm4, %xmm8 #endif - PCLMULQDQ 00, CONSTANT, %xmm1 - PCLMULQDQ 00, CONSTANT, %xmm2 - PCLMULQDQ 00, CONSTANT, %xmm3 + pclmulqdq $0x00, CONSTANT, %xmm1 + pclmulqdq $0x00, CONSTANT, %xmm2 + pclmulqdq $0x00, CONSTANT, %xmm3 #ifdef __x86_64__ - PCLMULQDQ 00, CONSTANT, %xmm4 + pclmulqdq $0x00, CONSTANT, %xmm4 #endif - PCLMULQDQ 0x11, CONSTANT, %xmm5 - PCLMULQDQ 0x11, CONSTANT, %xmm6 - PCLMULQDQ 0x11, CONSTANT, %xmm7 + pclmulqdq $0x11, CONSTANT, %xmm5 + pclmulqdq $0x11, CONSTANT, %xmm6 + pclmulqdq $0x11, CONSTANT, %xmm7 #ifdef __x86_64__ - PCLMULQDQ 0x11, CONSTANT, %xmm8 + pclmulqdq $0x11, CONSTANT, %xmm8 #endif pxor %xmm5, %xmm1 pxor %xmm6, %xmm2 @@ -149,8 +126,8 @@ loop_64:/* 64 bytes Full cache line folding */ #else /* xmm8 unsupported for x32 */ movdqa %xmm4, %xmm5 - PCLMULQDQ 00, CONSTANT, %xmm4 - PCLMULQDQ 0x11, CONSTANT, %xmm5 + pclmulqdq $0x00, CONSTANT, %xmm4 + pclmulqdq $0x11, CONSTANT, %xmm5 pxor %xmm5, %xmm4 #endif @@ -172,20 +149,20 @@ less_64:/* Folding cache line into 128bit */ prefetchnta (BUF) movdqa %xmm1, %xmm5 - PCLMULQDQ 0x00, CONSTANT, %xmm1 - PCLMULQDQ 0x11, CONSTANT, %xmm5 + pclmulqdq $0x00, CONSTANT, %xmm1 + pclmulqdq $0x11, CONSTANT, %xmm5 pxor %xmm5, %xmm1 pxor %xmm2, %xmm1 movdqa %xmm1, %xmm5 - PCLMULQDQ 0x00, CONSTANT, %xmm1 - PCLMULQDQ 0x11, CONSTANT, %xmm5 + pclmulqdq $0x00, CONSTANT, %xmm1 + pclmulqdq $0x11, CONSTANT, %xmm5 pxor %xmm5, %xmm1 pxor %xmm3, %xmm1 movdqa %xmm1, %xmm5 - PCLMULQDQ 0x00, CONSTANT, %xmm1 - PCLMULQDQ 0x11, CONSTANT, %xmm5 + pclmulqdq $0x00, CONSTANT, %xmm1 + pclmulqdq $0x11, CONSTANT, %xmm5 pxor %xmm5, %xmm1 pxor %xmm4, %xmm1 @@ -193,8 +170,8 @@ less_64:/* Folding cache line into 128bit */ jb fold_64 loop_16:/* Folding rest buffer into 128bit */ movdqa %xmm1, %xmm5 - PCLMULQDQ 0x00, CONSTANT, %xmm1 - PCLMULQDQ 0x11, CONSTANT, %xmm5 + pclmulqdq $0x00, CONSTANT, %xmm1 + pclmulqdq $0x11, CONSTANT, %xmm5 pxor %xmm5, %xmm1 pxor (BUF), %xmm1 sub $0x10, LEN @@ -205,7 +182,7 @@ loop_16:/* Folding rest buffer into 128bit */ fold_64: /* perform the last 64 bit fold, also adds 32 zeroes * to the input stream */ - PCLMULQDQ 0x01, %xmm1, CONSTANT /* R4 * xmm1.low */ + pclmulqdq $0x01, %xmm1, CONSTANT /* R4 * xmm1.low */ psrldq $0x08, %xmm1 pxor CONSTANT, %xmm1 @@ -220,7 +197,7 @@ fold_64: #endif psrldq $0x04, %xmm2 pand %xmm3, %xmm1 - PCLMULQDQ 0x00, CONSTANT, %xmm1 + pclmulqdq $0x00, CONSTANT, %xmm1 pxor %xmm2, %xmm1 /* Finish up with the bit-reversed barrett reduction 64 ==> 32 bits */ @@ -231,11 +208,11 @@ fold_64: #endif movdqa %xmm1, %xmm2 pand %xmm3, %xmm1 - PCLMULQDQ 0x10, CONSTANT, %xmm1 + pclmulqdq $0x10, CONSTANT, %xmm1 pand %xmm3, %xmm1 - PCLMULQDQ 0x00, CONSTANT, %xmm1 + pclmulqdq $0x00, CONSTANT, %xmm1 pxor %xmm2, %xmm1 - PEXTRD 0x01, %xmm1, %eax + pextrd $0x01, %xmm1, %eax - ret + RET SYM_FUNC_END(crc32_pclmul_le_16) diff --git a/arch/x86/crypto/crc32-pclmul_glue.c b/arch/x86/crypto/crc32-pclmul_glue.c index 418bd88acac8..98cf3b4e4c9f 100644 --- a/arch/x86/crypto/crc32-pclmul_glue.c +++ b/arch/x86/crypto/crc32-pclmul_glue.c @@ -24,7 +24,7 @@ /* * Copyright 2012 Xyratex Technology Limited * - * Wrappers for kernel crypto shash api to pclmulqdq crc32 imlementation. + * Wrappers for kernel crypto shash api to pclmulqdq crc32 implementation. */ #include <linux/init.h> #include <linux/module.h> @@ -170,7 +170,7 @@ static struct shash_alg alg = { }; static const struct x86_cpu_id crc32pclmul_cpu_id[] = { - X86_FEATURE_MATCH(X86_FEATURE_PCLMULQDQ), + X86_MATCH_FEATURE(X86_FEATURE_PCLMULQDQ, NULL), {} }; MODULE_DEVICE_TABLE(x86cpu, crc32pclmul_cpu_id); diff --git a/arch/x86/crypto/crc32c-intel_glue.c b/arch/x86/crypto/crc32c-intel_glue.c index c20d1b8a82c3..feccb5254c7e 100644 --- a/arch/x86/crypto/crc32c-intel_glue.c +++ b/arch/x86/crypto/crc32c-intel_glue.c @@ -28,9 +28,9 @@ #define SCALE_F sizeof(unsigned long) #ifdef CONFIG_X86_64 -#define REX_PRE "0x48, " +#define CRC32_INST "crc32q %1, %q0" #else -#define REX_PRE +#define CRC32_INST "crc32l %1, %0" #endif #ifdef CONFIG_X86_64 @@ -48,11 +48,8 @@ asmlinkage unsigned int crc_pcl(const u8 *buffer, int len, static u32 crc32c_intel_le_hw_byte(u32 crc, unsigned char const *data, size_t length) { while (length--) { - __asm__ __volatile__( - ".byte 0xf2, 0xf, 0x38, 0xf0, 0xf1" - :"=S"(crc) - :"0"(crc), "c"(*data) - ); + asm("crc32b %1, %0" + : "+r" (crc) : "rm" (*data)); data++; } @@ -66,11 +63,8 @@ static u32 __pure crc32c_intel_le_hw(u32 crc, unsigned char const *p, size_t len unsigned long *ptmp = (unsigned long *)p; while (iquotient--) { - __asm__ __volatile__( - ".byte 0xf2, " REX_PRE "0xf, 0x38, 0xf1, 0xf1;" - :"=S"(crc) - :"0"(crc), "c"(*ptmp) - ); + asm(CRC32_INST + : "+r" (crc) : "rm" (*ptmp)); ptmp++; } @@ -221,7 +215,7 @@ static struct shash_alg alg = { }; static const struct x86_cpu_id crc32c_cpu_id[] = { - X86_FEATURE_MATCH(X86_FEATURE_XMM4_2), + X86_MATCH_FEATURE(X86_FEATURE_XMM4_2, NULL), {} }; MODULE_DEVICE_TABLE(x86cpu, crc32c_cpu_id); diff --git a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S index 0e6690e3618c..ec35915f0901 100644 --- a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S +++ b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S @@ -43,7 +43,6 @@ * SOFTWARE. */ -#include <asm/inst.h> #include <linux/linkage.h> #include <asm/nospec-branch.h> @@ -54,7 +53,7 @@ .endm .macro JMPTBL_ENTRY i -.word crc_\i - crc_array +.quad crc_\i .endm .macro JNC_LESS_THAN j @@ -75,7 +74,7 @@ .text SYM_FUNC_START(crc_pcl) -#define bufp %rdi +#define bufp rdi #define bufp_dw %edi #define bufp_w %di #define bufp_b %dil @@ -105,9 +104,9 @@ SYM_FUNC_START(crc_pcl) ## 1) ALIGN: ################################################################ - mov bufp, bufptmp # rdi = *buf - neg bufp - and $7, bufp # calculate the unalignment amount of + mov %bufp, bufptmp # rdi = *buf + neg %bufp + and $7, %bufp # calculate the unalignment amount of # the address je proc_block # Skip if aligned @@ -123,13 +122,13 @@ SYM_FUNC_START(crc_pcl) do_align: #### Calculate CRC of unaligned bytes of the buffer (if any) movq (bufptmp), tmp # load a quadward from the buffer - add bufp, bufptmp # align buffer pointer for quadword + add %bufp, bufptmp # align buffer pointer for quadword # processing - sub bufp, len # update buffer length + sub %bufp, len # update buffer length align_loop: crc32b %bl, crc_init_dw # compute crc32 of 1-byte shr $8, tmp # get next byte - dec bufp + dec %bufp jne align_loop proc_block: @@ -169,10 +168,7 @@ continue_block: xor crc2, crc2 ## branch into array - lea jump_table(%rip), bufp - movzxw (bufp, %rax, 2), len - lea crc_array(%rip), bufp - lea (bufp, len, 1), bufp + mov jump_table(,%rax,8), %bufp JMP_NOSPEC bufp ################################################################ @@ -199,6 +195,7 @@ crc_array: .altmacro LABEL crc_ %i .noaltmacro + ENDBR crc32q -i*8(block_0), crc_init crc32q -i*8(block_1), crc1 crc32q -i*8(block_2), crc2 @@ -208,6 +205,7 @@ LABEL crc_ %i .altmacro LABEL crc_ %i .noaltmacro + ENDBR crc32q -i*8(block_0), crc_init crc32q -i*8(block_1), crc1 # SKIP crc32 -i*8(block_2), crc2 ; Don't do this one yet @@ -218,17 +216,17 @@ LABEL crc_ %i ## 4) Combine three results: ################################################################ - lea (K_table-8)(%rip), bufp # first entry is for idx 1 + lea (K_table-8)(%rip), %bufp # first entry is for idx 1 shlq $3, %rax # rax *= 8 - pmovzxdq (bufp,%rax), %xmm0 # 2 consts: K1:K2 + pmovzxdq (%bufp,%rax), %xmm0 # 2 consts: K1:K2 leal (%eax,%eax,2), %eax # rax *= 3 (total *24) subq %rax, tmp # tmp -= rax*24 movq crc_init, %xmm1 # CRC for block 1 - PCLMULQDQ 0x00,%xmm0,%xmm1 # Multiply by K2 + pclmulqdq $0x00, %xmm0, %xmm1 # Multiply by K2 movq crc1, %xmm2 # CRC for block 2 - PCLMULQDQ 0x10, %xmm0, %xmm2 # Multiply by K1 + pclmulqdq $0x10, %xmm0, %xmm2 # Multiply by K1 pxor %xmm2,%xmm1 movq %xmm1, %rax @@ -241,6 +239,7 @@ LABEL crc_ %i ################################################################ LABEL crc_ 0 + ENDBR mov tmp, len cmp $128*24, tmp jae full_block @@ -310,7 +309,7 @@ do_return: popq %rsi popq %rdi popq %rbx - ret + RET SYM_FUNC_END(crc_pcl) .section .rodata, "a", @progbits diff --git a/arch/x86/crypto/crct10dif-pcl-asm_64.S b/arch/x86/crypto/crct10dif-pcl-asm_64.S index b2533d63030e..721474abfb71 100644 --- a/arch/x86/crypto/crct10dif-pcl-asm_64.S +++ b/arch/x86/crypto/crct10dif-pcl-asm_64.S @@ -257,7 +257,7 @@ SYM_FUNC_START(crc_t10dif_pcl) # Final CRC value (x^16 * M(x)) mod G(x) is in low 16 bits of xmm0. pextrw $0, %xmm0, %eax - ret + RET .align 16 .Lless_than_256_bytes: diff --git a/arch/x86/crypto/crct10dif-pclmul_glue.c b/arch/x86/crypto/crct10dif-pclmul_glue.c index 3c81e15b0873..71291d5af9f4 100644 --- a/arch/x86/crypto/crct10dif-pclmul_glue.c +++ b/arch/x86/crypto/crct10dif-pclmul_glue.c @@ -114,7 +114,7 @@ static struct shash_alg alg = { }; static const struct x86_cpu_id crct10dif_cpu_id[] = { - X86_FEATURE_MATCH(X86_FEATURE_PCLMULQDQ), + X86_MATCH_FEATURE(X86_FEATURE_PCLMULQDQ, NULL), {} }; MODULE_DEVICE_TABLE(x86cpu, crct10dif_cpu_id); diff --git a/arch/x86/crypto/curve25519-x86_64.c b/arch/x86/crypto/curve25519-x86_64.c index eec7d2d24239..d55fa9e9b9e6 100644 --- a/arch/x86/crypto/curve25519-x86_64.c +++ b/arch/x86/crypto/curve25519-x86_64.c @@ -1,8 +1,7 @@ -// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause +// SPDX-License-Identifier: GPL-2.0 OR MIT /* - * Copyright (c) 2017 Armando Faz <armfazh@ic.unicamp.br>. All Rights Reserved. - * Copyright (C) 2018-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. - * Copyright (C) 2018 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved. + * Copyright (C) 2020 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. + * Copyright (c) 2016-2020 INRIA, CMU and Microsoft Corporation */ #include <crypto/curve25519.h> @@ -12,2341 +11,1592 @@ #include <linux/jump_label.h> #include <linux/kernel.h> #include <linux/module.h> +#include <linux/scatterlist.h> #include <asm/cpufeature.h> #include <asm/processor.h> -static __ro_after_init DEFINE_STATIC_KEY_FALSE(curve25519_use_bmi2); -static __ro_after_init DEFINE_STATIC_KEY_FALSE(curve25519_use_adx); - -enum { NUM_WORDS_ELTFP25519 = 4 }; -typedef __aligned(32) u64 eltfp25519_1w[NUM_WORDS_ELTFP25519]; -typedef __aligned(32) u64 eltfp25519_1w_buffer[2 * NUM_WORDS_ELTFP25519]; - -#define mul_eltfp25519_1w_adx(c, a, b) do { \ - mul_256x256_integer_adx(m.buffer, a, b); \ - red_eltfp25519_1w_adx(c, m.buffer); \ -} while (0) - -#define mul_eltfp25519_1w_bmi2(c, a, b) do { \ - mul_256x256_integer_bmi2(m.buffer, a, b); \ - red_eltfp25519_1w_bmi2(c, m.buffer); \ -} while (0) - -#define sqr_eltfp25519_1w_adx(a) do { \ - sqr_256x256_integer_adx(m.buffer, a); \ - red_eltfp25519_1w_adx(a, m.buffer); \ -} while (0) - -#define sqr_eltfp25519_1w_bmi2(a) do { \ - sqr_256x256_integer_bmi2(m.buffer, a); \ - red_eltfp25519_1w_bmi2(a, m.buffer); \ -} while (0) - -#define mul_eltfp25519_2w_adx(c, a, b) do { \ - mul2_256x256_integer_adx(m.buffer, a, b); \ - red_eltfp25519_2w_adx(c, m.buffer); \ -} while (0) - -#define mul_eltfp25519_2w_bmi2(c, a, b) do { \ - mul2_256x256_integer_bmi2(m.buffer, a, b); \ - red_eltfp25519_2w_bmi2(c, m.buffer); \ -} while (0) - -#define sqr_eltfp25519_2w_adx(a) do { \ - sqr2_256x256_integer_adx(m.buffer, a); \ - red_eltfp25519_2w_adx(a, m.buffer); \ -} while (0) - -#define sqr_eltfp25519_2w_bmi2(a) do { \ - sqr2_256x256_integer_bmi2(m.buffer, a); \ - red_eltfp25519_2w_bmi2(a, m.buffer); \ -} while (0) - -#define sqrn_eltfp25519_1w_adx(a, times) do { \ - int ____counter = (times); \ - while (____counter-- > 0) \ - sqr_eltfp25519_1w_adx(a); \ -} while (0) - -#define sqrn_eltfp25519_1w_bmi2(a, times) do { \ - int ____counter = (times); \ - while (____counter-- > 0) \ - sqr_eltfp25519_1w_bmi2(a); \ -} while (0) - -#define copy_eltfp25519_1w(C, A) do { \ - (C)[0] = (A)[0]; \ - (C)[1] = (A)[1]; \ - (C)[2] = (A)[2]; \ - (C)[3] = (A)[3]; \ -} while (0) - -#define setzero_eltfp25519_1w(C) do { \ - (C)[0] = 0; \ - (C)[1] = 0; \ - (C)[2] = 0; \ - (C)[3] = 0; \ -} while (0) - -__aligned(32) static const u64 table_ladder_8k[252 * NUM_WORDS_ELTFP25519] = { - /* 1 */ 0xfffffffffffffff3UL, 0xffffffffffffffffUL, - 0xffffffffffffffffUL, 0x5fffffffffffffffUL, - /* 2 */ 0x6b8220f416aafe96UL, 0x82ebeb2b4f566a34UL, - 0xd5a9a5b075a5950fUL, 0x5142b2cf4b2488f4UL, - /* 3 */ 0x6aaebc750069680cUL, 0x89cf7820a0f99c41UL, - 0x2a58d9183b56d0f4UL, 0x4b5aca80e36011a4UL, - /* 4 */ 0x329132348c29745dUL, 0xf4a2e616e1642fd7UL, - 0x1e45bb03ff67bc34UL, 0x306912d0f42a9b4aUL, - /* 5 */ 0xff886507e6af7154UL, 0x04f50e13dfeec82fUL, - 0xaa512fe82abab5ceUL, 0x174e251a68d5f222UL, - /* 6 */ 0xcf96700d82028898UL, 0x1743e3370a2c02c5UL, - 0x379eec98b4e86eaaUL, 0x0c59888a51e0482eUL, - /* 7 */ 0xfbcbf1d699b5d189UL, 0xacaef0d58e9fdc84UL, - 0xc1c20d06231f7614UL, 0x2938218da274f972UL, - /* 8 */ 0xf6af49beff1d7f18UL, 0xcc541c22387ac9c2UL, - 0x96fcc9ef4015c56bUL, 0x69c1627c690913a9UL, - /* 9 */ 0x7a86fd2f4733db0eUL, 0xfdb8c4f29e087de9UL, - 0x095e4b1a8ea2a229UL, 0x1ad7a7c829b37a79UL, - /* 10 */ 0x342d89cad17ea0c0UL, 0x67bedda6cced2051UL, - 0x19ca31bf2bb42f74UL, 0x3df7b4c84980acbbUL, - /* 11 */ 0xa8c6444dc80ad883UL, 0xb91e440366e3ab85UL, - 0xc215cda00164f6d8UL, 0x3d867c6ef247e668UL, - /* 12 */ 0xc7dd582bcc3e658cUL, 0xfd2c4748ee0e5528UL, - 0xa0fd9b95cc9f4f71UL, 0x7529d871b0675ddfUL, - /* 13 */ 0xb8f568b42d3cbd78UL, 0x1233011b91f3da82UL, - 0x2dce6ccd4a7c3b62UL, 0x75e7fc8e9e498603UL, - /* 14 */ 0x2f4f13f1fcd0b6ecUL, 0xf1a8ca1f29ff7a45UL, - 0xc249c1a72981e29bUL, 0x6ebe0dbb8c83b56aUL, - /* 15 */ 0x7114fa8d170bb222UL, 0x65a2dcd5bf93935fUL, - 0xbdc41f68b59c979aUL, 0x2f0eef79a2ce9289UL, - /* 16 */ 0x42ecbf0c083c37ceUL, 0x2930bc09ec496322UL, - 0xf294b0c19cfeac0dUL, 0x3780aa4bedfabb80UL, - /* 17 */ 0x56c17d3e7cead929UL, 0xe7cb4beb2e5722c5UL, - 0x0ce931732dbfe15aUL, 0x41b883c7621052f8UL, - /* 18 */ 0xdbf75ca0c3d25350UL, 0x2936be086eb1e351UL, - 0xc936e03cb4a9b212UL, 0x1d45bf82322225aaUL, - /* 19 */ 0xe81ab1036a024cc5UL, 0xe212201c304c9a72UL, - 0xc5d73fba6832b1fcUL, 0x20ffdb5a4d839581UL, - /* 20 */ 0xa283d367be5d0fadUL, 0x6c2b25ca8b164475UL, - 0x9d4935467caaf22eUL, 0x5166408eee85ff49UL, - /* 21 */ 0x3c67baa2fab4e361UL, 0xb3e433c67ef35cefUL, - 0x5259729241159b1cUL, 0x6a621892d5b0ab33UL, - /* 22 */ 0x20b74a387555cdcbUL, 0x532aa10e1208923fUL, - 0xeaa17b7762281dd1UL, 0x61ab3443f05c44bfUL, - /* 23 */ 0x257a6c422324def8UL, 0x131c6c1017e3cf7fUL, - 0x23758739f630a257UL, 0x295a407a01a78580UL, - /* 24 */ 0xf8c443246d5da8d9UL, 0x19d775450c52fa5dUL, - 0x2afcfc92731bf83dUL, 0x7d10c8e81b2b4700UL, - /* 25 */ 0xc8e0271f70baa20bUL, 0x993748867ca63957UL, - 0x5412efb3cb7ed4bbUL, 0x3196d36173e62975UL, - /* 26 */ 0xde5bcad141c7dffcUL, 0x47cc8cd2b395c848UL, - 0xa34cd942e11af3cbUL, 0x0256dbf2d04ecec2UL, - /* 27 */ 0x875ab7e94b0e667fUL, 0xcad4dd83c0850d10UL, - 0x47f12e8f4e72c79fUL, 0x5f1a87bb8c85b19bUL, - /* 28 */ 0x7ae9d0b6437f51b8UL, 0x12c7ce5518879065UL, - 0x2ade09fe5cf77aeeUL, 0x23a05a2f7d2c5627UL, - /* 29 */ 0x5908e128f17c169aUL, 0xf77498dd8ad0852dUL, - 0x74b4c4ceab102f64UL, 0x183abadd10139845UL, - /* 30 */ 0xb165ba8daa92aaacUL, 0xd5c5ef9599386705UL, - 0xbe2f8f0cf8fc40d1UL, 0x2701e635ee204514UL, - /* 31 */ 0x629fa80020156514UL, 0xf223868764a8c1ceUL, - 0x5b894fff0b3f060eUL, 0x60d9944cf708a3faUL, - /* 32 */ 0xaeea001a1c7a201fUL, 0xebf16a633ee2ce63UL, - 0x6f7709594c7a07e1UL, 0x79b958150d0208cbUL, - /* 33 */ 0x24b55e5301d410e7UL, 0xe3a34edff3fdc84dUL, - 0xd88768e4904032d8UL, 0x131384427b3aaeecUL, - /* 34 */ 0x8405e51286234f14UL, 0x14dc4739adb4c529UL, - 0xb8a2b5b250634ffdUL, 0x2fe2a94ad8a7ff93UL, - /* 35 */ 0xec5c57efe843faddUL, 0x2843ce40f0bb9918UL, - 0xa4b561d6cf3d6305UL, 0x743629bde8fb777eUL, - /* 36 */ 0x343edd46bbaf738fUL, 0xed981828b101a651UL, - 0xa401760b882c797aUL, 0x1fc223e28dc88730UL, - /* 37 */ 0x48604e91fc0fba0eUL, 0xb637f78f052c6fa4UL, - 0x91ccac3d09e9239cUL, 0x23f7eed4437a687cUL, - /* 38 */ 0x5173b1118d9bd800UL, 0x29d641b63189d4a7UL, - 0xfdbf177988bbc586UL, 0x2959894fcad81df5UL, - /* 39 */ 0xaebc8ef3b4bbc899UL, 0x4148995ab26992b9UL, - 0x24e20b0134f92cfbUL, 0x40d158894a05dee8UL, - /* 40 */ 0x46b00b1185af76f6UL, 0x26bac77873187a79UL, - 0x3dc0bf95ab8fff5fUL, 0x2a608bd8945524d7UL, - /* 41 */ 0x26449588bd446302UL, 0x7c4bc21c0388439cUL, - 0x8e98a4f383bd11b2UL, 0x26218d7bc9d876b9UL, - /* 42 */ 0xe3081542997c178aUL, 0x3c2d29a86fb6606fUL, - 0x5c217736fa279374UL, 0x7dde05734afeb1faUL, - /* 43 */ 0x3bf10e3906d42babUL, 0xe4f7803e1980649cUL, - 0xe6053bf89595bf7aUL, 0x394faf38da245530UL, - /* 44 */ 0x7a8efb58896928f4UL, 0xfbc778e9cc6a113cUL, - 0x72670ce330af596fUL, 0x48f222a81d3d6cf7UL, - /* 45 */ 0xf01fce410d72caa7UL, 0x5a20ecc7213b5595UL, - 0x7bc21165c1fa1483UL, 0x07f89ae31da8a741UL, - /* 46 */ 0x05d2c2b4c6830ff9UL, 0xd43e330fc6316293UL, - 0xa5a5590a96d3a904UL, 0x705edb91a65333b6UL, - /* 47 */ 0x048ee15e0bb9a5f7UL, 0x3240cfca9e0aaf5dUL, - 0x8f4b71ceedc4a40bUL, 0x621c0da3de544a6dUL, - /* 48 */ 0x92872836a08c4091UL, 0xce8375b010c91445UL, - 0x8a72eb524f276394UL, 0x2667fcfa7ec83635UL, - /* 49 */ 0x7f4c173345e8752aUL, 0x061b47feee7079a5UL, - 0x25dd9afa9f86ff34UL, 0x3780cef5425dc89cUL, - /* 50 */ 0x1a46035a513bb4e9UL, 0x3e1ef379ac575adaUL, - 0xc78c5f1c5fa24b50UL, 0x321a967634fd9f22UL, - /* 51 */ 0x946707b8826e27faUL, 0x3dca84d64c506fd0UL, - 0xc189218075e91436UL, 0x6d9284169b3b8484UL, - /* 52 */ 0x3a67e840383f2ddfUL, 0x33eec9a30c4f9b75UL, - 0x3ec7c86fa783ef47UL, 0x26ec449fbac9fbc4UL, - /* 53 */ 0x5c0f38cba09b9e7dUL, 0x81168cc762a3478cUL, - 0x3e23b0d306fc121cUL, 0x5a238aa0a5efdcddUL, - /* 54 */ 0x1ba26121c4ea43ffUL, 0x36f8c77f7c8832b5UL, - 0x88fbea0b0adcf99aUL, 0x5ca9938ec25bebf9UL, - /* 55 */ 0xd5436a5e51fccda0UL, 0x1dbc4797c2cd893bUL, - 0x19346a65d3224a08UL, 0x0f5034e49b9af466UL, - /* 56 */ 0xf23c3967a1e0b96eUL, 0xe58b08fa867a4d88UL, - 0xfb2fabc6a7341679UL, 0x2a75381eb6026946UL, - /* 57 */ 0xc80a3be4c19420acUL, 0x66b1f6c681f2b6dcUL, - 0x7cf7036761e93388UL, 0x25abbbd8a660a4c4UL, - /* 58 */ 0x91ea12ba14fd5198UL, 0x684950fc4a3cffa9UL, - 0xf826842130f5ad28UL, 0x3ea988f75301a441UL, - /* 59 */ 0xc978109a695f8c6fUL, 0x1746eb4a0530c3f3UL, - 0x444d6d77b4459995UL, 0x75952b8c054e5cc7UL, - /* 60 */ 0xa3703f7915f4d6aaUL, 0x66c346202f2647d8UL, - 0xd01469df811d644bUL, 0x77fea47d81a5d71fUL, - /* 61 */ 0xc5e9529ef57ca381UL, 0x6eeeb4b9ce2f881aUL, - 0xb6e91a28e8009bd6UL, 0x4b80be3e9afc3fecUL, - /* 62 */ 0x7e3773c526aed2c5UL, 0x1b4afcb453c9a49dUL, - 0xa920bdd7baffb24dUL, 0x7c54699f122d400eUL, - /* 63 */ 0xef46c8e14fa94bc8UL, 0xe0b074ce2952ed5eUL, - 0xbea450e1dbd885d5UL, 0x61b68649320f712cUL, - /* 64 */ 0x8a485f7309ccbdd1UL, 0xbd06320d7d4d1a2dUL, - 0x25232973322dbef4UL, 0x445dc4758c17f770UL, - /* 65 */ 0xdb0434177cc8933cUL, 0xed6fe82175ea059fUL, - 0x1efebefdc053db34UL, 0x4adbe867c65daf99UL, - /* 66 */ 0x3acd71a2a90609dfUL, 0xe5e991856dd04050UL, - 0x1ec69b688157c23cUL, 0x697427f6885cfe4dUL, - /* 67 */ 0xd7be7b9b65e1a851UL, 0xa03d28d522c536ddUL, - 0x28399d658fd2b645UL, 0x49e5b7e17c2641e1UL, - /* 68 */ 0x6f8c3a98700457a4UL, 0x5078f0a25ebb6778UL, - 0xd13c3ccbc382960fUL, 0x2e003258a7df84b1UL, - /* 69 */ 0x8ad1f39be6296a1cUL, 0xc1eeaa652a5fbfb2UL, - 0x33ee0673fd26f3cbUL, 0x59256173a69d2cccUL, - /* 70 */ 0x41ea07aa4e18fc41UL, 0xd9fc19527c87a51eUL, - 0xbdaacb805831ca6fUL, 0x445b652dc916694fUL, - /* 71 */ 0xce92a3a7f2172315UL, 0x1edc282de11b9964UL, - 0xa1823aafe04c314aUL, 0x790a2d94437cf586UL, - /* 72 */ 0x71c447fb93f6e009UL, 0x8922a56722845276UL, - 0xbf70903b204f5169UL, 0x2f7a89891ba319feUL, - /* 73 */ 0x02a08eb577e2140cUL, 0xed9a4ed4427bdcf4UL, - 0x5253ec44e4323cd1UL, 0x3e88363c14e9355bUL, - /* 74 */ 0xaa66c14277110b8cUL, 0x1ae0391610a23390UL, - 0x2030bd12c93fc2a2UL, 0x3ee141579555c7abUL, - /* 75 */ 0x9214de3a6d6e7d41UL, 0x3ccdd88607f17efeUL, - 0x674f1288f8e11217UL, 0x5682250f329f93d0UL, - /* 76 */ 0x6cf00b136d2e396eUL, 0x6e4cf86f1014debfUL, - 0x5930b1b5bfcc4e83UL, 0x047069b48aba16b6UL, - /* 77 */ 0x0d4ce4ab69b20793UL, 0xb24db91a97d0fb9eUL, - 0xcdfa50f54e00d01dUL, 0x221b1085368bddb5UL, - /* 78 */ 0xe7e59468b1e3d8d2UL, 0x53c56563bd122f93UL, - 0xeee8a903e0663f09UL, 0x61efa662cbbe3d42UL, - /* 79 */ 0x2cf8ddddde6eab2aUL, 0x9bf80ad51435f231UL, - 0x5deadacec9f04973UL, 0x29275b5d41d29b27UL, - /* 80 */ 0xcfde0f0895ebf14fUL, 0xb9aab96b054905a7UL, - 0xcae80dd9a1c420fdUL, 0x0a63bf2f1673bbc7UL, - /* 81 */ 0x092f6e11958fbc8cUL, 0x672a81e804822fadUL, - 0xcac8351560d52517UL, 0x6f3f7722c8f192f8UL, - /* 82 */ 0xf8ba90ccc2e894b7UL, 0x2c7557a438ff9f0dUL, - 0x894d1d855ae52359UL, 0x68e122157b743d69UL, - /* 83 */ 0xd87e5570cfb919f3UL, 0x3f2cdecd95798db9UL, - 0x2121154710c0a2ceUL, 0x3c66a115246dc5b2UL, - /* 84 */ 0xcbedc562294ecb72UL, 0xba7143c36a280b16UL, - 0x9610c2efd4078b67UL, 0x6144735d946a4b1eUL, - /* 85 */ 0x536f111ed75b3350UL, 0x0211db8c2041d81bUL, - 0xf93cb1000e10413cUL, 0x149dfd3c039e8876UL, - /* 86 */ 0xd479dde46b63155bUL, 0xb66e15e93c837976UL, - 0xdafde43b1f13e038UL, 0x5fafda1a2e4b0b35UL, - /* 87 */ 0x3600bbdf17197581UL, 0x3972050bbe3cd2c2UL, - 0x5938906dbdd5be86UL, 0x34fce5e43f9b860fUL, - /* 88 */ 0x75a8a4cd42d14d02UL, 0x828dabc53441df65UL, - 0x33dcabedd2e131d3UL, 0x3ebad76fb814d25fUL, - /* 89 */ 0xd4906f566f70e10fUL, 0x5d12f7aa51690f5aUL, - 0x45adb16e76cefcf2UL, 0x01f768aead232999UL, - /* 90 */ 0x2b6cc77b6248febdUL, 0x3cd30628ec3aaffdUL, - 0xce1c0b80d4ef486aUL, 0x4c3bff2ea6f66c23UL, - /* 91 */ 0x3f2ec4094aeaeb5fUL, 0x61b19b286e372ca7UL, - 0x5eefa966de2a701dUL, 0x23b20565de55e3efUL, - /* 92 */ 0xe301ca5279d58557UL, 0x07b2d4ce27c2874fUL, - 0xa532cd8a9dcf1d67UL, 0x2a52fee23f2bff56UL, - /* 93 */ 0x8624efb37cd8663dUL, 0xbbc7ac20ffbd7594UL, - 0x57b85e9c82d37445UL, 0x7b3052cb86a6ec66UL, - /* 94 */ 0x3482f0ad2525e91eUL, 0x2cb68043d28edca0UL, - 0xaf4f6d052e1b003aUL, 0x185f8c2529781b0aUL, - /* 95 */ 0xaa41de5bd80ce0d6UL, 0x9407b2416853e9d6UL, - 0x563ec36e357f4c3aUL, 0x4cc4b8dd0e297bceUL, - /* 96 */ 0xa2fc1a52ffb8730eUL, 0x1811f16e67058e37UL, - 0x10f9a366cddf4ee1UL, 0x72f4a0c4a0b9f099UL, - /* 97 */ 0x8c16c06f663f4ea7UL, 0x693b3af74e970fbaUL, - 0x2102e7f1d69ec345UL, 0x0ba53cbc968a8089UL, - /* 98 */ 0xca3d9dc7fea15537UL, 0x4c6824bb51536493UL, - 0xb9886314844006b1UL, 0x40d2a72ab454cc60UL, - /* 99 */ 0x5936a1b712570975UL, 0x91b9d648debda657UL, - 0x3344094bb64330eaUL, 0x006ba10d12ee51d0UL, - /* 100 */ 0x19228468f5de5d58UL, 0x0eb12f4c38cc05b0UL, - 0xa1039f9dd5601990UL, 0x4502d4ce4fff0e0bUL, - /* 101 */ 0xeb2054106837c189UL, 0xd0f6544c6dd3b93cUL, - 0x40727064c416d74fUL, 0x6e15c6114b502ef0UL, - /* 102 */ 0x4df2a398cfb1a76bUL, 0x11256c7419f2f6b1UL, - 0x4a497962066e6043UL, 0x705b3aab41355b44UL, - /* 103 */ 0x365ef536d797b1d8UL, 0x00076bd622ddf0dbUL, - 0x3bbf33b0e0575a88UL, 0x3777aa05c8e4ca4dUL, - /* 104 */ 0x392745c85578db5fUL, 0x6fda4149dbae5ae2UL, - 0xb1f0b00b8adc9867UL, 0x09963437d36f1da3UL, - /* 105 */ 0x7e824e90a5dc3853UL, 0xccb5f6641f135cbdUL, - 0x6736d86c87ce8fccUL, 0x625f3ce26604249fUL, - /* 106 */ 0xaf8ac8059502f63fUL, 0x0c05e70a2e351469UL, - 0x35292e9c764b6305UL, 0x1a394360c7e23ac3UL, - /* 107 */ 0xd5c6d53251183264UL, 0x62065abd43c2b74fUL, - 0xb5fbf5d03b973f9bUL, 0x13a3da3661206e5eUL, - /* 108 */ 0xc6bd5837725d94e5UL, 0x18e30912205016c5UL, - 0x2088ce1570033c68UL, 0x7fba1f495c837987UL, - /* 109 */ 0x5a8c7423f2f9079dUL, 0x1735157b34023fc5UL, - 0xe4f9b49ad2fab351UL, 0x6691ff72c878e33cUL, - /* 110 */ 0x122c2adedc5eff3eUL, 0xf8dd4bf1d8956cf4UL, - 0xeb86205d9e9e5bdaUL, 0x049b92b9d975c743UL, - /* 111 */ 0xa5379730b0f6c05aUL, 0x72a0ffacc6f3a553UL, - 0xb0032c34b20dcd6dUL, 0x470e9dbc88d5164aUL, - /* 112 */ 0xb19cf10ca237c047UL, 0xb65466711f6c81a2UL, - 0xb3321bd16dd80b43UL, 0x48c14f600c5fbe8eUL, - /* 113 */ 0x66451c264aa6c803UL, 0xb66e3904a4fa7da6UL, - 0xd45f19b0b3128395UL, 0x31602627c3c9bc10UL, - /* 114 */ 0x3120dc4832e4e10dUL, 0xeb20c46756c717f7UL, - 0x00f52e3f67280294UL, 0x566d4fc14730c509UL, - /* 115 */ 0x7e3a5d40fd837206UL, 0xc1e926dc7159547aUL, - 0x216730fba68d6095UL, 0x22e8c3843f69cea7UL, - /* 116 */ 0x33d074e8930e4b2bUL, 0xb6e4350e84d15816UL, - 0x5534c26ad6ba2365UL, 0x7773c12f89f1f3f3UL, - /* 117 */ 0x8cba404da57962aaUL, 0x5b9897a81999ce56UL, - 0x508e862f121692fcUL, 0x3a81907fa093c291UL, - /* 118 */ 0x0dded0ff4725a510UL, 0x10d8cc10673fc503UL, - 0x5b9d151c9f1f4e89UL, 0x32a5c1d5cb09a44cUL, - /* 119 */ 0x1e0aa442b90541fbUL, 0x5f85eb7cc1b485dbUL, - 0xbee595ce8a9df2e5UL, 0x25e496c722422236UL, - /* 120 */ 0x5edf3c46cd0fe5b9UL, 0x34e75a7ed2a43388UL, - 0xe488de11d761e352UL, 0x0e878a01a085545cUL, - /* 121 */ 0xba493c77e021bb04UL, 0x2b4d1843c7df899aUL, - 0x9ea37a487ae80d67UL, 0x67a9958011e41794UL, - /* 122 */ 0x4b58051a6697b065UL, 0x47e33f7d8d6ba6d4UL, - 0xbb4da8d483ca46c1UL, 0x68becaa181c2db0dUL, - /* 123 */ 0x8d8980e90b989aa5UL, 0xf95eb14a2c93c99bUL, - 0x51c6c7c4796e73a2UL, 0x6e228363b5efb569UL, - /* 124 */ 0xc6bbc0b02dd624c8UL, 0x777eb47dec8170eeUL, - 0x3cde15a004cfafa9UL, 0x1dc6bc087160bf9bUL, - /* 125 */ 0x2e07e043eec34002UL, 0x18e9fc677a68dc7fUL, - 0xd8da03188bd15b9aUL, 0x48fbc3bb00568253UL, - /* 126 */ 0x57547d4cfb654ce1UL, 0xd3565b82a058e2adUL, - 0xf63eaf0bbf154478UL, 0x47531ef114dfbb18UL, - /* 127 */ 0xe1ec630a4278c587UL, 0x5507d546ca8e83f3UL, - 0x85e135c63adc0c2bUL, 0x0aa7efa85682844eUL, - /* 128 */ 0x72691ba8b3e1f615UL, 0x32b4e9701fbe3ffaUL, - 0x97b6d92e39bb7868UL, 0x2cfe53dea02e39e8UL, - /* 129 */ 0x687392cd85cd52b0UL, 0x27ff66c910e29831UL, - 0x97134556a9832d06UL, 0x269bb0360a84f8a0UL, - /* 130 */ 0x706e55457643f85cUL, 0x3734a48c9b597d1bUL, - 0x7aee91e8c6efa472UL, 0x5cd6abc198a9d9e0UL, - /* 131 */ 0x0e04de06cb3ce41aUL, 0xd8c6eb893402e138UL, - 0x904659bb686e3772UL, 0x7215c371746ba8c8UL, - /* 132 */ 0xfd12a97eeae4a2d9UL, 0x9514b7516394f2c5UL, - 0x266fd5809208f294UL, 0x5c847085619a26b9UL, - /* 133 */ 0x52985410fed694eaUL, 0x3c905b934a2ed254UL, - 0x10bb47692d3be467UL, 0x063b3d2d69e5e9e1UL, - /* 134 */ 0x472726eedda57debUL, 0xefb6c4ae10f41891UL, - 0x2b1641917b307614UL, 0x117c554fc4f45b7cUL, - /* 135 */ 0xc07cf3118f9d8812UL, 0x01dbd82050017939UL, - 0xd7e803f4171b2827UL, 0x1015e87487d225eaUL, - /* 136 */ 0xc58de3fed23acc4dUL, 0x50db91c294a7be2dUL, - 0x0b94d43d1c9cf457UL, 0x6b1640fa6e37524aUL, - /* 137 */ 0x692f346c5fda0d09UL, 0x200b1c59fa4d3151UL, - 0xb8c46f760777a296UL, 0x4b38395f3ffdfbcfUL, - /* 138 */ 0x18d25e00be54d671UL, 0x60d50582bec8aba6UL, - 0x87ad8f263b78b982UL, 0x50fdf64e9cda0432UL, - /* 139 */ 0x90f567aac578dcf0UL, 0xef1e9b0ef2a3133bUL, - 0x0eebba9242d9de71UL, 0x15473c9bf03101c7UL, - /* 140 */ 0x7c77e8ae56b78095UL, 0xb678e7666e6f078eUL, - 0x2da0b9615348ba1fUL, 0x7cf931c1ff733f0bUL, - /* 141 */ 0x26b357f50a0a366cUL, 0xe9708cf42b87d732UL, - 0xc13aeea5f91cb2c0UL, 0x35d90c991143bb4cUL, - /* 142 */ 0x47c1c404a9a0d9dcUL, 0x659e58451972d251UL, - 0x3875a8c473b38c31UL, 0x1fbd9ed379561f24UL, - /* 143 */ 0x11fabc6fd41ec28dUL, 0x7ef8dfe3cd2a2dcaUL, - 0x72e73b5d8c404595UL, 0x6135fa4954b72f27UL, - /* 144 */ 0xccfc32a2de24b69cUL, 0x3f55698c1f095d88UL, - 0xbe3350ed5ac3f929UL, 0x5e9bf806ca477eebUL, - /* 145 */ 0xe9ce8fb63c309f68UL, 0x5376f63565e1f9f4UL, - 0xd1afcfb35a6393f1UL, 0x6632a1ede5623506UL, - /* 146 */ 0x0b7d6c390c2ded4cUL, 0x56cb3281df04cb1fUL, - 0x66305a1249ecc3c7UL, 0x5d588b60a38ca72aUL, - /* 147 */ 0xa6ecbf78e8e5f42dUL, 0x86eeb44b3c8a3eecUL, - 0xec219c48fbd21604UL, 0x1aaf1af517c36731UL, - /* 148 */ 0xc306a2836769bde7UL, 0x208280622b1e2adbUL, - 0x8027f51ffbff94a6UL, 0x76cfa1ce1124f26bUL, - /* 149 */ 0x18eb00562422abb6UL, 0xf377c4d58f8c29c3UL, - 0x4dbbc207f531561aUL, 0x0253b7f082128a27UL, - /* 150 */ 0x3d1f091cb62c17e0UL, 0x4860e1abd64628a9UL, - 0x52d17436309d4253UL, 0x356f97e13efae576UL, - /* 151 */ 0xd351e11aa150535bUL, 0x3e6b45bb1dd878ccUL, - 0x0c776128bed92c98UL, 0x1d34ae93032885b8UL, - /* 152 */ 0x4ba0488ca85ba4c3UL, 0x985348c33c9ce6ceUL, - 0x66124c6f97bda770UL, 0x0f81a0290654124aUL, - /* 153 */ 0x9ed09ca6569b86fdUL, 0x811009fd18af9a2dUL, - 0xff08d03f93d8c20aUL, 0x52a148199faef26bUL, - /* 154 */ 0x3e03f9dc2d8d1b73UL, 0x4205801873961a70UL, - 0xc0d987f041a35970UL, 0x07aa1f15a1c0d549UL, - /* 155 */ 0xdfd46ce08cd27224UL, 0x6d0a024f934e4239UL, - 0x808a7a6399897b59UL, 0x0a4556e9e13d95a2UL, - /* 156 */ 0xd21a991fe9c13045UL, 0x9b0e8548fe7751b8UL, - 0x5da643cb4bf30035UL, 0x77db28d63940f721UL, - /* 157 */ 0xfc5eeb614adc9011UL, 0x5229419ae8c411ebUL, - 0x9ec3e7787d1dcf74UL, 0x340d053e216e4cb5UL, - /* 158 */ 0xcac7af39b48df2b4UL, 0xc0faec2871a10a94UL, - 0x140a69245ca575edUL, 0x0cf1c37134273a4cUL, - /* 159 */ 0xc8ee306ac224b8a5UL, 0x57eaee7ccb4930b0UL, - 0xa1e806bdaacbe74fUL, 0x7d9a62742eeb657dUL, - /* 160 */ 0x9eb6b6ef546c4830UL, 0x885cca1fddb36e2eUL, - 0xe6b9f383ef0d7105UL, 0x58654fef9d2e0412UL, - /* 161 */ 0xa905c4ffbe0e8e26UL, 0x942de5df9b31816eUL, - 0x497d723f802e88e1UL, 0x30684dea602f408dUL, - /* 162 */ 0x21e5a278a3e6cb34UL, 0xaefb6e6f5b151dc4UL, - 0xb30b8e049d77ca15UL, 0x28c3c9cf53b98981UL, - /* 163 */ 0x287fb721556cdd2aUL, 0x0d317ca897022274UL, - 0x7468c7423a543258UL, 0x4a7f11464eb5642fUL, - /* 164 */ 0xa237a4774d193aa6UL, 0xd865986ea92129a1UL, - 0x24c515ecf87c1a88UL, 0x604003575f39f5ebUL, - /* 165 */ 0x47b9f189570a9b27UL, 0x2b98cede465e4b78UL, - 0x026df551dbb85c20UL, 0x74fcd91047e21901UL, - /* 166 */ 0x13e2a90a23c1bfa3UL, 0x0cb0074e478519f6UL, - 0x5ff1cbbe3af6cf44UL, 0x67fe5438be812dbeUL, - /* 167 */ 0xd13cf64fa40f05b0UL, 0x054dfb2f32283787UL, - 0x4173915b7f0d2aeaUL, 0x482f144f1f610d4eUL, - /* 168 */ 0xf6210201b47f8234UL, 0x5d0ae1929e70b990UL, - 0xdcd7f455b049567cUL, 0x7e93d0f1f0916f01UL, - /* 169 */ 0xdd79cbf18a7db4faUL, 0xbe8391bf6f74c62fUL, - 0x027145d14b8291bdUL, 0x585a73ea2cbf1705UL, - /* 170 */ 0x485ca03e928a0db2UL, 0x10fc01a5742857e7UL, - 0x2f482edbd6d551a7UL, 0x0f0433b5048fdb8aUL, - /* 171 */ 0x60da2e8dd7dc6247UL, 0x88b4c9d38cd4819aUL, - 0x13033ac001f66697UL, 0x273b24fe3b367d75UL, - /* 172 */ 0xc6e8f66a31b3b9d4UL, 0x281514a494df49d5UL, - 0xd1726fdfc8b23da7UL, 0x4b3ae7d103dee548UL, - /* 173 */ 0xc6256e19ce4b9d7eUL, 0xff5c5cf186e3c61cUL, - 0xacc63ca34b8ec145UL, 0x74621888fee66574UL, - /* 174 */ 0x956f409645290a1eUL, 0xef0bf8e3263a962eUL, - 0xed6a50eb5ec2647bUL, 0x0694283a9dca7502UL, - /* 175 */ 0x769b963643a2dcd1UL, 0x42b7c8ea09fc5353UL, - 0x4f002aee13397eabUL, 0x63005e2c19b7d63aUL, - /* 176 */ 0xca6736da63023beaUL, 0x966c7f6db12a99b7UL, - 0xace09390c537c5e1UL, 0x0b696063a1aa89eeUL, - /* 177 */ 0xebb03e97288c56e5UL, 0x432a9f9f938c8be8UL, - 0xa6a5a93d5b717f71UL, 0x1a5fb4c3e18f9d97UL, - /* 178 */ 0x1c94e7ad1c60cdceUL, 0xee202a43fc02c4a0UL, - 0x8dafe4d867c46a20UL, 0x0a10263c8ac27b58UL, - /* 179 */ 0xd0dea9dfe4432a4aUL, 0x856af87bbe9277c5UL, - 0xce8472acc212c71aUL, 0x6f151b6d9bbb1e91UL, - /* 180 */ 0x26776c527ceed56aUL, 0x7d211cb7fbf8faecUL, - 0x37ae66a6fd4609ccUL, 0x1f81b702d2770c42UL, - /* 181 */ 0x2fb0b057eac58392UL, 0xe1dd89fe29744e9dUL, - 0xc964f8eb17beb4f8UL, 0x29571073c9a2d41eUL, - /* 182 */ 0xa948a18981c0e254UL, 0x2df6369b65b22830UL, - 0xa33eb2d75fcfd3c6UL, 0x078cd6ec4199a01fUL, - /* 183 */ 0x4a584a41ad900d2fUL, 0x32142b78e2c74c52UL, - 0x68c4e8338431c978UL, 0x7f69ea9008689fc2UL, - /* 184 */ 0x52f2c81e46a38265UL, 0xfd78072d04a832fdUL, - 0x8cd7d5fa25359e94UL, 0x4de71b7454cc29d2UL, - /* 185 */ 0x42eb60ad1eda6ac9UL, 0x0aad37dfdbc09c3aUL, - 0x81004b71e33cc191UL, 0x44e6be345122803cUL, - /* 186 */ 0x03fe8388ba1920dbUL, 0xf5d57c32150db008UL, - 0x49c8c4281af60c29UL, 0x21edb518de701aeeUL, - /* 187 */ 0x7fb63e418f06dc99UL, 0xa4460d99c166d7b8UL, - 0x24dd5248ce520a83UL, 0x5ec3ad712b928358UL, - /* 188 */ 0x15022a5fbd17930fUL, 0xa4f64a77d82570e3UL, - 0x12bc8d6915783712UL, 0x498194c0fc620abbUL, - /* 189 */ 0x38a2d9d255686c82UL, 0x785c6bd9193e21f0UL, - 0xe4d5c81ab24a5484UL, 0x56307860b2e20989UL, - /* 190 */ 0x429d55f78b4d74c4UL, 0x22f1834643350131UL, - 0x1e60c24598c71fffUL, 0x59f2f014979983efUL, - /* 191 */ 0x46a47d56eb494a44UL, 0x3e22a854d636a18eUL, - 0xb346e15274491c3bUL, 0x2ceafd4e5390cde7UL, - /* 192 */ 0xba8a8538be0d6675UL, 0x4b9074bb50818e23UL, - 0xcbdab89085d304c3UL, 0x61a24fe0e56192c4UL, - /* 193 */ 0xcb7615e6db525bcbUL, 0xdd7d8c35a567e4caUL, - 0xe6b4153acafcdd69UL, 0x2d668e097f3c9766UL, - /* 194 */ 0xa57e7e265ce55ef0UL, 0x5d9f4e527cd4b967UL, - 0xfbc83606492fd1e5UL, 0x090d52beb7c3f7aeUL, - /* 195 */ 0x09b9515a1e7b4d7cUL, 0x1f266a2599da44c0UL, - 0xa1c49548e2c55504UL, 0x7ef04287126f15ccUL, - /* 196 */ 0xfed1659dbd30ef15UL, 0x8b4ab9eec4e0277bUL, - 0x884d6236a5df3291UL, 0x1fd96ea6bf5cf788UL, - /* 197 */ 0x42a161981f190d9aUL, 0x61d849507e6052c1UL, - 0x9fe113bf285a2cd5UL, 0x7c22d676dbad85d8UL, - /* 198 */ 0x82e770ed2bfbd27dUL, 0x4c05b2ece996f5a5UL, - 0xcd40a9c2b0900150UL, 0x5895319213d9bf64UL, - /* 199 */ 0xe7cc5d703fea2e08UL, 0xb50c491258e2188cUL, - 0xcce30baa48205bf0UL, 0x537c659ccfa32d62UL, - /* 200 */ 0x37b6623a98cfc088UL, 0xfe9bed1fa4d6aca4UL, - 0x04d29b8e56a8d1b0UL, 0x725f71c40b519575UL, - /* 201 */ 0x28c7f89cd0339ce6UL, 0x8367b14469ddc18bUL, - 0x883ada83a6a1652cUL, 0x585f1974034d6c17UL, - /* 202 */ 0x89cfb266f1b19188UL, 0xe63b4863e7c35217UL, - 0xd88c9da6b4c0526aUL, 0x3e035c9df0954635UL, - /* 203 */ 0xdd9d5412fb45de9dUL, 0xdd684532e4cff40dUL, - 0x4b5c999b151d671cUL, 0x2d8c2cc811e7f690UL, - /* 204 */ 0x7f54be1d90055d40UL, 0xa464c5df464aaf40UL, - 0x33979624f0e917beUL, 0x2c018dc527356b30UL, - /* 205 */ 0xa5415024e330b3d4UL, 0x73ff3d96691652d3UL, - 0x94ec42c4ef9b59f1UL, 0x0747201618d08e5aUL, - /* 206 */ 0x4d6ca48aca411c53UL, 0x66415f2fcfa66119UL, - 0x9c4dd40051e227ffUL, 0x59810bc09a02f7ebUL, - /* 207 */ 0x2a7eb171b3dc101dUL, 0x441c5ab99ffef68eUL, - 0x32025c9b93b359eaUL, 0x5e8ce0a71e9d112fUL, - /* 208 */ 0xbfcccb92429503fdUL, 0xd271ba752f095d55UL, - 0x345ead5e972d091eUL, 0x18c8df11a83103baUL, - /* 209 */ 0x90cd949a9aed0f4cUL, 0xc5d1f4cb6660e37eUL, - 0xb8cac52d56c52e0bUL, 0x6e42e400c5808e0dUL, - /* 210 */ 0xa3b46966eeaefd23UL, 0x0c4f1f0be39ecdcaUL, - 0x189dc8c9d683a51dUL, 0x51f27f054c09351bUL, - /* 211 */ 0x4c487ccd2a320682UL, 0x587ea95bb3df1c96UL, - 0xc8ccf79e555cb8e8UL, 0x547dc829a206d73dUL, - /* 212 */ 0xb822a6cd80c39b06UL, 0xe96d54732000d4c6UL, - 0x28535b6f91463b4dUL, 0x228f4660e2486e1dUL, - /* 213 */ 0x98799538de8d3abfUL, 0x8cd8330045ebca6eUL, - 0x79952a008221e738UL, 0x4322e1a7535cd2bbUL, - /* 214 */ 0xb114c11819d1801cUL, 0x2016e4d84f3f5ec7UL, - 0xdd0e2df409260f4cUL, 0x5ec362c0ae5f7266UL, - /* 215 */ 0xc0462b18b8b2b4eeUL, 0x7cc8d950274d1afbUL, - 0xf25f7105436b02d2UL, 0x43bbf8dcbff9ccd3UL, - /* 216 */ 0xb6ad1767a039e9dfUL, 0xb0714da8f69d3583UL, - 0x5e55fa18b42931f5UL, 0x4ed5558f33c60961UL, - /* 217 */ 0x1fe37901c647a5ddUL, 0x593ddf1f8081d357UL, - 0x0249a4fd813fd7a6UL, 0x69acca274e9caf61UL, - /* 218 */ 0x047ba3ea330721c9UL, 0x83423fc20e7e1ea0UL, - 0x1df4c0af01314a60UL, 0x09a62dab89289527UL, - /* 219 */ 0xa5b325a49cc6cb00UL, 0xe94b5dc654b56cb6UL, - 0x3be28779adc994a0UL, 0x4296e8f8ba3a4aadUL, - /* 220 */ 0x328689761e451eabUL, 0x2e4d598bff59594aUL, - 0x49b96853d7a7084aUL, 0x4980a319601420a8UL, - /* 221 */ 0x9565b9e12f552c42UL, 0x8a5318db7100fe96UL, - 0x05c90b4d43add0d7UL, 0x538b4cd66a5d4edaUL, - /* 222 */ 0xf4e94fc3e89f039fUL, 0x592c9af26f618045UL, - 0x08a36eb5fd4b9550UL, 0x25fffaf6c2ed1419UL, - /* 223 */ 0x34434459cc79d354UL, 0xeeecbfb4b1d5476bUL, - 0xddeb34a061615d99UL, 0x5129cecceb64b773UL, - /* 224 */ 0xee43215894993520UL, 0x772f9c7cf14c0b3bUL, - 0xd2e2fce306bedad5UL, 0x715f42b546f06a97UL, - /* 225 */ 0x434ecdceda5b5f1aUL, 0x0da17115a49741a9UL, - 0x680bd77c73edad2eUL, 0x487c02354edd9041UL, - /* 226 */ 0xb8efeff3a70ed9c4UL, 0x56a32aa3e857e302UL, - 0xdf3a68bd48a2a5a0UL, 0x07f650b73176c444UL, - /* 227 */ 0xe38b9b1626e0ccb1UL, 0x79e053c18b09fb36UL, - 0x56d90319c9f94964UL, 0x1ca941e7ac9ff5c4UL, - /* 228 */ 0x49c4df29162fa0bbUL, 0x8488cf3282b33305UL, - 0x95dfda14cabb437dUL, 0x3391f78264d5ad86UL, - /* 229 */ 0x729ae06ae2b5095dUL, 0xd58a58d73259a946UL, - 0xe9834262d13921edUL, 0x27fedafaa54bb592UL, - /* 230 */ 0xa99dc5b829ad48bbUL, 0x5f025742499ee260UL, - 0x802c8ecd5d7513fdUL, 0x78ceb3ef3f6dd938UL, - /* 231 */ 0xc342f44f8a135d94UL, 0x7b9edb44828cdda3UL, - 0x9436d11a0537cfe7UL, 0x5064b164ec1ab4c8UL, - /* 232 */ 0x7020eccfd37eb2fcUL, 0x1f31ea3ed90d25fcUL, - 0x1b930d7bdfa1bb34UL, 0x5344467a48113044UL, - /* 233 */ 0x70073170f25e6dfbUL, 0xe385dc1a50114cc8UL, - 0x2348698ac8fc4f00UL, 0x2a77a55284dd40d8UL, - /* 234 */ 0xfe06afe0c98c6ce4UL, 0xc235df96dddfd6e4UL, - 0x1428d01e33bf1ed3UL, 0x785768ec9300bdafUL, - /* 235 */ 0x9702e57a91deb63bUL, 0x61bdb8bfe5ce8b80UL, - 0x645b426f3d1d58acUL, 0x4804a82227a557bcUL, - /* 236 */ 0x8e57048ab44d2601UL, 0x68d6501a4b3a6935UL, - 0xc39c9ec3f9e1c293UL, 0x4172f257d4de63e2UL, - /* 237 */ 0xd368b450330c6401UL, 0x040d3017418f2391UL, - 0x2c34bb6090b7d90dUL, 0x16f649228fdfd51fUL, - /* 238 */ 0xbea6818e2b928ef5UL, 0xe28ccf91cdc11e72UL, - 0x594aaa68e77a36cdUL, 0x313034806c7ffd0fUL, - /* 239 */ 0x8a9d27ac2249bd65UL, 0x19a3b464018e9512UL, - 0xc26ccff352b37ec7UL, 0x056f68341d797b21UL, - /* 240 */ 0x5e79d6757efd2327UL, 0xfabdbcb6553afe15UL, - 0xd3e7222c6eaf5a60UL, 0x7046c76d4dae743bUL, - /* 241 */ 0x660be872b18d4a55UL, 0x19992518574e1496UL, - 0xc103053a302bdcbbUL, 0x3ed8e9800b218e8eUL, - /* 242 */ 0x7b0b9239fa75e03eUL, 0xefe9fb684633c083UL, - 0x98a35fbe391a7793UL, 0x6065510fe2d0fe34UL, - /* 243 */ 0x55cb668548abad0cUL, 0xb4584548da87e527UL, - 0x2c43ecea0107c1ddUL, 0x526028809372de35UL, - /* 244 */ 0x3415c56af9213b1fUL, 0x5bee1a4d017e98dbUL, - 0x13f6b105b5cf709bUL, 0x5ff20e3482b29ab6UL, - /* 245 */ 0x0aa29c75cc2e6c90UL, 0xfc7d73ca3a70e206UL, - 0x899fc38fc4b5c515UL, 0x250386b124ffc207UL, - /* 246 */ 0x54ea28d5ae3d2b56UL, 0x9913149dd6de60ceUL, - 0x16694fc58f06d6c1UL, 0x46b23975eb018fc7UL, - /* 247 */ 0x470a6a0fb4b7b4e2UL, 0x5d92475a8f7253deUL, - 0xabeee5b52fbd3adbUL, 0x7fa20801a0806968UL, - /* 248 */ 0x76f3faf19f7714d2UL, 0xb3e840c12f4660c3UL, - 0x0fb4cd8df212744eUL, 0x4b065a251d3a2dd2UL, - /* 249 */ 0x5cebde383d77cd4aUL, 0x6adf39df882c9cb1UL, - 0xa2dd242eb09af759UL, 0x3147c0e50e5f6422UL, - /* 250 */ 0x164ca5101d1350dbUL, 0xf8d13479c33fc962UL, - 0xe640ce4d13e5da08UL, 0x4bdee0c45061f8baUL, - /* 251 */ 0xd7c46dc1a4edb1c9UL, 0x5514d7b6437fd98aUL, - 0x58942f6bb2a1c00bUL, 0x2dffb2ab1d70710eUL, - /* 252 */ 0xccdfcf2fc18b6d68UL, 0xa8ebcba8b7806167UL, - 0x980697f95e2937e3UL, 0x02fbba1cd0126e8cUL -}; - -/* c is two 512-bit products: c0[0:7]=a0[0:3]*b0[0:3] and c1[8:15]=a1[4:7]*b1[4:7] - * a is two 256-bit integers: a0[0:3] and a1[4:7] - * b is two 256-bit integers: b0[0:3] and b1[4:7] - */ -static void mul2_256x256_integer_adx(u64 *const c, const u64 *const a, - const u64 *const b) +static __always_inline u64 eq_mask(u64 a, u64 b) { - asm volatile( - "xorl %%r14d, %%r14d ;" - "movq (%1), %%rdx; " /* A[0] */ - "mulx (%2), %%r8, %%r15; " /* A[0]*B[0] */ - "xorl %%r10d, %%r10d ;" - "movq %%r8, (%0) ;" - "mulx 8(%2), %%r10, %%rax; " /* A[0]*B[1] */ - "adox %%r10, %%r15 ;" - "mulx 16(%2), %%r8, %%rbx; " /* A[0]*B[2] */ - "adox %%r8, %%rax ;" - "mulx 24(%2), %%r10, %%rcx; " /* A[0]*B[3] */ - "adox %%r10, %%rbx ;" - /******************************************/ - "adox %%r14, %%rcx ;" - - "movq 8(%1), %%rdx; " /* A[1] */ - "mulx (%2), %%r8, %%r9; " /* A[1]*B[0] */ - "adox %%r15, %%r8 ;" - "movq %%r8, 8(%0) ;" - "mulx 8(%2), %%r10, %%r11; " /* A[1]*B[1] */ - "adox %%r10, %%r9 ;" - "adcx %%r9, %%rax ;" - "mulx 16(%2), %%r8, %%r13; " /* A[1]*B[2] */ - "adox %%r8, %%r11 ;" - "adcx %%r11, %%rbx ;" - "mulx 24(%2), %%r10, %%r15; " /* A[1]*B[3] */ - "adox %%r10, %%r13 ;" - "adcx %%r13, %%rcx ;" - /******************************************/ - "adox %%r14, %%r15 ;" - "adcx %%r14, %%r15 ;" - - "movq 16(%1), %%rdx; " /* A[2] */ - "xorl %%r10d, %%r10d ;" - "mulx (%2), %%r8, %%r9; " /* A[2]*B[0] */ - "adox %%rax, %%r8 ;" - "movq %%r8, 16(%0) ;" - "mulx 8(%2), %%r10, %%r11; " /* A[2]*B[1] */ - "adox %%r10, %%r9 ;" - "adcx %%r9, %%rbx ;" - "mulx 16(%2), %%r8, %%r13; " /* A[2]*B[2] */ - "adox %%r8, %%r11 ;" - "adcx %%r11, %%rcx ;" - "mulx 24(%2), %%r10, %%rax; " /* A[2]*B[3] */ - "adox %%r10, %%r13 ;" - "adcx %%r13, %%r15 ;" - /******************************************/ - "adox %%r14, %%rax ;" - "adcx %%r14, %%rax ;" - - "movq 24(%1), %%rdx; " /* A[3] */ - "xorl %%r10d, %%r10d ;" - "mulx (%2), %%r8, %%r9; " /* A[3]*B[0] */ - "adox %%rbx, %%r8 ;" - "movq %%r8, 24(%0) ;" - "mulx 8(%2), %%r10, %%r11; " /* A[3]*B[1] */ - "adox %%r10, %%r9 ;" - "adcx %%r9, %%rcx ;" - "movq %%rcx, 32(%0) ;" - "mulx 16(%2), %%r8, %%r13; " /* A[3]*B[2] */ - "adox %%r8, %%r11 ;" - "adcx %%r11, %%r15 ;" - "movq %%r15, 40(%0) ;" - "mulx 24(%2), %%r10, %%rbx; " /* A[3]*B[3] */ - "adox %%r10, %%r13 ;" - "adcx %%r13, %%rax ;" - "movq %%rax, 48(%0) ;" - /******************************************/ - "adox %%r14, %%rbx ;" - "adcx %%r14, %%rbx ;" - "movq %%rbx, 56(%0) ;" - - "movq 32(%1), %%rdx; " /* C[0] */ - "mulx 32(%2), %%r8, %%r15; " /* C[0]*D[0] */ - "xorl %%r10d, %%r10d ;" - "movq %%r8, 64(%0);" - "mulx 40(%2), %%r10, %%rax; " /* C[0]*D[1] */ - "adox %%r10, %%r15 ;" - "mulx 48(%2), %%r8, %%rbx; " /* C[0]*D[2] */ - "adox %%r8, %%rax ;" - "mulx 56(%2), %%r10, %%rcx; " /* C[0]*D[3] */ - "adox %%r10, %%rbx ;" - /******************************************/ - "adox %%r14, %%rcx ;" - - "movq 40(%1), %%rdx; " /* C[1] */ - "xorl %%r10d, %%r10d ;" - "mulx 32(%2), %%r8, %%r9; " /* C[1]*D[0] */ - "adox %%r15, %%r8 ;" - "movq %%r8, 72(%0);" - "mulx 40(%2), %%r10, %%r11; " /* C[1]*D[1] */ - "adox %%r10, %%r9 ;" - "adcx %%r9, %%rax ;" - "mulx 48(%2), %%r8, %%r13; " /* C[1]*D[2] */ - "adox %%r8, %%r11 ;" - "adcx %%r11, %%rbx ;" - "mulx 56(%2), %%r10, %%r15; " /* C[1]*D[3] */ - "adox %%r10, %%r13 ;" - "adcx %%r13, %%rcx ;" - /******************************************/ - "adox %%r14, %%r15 ;" - "adcx %%r14, %%r15 ;" - - "movq 48(%1), %%rdx; " /* C[2] */ - "xorl %%r10d, %%r10d ;" - "mulx 32(%2), %%r8, %%r9; " /* C[2]*D[0] */ - "adox %%rax, %%r8 ;" - "movq %%r8, 80(%0);" - "mulx 40(%2), %%r10, %%r11; " /* C[2]*D[1] */ - "adox %%r10, %%r9 ;" - "adcx %%r9, %%rbx ;" - "mulx 48(%2), %%r8, %%r13; " /* C[2]*D[2] */ - "adox %%r8, %%r11 ;" - "adcx %%r11, %%rcx ;" - "mulx 56(%2), %%r10, %%rax; " /* C[2]*D[3] */ - "adox %%r10, %%r13 ;" - "adcx %%r13, %%r15 ;" - /******************************************/ - "adox %%r14, %%rax ;" - "adcx %%r14, %%rax ;" - - "movq 56(%1), %%rdx; " /* C[3] */ - "xorl %%r10d, %%r10d ;" - "mulx 32(%2), %%r8, %%r9; " /* C[3]*D[0] */ - "adox %%rbx, %%r8 ;" - "movq %%r8, 88(%0);" - "mulx 40(%2), %%r10, %%r11; " /* C[3]*D[1] */ - "adox %%r10, %%r9 ;" - "adcx %%r9, %%rcx ;" - "movq %%rcx, 96(%0) ;" - "mulx 48(%2), %%r8, %%r13; " /* C[3]*D[2] */ - "adox %%r8, %%r11 ;" - "adcx %%r11, %%r15 ;" - "movq %%r15, 104(%0) ;" - "mulx 56(%2), %%r10, %%rbx; " /* C[3]*D[3] */ - "adox %%r10, %%r13 ;" - "adcx %%r13, %%rax ;" - "movq %%rax, 112(%0) ;" - /******************************************/ - "adox %%r14, %%rbx ;" - "adcx %%r14, %%rbx ;" - "movq %%rbx, 120(%0) ;" - : - : "r"(c), "r"(a), "r"(b) - : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", - "%r10", "%r11", "%r13", "%r14", "%r15"); + u64 x = a ^ b; + u64 minus_x = ~x + (u64)1U; + u64 x_or_minus_x = x | minus_x; + u64 xnx = x_or_minus_x >> (u32)63U; + return xnx - (u64)1U; } -static void mul2_256x256_integer_bmi2(u64 *const c, const u64 *const a, - const u64 *const b) +static __always_inline u64 gte_mask(u64 a, u64 b) { - asm volatile( - "movq (%1), %%rdx; " /* A[0] */ - "mulx (%2), %%r8, %%r15; " /* A[0]*B[0] */ - "movq %%r8, (%0) ;" - "mulx 8(%2), %%r10, %%rax; " /* A[0]*B[1] */ - "addq %%r10, %%r15 ;" - "mulx 16(%2), %%r8, %%rbx; " /* A[0]*B[2] */ - "adcq %%r8, %%rax ;" - "mulx 24(%2), %%r10, %%rcx; " /* A[0]*B[3] */ - "adcq %%r10, %%rbx ;" - /******************************************/ - "adcq $0, %%rcx ;" - - "movq 8(%1), %%rdx; " /* A[1] */ - "mulx (%2), %%r8, %%r9; " /* A[1]*B[0] */ - "addq %%r15, %%r8 ;" - "movq %%r8, 8(%0) ;" - "mulx 8(%2), %%r10, %%r11; " /* A[1]*B[1] */ - "adcq %%r10, %%r9 ;" - "mulx 16(%2), %%r8, %%r13; " /* A[1]*B[2] */ - "adcq %%r8, %%r11 ;" - "mulx 24(%2), %%r10, %%r15; " /* A[1]*B[3] */ - "adcq %%r10, %%r13 ;" - /******************************************/ - "adcq $0, %%r15 ;" - - "addq %%r9, %%rax ;" - "adcq %%r11, %%rbx ;" - "adcq %%r13, %%rcx ;" - "adcq $0, %%r15 ;" - - "movq 16(%1), %%rdx; " /* A[2] */ - "mulx (%2), %%r8, %%r9; " /* A[2]*B[0] */ - "addq %%rax, %%r8 ;" - "movq %%r8, 16(%0) ;" - "mulx 8(%2), %%r10, %%r11; " /* A[2]*B[1] */ - "adcq %%r10, %%r9 ;" - "mulx 16(%2), %%r8, %%r13; " /* A[2]*B[2] */ - "adcq %%r8, %%r11 ;" - "mulx 24(%2), %%r10, %%rax; " /* A[2]*B[3] */ - "adcq %%r10, %%r13 ;" - /******************************************/ - "adcq $0, %%rax ;" - - "addq %%r9, %%rbx ;" - "adcq %%r11, %%rcx ;" - "adcq %%r13, %%r15 ;" - "adcq $0, %%rax ;" - - "movq 24(%1), %%rdx; " /* A[3] */ - "mulx (%2), %%r8, %%r9; " /* A[3]*B[0] */ - "addq %%rbx, %%r8 ;" - "movq %%r8, 24(%0) ;" - "mulx 8(%2), %%r10, %%r11; " /* A[3]*B[1] */ - "adcq %%r10, %%r9 ;" - "mulx 16(%2), %%r8, %%r13; " /* A[3]*B[2] */ - "adcq %%r8, %%r11 ;" - "mulx 24(%2), %%r10, %%rbx; " /* A[3]*B[3] */ - "adcq %%r10, %%r13 ;" - /******************************************/ - "adcq $0, %%rbx ;" - - "addq %%r9, %%rcx ;" - "movq %%rcx, 32(%0) ;" - "adcq %%r11, %%r15 ;" - "movq %%r15, 40(%0) ;" - "adcq %%r13, %%rax ;" - "movq %%rax, 48(%0) ;" - "adcq $0, %%rbx ;" - "movq %%rbx, 56(%0) ;" - - "movq 32(%1), %%rdx; " /* C[0] */ - "mulx 32(%2), %%r8, %%r15; " /* C[0]*D[0] */ - "movq %%r8, 64(%0) ;" - "mulx 40(%2), %%r10, %%rax; " /* C[0]*D[1] */ - "addq %%r10, %%r15 ;" - "mulx 48(%2), %%r8, %%rbx; " /* C[0]*D[2] */ - "adcq %%r8, %%rax ;" - "mulx 56(%2), %%r10, %%rcx; " /* C[0]*D[3] */ - "adcq %%r10, %%rbx ;" - /******************************************/ - "adcq $0, %%rcx ;" - - "movq 40(%1), %%rdx; " /* C[1] */ - "mulx 32(%2), %%r8, %%r9; " /* C[1]*D[0] */ - "addq %%r15, %%r8 ;" - "movq %%r8, 72(%0) ;" - "mulx 40(%2), %%r10, %%r11; " /* C[1]*D[1] */ - "adcq %%r10, %%r9 ;" - "mulx 48(%2), %%r8, %%r13; " /* C[1]*D[2] */ - "adcq %%r8, %%r11 ;" - "mulx 56(%2), %%r10, %%r15; " /* C[1]*D[3] */ - "adcq %%r10, %%r13 ;" - /******************************************/ - "adcq $0, %%r15 ;" - - "addq %%r9, %%rax ;" - "adcq %%r11, %%rbx ;" - "adcq %%r13, %%rcx ;" - "adcq $0, %%r15 ;" - - "movq 48(%1), %%rdx; " /* C[2] */ - "mulx 32(%2), %%r8, %%r9; " /* C[2]*D[0] */ - "addq %%rax, %%r8 ;" - "movq %%r8, 80(%0) ;" - "mulx 40(%2), %%r10, %%r11; " /* C[2]*D[1] */ - "adcq %%r10, %%r9 ;" - "mulx 48(%2), %%r8, %%r13; " /* C[2]*D[2] */ - "adcq %%r8, %%r11 ;" - "mulx 56(%2), %%r10, %%rax; " /* C[2]*D[3] */ - "adcq %%r10, %%r13 ;" - /******************************************/ - "adcq $0, %%rax ;" - - "addq %%r9, %%rbx ;" - "adcq %%r11, %%rcx ;" - "adcq %%r13, %%r15 ;" - "adcq $0, %%rax ;" - - "movq 56(%1), %%rdx; " /* C[3] */ - "mulx 32(%2), %%r8, %%r9; " /* C[3]*D[0] */ - "addq %%rbx, %%r8 ;" - "movq %%r8, 88(%0) ;" - "mulx 40(%2), %%r10, %%r11; " /* C[3]*D[1] */ - "adcq %%r10, %%r9 ;" - "mulx 48(%2), %%r8, %%r13; " /* C[3]*D[2] */ - "adcq %%r8, %%r11 ;" - "mulx 56(%2), %%r10, %%rbx; " /* C[3]*D[3] */ - "adcq %%r10, %%r13 ;" - /******************************************/ - "adcq $0, %%rbx ;" - - "addq %%r9, %%rcx ;" - "movq %%rcx, 96(%0) ;" - "adcq %%r11, %%r15 ;" - "movq %%r15, 104(%0) ;" - "adcq %%r13, %%rax ;" - "movq %%rax, 112(%0) ;" - "adcq $0, %%rbx ;" - "movq %%rbx, 120(%0) ;" - : - : "r"(c), "r"(a), "r"(b) - : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", - "%r10", "%r11", "%r13", "%r15"); + u64 x = a; + u64 y = b; + u64 x_xor_y = x ^ y; + u64 x_sub_y = x - y; + u64 x_sub_y_xor_y = x_sub_y ^ y; + u64 q = x_xor_y | x_sub_y_xor_y; + u64 x_xor_q = x ^ q; + u64 x_xor_q_ = x_xor_q >> (u32)63U; + return x_xor_q_ - (u64)1U; } -static void sqr2_256x256_integer_adx(u64 *const c, const u64 *const a) +/* Computes the addition of four-element f1 with value in f2 + * and returns the carry (if any) */ +static inline u64 add_scalar(u64 *out, const u64 *f1, u64 f2) { - asm volatile( - "movq (%1), %%rdx ;" /* A[0] */ - "mulx 8(%1), %%r8, %%r14 ;" /* A[1]*A[0] */ - "xorl %%r15d, %%r15d;" - "mulx 16(%1), %%r9, %%r10 ;" /* A[2]*A[0] */ - "adcx %%r14, %%r9 ;" - "mulx 24(%1), %%rax, %%rcx ;" /* A[3]*A[0] */ - "adcx %%rax, %%r10 ;" - "movq 24(%1), %%rdx ;" /* A[3] */ - "mulx 8(%1), %%r11, %%rbx ;" /* A[1]*A[3] */ - "adcx %%rcx, %%r11 ;" - "mulx 16(%1), %%rax, %%r13 ;" /* A[2]*A[3] */ - "adcx %%rax, %%rbx ;" - "movq 8(%1), %%rdx ;" /* A[1] */ - "adcx %%r15, %%r13 ;" - "mulx 16(%1), %%rax, %%rcx ;" /* A[2]*A[1] */ - "movq $0, %%r14 ;" - /******************************************/ - "adcx %%r15, %%r14 ;" - - "xorl %%r15d, %%r15d;" - "adox %%rax, %%r10 ;" - "adcx %%r8, %%r8 ;" - "adox %%rcx, %%r11 ;" - "adcx %%r9, %%r9 ;" - "adox %%r15, %%rbx ;" - "adcx %%r10, %%r10 ;" - "adox %%r15, %%r13 ;" - "adcx %%r11, %%r11 ;" - "adox %%r15, %%r14 ;" - "adcx %%rbx, %%rbx ;" - "adcx %%r13, %%r13 ;" - "adcx %%r14, %%r14 ;" - - "movq (%1), %%rdx ;" - "mulx %%rdx, %%rax, %%rcx ;" /* A[0]^2 */ - /*******************/ - "movq %%rax, 0(%0) ;" - "addq %%rcx, %%r8 ;" - "movq %%r8, 8(%0) ;" - "movq 8(%1), %%rdx ;" - "mulx %%rdx, %%rax, %%rcx ;" /* A[1]^2 */ - "adcq %%rax, %%r9 ;" - "movq %%r9, 16(%0) ;" - "adcq %%rcx, %%r10 ;" - "movq %%r10, 24(%0) ;" - "movq 16(%1), %%rdx ;" - "mulx %%rdx, %%rax, %%rcx ;" /* A[2]^2 */ - "adcq %%rax, %%r11 ;" - "movq %%r11, 32(%0) ;" - "adcq %%rcx, %%rbx ;" - "movq %%rbx, 40(%0) ;" - "movq 24(%1), %%rdx ;" - "mulx %%rdx, %%rax, %%rcx ;" /* A[3]^2 */ - "adcq %%rax, %%r13 ;" - "movq %%r13, 48(%0) ;" - "adcq %%rcx, %%r14 ;" - "movq %%r14, 56(%0) ;" - - - "movq 32(%1), %%rdx ;" /* B[0] */ - "mulx 40(%1), %%r8, %%r14 ;" /* B[1]*B[0] */ - "xorl %%r15d, %%r15d;" - "mulx 48(%1), %%r9, %%r10 ;" /* B[2]*B[0] */ - "adcx %%r14, %%r9 ;" - "mulx 56(%1), %%rax, %%rcx ;" /* B[3]*B[0] */ - "adcx %%rax, %%r10 ;" - "movq 56(%1), %%rdx ;" /* B[3] */ - "mulx 40(%1), %%r11, %%rbx ;" /* B[1]*B[3] */ - "adcx %%rcx, %%r11 ;" - "mulx 48(%1), %%rax, %%r13 ;" /* B[2]*B[3] */ - "adcx %%rax, %%rbx ;" - "movq 40(%1), %%rdx ;" /* B[1] */ - "adcx %%r15, %%r13 ;" - "mulx 48(%1), %%rax, %%rcx ;" /* B[2]*B[1] */ - "movq $0, %%r14 ;" - /******************************************/ - "adcx %%r15, %%r14 ;" - - "xorl %%r15d, %%r15d;" - "adox %%rax, %%r10 ;" - "adcx %%r8, %%r8 ;" - "adox %%rcx, %%r11 ;" - "adcx %%r9, %%r9 ;" - "adox %%r15, %%rbx ;" - "adcx %%r10, %%r10 ;" - "adox %%r15, %%r13 ;" - "adcx %%r11, %%r11 ;" - "adox %%r15, %%r14 ;" - "adcx %%rbx, %%rbx ;" - "adcx %%r13, %%r13 ;" - "adcx %%r14, %%r14 ;" - - "movq 32(%1), %%rdx ;" - "mulx %%rdx, %%rax, %%rcx ;" /* B[0]^2 */ - /*******************/ - "movq %%rax, 64(%0) ;" - "addq %%rcx, %%r8 ;" - "movq %%r8, 72(%0) ;" - "movq 40(%1), %%rdx ;" - "mulx %%rdx, %%rax, %%rcx ;" /* B[1]^2 */ - "adcq %%rax, %%r9 ;" - "movq %%r9, 80(%0) ;" - "adcq %%rcx, %%r10 ;" - "movq %%r10, 88(%0) ;" - "movq 48(%1), %%rdx ;" - "mulx %%rdx, %%rax, %%rcx ;" /* B[2]^2 */ - "adcq %%rax, %%r11 ;" - "movq %%r11, 96(%0) ;" - "adcq %%rcx, %%rbx ;" - "movq %%rbx, 104(%0) ;" - "movq 56(%1), %%rdx ;" - "mulx %%rdx, %%rax, %%rcx ;" /* B[3]^2 */ - "adcq %%rax, %%r13 ;" - "movq %%r13, 112(%0) ;" - "adcq %%rcx, %%r14 ;" - "movq %%r14, 120(%0) ;" - : - : "r"(c), "r"(a) - : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", - "%r10", "%r11", "%r13", "%r14", "%r15"); -} + u64 carry_r; -static void sqr2_256x256_integer_bmi2(u64 *const c, const u64 *const a) -{ asm volatile( - "movq 8(%1), %%rdx ;" /* A[1] */ - "mulx (%1), %%r8, %%r9 ;" /* A[0]*A[1] */ - "mulx 16(%1), %%r10, %%r11 ;" /* A[2]*A[1] */ - "mulx 24(%1), %%rcx, %%r14 ;" /* A[3]*A[1] */ - - "movq 16(%1), %%rdx ;" /* A[2] */ - "mulx 24(%1), %%r15, %%r13 ;" /* A[3]*A[2] */ - "mulx (%1), %%rax, %%rdx ;" /* A[0]*A[2] */ - - "addq %%rax, %%r9 ;" - "adcq %%rdx, %%r10 ;" - "adcq %%rcx, %%r11 ;" - "adcq %%r14, %%r15 ;" - "adcq $0, %%r13 ;" - "movq $0, %%r14 ;" - "adcq $0, %%r14 ;" - - "movq (%1), %%rdx ;" /* A[0] */ - "mulx 24(%1), %%rax, %%rcx ;" /* A[0]*A[3] */ - - "addq %%rax, %%r10 ;" - "adcq %%rcx, %%r11 ;" - "adcq $0, %%r15 ;" - "adcq $0, %%r13 ;" - "adcq $0, %%r14 ;" - - "shldq $1, %%r13, %%r14 ;" - "shldq $1, %%r15, %%r13 ;" - "shldq $1, %%r11, %%r15 ;" - "shldq $1, %%r10, %%r11 ;" - "shldq $1, %%r9, %%r10 ;" - "shldq $1, %%r8, %%r9 ;" - "shlq $1, %%r8 ;" - - /*******************/ - "mulx %%rdx, %%rax, %%rcx ; " /* A[0]^2 */ - /*******************/ - "movq %%rax, 0(%0) ;" - "addq %%rcx, %%r8 ;" - "movq %%r8, 8(%0) ;" - "movq 8(%1), %%rdx ;" - "mulx %%rdx, %%rax, %%rcx ; " /* A[1]^2 */ - "adcq %%rax, %%r9 ;" - "movq %%r9, 16(%0) ;" - "adcq %%rcx, %%r10 ;" - "movq %%r10, 24(%0) ;" - "movq 16(%1), %%rdx ;" - "mulx %%rdx, %%rax, %%rcx ; " /* A[2]^2 */ - "adcq %%rax, %%r11 ;" - "movq %%r11, 32(%0) ;" - "adcq %%rcx, %%r15 ;" - "movq %%r15, 40(%0) ;" - "movq 24(%1), %%rdx ;" - "mulx %%rdx, %%rax, %%rcx ; " /* A[3]^2 */ - "adcq %%rax, %%r13 ;" - "movq %%r13, 48(%0) ;" - "adcq %%rcx, %%r14 ;" - "movq %%r14, 56(%0) ;" - - "movq 40(%1), %%rdx ;" /* B[1] */ - "mulx 32(%1), %%r8, %%r9 ;" /* B[0]*B[1] */ - "mulx 48(%1), %%r10, %%r11 ;" /* B[2]*B[1] */ - "mulx 56(%1), %%rcx, %%r14 ;" /* B[3]*B[1] */ - - "movq 48(%1), %%rdx ;" /* B[2] */ - "mulx 56(%1), %%r15, %%r13 ;" /* B[3]*B[2] */ - "mulx 32(%1), %%rax, %%rdx ;" /* B[0]*B[2] */ - - "addq %%rax, %%r9 ;" - "adcq %%rdx, %%r10 ;" - "adcq %%rcx, %%r11 ;" - "adcq %%r14, %%r15 ;" - "adcq $0, %%r13 ;" - "movq $0, %%r14 ;" - "adcq $0, %%r14 ;" - - "movq 32(%1), %%rdx ;" /* B[0] */ - "mulx 56(%1), %%rax, %%rcx ;" /* B[0]*B[3] */ - - "addq %%rax, %%r10 ;" - "adcq %%rcx, %%r11 ;" - "adcq $0, %%r15 ;" - "adcq $0, %%r13 ;" - "adcq $0, %%r14 ;" - - "shldq $1, %%r13, %%r14 ;" - "shldq $1, %%r15, %%r13 ;" - "shldq $1, %%r11, %%r15 ;" - "shldq $1, %%r10, %%r11 ;" - "shldq $1, %%r9, %%r10 ;" - "shldq $1, %%r8, %%r9 ;" - "shlq $1, %%r8 ;" - - /*******************/ - "mulx %%rdx, %%rax, %%rcx ; " /* B[0]^2 */ - /*******************/ - "movq %%rax, 64(%0) ;" - "addq %%rcx, %%r8 ;" - "movq %%r8, 72(%0) ;" - "movq 40(%1), %%rdx ;" - "mulx %%rdx, %%rax, %%rcx ; " /* B[1]^2 */ - "adcq %%rax, %%r9 ;" - "movq %%r9, 80(%0) ;" - "adcq %%rcx, %%r10 ;" - "movq %%r10, 88(%0) ;" - "movq 48(%1), %%rdx ;" - "mulx %%rdx, %%rax, %%rcx ; " /* B[2]^2 */ - "adcq %%rax, %%r11 ;" - "movq %%r11, 96(%0) ;" - "adcq %%rcx, %%r15 ;" - "movq %%r15, 104(%0) ;" - "movq 56(%1), %%rdx ;" - "mulx %%rdx, %%rax, %%rcx ; " /* B[3]^2 */ - "adcq %%rax, %%r13 ;" - "movq %%r13, 112(%0) ;" - "adcq %%rcx, %%r14 ;" - "movq %%r14, 120(%0) ;" - : - : "r"(c), "r"(a) - : "memory", "cc", "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", - "%r11", "%r13", "%r14", "%r15"); + /* Clear registers to propagate the carry bit */ + " xor %%r8d, %%r8d;" + " xor %%r9d, %%r9d;" + " xor %%r10d, %%r10d;" + " xor %%r11d, %%r11d;" + " xor %k1, %k1;" + + /* Begin addition chain */ + " addq 0(%3), %0;" + " movq %0, 0(%2);" + " adcxq 8(%3), %%r8;" + " movq %%r8, 8(%2);" + " adcxq 16(%3), %%r9;" + " movq %%r9, 16(%2);" + " adcxq 24(%3), %%r10;" + " movq %%r10, 24(%2);" + + /* Return the carry bit in a register */ + " adcx %%r11, %1;" + : "+&r"(f2), "=&r"(carry_r) + : "r"(out), "r"(f1) + : "%r8", "%r9", "%r10", "%r11", "memory", "cc"); + + return carry_r; } -static void red_eltfp25519_2w_adx(u64 *const c, const u64 *const a) +/* Computes the field addition of two field elements */ +static inline void fadd(u64 *out, const u64 *f1, const u64 *f2) { asm volatile( - "movl $38, %%edx; " /* 2*c = 38 = 2^256 */ - "mulx 32(%1), %%r8, %%r10; " /* c*C[4] */ - "xorl %%ebx, %%ebx ;" - "adox (%1), %%r8 ;" - "mulx 40(%1), %%r9, %%r11; " /* c*C[5] */ - "adcx %%r10, %%r9 ;" - "adox 8(%1), %%r9 ;" - "mulx 48(%1), %%r10, %%rax; " /* c*C[6] */ - "adcx %%r11, %%r10 ;" - "adox 16(%1), %%r10 ;" - "mulx 56(%1), %%r11, %%rcx; " /* c*C[7] */ - "adcx %%rax, %%r11 ;" - "adox 24(%1), %%r11 ;" - /***************************************/ - "adcx %%rbx, %%rcx ;" - "adox %%rbx, %%rcx ;" - "imul %%rdx, %%rcx ;" /* c*C[4], cf=0, of=0 */ - "adcx %%rcx, %%r8 ;" - "adcx %%rbx, %%r9 ;" - "movq %%r9, 8(%0) ;" - "adcx %%rbx, %%r10 ;" - "movq %%r10, 16(%0) ;" - "adcx %%rbx, %%r11 ;" - "movq %%r11, 24(%0) ;" - "mov $0, %%ecx ;" - "cmovc %%edx, %%ecx ;" - "addq %%rcx, %%r8 ;" - "movq %%r8, (%0) ;" - - "mulx 96(%1), %%r8, %%r10; " /* c*C[4] */ - "xorl %%ebx, %%ebx ;" - "adox 64(%1), %%r8 ;" - "mulx 104(%1), %%r9, %%r11; " /* c*C[5] */ - "adcx %%r10, %%r9 ;" - "adox 72(%1), %%r9 ;" - "mulx 112(%1), %%r10, %%rax; " /* c*C[6] */ - "adcx %%r11, %%r10 ;" - "adox 80(%1), %%r10 ;" - "mulx 120(%1), %%r11, %%rcx; " /* c*C[7] */ - "adcx %%rax, %%r11 ;" - "adox 88(%1), %%r11 ;" - /****************************************/ - "adcx %%rbx, %%rcx ;" - "adox %%rbx, %%rcx ;" - "imul %%rdx, %%rcx ;" /* c*C[4], cf=0, of=0 */ - "adcx %%rcx, %%r8 ;" - "adcx %%rbx, %%r9 ;" - "movq %%r9, 40(%0) ;" - "adcx %%rbx, %%r10 ;" - "movq %%r10, 48(%0) ;" - "adcx %%rbx, %%r11 ;" - "movq %%r11, 56(%0) ;" - "mov $0, %%ecx ;" - "cmovc %%edx, %%ecx ;" - "addq %%rcx, %%r8 ;" - "movq %%r8, 32(%0) ;" - : - : "r"(c), "r"(a) - : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", - "%r10", "%r11"); + /* Compute the raw addition of f1 + f2 */ + " movq 0(%0), %%r8;" + " addq 0(%2), %%r8;" + " movq 8(%0), %%r9;" + " adcxq 8(%2), %%r9;" + " movq 16(%0), %%r10;" + " adcxq 16(%2), %%r10;" + " movq 24(%0), %%r11;" + " adcxq 24(%2), %%r11;" + + /* Wrap the result back into the field */ + + /* Step 1: Compute carry*38 */ + " mov $0, %%rax;" + " mov $38, %0;" + " cmovc %0, %%rax;" + + /* Step 2: Add carry*38 to the original sum */ + " xor %%ecx, %%ecx;" + " add %%rax, %%r8;" + " adcx %%rcx, %%r9;" + " movq %%r9, 8(%1);" + " adcx %%rcx, %%r10;" + " movq %%r10, 16(%1);" + " adcx %%rcx, %%r11;" + " movq %%r11, 24(%1);" + + /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */ + " mov $0, %%rax;" + " cmovc %0, %%rax;" + " add %%rax, %%r8;" + " movq %%r8, 0(%1);" + : "+&r"(f2) + : "r"(out), "r"(f1) + : "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11", "memory", "cc"); } -static void red_eltfp25519_2w_bmi2(u64 *const c, const u64 *const a) +/* Computes the field subtraction of two field elements */ +static inline void fsub(u64 *out, const u64 *f1, const u64 *f2) { asm volatile( - "movl $38, %%edx ; " /* 2*c = 38 = 2^256 */ - "mulx 32(%1), %%r8, %%r10 ;" /* c*C[4] */ - "mulx 40(%1), %%r9, %%r11 ;" /* c*C[5] */ - "addq %%r10, %%r9 ;" - "mulx 48(%1), %%r10, %%rax ;" /* c*C[6] */ - "adcq %%r11, %%r10 ;" - "mulx 56(%1), %%r11, %%rcx ;" /* c*C[7] */ - "adcq %%rax, %%r11 ;" - /***************************************/ - "adcq $0, %%rcx ;" - "addq (%1), %%r8 ;" - "adcq 8(%1), %%r9 ;" - "adcq 16(%1), %%r10 ;" - "adcq 24(%1), %%r11 ;" - "adcq $0, %%rcx ;" - "imul %%rdx, %%rcx ;" /* c*C[4], cf=0 */ - "addq %%rcx, %%r8 ;" - "adcq $0, %%r9 ;" - "movq %%r9, 8(%0) ;" - "adcq $0, %%r10 ;" - "movq %%r10, 16(%0) ;" - "adcq $0, %%r11 ;" - "movq %%r11, 24(%0) ;" - "mov $0, %%ecx ;" - "cmovc %%edx, %%ecx ;" - "addq %%rcx, %%r8 ;" - "movq %%r8, (%0) ;" - - "mulx 96(%1), %%r8, %%r10 ;" /* c*C[4] */ - "mulx 104(%1), %%r9, %%r11 ;" /* c*C[5] */ - "addq %%r10, %%r9 ;" - "mulx 112(%1), %%r10, %%rax ;" /* c*C[6] */ - "adcq %%r11, %%r10 ;" - "mulx 120(%1), %%r11, %%rcx ;" /* c*C[7] */ - "adcq %%rax, %%r11 ;" - /****************************************/ - "adcq $0, %%rcx ;" - "addq 64(%1), %%r8 ;" - "adcq 72(%1), %%r9 ;" - "adcq 80(%1), %%r10 ;" - "adcq 88(%1), %%r11 ;" - "adcq $0, %%rcx ;" - "imul %%rdx, %%rcx ;" /* c*C[4], cf=0 */ - "addq %%rcx, %%r8 ;" - "adcq $0, %%r9 ;" - "movq %%r9, 40(%0) ;" - "adcq $0, %%r10 ;" - "movq %%r10, 48(%0) ;" - "adcq $0, %%r11 ;" - "movq %%r11, 56(%0) ;" - "mov $0, %%ecx ;" - "cmovc %%edx, %%ecx ;" - "addq %%rcx, %%r8 ;" - "movq %%r8, 32(%0) ;" + /* Compute the raw subtraction of f1-f2 */ + " movq 0(%1), %%r8;" + " subq 0(%2), %%r8;" + " movq 8(%1), %%r9;" + " sbbq 8(%2), %%r9;" + " movq 16(%1), %%r10;" + " sbbq 16(%2), %%r10;" + " movq 24(%1), %%r11;" + " sbbq 24(%2), %%r11;" + + /* Wrap the result back into the field */ + + /* Step 1: Compute carry*38 */ + " mov $0, %%rax;" + " mov $38, %%rcx;" + " cmovc %%rcx, %%rax;" + + /* Step 2: Subtract carry*38 from the original difference */ + " sub %%rax, %%r8;" + " sbb $0, %%r9;" + " sbb $0, %%r10;" + " sbb $0, %%r11;" + + /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */ + " mov $0, %%rax;" + " cmovc %%rcx, %%rax;" + " sub %%rax, %%r8;" + + /* Store the result */ + " movq %%r8, 0(%0);" + " movq %%r9, 8(%0);" + " movq %%r10, 16(%0);" + " movq %%r11, 24(%0);" : - : "r"(c), "r"(a) - : "memory", "cc", "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", - "%r11"); + : "r"(out), "r"(f1), "r"(f2) + : "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11", "memory", "cc"); } -static void mul_256x256_integer_adx(u64 *const c, const u64 *const a, - const u64 *const b) +/* Computes a field multiplication: out <- f1 * f2 + * Uses the 8-element buffer tmp for intermediate results */ +static inline void fmul(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp) { asm volatile( - "movq (%1), %%rdx; " /* A[0] */ - "mulx (%2), %%r8, %%r9; " /* A[0]*B[0] */ - "xorl %%r10d, %%r10d ;" - "movq %%r8, (%0) ;" - "mulx 8(%2), %%r10, %%r11; " /* A[0]*B[1] */ - "adox %%r9, %%r10 ;" - "movq %%r10, 8(%0) ;" - "mulx 16(%2), %%r15, %%r13; " /* A[0]*B[2] */ - "adox %%r11, %%r15 ;" - "mulx 24(%2), %%r14, %%rdx; " /* A[0]*B[3] */ - "adox %%r13, %%r14 ;" - "movq $0, %%rax ;" - /******************************************/ - "adox %%rdx, %%rax ;" - - "movq 8(%1), %%rdx; " /* A[1] */ - "mulx (%2), %%r8, %%r9; " /* A[1]*B[0] */ - "xorl %%r10d, %%r10d ;" - "adcx 8(%0), %%r8 ;" - "movq %%r8, 8(%0) ;" - "mulx 8(%2), %%r10, %%r11; " /* A[1]*B[1] */ - "adox %%r9, %%r10 ;" - "adcx %%r15, %%r10 ;" - "movq %%r10, 16(%0) ;" - "mulx 16(%2), %%r15, %%r13; " /* A[1]*B[2] */ - "adox %%r11, %%r15 ;" - "adcx %%r14, %%r15 ;" - "movq $0, %%r8 ;" - "mulx 24(%2), %%r14, %%rdx; " /* A[1]*B[3] */ - "adox %%r13, %%r14 ;" - "adcx %%rax, %%r14 ;" - "movq $0, %%rax ;" - /******************************************/ - "adox %%rdx, %%rax ;" - "adcx %%r8, %%rax ;" - - "movq 16(%1), %%rdx; " /* A[2] */ - "mulx (%2), %%r8, %%r9; " /* A[2]*B[0] */ - "xorl %%r10d, %%r10d ;" - "adcx 16(%0), %%r8 ;" - "movq %%r8, 16(%0) ;" - "mulx 8(%2), %%r10, %%r11; " /* A[2]*B[1] */ - "adox %%r9, %%r10 ;" - "adcx %%r15, %%r10 ;" - "movq %%r10, 24(%0) ;" - "mulx 16(%2), %%r15, %%r13; " /* A[2]*B[2] */ - "adox %%r11, %%r15 ;" - "adcx %%r14, %%r15 ;" - "movq $0, %%r8 ;" - "mulx 24(%2), %%r14, %%rdx; " /* A[2]*B[3] */ - "adox %%r13, %%r14 ;" - "adcx %%rax, %%r14 ;" - "movq $0, %%rax ;" - /******************************************/ - "adox %%rdx, %%rax ;" - "adcx %%r8, %%rax ;" - - "movq 24(%1), %%rdx; " /* A[3] */ - "mulx (%2), %%r8, %%r9; " /* A[3]*B[0] */ - "xorl %%r10d, %%r10d ;" - "adcx 24(%0), %%r8 ;" - "movq %%r8, 24(%0) ;" - "mulx 8(%2), %%r10, %%r11; " /* A[3]*B[1] */ - "adox %%r9, %%r10 ;" - "adcx %%r15, %%r10 ;" - "movq %%r10, 32(%0) ;" - "mulx 16(%2), %%r15, %%r13; " /* A[3]*B[2] */ - "adox %%r11, %%r15 ;" - "adcx %%r14, %%r15 ;" - "movq %%r15, 40(%0) ;" - "movq $0, %%r8 ;" - "mulx 24(%2), %%r14, %%rdx; " /* A[3]*B[3] */ - "adox %%r13, %%r14 ;" - "adcx %%rax, %%r14 ;" - "movq %%r14, 48(%0) ;" - "movq $0, %%rax ;" - /******************************************/ - "adox %%rdx, %%rax ;" - "adcx %%r8, %%rax ;" - "movq %%rax, 56(%0) ;" - : - : "r"(c), "r"(a), "r"(b) - : "memory", "cc", "%rax", "%rdx", "%r8", "%r9", "%r10", "%r11", - "%r13", "%r14", "%r15"); -} -static void mul_256x256_integer_bmi2(u64 *const c, const u64 *const a, - const u64 *const b) -{ - asm volatile( - "movq (%1), %%rdx; " /* A[0] */ - "mulx (%2), %%r8, %%r15; " /* A[0]*B[0] */ - "movq %%r8, (%0) ;" - "mulx 8(%2), %%r10, %%rax; " /* A[0]*B[1] */ - "addq %%r10, %%r15 ;" - "mulx 16(%2), %%r8, %%rbx; " /* A[0]*B[2] */ - "adcq %%r8, %%rax ;" - "mulx 24(%2), %%r10, %%rcx; " /* A[0]*B[3] */ - "adcq %%r10, %%rbx ;" - /******************************************/ - "adcq $0, %%rcx ;" - - "movq 8(%1), %%rdx; " /* A[1] */ - "mulx (%2), %%r8, %%r9; " /* A[1]*B[0] */ - "addq %%r15, %%r8 ;" - "movq %%r8, 8(%0) ;" - "mulx 8(%2), %%r10, %%r11; " /* A[1]*B[1] */ - "adcq %%r10, %%r9 ;" - "mulx 16(%2), %%r8, %%r13; " /* A[1]*B[2] */ - "adcq %%r8, %%r11 ;" - "mulx 24(%2), %%r10, %%r15; " /* A[1]*B[3] */ - "adcq %%r10, %%r13 ;" - /******************************************/ - "adcq $0, %%r15 ;" - - "addq %%r9, %%rax ;" - "adcq %%r11, %%rbx ;" - "adcq %%r13, %%rcx ;" - "adcq $0, %%r15 ;" - - "movq 16(%1), %%rdx; " /* A[2] */ - "mulx (%2), %%r8, %%r9; " /* A[2]*B[0] */ - "addq %%rax, %%r8 ;" - "movq %%r8, 16(%0) ;" - "mulx 8(%2), %%r10, %%r11; " /* A[2]*B[1] */ - "adcq %%r10, %%r9 ;" - "mulx 16(%2), %%r8, %%r13; " /* A[2]*B[2] */ - "adcq %%r8, %%r11 ;" - "mulx 24(%2), %%r10, %%rax; " /* A[2]*B[3] */ - "adcq %%r10, %%r13 ;" - /******************************************/ - "adcq $0, %%rax ;" - - "addq %%r9, %%rbx ;" - "adcq %%r11, %%rcx ;" - "adcq %%r13, %%r15 ;" - "adcq $0, %%rax ;" - - "movq 24(%1), %%rdx; " /* A[3] */ - "mulx (%2), %%r8, %%r9; " /* A[3]*B[0] */ - "addq %%rbx, %%r8 ;" - "movq %%r8, 24(%0) ;" - "mulx 8(%2), %%r10, %%r11; " /* A[3]*B[1] */ - "adcq %%r10, %%r9 ;" - "mulx 16(%2), %%r8, %%r13; " /* A[3]*B[2] */ - "adcq %%r8, %%r11 ;" - "mulx 24(%2), %%r10, %%rbx; " /* A[3]*B[3] */ - "adcq %%r10, %%r13 ;" - /******************************************/ - "adcq $0, %%rbx ;" - - "addq %%r9, %%rcx ;" - "movq %%rcx, 32(%0) ;" - "adcq %%r11, %%r15 ;" - "movq %%r15, 40(%0) ;" - "adcq %%r13, %%rax ;" - "movq %%rax, 48(%0) ;" - "adcq $0, %%rbx ;" - "movq %%rbx, 56(%0) ;" - : - : "r"(c), "r"(a), "r"(b) - : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", - "%r10", "%r11", "%r13", "%r15"); + /* Compute the raw multiplication: tmp <- src1 * src2 */ + + /* Compute src1[0] * src2 */ + " movq 0(%0), %%rdx;" + " mulxq 0(%1), %%r8, %%r9;" + " xor %%r10d, %%r10d;" + " movq %%r8, 0(%2);" + " mulxq 8(%1), %%r10, %%r11;" + " adox %%r9, %%r10;" + " movq %%r10, 8(%2);" + " mulxq 16(%1), %%rbx, %%r13;" + " adox %%r11, %%rbx;" + " mulxq 24(%1), %%r14, %%rdx;" + " adox %%r13, %%r14;" + " mov $0, %%rax;" + " adox %%rdx, %%rax;" + + /* Compute src1[1] * src2 */ + " movq 8(%0), %%rdx;" + " mulxq 0(%1), %%r8, %%r9;" + " xor %%r10d, %%r10d;" + " adcxq 8(%2), %%r8;" + " movq %%r8, 8(%2);" + " mulxq 8(%1), %%r10, %%r11;" + " adox %%r9, %%r10;" + " adcx %%rbx, %%r10;" + " movq %%r10, 16(%2);" + " mulxq 16(%1), %%rbx, %%r13;" + " adox %%r11, %%rbx;" + " adcx %%r14, %%rbx;" + " mov $0, %%r8;" + " mulxq 24(%1), %%r14, %%rdx;" + " adox %%r13, %%r14;" + " adcx %%rax, %%r14;" + " mov $0, %%rax;" + " adox %%rdx, %%rax;" + " adcx %%r8, %%rax;" + + /* Compute src1[2] * src2 */ + " movq 16(%0), %%rdx;" + " mulxq 0(%1), %%r8, %%r9;" + " xor %%r10d, %%r10d;" + " adcxq 16(%2), %%r8;" + " movq %%r8, 16(%2);" + " mulxq 8(%1), %%r10, %%r11;" + " adox %%r9, %%r10;" + " adcx %%rbx, %%r10;" + " movq %%r10, 24(%2);" + " mulxq 16(%1), %%rbx, %%r13;" + " adox %%r11, %%rbx;" + " adcx %%r14, %%rbx;" + " mov $0, %%r8;" + " mulxq 24(%1), %%r14, %%rdx;" + " adox %%r13, %%r14;" + " adcx %%rax, %%r14;" + " mov $0, %%rax;" + " adox %%rdx, %%rax;" + " adcx %%r8, %%rax;" + + /* Compute src1[3] * src2 */ + " movq 24(%0), %%rdx;" + " mulxq 0(%1), %%r8, %%r9;" + " xor %%r10d, %%r10d;" + " adcxq 24(%2), %%r8;" + " movq %%r8, 24(%2);" + " mulxq 8(%1), %%r10, %%r11;" + " adox %%r9, %%r10;" + " adcx %%rbx, %%r10;" + " movq %%r10, 32(%2);" + " mulxq 16(%1), %%rbx, %%r13;" + " adox %%r11, %%rbx;" + " adcx %%r14, %%rbx;" + " movq %%rbx, 40(%2);" + " mov $0, %%r8;" + " mulxq 24(%1), %%r14, %%rdx;" + " adox %%r13, %%r14;" + " adcx %%rax, %%r14;" + " movq %%r14, 48(%2);" + " mov $0, %%rax;" + " adox %%rdx, %%rax;" + " adcx %%r8, %%rax;" + " movq %%rax, 56(%2);" + + /* Line up pointers */ + " mov %2, %0;" + " mov %3, %2;" + + /* Wrap the result back into the field */ + + /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */ + " mov $38, %%rdx;" + " mulxq 32(%0), %%r8, %%r13;" + " xor %k1, %k1;" + " adoxq 0(%0), %%r8;" + " mulxq 40(%0), %%r9, %%rbx;" + " adcx %%r13, %%r9;" + " adoxq 8(%0), %%r9;" + " mulxq 48(%0), %%r10, %%r13;" + " adcx %%rbx, %%r10;" + " adoxq 16(%0), %%r10;" + " mulxq 56(%0), %%r11, %%rax;" + " adcx %%r13, %%r11;" + " adoxq 24(%0), %%r11;" + " adcx %1, %%rax;" + " adox %1, %%rax;" + " imul %%rdx, %%rax;" + + /* Step 2: Fold the carry back into dst */ + " add %%rax, %%r8;" + " adcx %1, %%r9;" + " movq %%r9, 8(%2);" + " adcx %1, %%r10;" + " movq %%r10, 16(%2);" + " adcx %1, %%r11;" + " movq %%r11, 24(%2);" + + /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */ + " mov $0, %%rax;" + " cmovc %%rdx, %%rax;" + " add %%rax, %%r8;" + " movq %%r8, 0(%2);" + : "+&r"(f1), "+&r"(f2), "+&r"(tmp) + : "r"(out) + : "%rax", "%rbx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r13", + "%r14", "memory", "cc"); } -static void sqr_256x256_integer_adx(u64 *const c, const u64 *const a) +/* Computes two field multiplications: + * out[0] <- f1[0] * f2[0] + * out[1] <- f1[1] * f2[1] + * Uses the 16-element buffer tmp for intermediate results: */ +static inline void fmul2(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp) { asm volatile( - "movq (%1), %%rdx ;" /* A[0] */ - "mulx 8(%1), %%r8, %%r14 ;" /* A[1]*A[0] */ - "xorl %%r15d, %%r15d;" - "mulx 16(%1), %%r9, %%r10 ;" /* A[2]*A[0] */ - "adcx %%r14, %%r9 ;" - "mulx 24(%1), %%rax, %%rcx ;" /* A[3]*A[0] */ - "adcx %%rax, %%r10 ;" - "movq 24(%1), %%rdx ;" /* A[3] */ - "mulx 8(%1), %%r11, %%rbx ;" /* A[1]*A[3] */ - "adcx %%rcx, %%r11 ;" - "mulx 16(%1), %%rax, %%r13 ;" /* A[2]*A[3] */ - "adcx %%rax, %%rbx ;" - "movq 8(%1), %%rdx ;" /* A[1] */ - "adcx %%r15, %%r13 ;" - "mulx 16(%1), %%rax, %%rcx ;" /* A[2]*A[1] */ - "movq $0, %%r14 ;" - /******************************************/ - "adcx %%r15, %%r14 ;" - - "xorl %%r15d, %%r15d;" - "adox %%rax, %%r10 ;" - "adcx %%r8, %%r8 ;" - "adox %%rcx, %%r11 ;" - "adcx %%r9, %%r9 ;" - "adox %%r15, %%rbx ;" - "adcx %%r10, %%r10 ;" - "adox %%r15, %%r13 ;" - "adcx %%r11, %%r11 ;" - "adox %%r15, %%r14 ;" - "adcx %%rbx, %%rbx ;" - "adcx %%r13, %%r13 ;" - "adcx %%r14, %%r14 ;" - - "movq (%1), %%rdx ;" - "mulx %%rdx, %%rax, %%rcx ;" /* A[0]^2 */ - /*******************/ - "movq %%rax, 0(%0) ;" - "addq %%rcx, %%r8 ;" - "movq %%r8, 8(%0) ;" - "movq 8(%1), %%rdx ;" - "mulx %%rdx, %%rax, %%rcx ;" /* A[1]^2 */ - "adcq %%rax, %%r9 ;" - "movq %%r9, 16(%0) ;" - "adcq %%rcx, %%r10 ;" - "movq %%r10, 24(%0) ;" - "movq 16(%1), %%rdx ;" - "mulx %%rdx, %%rax, %%rcx ;" /* A[2]^2 */ - "adcq %%rax, %%r11 ;" - "movq %%r11, 32(%0) ;" - "adcq %%rcx, %%rbx ;" - "movq %%rbx, 40(%0) ;" - "movq 24(%1), %%rdx ;" - "mulx %%rdx, %%rax, %%rcx ;" /* A[3]^2 */ - "adcq %%rax, %%r13 ;" - "movq %%r13, 48(%0) ;" - "adcq %%rcx, %%r14 ;" - "movq %%r14, 56(%0) ;" - : - : "r"(c), "r"(a) - : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", - "%r10", "%r11", "%r13", "%r14", "%r15"); -} -static void sqr_256x256_integer_bmi2(u64 *const c, const u64 *const a) -{ - asm volatile( - "movq 8(%1), %%rdx ;" /* A[1] */ - "mulx (%1), %%r8, %%r9 ;" /* A[0]*A[1] */ - "mulx 16(%1), %%r10, %%r11 ;" /* A[2]*A[1] */ - "mulx 24(%1), %%rcx, %%r14 ;" /* A[3]*A[1] */ - - "movq 16(%1), %%rdx ;" /* A[2] */ - "mulx 24(%1), %%r15, %%r13 ;" /* A[3]*A[2] */ - "mulx (%1), %%rax, %%rdx ;" /* A[0]*A[2] */ - - "addq %%rax, %%r9 ;" - "adcq %%rdx, %%r10 ;" - "adcq %%rcx, %%r11 ;" - "adcq %%r14, %%r15 ;" - "adcq $0, %%r13 ;" - "movq $0, %%r14 ;" - "adcq $0, %%r14 ;" - - "movq (%1), %%rdx ;" /* A[0] */ - "mulx 24(%1), %%rax, %%rcx ;" /* A[0]*A[3] */ - - "addq %%rax, %%r10 ;" - "adcq %%rcx, %%r11 ;" - "adcq $0, %%r15 ;" - "adcq $0, %%r13 ;" - "adcq $0, %%r14 ;" - - "shldq $1, %%r13, %%r14 ;" - "shldq $1, %%r15, %%r13 ;" - "shldq $1, %%r11, %%r15 ;" - "shldq $1, %%r10, %%r11 ;" - "shldq $1, %%r9, %%r10 ;" - "shldq $1, %%r8, %%r9 ;" - "shlq $1, %%r8 ;" - - /*******************/ - "mulx %%rdx, %%rax, %%rcx ;" /* A[0]^2 */ - /*******************/ - "movq %%rax, 0(%0) ;" - "addq %%rcx, %%r8 ;" - "movq %%r8, 8(%0) ;" - "movq 8(%1), %%rdx ;" - "mulx %%rdx, %%rax, %%rcx ;" /* A[1]^2 */ - "adcq %%rax, %%r9 ;" - "movq %%r9, 16(%0) ;" - "adcq %%rcx, %%r10 ;" - "movq %%r10, 24(%0) ;" - "movq 16(%1), %%rdx ;" - "mulx %%rdx, %%rax, %%rcx ;" /* A[2]^2 */ - "adcq %%rax, %%r11 ;" - "movq %%r11, 32(%0) ;" - "adcq %%rcx, %%r15 ;" - "movq %%r15, 40(%0) ;" - "movq 24(%1), %%rdx ;" - "mulx %%rdx, %%rax, %%rcx ;" /* A[3]^2 */ - "adcq %%rax, %%r13 ;" - "movq %%r13, 48(%0) ;" - "adcq %%rcx, %%r14 ;" - "movq %%r14, 56(%0) ;" - : - : "r"(c), "r"(a) - : "memory", "cc", "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", - "%r11", "%r13", "%r14", "%r15"); + /* Compute the raw multiplication tmp[0] <- f1[0] * f2[0] */ + + /* Compute src1[0] * src2 */ + " movq 0(%0), %%rdx;" + " mulxq 0(%1), %%r8, %%r9;" + " xor %%r10d, %%r10d;" + " movq %%r8, 0(%2);" + " mulxq 8(%1), %%r10, %%r11;" + " adox %%r9, %%r10;" + " movq %%r10, 8(%2);" + " mulxq 16(%1), %%rbx, %%r13;" + " adox %%r11, %%rbx;" + " mulxq 24(%1), %%r14, %%rdx;" + " adox %%r13, %%r14;" + " mov $0, %%rax;" + " adox %%rdx, %%rax;" + + /* Compute src1[1] * src2 */ + " movq 8(%0), %%rdx;" + " mulxq 0(%1), %%r8, %%r9;" + " xor %%r10d, %%r10d;" + " adcxq 8(%2), %%r8;" + " movq %%r8, 8(%2);" + " mulxq 8(%1), %%r10, %%r11;" + " adox %%r9, %%r10;" + " adcx %%rbx, %%r10;" + " movq %%r10, 16(%2);" + " mulxq 16(%1), %%rbx, %%r13;" + " adox %%r11, %%rbx;" + " adcx %%r14, %%rbx;" + " mov $0, %%r8;" + " mulxq 24(%1), %%r14, %%rdx;" + " adox %%r13, %%r14;" + " adcx %%rax, %%r14;" + " mov $0, %%rax;" + " adox %%rdx, %%rax;" + " adcx %%r8, %%rax;" + + /* Compute src1[2] * src2 */ + " movq 16(%0), %%rdx;" + " mulxq 0(%1), %%r8, %%r9;" + " xor %%r10d, %%r10d;" + " adcxq 16(%2), %%r8;" + " movq %%r8, 16(%2);" + " mulxq 8(%1), %%r10, %%r11;" + " adox %%r9, %%r10;" + " adcx %%rbx, %%r10;" + " movq %%r10, 24(%2);" + " mulxq 16(%1), %%rbx, %%r13;" + " adox %%r11, %%rbx;" + " adcx %%r14, %%rbx;" + " mov $0, %%r8;" + " mulxq 24(%1), %%r14, %%rdx;" + " adox %%r13, %%r14;" + " adcx %%rax, %%r14;" + " mov $0, %%rax;" + " adox %%rdx, %%rax;" + " adcx %%r8, %%rax;" + + /* Compute src1[3] * src2 */ + " movq 24(%0), %%rdx;" + " mulxq 0(%1), %%r8, %%r9;" + " xor %%r10d, %%r10d;" + " adcxq 24(%2), %%r8;" + " movq %%r8, 24(%2);" + " mulxq 8(%1), %%r10, %%r11;" + " adox %%r9, %%r10;" + " adcx %%rbx, %%r10;" + " movq %%r10, 32(%2);" + " mulxq 16(%1), %%rbx, %%r13;" + " adox %%r11, %%rbx;" + " adcx %%r14, %%rbx;" + " movq %%rbx, 40(%2);" + " mov $0, %%r8;" + " mulxq 24(%1), %%r14, %%rdx;" + " adox %%r13, %%r14;" + " adcx %%rax, %%r14;" + " movq %%r14, 48(%2);" + " mov $0, %%rax;" + " adox %%rdx, %%rax;" + " adcx %%r8, %%rax;" + " movq %%rax, 56(%2);" + + /* Compute the raw multiplication tmp[1] <- f1[1] * f2[1] */ + + /* Compute src1[0] * src2 */ + " movq 32(%0), %%rdx;" + " mulxq 32(%1), %%r8, %%r9;" + " xor %%r10d, %%r10d;" + " movq %%r8, 64(%2);" + " mulxq 40(%1), %%r10, %%r11;" + " adox %%r9, %%r10;" + " movq %%r10, 72(%2);" + " mulxq 48(%1), %%rbx, %%r13;" + " adox %%r11, %%rbx;" + " mulxq 56(%1), %%r14, %%rdx;" + " adox %%r13, %%r14;" + " mov $0, %%rax;" + " adox %%rdx, %%rax;" + + /* Compute src1[1] * src2 */ + " movq 40(%0), %%rdx;" + " mulxq 32(%1), %%r8, %%r9;" + " xor %%r10d, %%r10d;" + " adcxq 72(%2), %%r8;" + " movq %%r8, 72(%2);" + " mulxq 40(%1), %%r10, %%r11;" + " adox %%r9, %%r10;" + " adcx %%rbx, %%r10;" + " movq %%r10, 80(%2);" + " mulxq 48(%1), %%rbx, %%r13;" + " adox %%r11, %%rbx;" + " adcx %%r14, %%rbx;" + " mov $0, %%r8;" + " mulxq 56(%1), %%r14, %%rdx;" + " adox %%r13, %%r14;" + " adcx %%rax, %%r14;" + " mov $0, %%rax;" + " adox %%rdx, %%rax;" + " adcx %%r8, %%rax;" + + /* Compute src1[2] * src2 */ + " movq 48(%0), %%rdx;" + " mulxq 32(%1), %%r8, %%r9;" + " xor %%r10d, %%r10d;" + " adcxq 80(%2), %%r8;" + " movq %%r8, 80(%2);" + " mulxq 40(%1), %%r10, %%r11;" + " adox %%r9, %%r10;" + " adcx %%rbx, %%r10;" + " movq %%r10, 88(%2);" + " mulxq 48(%1), %%rbx, %%r13;" + " adox %%r11, %%rbx;" + " adcx %%r14, %%rbx;" + " mov $0, %%r8;" + " mulxq 56(%1), %%r14, %%rdx;" + " adox %%r13, %%r14;" + " adcx %%rax, %%r14;" + " mov $0, %%rax;" + " adox %%rdx, %%rax;" + " adcx %%r8, %%rax;" + + /* Compute src1[3] * src2 */ + " movq 56(%0), %%rdx;" + " mulxq 32(%1), %%r8, %%r9;" + " xor %%r10d, %%r10d;" + " adcxq 88(%2), %%r8;" + " movq %%r8, 88(%2);" + " mulxq 40(%1), %%r10, %%r11;" + " adox %%r9, %%r10;" + " adcx %%rbx, %%r10;" + " movq %%r10, 96(%2);" + " mulxq 48(%1), %%rbx, %%r13;" + " adox %%r11, %%rbx;" + " adcx %%r14, %%rbx;" + " movq %%rbx, 104(%2);" + " mov $0, %%r8;" + " mulxq 56(%1), %%r14, %%rdx;" + " adox %%r13, %%r14;" + " adcx %%rax, %%r14;" + " movq %%r14, 112(%2);" + " mov $0, %%rax;" + " adox %%rdx, %%rax;" + " adcx %%r8, %%rax;" + " movq %%rax, 120(%2);" + + /* Line up pointers */ + " mov %2, %0;" + " mov %3, %2;" + + /* Wrap the results back into the field */ + + /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */ + " mov $38, %%rdx;" + " mulxq 32(%0), %%r8, %%r13;" + " xor %k1, %k1;" + " adoxq 0(%0), %%r8;" + " mulxq 40(%0), %%r9, %%rbx;" + " adcx %%r13, %%r9;" + " adoxq 8(%0), %%r9;" + " mulxq 48(%0), %%r10, %%r13;" + " adcx %%rbx, %%r10;" + " adoxq 16(%0), %%r10;" + " mulxq 56(%0), %%r11, %%rax;" + " adcx %%r13, %%r11;" + " adoxq 24(%0), %%r11;" + " adcx %1, %%rax;" + " adox %1, %%rax;" + " imul %%rdx, %%rax;" + + /* Step 2: Fold the carry back into dst */ + " add %%rax, %%r8;" + " adcx %1, %%r9;" + " movq %%r9, 8(%2);" + " adcx %1, %%r10;" + " movq %%r10, 16(%2);" + " adcx %1, %%r11;" + " movq %%r11, 24(%2);" + + /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */ + " mov $0, %%rax;" + " cmovc %%rdx, %%rax;" + " add %%rax, %%r8;" + " movq %%r8, 0(%2);" + + /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */ + " mov $38, %%rdx;" + " mulxq 96(%0), %%r8, %%r13;" + " xor %k1, %k1;" + " adoxq 64(%0), %%r8;" + " mulxq 104(%0), %%r9, %%rbx;" + " adcx %%r13, %%r9;" + " adoxq 72(%0), %%r9;" + " mulxq 112(%0), %%r10, %%r13;" + " adcx %%rbx, %%r10;" + " adoxq 80(%0), %%r10;" + " mulxq 120(%0), %%r11, %%rax;" + " adcx %%r13, %%r11;" + " adoxq 88(%0), %%r11;" + " adcx %1, %%rax;" + " adox %1, %%rax;" + " imul %%rdx, %%rax;" + + /* Step 2: Fold the carry back into dst */ + " add %%rax, %%r8;" + " adcx %1, %%r9;" + " movq %%r9, 40(%2);" + " adcx %1, %%r10;" + " movq %%r10, 48(%2);" + " adcx %1, %%r11;" + " movq %%r11, 56(%2);" + + /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */ + " mov $0, %%rax;" + " cmovc %%rdx, %%rax;" + " add %%rax, %%r8;" + " movq %%r8, 32(%2);" + : "+&r"(f1), "+&r"(f2), "+&r"(tmp) + : "r"(out) + : "%rax", "%rbx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r13", + "%r14", "memory", "cc"); } -static void red_eltfp25519_1w_adx(u64 *const c, const u64 *const a) +/* Computes the field multiplication of four-element f1 with value in f2 + * Requires f2 to be smaller than 2^17 */ +static inline void fmul_scalar(u64 *out, const u64 *f1, u64 f2) { - asm volatile( - "movl $38, %%edx ;" /* 2*c = 38 = 2^256 */ - "mulx 32(%1), %%r8, %%r10 ;" /* c*C[4] */ - "xorl %%ebx, %%ebx ;" - "adox (%1), %%r8 ;" - "mulx 40(%1), %%r9, %%r11 ;" /* c*C[5] */ - "adcx %%r10, %%r9 ;" - "adox 8(%1), %%r9 ;" - "mulx 48(%1), %%r10, %%rax ;" /* c*C[6] */ - "adcx %%r11, %%r10 ;" - "adox 16(%1), %%r10 ;" - "mulx 56(%1), %%r11, %%rcx ;" /* c*C[7] */ - "adcx %%rax, %%r11 ;" - "adox 24(%1), %%r11 ;" - /***************************************/ - "adcx %%rbx, %%rcx ;" - "adox %%rbx, %%rcx ;" - "imul %%rdx, %%rcx ;" /* c*C[4], cf=0, of=0 */ - "adcx %%rcx, %%r8 ;" - "adcx %%rbx, %%r9 ;" - "movq %%r9, 8(%0) ;" - "adcx %%rbx, %%r10 ;" - "movq %%r10, 16(%0) ;" - "adcx %%rbx, %%r11 ;" - "movq %%r11, 24(%0) ;" - "mov $0, %%ecx ;" - "cmovc %%edx, %%ecx ;" - "addq %%rcx, %%r8 ;" - "movq %%r8, (%0) ;" - : - : "r"(c), "r"(a) - : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", - "%r10", "%r11"); -} + register u64 f2_r asm("rdx") = f2; -static void red_eltfp25519_1w_bmi2(u64 *const c, const u64 *const a) -{ asm volatile( - "movl $38, %%edx ;" /* 2*c = 38 = 2^256 */ - "mulx 32(%1), %%r8, %%r10 ;" /* c*C[4] */ - "mulx 40(%1), %%r9, %%r11 ;" /* c*C[5] */ - "addq %%r10, %%r9 ;" - "mulx 48(%1), %%r10, %%rax ;" /* c*C[6] */ - "adcq %%r11, %%r10 ;" - "mulx 56(%1), %%r11, %%rcx ;" /* c*C[7] */ - "adcq %%rax, %%r11 ;" - /***************************************/ - "adcq $0, %%rcx ;" - "addq (%1), %%r8 ;" - "adcq 8(%1), %%r9 ;" - "adcq 16(%1), %%r10 ;" - "adcq 24(%1), %%r11 ;" - "adcq $0, %%rcx ;" - "imul %%rdx, %%rcx ;" /* c*C[4], cf=0 */ - "addq %%rcx, %%r8 ;" - "adcq $0, %%r9 ;" - "movq %%r9, 8(%0) ;" - "adcq $0, %%r10 ;" - "movq %%r10, 16(%0) ;" - "adcq $0, %%r11 ;" - "movq %%r11, 24(%0) ;" - "mov $0, %%ecx ;" - "cmovc %%edx, %%ecx ;" - "addq %%rcx, %%r8 ;" - "movq %%r8, (%0) ;" - : - : "r"(c), "r"(a) - : "memory", "cc", "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", - "%r11"); + /* Compute the raw multiplication of f1*f2 */ + " mulxq 0(%2), %%r8, %%rcx;" /* f1[0]*f2 */ + " mulxq 8(%2), %%r9, %%rbx;" /* f1[1]*f2 */ + " add %%rcx, %%r9;" + " mov $0, %%rcx;" + " mulxq 16(%2), %%r10, %%r13;" /* f1[2]*f2 */ + " adcx %%rbx, %%r10;" + " mulxq 24(%2), %%r11, %%rax;" /* f1[3]*f2 */ + " adcx %%r13, %%r11;" + " adcx %%rcx, %%rax;" + + /* Wrap the result back into the field */ + + /* Step 1: Compute carry*38 */ + " mov $38, %%rdx;" + " imul %%rdx, %%rax;" + + /* Step 2: Fold the carry back into dst */ + " add %%rax, %%r8;" + " adcx %%rcx, %%r9;" + " movq %%r9, 8(%1);" + " adcx %%rcx, %%r10;" + " movq %%r10, 16(%1);" + " adcx %%rcx, %%r11;" + " movq %%r11, 24(%1);" + + /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */ + " mov $0, %%rax;" + " cmovc %%rdx, %%rax;" + " add %%rax, %%r8;" + " movq %%r8, 0(%1);" + : "+&r"(f2_r) + : "r"(out), "r"(f1) + : "%rax", "%rbx", "%rcx", "%r8", "%r9", "%r10", "%r11", "%r13", + "memory", "cc"); } -static __always_inline void -add_eltfp25519_1w_adx(u64 *const c, const u64 *const a, const u64 *const b) +/* Computes p1 <- bit ? p2 : p1 in constant time */ +static inline void cswap2(u64 bit, const u64 *p1, const u64 *p2) { asm volatile( - "mov $38, %%eax ;" - "xorl %%ecx, %%ecx ;" - "movq (%2), %%r8 ;" - "adcx (%1), %%r8 ;" - "movq 8(%2), %%r9 ;" - "adcx 8(%1), %%r9 ;" - "movq 16(%2), %%r10 ;" - "adcx 16(%1), %%r10 ;" - "movq 24(%2), %%r11 ;" - "adcx 24(%1), %%r11 ;" - "cmovc %%eax, %%ecx ;" - "xorl %%eax, %%eax ;" - "adcx %%rcx, %%r8 ;" - "adcx %%rax, %%r9 ;" - "movq %%r9, 8(%0) ;" - "adcx %%rax, %%r10 ;" - "movq %%r10, 16(%0) ;" - "adcx %%rax, %%r11 ;" - "movq %%r11, 24(%0) ;" - "mov $38, %%ecx ;" - "cmovc %%ecx, %%eax ;" - "addq %%rax, %%r8 ;" - "movq %%r8, (%0) ;" - : - : "r"(c), "r"(a), "r"(b) - : "memory", "cc", "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11"); + /* Transfer bit into CF flag */ + " add $18446744073709551615, %0;" + + /* cswap p1[0], p2[0] */ + " movq 0(%1), %%r8;" + " movq 0(%2), %%r9;" + " mov %%r8, %%r10;" + " cmovc %%r9, %%r8;" + " cmovc %%r10, %%r9;" + " movq %%r8, 0(%1);" + " movq %%r9, 0(%2);" + + /* cswap p1[1], p2[1] */ + " movq 8(%1), %%r8;" + " movq 8(%2), %%r9;" + " mov %%r8, %%r10;" + " cmovc %%r9, %%r8;" + " cmovc %%r10, %%r9;" + " movq %%r8, 8(%1);" + " movq %%r9, 8(%2);" + + /* cswap p1[2], p2[2] */ + " movq 16(%1), %%r8;" + " movq 16(%2), %%r9;" + " mov %%r8, %%r10;" + " cmovc %%r9, %%r8;" + " cmovc %%r10, %%r9;" + " movq %%r8, 16(%1);" + " movq %%r9, 16(%2);" + + /* cswap p1[3], p2[3] */ + " movq 24(%1), %%r8;" + " movq 24(%2), %%r9;" + " mov %%r8, %%r10;" + " cmovc %%r9, %%r8;" + " cmovc %%r10, %%r9;" + " movq %%r8, 24(%1);" + " movq %%r9, 24(%2);" + + /* cswap p1[4], p2[4] */ + " movq 32(%1), %%r8;" + " movq 32(%2), %%r9;" + " mov %%r8, %%r10;" + " cmovc %%r9, %%r8;" + " cmovc %%r10, %%r9;" + " movq %%r8, 32(%1);" + " movq %%r9, 32(%2);" + + /* cswap p1[5], p2[5] */ + " movq 40(%1), %%r8;" + " movq 40(%2), %%r9;" + " mov %%r8, %%r10;" + " cmovc %%r9, %%r8;" + " cmovc %%r10, %%r9;" + " movq %%r8, 40(%1);" + " movq %%r9, 40(%2);" + + /* cswap p1[6], p2[6] */ + " movq 48(%1), %%r8;" + " movq 48(%2), %%r9;" + " mov %%r8, %%r10;" + " cmovc %%r9, %%r8;" + " cmovc %%r10, %%r9;" + " movq %%r8, 48(%1);" + " movq %%r9, 48(%2);" + + /* cswap p1[7], p2[7] */ + " movq 56(%1), %%r8;" + " movq 56(%2), %%r9;" + " mov %%r8, %%r10;" + " cmovc %%r9, %%r8;" + " cmovc %%r10, %%r9;" + " movq %%r8, 56(%1);" + " movq %%r9, 56(%2);" + : "+&r"(bit) + : "r"(p1), "r"(p2) + : "%r8", "%r9", "%r10", "memory", "cc"); } -static __always_inline void -add_eltfp25519_1w_bmi2(u64 *const c, const u64 *const a, const u64 *const b) +/* Computes the square of a field element: out <- f * f + * Uses the 8-element buffer tmp for intermediate results */ +static inline void fsqr(u64 *out, const u64 *f, u64 *tmp) { asm volatile( - "mov $38, %%eax ;" - "movq (%2), %%r8 ;" - "addq (%1), %%r8 ;" - "movq 8(%2), %%r9 ;" - "adcq 8(%1), %%r9 ;" - "movq 16(%2), %%r10 ;" - "adcq 16(%1), %%r10 ;" - "movq 24(%2), %%r11 ;" - "adcq 24(%1), %%r11 ;" - "mov $0, %%ecx ;" - "cmovc %%eax, %%ecx ;" - "addq %%rcx, %%r8 ;" - "adcq $0, %%r9 ;" - "movq %%r9, 8(%0) ;" - "adcq $0, %%r10 ;" - "movq %%r10, 16(%0) ;" - "adcq $0, %%r11 ;" - "movq %%r11, 24(%0) ;" - "mov $0, %%ecx ;" - "cmovc %%eax, %%ecx ;" - "addq %%rcx, %%r8 ;" - "movq %%r8, (%0) ;" - : - : "r"(c), "r"(a), "r"(b) - : "memory", "cc", "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11"); + /* Compute the raw multiplication: tmp <- f * f */ + + /* Step 1: Compute all partial products */ + " movq 0(%0), %%rdx;" /* f[0] */ + " mulxq 8(%0), %%r8, %%r14;" + " xor %%r15d, %%r15d;" /* f[1]*f[0] */ + " mulxq 16(%0), %%r9, %%r10;" + " adcx %%r14, %%r9;" /* f[2]*f[0] */ + " mulxq 24(%0), %%rax, %%rcx;" + " adcx %%rax, %%r10;" /* f[3]*f[0] */ + " movq 24(%0), %%rdx;" /* f[3] */ + " mulxq 8(%0), %%r11, %%rbx;" + " adcx %%rcx, %%r11;" /* f[1]*f[3] */ + " mulxq 16(%0), %%rax, %%r13;" + " adcx %%rax, %%rbx;" /* f[2]*f[3] */ + " movq 8(%0), %%rdx;" + " adcx %%r15, %%r13;" /* f1 */ + " mulxq 16(%0), %%rax, %%rcx;" + " mov $0, %%r14;" /* f[2]*f[1] */ + + /* Step 2: Compute two parallel carry chains */ + " xor %%r15d, %%r15d;" + " adox %%rax, %%r10;" + " adcx %%r8, %%r8;" + " adox %%rcx, %%r11;" + " adcx %%r9, %%r9;" + " adox %%r15, %%rbx;" + " adcx %%r10, %%r10;" + " adox %%r15, %%r13;" + " adcx %%r11, %%r11;" + " adox %%r15, %%r14;" + " adcx %%rbx, %%rbx;" + " adcx %%r13, %%r13;" + " adcx %%r14, %%r14;" + + /* Step 3: Compute intermediate squares */ + " movq 0(%0), %%rdx;" + " mulx %%rdx, %%rax, %%rcx;" /* f[0]^2 */ + " movq %%rax, 0(%1);" + " add %%rcx, %%r8;" + " movq %%r8, 8(%1);" + " movq 8(%0), %%rdx;" + " mulx %%rdx, %%rax, %%rcx;" /* f[1]^2 */ + " adcx %%rax, %%r9;" + " movq %%r9, 16(%1);" + " adcx %%rcx, %%r10;" + " movq %%r10, 24(%1);" + " movq 16(%0), %%rdx;" + " mulx %%rdx, %%rax, %%rcx;" /* f[2]^2 */ + " adcx %%rax, %%r11;" + " movq %%r11, 32(%1);" + " adcx %%rcx, %%rbx;" + " movq %%rbx, 40(%1);" + " movq 24(%0), %%rdx;" + " mulx %%rdx, %%rax, %%rcx;" /* f[3]^2 */ + " adcx %%rax, %%r13;" + " movq %%r13, 48(%1);" + " adcx %%rcx, %%r14;" + " movq %%r14, 56(%1);" + + /* Line up pointers */ + " mov %1, %0;" + " mov %2, %1;" + + /* Wrap the result back into the field */ + + /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */ + " mov $38, %%rdx;" + " mulxq 32(%0), %%r8, %%r13;" + " xor %%ecx, %%ecx;" + " adoxq 0(%0), %%r8;" + " mulxq 40(%0), %%r9, %%rbx;" + " adcx %%r13, %%r9;" + " adoxq 8(%0), %%r9;" + " mulxq 48(%0), %%r10, %%r13;" + " adcx %%rbx, %%r10;" + " adoxq 16(%0), %%r10;" + " mulxq 56(%0), %%r11, %%rax;" + " adcx %%r13, %%r11;" + " adoxq 24(%0), %%r11;" + " adcx %%rcx, %%rax;" + " adox %%rcx, %%rax;" + " imul %%rdx, %%rax;" + + /* Step 2: Fold the carry back into dst */ + " add %%rax, %%r8;" + " adcx %%rcx, %%r9;" + " movq %%r9, 8(%1);" + " adcx %%rcx, %%r10;" + " movq %%r10, 16(%1);" + " adcx %%rcx, %%r11;" + " movq %%r11, 24(%1);" + + /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */ + " mov $0, %%rax;" + " cmovc %%rdx, %%rax;" + " add %%rax, %%r8;" + " movq %%r8, 0(%1);" + : "+&r"(f), "+&r"(tmp) + : "r"(out) + : "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", + "%r13", "%r14", "%r15", "memory", "cc"); } -static __always_inline void -sub_eltfp25519_1w(u64 *const c, const u64 *const a, const u64 *const b) +/* Computes two field squarings: + * out[0] <- f[0] * f[0] + * out[1] <- f[1] * f[1] + * Uses the 16-element buffer tmp for intermediate results */ +static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp) { asm volatile( - "mov $38, %%eax ;" - "movq (%1), %%r8 ;" - "subq (%2), %%r8 ;" - "movq 8(%1), %%r9 ;" - "sbbq 8(%2), %%r9 ;" - "movq 16(%1), %%r10 ;" - "sbbq 16(%2), %%r10 ;" - "movq 24(%1), %%r11 ;" - "sbbq 24(%2), %%r11 ;" - "mov $0, %%ecx ;" - "cmovc %%eax, %%ecx ;" - "subq %%rcx, %%r8 ;" - "sbbq $0, %%r9 ;" - "movq %%r9, 8(%0) ;" - "sbbq $0, %%r10 ;" - "movq %%r10, 16(%0) ;" - "sbbq $0, %%r11 ;" - "movq %%r11, 24(%0) ;" - "mov $0, %%ecx ;" - "cmovc %%eax, %%ecx ;" - "subq %%rcx, %%r8 ;" - "movq %%r8, (%0) ;" - : - : "r"(c), "r"(a), "r"(b) - : "memory", "cc", "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11"); + /* Step 1: Compute all partial products */ + " movq 0(%0), %%rdx;" /* f[0] */ + " mulxq 8(%0), %%r8, %%r14;" + " xor %%r15d, %%r15d;" /* f[1]*f[0] */ + " mulxq 16(%0), %%r9, %%r10;" + " adcx %%r14, %%r9;" /* f[2]*f[0] */ + " mulxq 24(%0), %%rax, %%rcx;" + " adcx %%rax, %%r10;" /* f[3]*f[0] */ + " movq 24(%0), %%rdx;" /* f[3] */ + " mulxq 8(%0), %%r11, %%rbx;" + " adcx %%rcx, %%r11;" /* f[1]*f[3] */ + " mulxq 16(%0), %%rax, %%r13;" + " adcx %%rax, %%rbx;" /* f[2]*f[3] */ + " movq 8(%0), %%rdx;" + " adcx %%r15, %%r13;" /* f1 */ + " mulxq 16(%0), %%rax, %%rcx;" + " mov $0, %%r14;" /* f[2]*f[1] */ + + /* Step 2: Compute two parallel carry chains */ + " xor %%r15d, %%r15d;" + " adox %%rax, %%r10;" + " adcx %%r8, %%r8;" + " adox %%rcx, %%r11;" + " adcx %%r9, %%r9;" + " adox %%r15, %%rbx;" + " adcx %%r10, %%r10;" + " adox %%r15, %%r13;" + " adcx %%r11, %%r11;" + " adox %%r15, %%r14;" + " adcx %%rbx, %%rbx;" + " adcx %%r13, %%r13;" + " adcx %%r14, %%r14;" + + /* Step 3: Compute intermediate squares */ + " movq 0(%0), %%rdx;" + " mulx %%rdx, %%rax, %%rcx;" /* f[0]^2 */ + " movq %%rax, 0(%1);" + " add %%rcx, %%r8;" + " movq %%r8, 8(%1);" + " movq 8(%0), %%rdx;" + " mulx %%rdx, %%rax, %%rcx;" /* f[1]^2 */ + " adcx %%rax, %%r9;" + " movq %%r9, 16(%1);" + " adcx %%rcx, %%r10;" + " movq %%r10, 24(%1);" + " movq 16(%0), %%rdx;" + " mulx %%rdx, %%rax, %%rcx;" /* f[2]^2 */ + " adcx %%rax, %%r11;" + " movq %%r11, 32(%1);" + " adcx %%rcx, %%rbx;" + " movq %%rbx, 40(%1);" + " movq 24(%0), %%rdx;" + " mulx %%rdx, %%rax, %%rcx;" /* f[3]^2 */ + " adcx %%rax, %%r13;" + " movq %%r13, 48(%1);" + " adcx %%rcx, %%r14;" + " movq %%r14, 56(%1);" + + /* Step 1: Compute all partial products */ + " movq 32(%0), %%rdx;" /* f[0] */ + " mulxq 40(%0), %%r8, %%r14;" + " xor %%r15d, %%r15d;" /* f[1]*f[0] */ + " mulxq 48(%0), %%r9, %%r10;" + " adcx %%r14, %%r9;" /* f[2]*f[0] */ + " mulxq 56(%0), %%rax, %%rcx;" + " adcx %%rax, %%r10;" /* f[3]*f[0] */ + " movq 56(%0), %%rdx;" /* f[3] */ + " mulxq 40(%0), %%r11, %%rbx;" + " adcx %%rcx, %%r11;" /* f[1]*f[3] */ + " mulxq 48(%0), %%rax, %%r13;" + " adcx %%rax, %%rbx;" /* f[2]*f[3] */ + " movq 40(%0), %%rdx;" + " adcx %%r15, %%r13;" /* f1 */ + " mulxq 48(%0), %%rax, %%rcx;" + " mov $0, %%r14;" /* f[2]*f[1] */ + + /* Step 2: Compute two parallel carry chains */ + " xor %%r15d, %%r15d;" + " adox %%rax, %%r10;" + " adcx %%r8, %%r8;" + " adox %%rcx, %%r11;" + " adcx %%r9, %%r9;" + " adox %%r15, %%rbx;" + " adcx %%r10, %%r10;" + " adox %%r15, %%r13;" + " adcx %%r11, %%r11;" + " adox %%r15, %%r14;" + " adcx %%rbx, %%rbx;" + " adcx %%r13, %%r13;" + " adcx %%r14, %%r14;" + + /* Step 3: Compute intermediate squares */ + " movq 32(%0), %%rdx;" + " mulx %%rdx, %%rax, %%rcx;" /* f[0]^2 */ + " movq %%rax, 64(%1);" + " add %%rcx, %%r8;" + " movq %%r8, 72(%1);" + " movq 40(%0), %%rdx;" + " mulx %%rdx, %%rax, %%rcx;" /* f[1]^2 */ + " adcx %%rax, %%r9;" + " movq %%r9, 80(%1);" + " adcx %%rcx, %%r10;" + " movq %%r10, 88(%1);" + " movq 48(%0), %%rdx;" + " mulx %%rdx, %%rax, %%rcx;" /* f[2]^2 */ + " adcx %%rax, %%r11;" + " movq %%r11, 96(%1);" + " adcx %%rcx, %%rbx;" + " movq %%rbx, 104(%1);" + " movq 56(%0), %%rdx;" + " mulx %%rdx, %%rax, %%rcx;" /* f[3]^2 */ + " adcx %%rax, %%r13;" + " movq %%r13, 112(%1);" + " adcx %%rcx, %%r14;" + " movq %%r14, 120(%1);" + + /* Line up pointers */ + " mov %1, %0;" + " mov %2, %1;" + + /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */ + " mov $38, %%rdx;" + " mulxq 32(%0), %%r8, %%r13;" + " xor %%ecx, %%ecx;" + " adoxq 0(%0), %%r8;" + " mulxq 40(%0), %%r9, %%rbx;" + " adcx %%r13, %%r9;" + " adoxq 8(%0), %%r9;" + " mulxq 48(%0), %%r10, %%r13;" + " adcx %%rbx, %%r10;" + " adoxq 16(%0), %%r10;" + " mulxq 56(%0), %%r11, %%rax;" + " adcx %%r13, %%r11;" + " adoxq 24(%0), %%r11;" + " adcx %%rcx, %%rax;" + " adox %%rcx, %%rax;" + " imul %%rdx, %%rax;" + + /* Step 2: Fold the carry back into dst */ + " add %%rax, %%r8;" + " adcx %%rcx, %%r9;" + " movq %%r9, 8(%1);" + " adcx %%rcx, %%r10;" + " movq %%r10, 16(%1);" + " adcx %%rcx, %%r11;" + " movq %%r11, 24(%1);" + + /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */ + " mov $0, %%rax;" + " cmovc %%rdx, %%rax;" + " add %%rax, %%r8;" + " movq %%r8, 0(%1);" + + /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */ + " mov $38, %%rdx;" + " mulxq 96(%0), %%r8, %%r13;" + " xor %%ecx, %%ecx;" + " adoxq 64(%0), %%r8;" + " mulxq 104(%0), %%r9, %%rbx;" + " adcx %%r13, %%r9;" + " adoxq 72(%0), %%r9;" + " mulxq 112(%0), %%r10, %%r13;" + " adcx %%rbx, %%r10;" + " adoxq 80(%0), %%r10;" + " mulxq 120(%0), %%r11, %%rax;" + " adcx %%r13, %%r11;" + " adoxq 88(%0), %%r11;" + " adcx %%rcx, %%rax;" + " adox %%rcx, %%rax;" + " imul %%rdx, %%rax;" + + /* Step 2: Fold the carry back into dst */ + " add %%rax, %%r8;" + " adcx %%rcx, %%r9;" + " movq %%r9, 40(%1);" + " adcx %%rcx, %%r10;" + " movq %%r10, 48(%1);" + " adcx %%rcx, %%r11;" + " movq %%r11, 56(%1);" + + /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */ + " mov $0, %%rax;" + " cmovc %%rdx, %%rax;" + " add %%rax, %%r8;" + " movq %%r8, 32(%1);" + : "+&r"(f), "+&r"(tmp) + : "r"(out) + : "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", + "%r13", "%r14", "%r15", "memory", "cc"); } -/* Multiplication by a24 = (A+2)/4 = (486662+2)/4 = 121666 */ -static __always_inline void -mul_a24_eltfp25519_1w(u64 *const c, const u64 *const a) +static void point_add_and_double(u64 *q, u64 *p01_tmp1, u64 *tmp2) { - const u64 a24 = 121666; - asm volatile( - "movq %2, %%rdx ;" - "mulx (%1), %%r8, %%r10 ;" - "mulx 8(%1), %%r9, %%r11 ;" - "addq %%r10, %%r9 ;" - "mulx 16(%1), %%r10, %%rax ;" - "adcq %%r11, %%r10 ;" - "mulx 24(%1), %%r11, %%rcx ;" - "adcq %%rax, %%r11 ;" - /**************************/ - "adcq $0, %%rcx ;" - "movl $38, %%edx ;" /* 2*c = 38 = 2^256 mod 2^255-19*/ - "imul %%rdx, %%rcx ;" - "addq %%rcx, %%r8 ;" - "adcq $0, %%r9 ;" - "movq %%r9, 8(%0) ;" - "adcq $0, %%r10 ;" - "movq %%r10, 16(%0) ;" - "adcq $0, %%r11 ;" - "movq %%r11, 24(%0) ;" - "mov $0, %%ecx ;" - "cmovc %%edx, %%ecx ;" - "addq %%rcx, %%r8 ;" - "movq %%r8, (%0) ;" - : - : "r"(c), "r"(a), "r"(a24) - : "memory", "cc", "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", - "%r11"); + u64 *nq = p01_tmp1; + u64 *nq_p1 = p01_tmp1 + (u32)8U; + u64 *tmp1 = p01_tmp1 + (u32)16U; + u64 *x1 = q; + u64 *x2 = nq; + u64 *z2 = nq + (u32)4U; + u64 *z3 = nq_p1 + (u32)4U; + u64 *a = tmp1; + u64 *b = tmp1 + (u32)4U; + u64 *ab = tmp1; + u64 *dc = tmp1 + (u32)8U; + u64 *x3; + u64 *z31; + u64 *d0; + u64 *c0; + u64 *a1; + u64 *b1; + u64 *d; + u64 *c; + u64 *ab1; + u64 *dc1; + fadd(a, x2, z2); + fsub(b, x2, z2); + x3 = nq_p1; + z31 = nq_p1 + (u32)4U; + d0 = dc; + c0 = dc + (u32)4U; + fadd(c0, x3, z31); + fsub(d0, x3, z31); + fmul2(dc, dc, ab, tmp2); + fadd(x3, d0, c0); + fsub(z31, d0, c0); + a1 = tmp1; + b1 = tmp1 + (u32)4U; + d = tmp1 + (u32)8U; + c = tmp1 + (u32)12U; + ab1 = tmp1; + dc1 = tmp1 + (u32)8U; + fsqr2(dc1, ab1, tmp2); + fsqr2(nq_p1, nq_p1, tmp2); + a1[0U] = c[0U]; + a1[1U] = c[1U]; + a1[2U] = c[2U]; + a1[3U] = c[3U]; + fsub(c, d, c); + fmul_scalar(b1, c, (u64)121665U); + fadd(b1, b1, d); + fmul2(nq, dc1, ab1, tmp2); + fmul(z3, z3, x1, tmp2); } -static void inv_eltfp25519_1w_adx(u64 *const c, const u64 *const a) +static void point_double(u64 *nq, u64 *tmp1, u64 *tmp2) { - struct { - eltfp25519_1w_buffer buffer; - eltfp25519_1w x0, x1, x2; - } __aligned(32) m; - u64 *T[4]; - - T[0] = m.x0; - T[1] = c; /* x^(-1) */ - T[2] = m.x1; - T[3] = m.x2; - - copy_eltfp25519_1w(T[1], a); - sqrn_eltfp25519_1w_adx(T[1], 1); - copy_eltfp25519_1w(T[2], T[1]); - sqrn_eltfp25519_1w_adx(T[2], 2); - mul_eltfp25519_1w_adx(T[0], a, T[2]); - mul_eltfp25519_1w_adx(T[1], T[1], T[0]); - copy_eltfp25519_1w(T[2], T[1]); - sqrn_eltfp25519_1w_adx(T[2], 1); - mul_eltfp25519_1w_adx(T[0], T[0], T[2]); - copy_eltfp25519_1w(T[2], T[0]); - sqrn_eltfp25519_1w_adx(T[2], 5); - mul_eltfp25519_1w_adx(T[0], T[0], T[2]); - copy_eltfp25519_1w(T[2], T[0]); - sqrn_eltfp25519_1w_adx(T[2], 10); - mul_eltfp25519_1w_adx(T[2], T[2], T[0]); - copy_eltfp25519_1w(T[3], T[2]); - sqrn_eltfp25519_1w_adx(T[3], 20); - mul_eltfp25519_1w_adx(T[3], T[3], T[2]); - sqrn_eltfp25519_1w_adx(T[3], 10); - mul_eltfp25519_1w_adx(T[3], T[3], T[0]); - copy_eltfp25519_1w(T[0], T[3]); - sqrn_eltfp25519_1w_adx(T[0], 50); - mul_eltfp25519_1w_adx(T[0], T[0], T[3]); - copy_eltfp25519_1w(T[2], T[0]); - sqrn_eltfp25519_1w_adx(T[2], 100); - mul_eltfp25519_1w_adx(T[2], T[2], T[0]); - sqrn_eltfp25519_1w_adx(T[2], 50); - mul_eltfp25519_1w_adx(T[2], T[2], T[3]); - sqrn_eltfp25519_1w_adx(T[2], 5); - mul_eltfp25519_1w_adx(T[1], T[1], T[2]); - - memzero_explicit(&m, sizeof(m)); + u64 *x2 = nq; + u64 *z2 = nq + (u32)4U; + u64 *a = tmp1; + u64 *b = tmp1 + (u32)4U; + u64 *d = tmp1 + (u32)8U; + u64 *c = tmp1 + (u32)12U; + u64 *ab = tmp1; + u64 *dc = tmp1 + (u32)8U; + fadd(a, x2, z2); + fsub(b, x2, z2); + fsqr2(dc, ab, tmp2); + a[0U] = c[0U]; + a[1U] = c[1U]; + a[2U] = c[2U]; + a[3U] = c[3U]; + fsub(c, d, c); + fmul_scalar(b, c, (u64)121665U); + fadd(b, b, d); + fmul2(nq, dc, ab, tmp2); } -static void inv_eltfp25519_1w_bmi2(u64 *const c, const u64 *const a) +static void montgomery_ladder(u64 *out, const u8 *key, u64 *init1) { - struct { - eltfp25519_1w_buffer buffer; - eltfp25519_1w x0, x1, x2; - } __aligned(32) m; - u64 *T[5]; - - T[0] = m.x0; - T[1] = c; /* x^(-1) */ - T[2] = m.x1; - T[3] = m.x2; - - copy_eltfp25519_1w(T[1], a); - sqrn_eltfp25519_1w_bmi2(T[1], 1); - copy_eltfp25519_1w(T[2], T[1]); - sqrn_eltfp25519_1w_bmi2(T[2], 2); - mul_eltfp25519_1w_bmi2(T[0], a, T[2]); - mul_eltfp25519_1w_bmi2(T[1], T[1], T[0]); - copy_eltfp25519_1w(T[2], T[1]); - sqrn_eltfp25519_1w_bmi2(T[2], 1); - mul_eltfp25519_1w_bmi2(T[0], T[0], T[2]); - copy_eltfp25519_1w(T[2], T[0]); - sqrn_eltfp25519_1w_bmi2(T[2], 5); - mul_eltfp25519_1w_bmi2(T[0], T[0], T[2]); - copy_eltfp25519_1w(T[2], T[0]); - sqrn_eltfp25519_1w_bmi2(T[2], 10); - mul_eltfp25519_1w_bmi2(T[2], T[2], T[0]); - copy_eltfp25519_1w(T[3], T[2]); - sqrn_eltfp25519_1w_bmi2(T[3], 20); - mul_eltfp25519_1w_bmi2(T[3], T[3], T[2]); - sqrn_eltfp25519_1w_bmi2(T[3], 10); - mul_eltfp25519_1w_bmi2(T[3], T[3], T[0]); - copy_eltfp25519_1w(T[0], T[3]); - sqrn_eltfp25519_1w_bmi2(T[0], 50); - mul_eltfp25519_1w_bmi2(T[0], T[0], T[3]); - copy_eltfp25519_1w(T[2], T[0]); - sqrn_eltfp25519_1w_bmi2(T[2], 100); - mul_eltfp25519_1w_bmi2(T[2], T[2], T[0]); - sqrn_eltfp25519_1w_bmi2(T[2], 50); - mul_eltfp25519_1w_bmi2(T[2], T[2], T[3]); - sqrn_eltfp25519_1w_bmi2(T[2], 5); - mul_eltfp25519_1w_bmi2(T[1], T[1], T[2]); - - memzero_explicit(&m, sizeof(m)); + u64 tmp2[16U] = { 0U }; + u64 p01_tmp1_swap[33U] = { 0U }; + u64 *p0 = p01_tmp1_swap; + u64 *p01 = p01_tmp1_swap; + u64 *p03 = p01; + u64 *p11 = p01 + (u32)8U; + u64 *x0; + u64 *z0; + u64 *p01_tmp1; + u64 *p01_tmp11; + u64 *nq10; + u64 *nq_p11; + u64 *swap1; + u64 sw0; + u64 *nq1; + u64 *tmp1; + memcpy(p11, init1, (u32)8U * sizeof(init1[0U])); + x0 = p03; + z0 = p03 + (u32)4U; + x0[0U] = (u64)1U; + x0[1U] = (u64)0U; + x0[2U] = (u64)0U; + x0[3U] = (u64)0U; + z0[0U] = (u64)0U; + z0[1U] = (u64)0U; + z0[2U] = (u64)0U; + z0[3U] = (u64)0U; + p01_tmp1 = p01_tmp1_swap; + p01_tmp11 = p01_tmp1_swap; + nq10 = p01_tmp1_swap; + nq_p11 = p01_tmp1_swap + (u32)8U; + swap1 = p01_tmp1_swap + (u32)32U; + cswap2((u64)1U, nq10, nq_p11); + point_add_and_double(init1, p01_tmp11, tmp2); + swap1[0U] = (u64)1U; + { + u32 i; + for (i = (u32)0U; i < (u32)251U; i = i + (u32)1U) { + u64 *p01_tmp12 = p01_tmp1_swap; + u64 *swap2 = p01_tmp1_swap + (u32)32U; + u64 *nq2 = p01_tmp12; + u64 *nq_p12 = p01_tmp12 + (u32)8U; + u64 bit = (u64)(key[((u32)253U - i) / (u32)8U] >> ((u32)253U - i) % (u32)8U & (u8)1U); + u64 sw = swap2[0U] ^ bit; + cswap2(sw, nq2, nq_p12); + point_add_and_double(init1, p01_tmp12, tmp2); + swap2[0U] = bit; + } + } + sw0 = swap1[0U]; + cswap2(sw0, nq10, nq_p11); + nq1 = p01_tmp1; + tmp1 = p01_tmp1 + (u32)16U; + point_double(nq1, tmp1, tmp2); + point_double(nq1, tmp1, tmp2); + point_double(nq1, tmp1, tmp2); + memcpy(out, p0, (u32)8U * sizeof(p0[0U])); + + memzero_explicit(tmp2, sizeof(tmp2)); + memzero_explicit(p01_tmp1_swap, sizeof(p01_tmp1_swap)); } -/* Given c, a 256-bit number, fred_eltfp25519_1w updates c - * with a number such that 0 <= C < 2**255-19. - */ -static __always_inline void fred_eltfp25519_1w(u64 *const c) +static void fsquare_times(u64 *o, const u64 *inp, u64 *tmp, u32 n1) { - u64 tmp0 = 38, tmp1 = 19; - asm volatile( - "btrq $63, %3 ;" /* Put bit 255 in carry flag and clear */ - "cmovncl %k5, %k4 ;" /* c[255] ? 38 : 19 */ - - /* Add either 19 or 38 to c */ - "addq %4, %0 ;" - "adcq $0, %1 ;" - "adcq $0, %2 ;" - "adcq $0, %3 ;" - - /* Test for bit 255 again; only triggered on overflow modulo 2^255-19 */ - "movl $0, %k4 ;" - "cmovnsl %k5, %k4 ;" /* c[255] ? 0 : 19 */ - "btrq $63, %3 ;" /* Clear bit 255 */ - - /* Subtract 19 if necessary */ - "subq %4, %0 ;" - "sbbq $0, %1 ;" - "sbbq $0, %2 ;" - "sbbq $0, %3 ;" - - : "+r"(c[0]), "+r"(c[1]), "+r"(c[2]), "+r"(c[3]), "+r"(tmp0), - "+r"(tmp1) - : - : "memory", "cc"); + u32 i; + fsqr(o, inp, tmp); + for (i = (u32)0U; i < n1 - (u32)1U; i = i + (u32)1U) + fsqr(o, o, tmp); } -static __always_inline void cswap(u8 bit, u64 *const px, u64 *const py) +static void finv(u64 *o, const u64 *i, u64 *tmp) { - u64 temp; - asm volatile( - "test %9, %9 ;" - "movq %0, %8 ;" - "cmovnzq %4, %0 ;" - "cmovnzq %8, %4 ;" - "movq %1, %8 ;" - "cmovnzq %5, %1 ;" - "cmovnzq %8, %5 ;" - "movq %2, %8 ;" - "cmovnzq %6, %2 ;" - "cmovnzq %8, %6 ;" - "movq %3, %8 ;" - "cmovnzq %7, %3 ;" - "cmovnzq %8, %7 ;" - : "+r"(px[0]), "+r"(px[1]), "+r"(px[2]), "+r"(px[3]), - "+r"(py[0]), "+r"(py[1]), "+r"(py[2]), "+r"(py[3]), - "=r"(temp) - : "r"(bit) - : "cc" - ); + u64 t1[16U] = { 0U }; + u64 *a0 = t1; + u64 *b = t1 + (u32)4U; + u64 *c = t1 + (u32)8U; + u64 *t00 = t1 + (u32)12U; + u64 *tmp1 = tmp; + u64 *a; + u64 *t0; + fsquare_times(a0, i, tmp1, (u32)1U); + fsquare_times(t00, a0, tmp1, (u32)2U); + fmul(b, t00, i, tmp); + fmul(a0, b, a0, tmp); + fsquare_times(t00, a0, tmp1, (u32)1U); + fmul(b, t00, b, tmp); + fsquare_times(t00, b, tmp1, (u32)5U); + fmul(b, t00, b, tmp); + fsquare_times(t00, b, tmp1, (u32)10U); + fmul(c, t00, b, tmp); + fsquare_times(t00, c, tmp1, (u32)20U); + fmul(t00, t00, c, tmp); + fsquare_times(t00, t00, tmp1, (u32)10U); + fmul(b, t00, b, tmp); + fsquare_times(t00, b, tmp1, (u32)50U); + fmul(c, t00, b, tmp); + fsquare_times(t00, c, tmp1, (u32)100U); + fmul(t00, t00, c, tmp); + fsquare_times(t00, t00, tmp1, (u32)50U); + fmul(t00, t00, b, tmp); + fsquare_times(t00, t00, tmp1, (u32)5U); + a = t1; + t0 = t1 + (u32)12U; + fmul(o, t0, a, tmp); } -static __always_inline void cselect(u8 bit, u64 *const px, const u64 *const py) +static void store_felem(u64 *b, u64 *f) { - asm volatile( - "test %4, %4 ;" - "cmovnzq %5, %0 ;" - "cmovnzq %6, %1 ;" - "cmovnzq %7, %2 ;" - "cmovnzq %8, %3 ;" - : "+r"(px[0]), "+r"(px[1]), "+r"(px[2]), "+r"(px[3]) - : "r"(bit), "rm"(py[0]), "rm"(py[1]), "rm"(py[2]), "rm"(py[3]) - : "cc" - ); + u64 f30 = f[3U]; + u64 top_bit0 = f30 >> (u32)63U; + u64 f31; + u64 top_bit; + u64 f0; + u64 f1; + u64 f2; + u64 f3; + u64 m0; + u64 m1; + u64 m2; + u64 m3; + u64 mask; + u64 f0_; + u64 f1_; + u64 f2_; + u64 f3_; + u64 o0; + u64 o1; + u64 o2; + u64 o3; + f[3U] = f30 & (u64)0x7fffffffffffffffU; + add_scalar(f, f, (u64)19U * top_bit0); + f31 = f[3U]; + top_bit = f31 >> (u32)63U; + f[3U] = f31 & (u64)0x7fffffffffffffffU; + add_scalar(f, f, (u64)19U * top_bit); + f0 = f[0U]; + f1 = f[1U]; + f2 = f[2U]; + f3 = f[3U]; + m0 = gte_mask(f0, (u64)0xffffffffffffffedU); + m1 = eq_mask(f1, (u64)0xffffffffffffffffU); + m2 = eq_mask(f2, (u64)0xffffffffffffffffU); + m3 = eq_mask(f3, (u64)0x7fffffffffffffffU); + mask = ((m0 & m1) & m2) & m3; + f0_ = f0 - (mask & (u64)0xffffffffffffffedU); + f1_ = f1 - (mask & (u64)0xffffffffffffffffU); + f2_ = f2 - (mask & (u64)0xffffffffffffffffU); + f3_ = f3 - (mask & (u64)0x7fffffffffffffffU); + o0 = f0_; + o1 = f1_; + o2 = f2_; + o3 = f3_; + b[0U] = o0; + b[1U] = o1; + b[2U] = o2; + b[3U] = o3; } -static void curve25519_adx(u8 shared[CURVE25519_KEY_SIZE], - const u8 private_key[CURVE25519_KEY_SIZE], - const u8 session_key[CURVE25519_KEY_SIZE]) +static void encode_point(u8 *o, const u64 *i) { - struct { - u64 buffer[4 * NUM_WORDS_ELTFP25519]; - u64 coordinates[4 * NUM_WORDS_ELTFP25519]; - u64 workspace[6 * NUM_WORDS_ELTFP25519]; - u8 session[CURVE25519_KEY_SIZE]; - u8 private[CURVE25519_KEY_SIZE]; - } __aligned(32) m; - - int i = 0, j = 0; - u64 prev = 0; - u64 *const X1 = (u64 *)m.session; - u64 *const key = (u64 *)m.private; - u64 *const Px = m.coordinates + 0; - u64 *const Pz = m.coordinates + 4; - u64 *const Qx = m.coordinates + 8; - u64 *const Qz = m.coordinates + 12; - u64 *const X2 = Qx; - u64 *const Z2 = Qz; - u64 *const X3 = Px; - u64 *const Z3 = Pz; - u64 *const X2Z2 = Qx; - u64 *const X3Z3 = Px; - - u64 *const A = m.workspace + 0; - u64 *const B = m.workspace + 4; - u64 *const D = m.workspace + 8; - u64 *const C = m.workspace + 12; - u64 *const DA = m.workspace + 16; - u64 *const CB = m.workspace + 20; - u64 *const AB = A; - u64 *const DC = D; - u64 *const DACB = DA; - - memcpy(m.private, private_key, sizeof(m.private)); - memcpy(m.session, session_key, sizeof(m.session)); - - curve25519_clamp_secret(m.private); - - /* As in the draft: - * When receiving such an array, implementations of curve25519 - * MUST mask the most-significant bit in the final byte. This - * is done to preserve compatibility with point formats which - * reserve the sign bit for use in other protocols and to - * increase resistance to implementation fingerprinting - */ - m.session[CURVE25519_KEY_SIZE - 1] &= (1 << (255 % 8)) - 1; - - copy_eltfp25519_1w(Px, X1); - setzero_eltfp25519_1w(Pz); - setzero_eltfp25519_1w(Qx); - setzero_eltfp25519_1w(Qz); - - Pz[0] = 1; - Qx[0] = 1; - - /* main-loop */ - prev = 0; - j = 62; - for (i = 3; i >= 0; --i) { - while (j >= 0) { - u64 bit = (key[i] >> j) & 0x1; - u64 swap = bit ^ prev; - prev = bit; - - add_eltfp25519_1w_adx(A, X2, Z2); /* A = (X2+Z2) */ - sub_eltfp25519_1w(B, X2, Z2); /* B = (X2-Z2) */ - add_eltfp25519_1w_adx(C, X3, Z3); /* C = (X3+Z3) */ - sub_eltfp25519_1w(D, X3, Z3); /* D = (X3-Z3) */ - mul_eltfp25519_2w_adx(DACB, AB, DC); /* [DA|CB] = [A|B]*[D|C] */ - - cselect(swap, A, C); - cselect(swap, B, D); - - sqr_eltfp25519_2w_adx(AB); /* [AA|BB] = [A^2|B^2] */ - add_eltfp25519_1w_adx(X3, DA, CB); /* X3 = (DA+CB) */ - sub_eltfp25519_1w(Z3, DA, CB); /* Z3 = (DA-CB) */ - sqr_eltfp25519_2w_adx(X3Z3); /* [X3|Z3] = [(DA+CB)|(DA+CB)]^2 */ - - copy_eltfp25519_1w(X2, B); /* X2 = B^2 */ - sub_eltfp25519_1w(Z2, A, B); /* Z2 = E = AA-BB */ - - mul_a24_eltfp25519_1w(B, Z2); /* B = a24*E */ - add_eltfp25519_1w_adx(B, B, X2); /* B = a24*E+B */ - mul_eltfp25519_2w_adx(X2Z2, X2Z2, AB); /* [X2|Z2] = [B|E]*[A|a24*E+B] */ - mul_eltfp25519_1w_adx(Z3, Z3, X1); /* Z3 = Z3*X1 */ - --j; - } - j = 63; - } - - inv_eltfp25519_1w_adx(A, Qz); - mul_eltfp25519_1w_adx((u64 *)shared, Qx, A); - fred_eltfp25519_1w((u64 *)shared); - - memzero_explicit(&m, sizeof(m)); + const u64 *x = i; + const u64 *z = i + (u32)4U; + u64 tmp[4U] = { 0U }; + u64 tmp_w[16U] = { 0U }; + finv(tmp, z, tmp_w); + fmul(tmp, tmp, x, tmp_w); + store_felem((u64 *)o, tmp); } -static void curve25519_adx_base(u8 session_key[CURVE25519_KEY_SIZE], - const u8 private_key[CURVE25519_KEY_SIZE]) +static void curve25519_ever64(u8 *out, const u8 *priv, const u8 *pub) { - struct { - u64 buffer[4 * NUM_WORDS_ELTFP25519]; - u64 coordinates[4 * NUM_WORDS_ELTFP25519]; - u64 workspace[4 * NUM_WORDS_ELTFP25519]; - u8 private[CURVE25519_KEY_SIZE]; - } __aligned(32) m; - - const int ite[4] = { 64, 64, 64, 63 }; - const int q = 3; - u64 swap = 1; - - int i = 0, j = 0, k = 0; - u64 *const key = (u64 *)m.private; - u64 *const Ur1 = m.coordinates + 0; - u64 *const Zr1 = m.coordinates + 4; - u64 *const Ur2 = m.coordinates + 8; - u64 *const Zr2 = m.coordinates + 12; - - u64 *const UZr1 = m.coordinates + 0; - u64 *const ZUr2 = m.coordinates + 8; - - u64 *const A = m.workspace + 0; - u64 *const B = m.workspace + 4; - u64 *const C = m.workspace + 8; - u64 *const D = m.workspace + 12; - - u64 *const AB = m.workspace + 0; - u64 *const CD = m.workspace + 8; - - const u64 *const P = table_ladder_8k; - - memcpy(m.private, private_key, sizeof(m.private)); - - curve25519_clamp_secret(m.private); - - setzero_eltfp25519_1w(Ur1); - setzero_eltfp25519_1w(Zr1); - setzero_eltfp25519_1w(Zr2); - Ur1[0] = 1; - Zr1[0] = 1; - Zr2[0] = 1; - - /* G-S */ - Ur2[3] = 0x1eaecdeee27cab34UL; - Ur2[2] = 0xadc7a0b9235d48e2UL; - Ur2[1] = 0xbbf095ae14b2edf8UL; - Ur2[0] = 0x7e94e1fec82faabdUL; - - /* main-loop */ - j = q; - for (i = 0; i < NUM_WORDS_ELTFP25519; ++i) { - while (j < ite[i]) { - u64 bit = (key[i] >> j) & 0x1; - k = (64 * i + j - q); - swap = swap ^ bit; - cswap(swap, Ur1, Ur2); - cswap(swap, Zr1, Zr2); - swap = bit; - /* Addition */ - sub_eltfp25519_1w(B, Ur1, Zr1); /* B = Ur1-Zr1 */ - add_eltfp25519_1w_adx(A, Ur1, Zr1); /* A = Ur1+Zr1 */ - mul_eltfp25519_1w_adx(C, &P[4 * k], B); /* C = M0-B */ - sub_eltfp25519_1w(B, A, C); /* B = (Ur1+Zr1) - M*(Ur1-Zr1) */ - add_eltfp25519_1w_adx(A, A, C); /* A = (Ur1+Zr1) + M*(Ur1-Zr1) */ - sqr_eltfp25519_2w_adx(AB); /* A = A^2 | B = B^2 */ - mul_eltfp25519_2w_adx(UZr1, ZUr2, AB); /* Ur1 = Zr2*A | Zr1 = Ur2*B */ - ++j; + u64 init1[8U] = { 0U }; + u64 tmp[4U] = { 0U }; + u64 tmp3; + u64 *x; + u64 *z; + { + u32 i; + for (i = (u32)0U; i < (u32)4U; i = i + (u32)1U) { + u64 *os = tmp; + const u8 *bj = pub + i * (u32)8U; + u64 u = *(u64 *)bj; + u64 r = u; + u64 x0 = r; + os[i] = x0; } - j = 0; } - - /* Doubling */ - for (i = 0; i < q; ++i) { - add_eltfp25519_1w_adx(A, Ur1, Zr1); /* A = Ur1+Zr1 */ - sub_eltfp25519_1w(B, Ur1, Zr1); /* B = Ur1-Zr1 */ - sqr_eltfp25519_2w_adx(AB); /* A = A**2 B = B**2 */ - copy_eltfp25519_1w(C, B); /* C = B */ - sub_eltfp25519_1w(B, A, B); /* B = A-B */ - mul_a24_eltfp25519_1w(D, B); /* D = my_a24*B */ - add_eltfp25519_1w_adx(D, D, C); /* D = D+C */ - mul_eltfp25519_2w_adx(UZr1, AB, CD); /* Ur1 = A*B Zr1 = Zr1*A */ - } - - /* Convert to affine coordinates */ - inv_eltfp25519_1w_adx(A, Zr1); - mul_eltfp25519_1w_adx((u64 *)session_key, Ur1, A); - fred_eltfp25519_1w((u64 *)session_key); - - memzero_explicit(&m, sizeof(m)); + tmp3 = tmp[3U]; + tmp[3U] = tmp3 & (u64)0x7fffffffffffffffU; + x = init1; + z = init1 + (u32)4U; + z[0U] = (u64)1U; + z[1U] = (u64)0U; + z[2U] = (u64)0U; + z[3U] = (u64)0U; + x[0U] = tmp[0U]; + x[1U] = tmp[1U]; + x[2U] = tmp[2U]; + x[3U] = tmp[3U]; + montgomery_ladder(init1, priv, init1); + encode_point(out, init1); } -static void curve25519_bmi2(u8 shared[CURVE25519_KEY_SIZE], - const u8 private_key[CURVE25519_KEY_SIZE], - const u8 session_key[CURVE25519_KEY_SIZE]) -{ - struct { - u64 buffer[4 * NUM_WORDS_ELTFP25519]; - u64 coordinates[4 * NUM_WORDS_ELTFP25519]; - u64 workspace[6 * NUM_WORDS_ELTFP25519]; - u8 session[CURVE25519_KEY_SIZE]; - u8 private[CURVE25519_KEY_SIZE]; - } __aligned(32) m; - - int i = 0, j = 0; - u64 prev = 0; - u64 *const X1 = (u64 *)m.session; - u64 *const key = (u64 *)m.private; - u64 *const Px = m.coordinates + 0; - u64 *const Pz = m.coordinates + 4; - u64 *const Qx = m.coordinates + 8; - u64 *const Qz = m.coordinates + 12; - u64 *const X2 = Qx; - u64 *const Z2 = Qz; - u64 *const X3 = Px; - u64 *const Z3 = Pz; - u64 *const X2Z2 = Qx; - u64 *const X3Z3 = Px; - - u64 *const A = m.workspace + 0; - u64 *const B = m.workspace + 4; - u64 *const D = m.workspace + 8; - u64 *const C = m.workspace + 12; - u64 *const DA = m.workspace + 16; - u64 *const CB = m.workspace + 20; - u64 *const AB = A; - u64 *const DC = D; - u64 *const DACB = DA; - - memcpy(m.private, private_key, sizeof(m.private)); - memcpy(m.session, session_key, sizeof(m.session)); - - curve25519_clamp_secret(m.private); - - /* As in the draft: - * When receiving such an array, implementations of curve25519 - * MUST mask the most-significant bit in the final byte. This - * is done to preserve compatibility with point formats which - * reserve the sign bit for use in other protocols and to - * increase resistance to implementation fingerprinting - */ - m.session[CURVE25519_KEY_SIZE - 1] &= (1 << (255 % 8)) - 1; - - copy_eltfp25519_1w(Px, X1); - setzero_eltfp25519_1w(Pz); - setzero_eltfp25519_1w(Qx); - setzero_eltfp25519_1w(Qz); - - Pz[0] = 1; - Qx[0] = 1; - - /* main-loop */ - prev = 0; - j = 62; - for (i = 3; i >= 0; --i) { - while (j >= 0) { - u64 bit = (key[i] >> j) & 0x1; - u64 swap = bit ^ prev; - prev = bit; - - add_eltfp25519_1w_bmi2(A, X2, Z2); /* A = (X2+Z2) */ - sub_eltfp25519_1w(B, X2, Z2); /* B = (X2-Z2) */ - add_eltfp25519_1w_bmi2(C, X3, Z3); /* C = (X3+Z3) */ - sub_eltfp25519_1w(D, X3, Z3); /* D = (X3-Z3) */ - mul_eltfp25519_2w_bmi2(DACB, AB, DC); /* [DA|CB] = [A|B]*[D|C] */ - - cselect(swap, A, C); - cselect(swap, B, D); - - sqr_eltfp25519_2w_bmi2(AB); /* [AA|BB] = [A^2|B^2] */ - add_eltfp25519_1w_bmi2(X3, DA, CB); /* X3 = (DA+CB) */ - sub_eltfp25519_1w(Z3, DA, CB); /* Z3 = (DA-CB) */ - sqr_eltfp25519_2w_bmi2(X3Z3); /* [X3|Z3] = [(DA+CB)|(DA+CB)]^2 */ - - copy_eltfp25519_1w(X2, B); /* X2 = B^2 */ - sub_eltfp25519_1w(Z2, A, B); /* Z2 = E = AA-BB */ - - mul_a24_eltfp25519_1w(B, Z2); /* B = a24*E */ - add_eltfp25519_1w_bmi2(B, B, X2); /* B = a24*E+B */ - mul_eltfp25519_2w_bmi2(X2Z2, X2Z2, AB); /* [X2|Z2] = [B|E]*[A|a24*E+B] */ - mul_eltfp25519_1w_bmi2(Z3, Z3, X1); /* Z3 = Z3*X1 */ - --j; - } - j = 63; - } - - inv_eltfp25519_1w_bmi2(A, Qz); - mul_eltfp25519_1w_bmi2((u64 *)shared, Qx, A); - fred_eltfp25519_1w((u64 *)shared); +/* The below constants were generated using this sage script: + * + * #!/usr/bin/env sage + * import sys + * from sage.all import * + * def limbs(n): + * n = int(n) + * l = ((n >> 0) % 2^64, (n >> 64) % 2^64, (n >> 128) % 2^64, (n >> 192) % 2^64) + * return "0x%016xULL, 0x%016xULL, 0x%016xULL, 0x%016xULL" % l + * ec = EllipticCurve(GF(2^255 - 19), [0, 486662, 0, 1, 0]) + * p_minus_s = (ec.lift_x(9) - ec.lift_x(1))[0] + * print("static const u64 p_minus_s[] = { %s };\n" % limbs(p_minus_s)) + * print("static const u64 table_ladder[] = {") + * p = ec.lift_x(9) + * for i in range(252): + * l = (p[0] + p[2]) / (p[0] - p[2]) + * print(("\t%s" + ("," if i != 251 else "")) % limbs(l)) + * p = p * 2 + * print("};") + * + */ - memzero_explicit(&m, sizeof(m)); -} +static const u64 p_minus_s[] = { 0x816b1e0137d48290ULL, 0x440f6a51eb4d1207ULL, 0x52385f46dca2b71dULL, 0x215132111d8354cbULL }; + +static const u64 table_ladder[] = { + 0xfffffffffffffff3ULL, 0xffffffffffffffffULL, 0xffffffffffffffffULL, 0x5fffffffffffffffULL, + 0x6b8220f416aafe96ULL, 0x82ebeb2b4f566a34ULL, 0xd5a9a5b075a5950fULL, 0x5142b2cf4b2488f4ULL, + 0x6aaebc750069680cULL, 0x89cf7820a0f99c41ULL, 0x2a58d9183b56d0f4ULL, 0x4b5aca80e36011a4ULL, + 0x329132348c29745dULL, 0xf4a2e616e1642fd7ULL, 0x1e45bb03ff67bc34ULL, 0x306912d0f42a9b4aULL, + 0xff886507e6af7154ULL, 0x04f50e13dfeec82fULL, 0xaa512fe82abab5ceULL, 0x174e251a68d5f222ULL, + 0xcf96700d82028898ULL, 0x1743e3370a2c02c5ULL, 0x379eec98b4e86eaaULL, 0x0c59888a51e0482eULL, + 0xfbcbf1d699b5d189ULL, 0xacaef0d58e9fdc84ULL, 0xc1c20d06231f7614ULL, 0x2938218da274f972ULL, + 0xf6af49beff1d7f18ULL, 0xcc541c22387ac9c2ULL, 0x96fcc9ef4015c56bULL, 0x69c1627c690913a9ULL, + 0x7a86fd2f4733db0eULL, 0xfdb8c4f29e087de9ULL, 0x095e4b1a8ea2a229ULL, 0x1ad7a7c829b37a79ULL, + 0x342d89cad17ea0c0ULL, 0x67bedda6cced2051ULL, 0x19ca31bf2bb42f74ULL, 0x3df7b4c84980acbbULL, + 0xa8c6444dc80ad883ULL, 0xb91e440366e3ab85ULL, 0xc215cda00164f6d8ULL, 0x3d867c6ef247e668ULL, + 0xc7dd582bcc3e658cULL, 0xfd2c4748ee0e5528ULL, 0xa0fd9b95cc9f4f71ULL, 0x7529d871b0675ddfULL, + 0xb8f568b42d3cbd78ULL, 0x1233011b91f3da82ULL, 0x2dce6ccd4a7c3b62ULL, 0x75e7fc8e9e498603ULL, + 0x2f4f13f1fcd0b6ecULL, 0xf1a8ca1f29ff7a45ULL, 0xc249c1a72981e29bULL, 0x6ebe0dbb8c83b56aULL, + 0x7114fa8d170bb222ULL, 0x65a2dcd5bf93935fULL, 0xbdc41f68b59c979aULL, 0x2f0eef79a2ce9289ULL, + 0x42ecbf0c083c37ceULL, 0x2930bc09ec496322ULL, 0xf294b0c19cfeac0dULL, 0x3780aa4bedfabb80ULL, + 0x56c17d3e7cead929ULL, 0xe7cb4beb2e5722c5ULL, 0x0ce931732dbfe15aULL, 0x41b883c7621052f8ULL, + 0xdbf75ca0c3d25350ULL, 0x2936be086eb1e351ULL, 0xc936e03cb4a9b212ULL, 0x1d45bf82322225aaULL, + 0xe81ab1036a024cc5ULL, 0xe212201c304c9a72ULL, 0xc5d73fba6832b1fcULL, 0x20ffdb5a4d839581ULL, + 0xa283d367be5d0fadULL, 0x6c2b25ca8b164475ULL, 0x9d4935467caaf22eULL, 0x5166408eee85ff49ULL, + 0x3c67baa2fab4e361ULL, 0xb3e433c67ef35cefULL, 0x5259729241159b1cULL, 0x6a621892d5b0ab33ULL, + 0x20b74a387555cdcbULL, 0x532aa10e1208923fULL, 0xeaa17b7762281dd1ULL, 0x61ab3443f05c44bfULL, + 0x257a6c422324def8ULL, 0x131c6c1017e3cf7fULL, 0x23758739f630a257ULL, 0x295a407a01a78580ULL, + 0xf8c443246d5da8d9ULL, 0x19d775450c52fa5dULL, 0x2afcfc92731bf83dULL, 0x7d10c8e81b2b4700ULL, + 0xc8e0271f70baa20bULL, 0x993748867ca63957ULL, 0x5412efb3cb7ed4bbULL, 0x3196d36173e62975ULL, + 0xde5bcad141c7dffcULL, 0x47cc8cd2b395c848ULL, 0xa34cd942e11af3cbULL, 0x0256dbf2d04ecec2ULL, + 0x875ab7e94b0e667fULL, 0xcad4dd83c0850d10ULL, 0x47f12e8f4e72c79fULL, 0x5f1a87bb8c85b19bULL, + 0x7ae9d0b6437f51b8ULL, 0x12c7ce5518879065ULL, 0x2ade09fe5cf77aeeULL, 0x23a05a2f7d2c5627ULL, + 0x5908e128f17c169aULL, 0xf77498dd8ad0852dULL, 0x74b4c4ceab102f64ULL, 0x183abadd10139845ULL, + 0xb165ba8daa92aaacULL, 0xd5c5ef9599386705ULL, 0xbe2f8f0cf8fc40d1ULL, 0x2701e635ee204514ULL, + 0x629fa80020156514ULL, 0xf223868764a8c1ceULL, 0x5b894fff0b3f060eULL, 0x60d9944cf708a3faULL, + 0xaeea001a1c7a201fULL, 0xebf16a633ee2ce63ULL, 0x6f7709594c7a07e1ULL, 0x79b958150d0208cbULL, + 0x24b55e5301d410e7ULL, 0xe3a34edff3fdc84dULL, 0xd88768e4904032d8ULL, 0x131384427b3aaeecULL, + 0x8405e51286234f14ULL, 0x14dc4739adb4c529ULL, 0xb8a2b5b250634ffdULL, 0x2fe2a94ad8a7ff93ULL, + 0xec5c57efe843faddULL, 0x2843ce40f0bb9918ULL, 0xa4b561d6cf3d6305ULL, 0x743629bde8fb777eULL, + 0x343edd46bbaf738fULL, 0xed981828b101a651ULL, 0xa401760b882c797aULL, 0x1fc223e28dc88730ULL, + 0x48604e91fc0fba0eULL, 0xb637f78f052c6fa4ULL, 0x91ccac3d09e9239cULL, 0x23f7eed4437a687cULL, + 0x5173b1118d9bd800ULL, 0x29d641b63189d4a7ULL, 0xfdbf177988bbc586ULL, 0x2959894fcad81df5ULL, + 0xaebc8ef3b4bbc899ULL, 0x4148995ab26992b9ULL, 0x24e20b0134f92cfbULL, 0x40d158894a05dee8ULL, + 0x46b00b1185af76f6ULL, 0x26bac77873187a79ULL, 0x3dc0bf95ab8fff5fULL, 0x2a608bd8945524d7ULL, + 0x26449588bd446302ULL, 0x7c4bc21c0388439cULL, 0x8e98a4f383bd11b2ULL, 0x26218d7bc9d876b9ULL, + 0xe3081542997c178aULL, 0x3c2d29a86fb6606fULL, 0x5c217736fa279374ULL, 0x7dde05734afeb1faULL, + 0x3bf10e3906d42babULL, 0xe4f7803e1980649cULL, 0xe6053bf89595bf7aULL, 0x394faf38da245530ULL, + 0x7a8efb58896928f4ULL, 0xfbc778e9cc6a113cULL, 0x72670ce330af596fULL, 0x48f222a81d3d6cf7ULL, + 0xf01fce410d72caa7ULL, 0x5a20ecc7213b5595ULL, 0x7bc21165c1fa1483ULL, 0x07f89ae31da8a741ULL, + 0x05d2c2b4c6830ff9ULL, 0xd43e330fc6316293ULL, 0xa5a5590a96d3a904ULL, 0x705edb91a65333b6ULL, + 0x048ee15e0bb9a5f7ULL, 0x3240cfca9e0aaf5dULL, 0x8f4b71ceedc4a40bULL, 0x621c0da3de544a6dULL, + 0x92872836a08c4091ULL, 0xce8375b010c91445ULL, 0x8a72eb524f276394ULL, 0x2667fcfa7ec83635ULL, + 0x7f4c173345e8752aULL, 0x061b47feee7079a5ULL, 0x25dd9afa9f86ff34ULL, 0x3780cef5425dc89cULL, + 0x1a46035a513bb4e9ULL, 0x3e1ef379ac575adaULL, 0xc78c5f1c5fa24b50ULL, 0x321a967634fd9f22ULL, + 0x946707b8826e27faULL, 0x3dca84d64c506fd0ULL, 0xc189218075e91436ULL, 0x6d9284169b3b8484ULL, + 0x3a67e840383f2ddfULL, 0x33eec9a30c4f9b75ULL, 0x3ec7c86fa783ef47ULL, 0x26ec449fbac9fbc4ULL, + 0x5c0f38cba09b9e7dULL, 0x81168cc762a3478cULL, 0x3e23b0d306fc121cULL, 0x5a238aa0a5efdcddULL, + 0x1ba26121c4ea43ffULL, 0x36f8c77f7c8832b5ULL, 0x88fbea0b0adcf99aULL, 0x5ca9938ec25bebf9ULL, + 0xd5436a5e51fccda0ULL, 0x1dbc4797c2cd893bULL, 0x19346a65d3224a08ULL, 0x0f5034e49b9af466ULL, + 0xf23c3967a1e0b96eULL, 0xe58b08fa867a4d88ULL, 0xfb2fabc6a7341679ULL, 0x2a75381eb6026946ULL, + 0xc80a3be4c19420acULL, 0x66b1f6c681f2b6dcULL, 0x7cf7036761e93388ULL, 0x25abbbd8a660a4c4ULL, + 0x91ea12ba14fd5198ULL, 0x684950fc4a3cffa9ULL, 0xf826842130f5ad28ULL, 0x3ea988f75301a441ULL, + 0xc978109a695f8c6fULL, 0x1746eb4a0530c3f3ULL, 0x444d6d77b4459995ULL, 0x75952b8c054e5cc7ULL, + 0xa3703f7915f4d6aaULL, 0x66c346202f2647d8ULL, 0xd01469df811d644bULL, 0x77fea47d81a5d71fULL, + 0xc5e9529ef57ca381ULL, 0x6eeeb4b9ce2f881aULL, 0xb6e91a28e8009bd6ULL, 0x4b80be3e9afc3fecULL, + 0x7e3773c526aed2c5ULL, 0x1b4afcb453c9a49dULL, 0xa920bdd7baffb24dULL, 0x7c54699f122d400eULL, + 0xef46c8e14fa94bc8ULL, 0xe0b074ce2952ed5eULL, 0xbea450e1dbd885d5ULL, 0x61b68649320f712cULL, + 0x8a485f7309ccbdd1ULL, 0xbd06320d7d4d1a2dULL, 0x25232973322dbef4ULL, 0x445dc4758c17f770ULL, + 0xdb0434177cc8933cULL, 0xed6fe82175ea059fULL, 0x1efebefdc053db34ULL, 0x4adbe867c65daf99ULL, + 0x3acd71a2a90609dfULL, 0xe5e991856dd04050ULL, 0x1ec69b688157c23cULL, 0x697427f6885cfe4dULL, + 0xd7be7b9b65e1a851ULL, 0xa03d28d522c536ddULL, 0x28399d658fd2b645ULL, 0x49e5b7e17c2641e1ULL, + 0x6f8c3a98700457a4ULL, 0x5078f0a25ebb6778ULL, 0xd13c3ccbc382960fULL, 0x2e003258a7df84b1ULL, + 0x8ad1f39be6296a1cULL, 0xc1eeaa652a5fbfb2ULL, 0x33ee0673fd26f3cbULL, 0x59256173a69d2cccULL, + 0x41ea07aa4e18fc41ULL, 0xd9fc19527c87a51eULL, 0xbdaacb805831ca6fULL, 0x445b652dc916694fULL, + 0xce92a3a7f2172315ULL, 0x1edc282de11b9964ULL, 0xa1823aafe04c314aULL, 0x790a2d94437cf586ULL, + 0x71c447fb93f6e009ULL, 0x8922a56722845276ULL, 0xbf70903b204f5169ULL, 0x2f7a89891ba319feULL, + 0x02a08eb577e2140cULL, 0xed9a4ed4427bdcf4ULL, 0x5253ec44e4323cd1ULL, 0x3e88363c14e9355bULL, + 0xaa66c14277110b8cULL, 0x1ae0391610a23390ULL, 0x2030bd12c93fc2a2ULL, 0x3ee141579555c7abULL, + 0x9214de3a6d6e7d41ULL, 0x3ccdd88607f17efeULL, 0x674f1288f8e11217ULL, 0x5682250f329f93d0ULL, + 0x6cf00b136d2e396eULL, 0x6e4cf86f1014debfULL, 0x5930b1b5bfcc4e83ULL, 0x047069b48aba16b6ULL, + 0x0d4ce4ab69b20793ULL, 0xb24db91a97d0fb9eULL, 0xcdfa50f54e00d01dULL, 0x221b1085368bddb5ULL, + 0xe7e59468b1e3d8d2ULL, 0x53c56563bd122f93ULL, 0xeee8a903e0663f09ULL, 0x61efa662cbbe3d42ULL, + 0x2cf8ddddde6eab2aULL, 0x9bf80ad51435f231ULL, 0x5deadacec9f04973ULL, 0x29275b5d41d29b27ULL, + 0xcfde0f0895ebf14fULL, 0xb9aab96b054905a7ULL, 0xcae80dd9a1c420fdULL, 0x0a63bf2f1673bbc7ULL, + 0x092f6e11958fbc8cULL, 0x672a81e804822fadULL, 0xcac8351560d52517ULL, 0x6f3f7722c8f192f8ULL, + 0xf8ba90ccc2e894b7ULL, 0x2c7557a438ff9f0dULL, 0x894d1d855ae52359ULL, 0x68e122157b743d69ULL, + 0xd87e5570cfb919f3ULL, 0x3f2cdecd95798db9ULL, 0x2121154710c0a2ceULL, 0x3c66a115246dc5b2ULL, + 0xcbedc562294ecb72ULL, 0xba7143c36a280b16ULL, 0x9610c2efd4078b67ULL, 0x6144735d946a4b1eULL, + 0x536f111ed75b3350ULL, 0x0211db8c2041d81bULL, 0xf93cb1000e10413cULL, 0x149dfd3c039e8876ULL, + 0xd479dde46b63155bULL, 0xb66e15e93c837976ULL, 0xdafde43b1f13e038ULL, 0x5fafda1a2e4b0b35ULL, + 0x3600bbdf17197581ULL, 0x3972050bbe3cd2c2ULL, 0x5938906dbdd5be86ULL, 0x34fce5e43f9b860fULL, + 0x75a8a4cd42d14d02ULL, 0x828dabc53441df65ULL, 0x33dcabedd2e131d3ULL, 0x3ebad76fb814d25fULL, + 0xd4906f566f70e10fULL, 0x5d12f7aa51690f5aULL, 0x45adb16e76cefcf2ULL, 0x01f768aead232999ULL, + 0x2b6cc77b6248febdULL, 0x3cd30628ec3aaffdULL, 0xce1c0b80d4ef486aULL, 0x4c3bff2ea6f66c23ULL, + 0x3f2ec4094aeaeb5fULL, 0x61b19b286e372ca7ULL, 0x5eefa966de2a701dULL, 0x23b20565de55e3efULL, + 0xe301ca5279d58557ULL, 0x07b2d4ce27c2874fULL, 0xa532cd8a9dcf1d67ULL, 0x2a52fee23f2bff56ULL, + 0x8624efb37cd8663dULL, 0xbbc7ac20ffbd7594ULL, 0x57b85e9c82d37445ULL, 0x7b3052cb86a6ec66ULL, + 0x3482f0ad2525e91eULL, 0x2cb68043d28edca0ULL, 0xaf4f6d052e1b003aULL, 0x185f8c2529781b0aULL, + 0xaa41de5bd80ce0d6ULL, 0x9407b2416853e9d6ULL, 0x563ec36e357f4c3aULL, 0x4cc4b8dd0e297bceULL, + 0xa2fc1a52ffb8730eULL, 0x1811f16e67058e37ULL, 0x10f9a366cddf4ee1ULL, 0x72f4a0c4a0b9f099ULL, + 0x8c16c06f663f4ea7ULL, 0x693b3af74e970fbaULL, 0x2102e7f1d69ec345ULL, 0x0ba53cbc968a8089ULL, + 0xca3d9dc7fea15537ULL, 0x4c6824bb51536493ULL, 0xb9886314844006b1ULL, 0x40d2a72ab454cc60ULL, + 0x5936a1b712570975ULL, 0x91b9d648debda657ULL, 0x3344094bb64330eaULL, 0x006ba10d12ee51d0ULL, + 0x19228468f5de5d58ULL, 0x0eb12f4c38cc05b0ULL, 0xa1039f9dd5601990ULL, 0x4502d4ce4fff0e0bULL, + 0xeb2054106837c189ULL, 0xd0f6544c6dd3b93cULL, 0x40727064c416d74fULL, 0x6e15c6114b502ef0ULL, + 0x4df2a398cfb1a76bULL, 0x11256c7419f2f6b1ULL, 0x4a497962066e6043ULL, 0x705b3aab41355b44ULL, + 0x365ef536d797b1d8ULL, 0x00076bd622ddf0dbULL, 0x3bbf33b0e0575a88ULL, 0x3777aa05c8e4ca4dULL, + 0x392745c85578db5fULL, 0x6fda4149dbae5ae2ULL, 0xb1f0b00b8adc9867ULL, 0x09963437d36f1da3ULL, + 0x7e824e90a5dc3853ULL, 0xccb5f6641f135cbdULL, 0x6736d86c87ce8fccULL, 0x625f3ce26604249fULL, + 0xaf8ac8059502f63fULL, 0x0c05e70a2e351469ULL, 0x35292e9c764b6305ULL, 0x1a394360c7e23ac3ULL, + 0xd5c6d53251183264ULL, 0x62065abd43c2b74fULL, 0xb5fbf5d03b973f9bULL, 0x13a3da3661206e5eULL, + 0xc6bd5837725d94e5ULL, 0x18e30912205016c5ULL, 0x2088ce1570033c68ULL, 0x7fba1f495c837987ULL, + 0x5a8c7423f2f9079dULL, 0x1735157b34023fc5ULL, 0xe4f9b49ad2fab351ULL, 0x6691ff72c878e33cULL, + 0x122c2adedc5eff3eULL, 0xf8dd4bf1d8956cf4ULL, 0xeb86205d9e9e5bdaULL, 0x049b92b9d975c743ULL, + 0xa5379730b0f6c05aULL, 0x72a0ffacc6f3a553ULL, 0xb0032c34b20dcd6dULL, 0x470e9dbc88d5164aULL, + 0xb19cf10ca237c047ULL, 0xb65466711f6c81a2ULL, 0xb3321bd16dd80b43ULL, 0x48c14f600c5fbe8eULL, + 0x66451c264aa6c803ULL, 0xb66e3904a4fa7da6ULL, 0xd45f19b0b3128395ULL, 0x31602627c3c9bc10ULL, + 0x3120dc4832e4e10dULL, 0xeb20c46756c717f7ULL, 0x00f52e3f67280294ULL, 0x566d4fc14730c509ULL, + 0x7e3a5d40fd837206ULL, 0xc1e926dc7159547aULL, 0x216730fba68d6095ULL, 0x22e8c3843f69cea7ULL, + 0x33d074e8930e4b2bULL, 0xb6e4350e84d15816ULL, 0x5534c26ad6ba2365ULL, 0x7773c12f89f1f3f3ULL, + 0x8cba404da57962aaULL, 0x5b9897a81999ce56ULL, 0x508e862f121692fcULL, 0x3a81907fa093c291ULL, + 0x0dded0ff4725a510ULL, 0x10d8cc10673fc503ULL, 0x5b9d151c9f1f4e89ULL, 0x32a5c1d5cb09a44cULL, + 0x1e0aa442b90541fbULL, 0x5f85eb7cc1b485dbULL, 0xbee595ce8a9df2e5ULL, 0x25e496c722422236ULL, + 0x5edf3c46cd0fe5b9ULL, 0x34e75a7ed2a43388ULL, 0xe488de11d761e352ULL, 0x0e878a01a085545cULL, + 0xba493c77e021bb04ULL, 0x2b4d1843c7df899aULL, 0x9ea37a487ae80d67ULL, 0x67a9958011e41794ULL, + 0x4b58051a6697b065ULL, 0x47e33f7d8d6ba6d4ULL, 0xbb4da8d483ca46c1ULL, 0x68becaa181c2db0dULL, + 0x8d8980e90b989aa5ULL, 0xf95eb14a2c93c99bULL, 0x51c6c7c4796e73a2ULL, 0x6e228363b5efb569ULL, + 0xc6bbc0b02dd624c8ULL, 0x777eb47dec8170eeULL, 0x3cde15a004cfafa9ULL, 0x1dc6bc087160bf9bULL, + 0x2e07e043eec34002ULL, 0x18e9fc677a68dc7fULL, 0xd8da03188bd15b9aULL, 0x48fbc3bb00568253ULL, + 0x57547d4cfb654ce1ULL, 0xd3565b82a058e2adULL, 0xf63eaf0bbf154478ULL, 0x47531ef114dfbb18ULL, + 0xe1ec630a4278c587ULL, 0x5507d546ca8e83f3ULL, 0x85e135c63adc0c2bULL, 0x0aa7efa85682844eULL, + 0x72691ba8b3e1f615ULL, 0x32b4e9701fbe3ffaULL, 0x97b6d92e39bb7868ULL, 0x2cfe53dea02e39e8ULL, + 0x687392cd85cd52b0ULL, 0x27ff66c910e29831ULL, 0x97134556a9832d06ULL, 0x269bb0360a84f8a0ULL, + 0x706e55457643f85cULL, 0x3734a48c9b597d1bULL, 0x7aee91e8c6efa472ULL, 0x5cd6abc198a9d9e0ULL, + 0x0e04de06cb3ce41aULL, 0xd8c6eb893402e138ULL, 0x904659bb686e3772ULL, 0x7215c371746ba8c8ULL, + 0xfd12a97eeae4a2d9ULL, 0x9514b7516394f2c5ULL, 0x266fd5809208f294ULL, 0x5c847085619a26b9ULL, + 0x52985410fed694eaULL, 0x3c905b934a2ed254ULL, 0x10bb47692d3be467ULL, 0x063b3d2d69e5e9e1ULL, + 0x472726eedda57debULL, 0xefb6c4ae10f41891ULL, 0x2b1641917b307614ULL, 0x117c554fc4f45b7cULL, + 0xc07cf3118f9d8812ULL, 0x01dbd82050017939ULL, 0xd7e803f4171b2827ULL, 0x1015e87487d225eaULL, + 0xc58de3fed23acc4dULL, 0x50db91c294a7be2dULL, 0x0b94d43d1c9cf457ULL, 0x6b1640fa6e37524aULL, + 0x692f346c5fda0d09ULL, 0x200b1c59fa4d3151ULL, 0xb8c46f760777a296ULL, 0x4b38395f3ffdfbcfULL, + 0x18d25e00be54d671ULL, 0x60d50582bec8aba6ULL, 0x87ad8f263b78b982ULL, 0x50fdf64e9cda0432ULL, + 0x90f567aac578dcf0ULL, 0xef1e9b0ef2a3133bULL, 0x0eebba9242d9de71ULL, 0x15473c9bf03101c7ULL, + 0x7c77e8ae56b78095ULL, 0xb678e7666e6f078eULL, 0x2da0b9615348ba1fULL, 0x7cf931c1ff733f0bULL, + 0x26b357f50a0a366cULL, 0xe9708cf42b87d732ULL, 0xc13aeea5f91cb2c0ULL, 0x35d90c991143bb4cULL, + 0x47c1c404a9a0d9dcULL, 0x659e58451972d251ULL, 0x3875a8c473b38c31ULL, 0x1fbd9ed379561f24ULL, + 0x11fabc6fd41ec28dULL, 0x7ef8dfe3cd2a2dcaULL, 0x72e73b5d8c404595ULL, 0x6135fa4954b72f27ULL, + 0xccfc32a2de24b69cULL, 0x3f55698c1f095d88ULL, 0xbe3350ed5ac3f929ULL, 0x5e9bf806ca477eebULL, + 0xe9ce8fb63c309f68ULL, 0x5376f63565e1f9f4ULL, 0xd1afcfb35a6393f1ULL, 0x6632a1ede5623506ULL, + 0x0b7d6c390c2ded4cULL, 0x56cb3281df04cb1fULL, 0x66305a1249ecc3c7ULL, 0x5d588b60a38ca72aULL, + 0xa6ecbf78e8e5f42dULL, 0x86eeb44b3c8a3eecULL, 0xec219c48fbd21604ULL, 0x1aaf1af517c36731ULL, + 0xc306a2836769bde7ULL, 0x208280622b1e2adbULL, 0x8027f51ffbff94a6ULL, 0x76cfa1ce1124f26bULL, + 0x18eb00562422abb6ULL, 0xf377c4d58f8c29c3ULL, 0x4dbbc207f531561aULL, 0x0253b7f082128a27ULL, + 0x3d1f091cb62c17e0ULL, 0x4860e1abd64628a9ULL, 0x52d17436309d4253ULL, 0x356f97e13efae576ULL, + 0xd351e11aa150535bULL, 0x3e6b45bb1dd878ccULL, 0x0c776128bed92c98ULL, 0x1d34ae93032885b8ULL, + 0x4ba0488ca85ba4c3ULL, 0x985348c33c9ce6ceULL, 0x66124c6f97bda770ULL, 0x0f81a0290654124aULL, + 0x9ed09ca6569b86fdULL, 0x811009fd18af9a2dULL, 0xff08d03f93d8c20aULL, 0x52a148199faef26bULL, + 0x3e03f9dc2d8d1b73ULL, 0x4205801873961a70ULL, 0xc0d987f041a35970ULL, 0x07aa1f15a1c0d549ULL, + 0xdfd46ce08cd27224ULL, 0x6d0a024f934e4239ULL, 0x808a7a6399897b59ULL, 0x0a4556e9e13d95a2ULL, + 0xd21a991fe9c13045ULL, 0x9b0e8548fe7751b8ULL, 0x5da643cb4bf30035ULL, 0x77db28d63940f721ULL, + 0xfc5eeb614adc9011ULL, 0x5229419ae8c411ebULL, 0x9ec3e7787d1dcf74ULL, 0x340d053e216e4cb5ULL, + 0xcac7af39b48df2b4ULL, 0xc0faec2871a10a94ULL, 0x140a69245ca575edULL, 0x0cf1c37134273a4cULL, + 0xc8ee306ac224b8a5ULL, 0x57eaee7ccb4930b0ULL, 0xa1e806bdaacbe74fULL, 0x7d9a62742eeb657dULL, + 0x9eb6b6ef546c4830ULL, 0x885cca1fddb36e2eULL, 0xe6b9f383ef0d7105ULL, 0x58654fef9d2e0412ULL, + 0xa905c4ffbe0e8e26ULL, 0x942de5df9b31816eULL, 0x497d723f802e88e1ULL, 0x30684dea602f408dULL, + 0x21e5a278a3e6cb34ULL, 0xaefb6e6f5b151dc4ULL, 0xb30b8e049d77ca15ULL, 0x28c3c9cf53b98981ULL, + 0x287fb721556cdd2aULL, 0x0d317ca897022274ULL, 0x7468c7423a543258ULL, 0x4a7f11464eb5642fULL, + 0xa237a4774d193aa6ULL, 0xd865986ea92129a1ULL, 0x24c515ecf87c1a88ULL, 0x604003575f39f5ebULL, + 0x47b9f189570a9b27ULL, 0x2b98cede465e4b78ULL, 0x026df551dbb85c20ULL, 0x74fcd91047e21901ULL, + 0x13e2a90a23c1bfa3ULL, 0x0cb0074e478519f6ULL, 0x5ff1cbbe3af6cf44ULL, 0x67fe5438be812dbeULL, + 0xd13cf64fa40f05b0ULL, 0x054dfb2f32283787ULL, 0x4173915b7f0d2aeaULL, 0x482f144f1f610d4eULL, + 0xf6210201b47f8234ULL, 0x5d0ae1929e70b990ULL, 0xdcd7f455b049567cULL, 0x7e93d0f1f0916f01ULL, + 0xdd79cbf18a7db4faULL, 0xbe8391bf6f74c62fULL, 0x027145d14b8291bdULL, 0x585a73ea2cbf1705ULL, + 0x485ca03e928a0db2ULL, 0x10fc01a5742857e7ULL, 0x2f482edbd6d551a7ULL, 0x0f0433b5048fdb8aULL, + 0x60da2e8dd7dc6247ULL, 0x88b4c9d38cd4819aULL, 0x13033ac001f66697ULL, 0x273b24fe3b367d75ULL, + 0xc6e8f66a31b3b9d4ULL, 0x281514a494df49d5ULL, 0xd1726fdfc8b23da7ULL, 0x4b3ae7d103dee548ULL, + 0xc6256e19ce4b9d7eULL, 0xff5c5cf186e3c61cULL, 0xacc63ca34b8ec145ULL, 0x74621888fee66574ULL, + 0x956f409645290a1eULL, 0xef0bf8e3263a962eULL, 0xed6a50eb5ec2647bULL, 0x0694283a9dca7502ULL, + 0x769b963643a2dcd1ULL, 0x42b7c8ea09fc5353ULL, 0x4f002aee13397eabULL, 0x63005e2c19b7d63aULL, + 0xca6736da63023beaULL, 0x966c7f6db12a99b7ULL, 0xace09390c537c5e1ULL, 0x0b696063a1aa89eeULL, + 0xebb03e97288c56e5ULL, 0x432a9f9f938c8be8ULL, 0xa6a5a93d5b717f71ULL, 0x1a5fb4c3e18f9d97ULL, + 0x1c94e7ad1c60cdceULL, 0xee202a43fc02c4a0ULL, 0x8dafe4d867c46a20ULL, 0x0a10263c8ac27b58ULL, + 0xd0dea9dfe4432a4aULL, 0x856af87bbe9277c5ULL, 0xce8472acc212c71aULL, 0x6f151b6d9bbb1e91ULL, + 0x26776c527ceed56aULL, 0x7d211cb7fbf8faecULL, 0x37ae66a6fd4609ccULL, 0x1f81b702d2770c42ULL, + 0x2fb0b057eac58392ULL, 0xe1dd89fe29744e9dULL, 0xc964f8eb17beb4f8ULL, 0x29571073c9a2d41eULL, + 0xa948a18981c0e254ULL, 0x2df6369b65b22830ULL, 0xa33eb2d75fcfd3c6ULL, 0x078cd6ec4199a01fULL, + 0x4a584a41ad900d2fULL, 0x32142b78e2c74c52ULL, 0x68c4e8338431c978ULL, 0x7f69ea9008689fc2ULL, + 0x52f2c81e46a38265ULL, 0xfd78072d04a832fdULL, 0x8cd7d5fa25359e94ULL, 0x4de71b7454cc29d2ULL, + 0x42eb60ad1eda6ac9ULL, 0x0aad37dfdbc09c3aULL, 0x81004b71e33cc191ULL, 0x44e6be345122803cULL, + 0x03fe8388ba1920dbULL, 0xf5d57c32150db008ULL, 0x49c8c4281af60c29ULL, 0x21edb518de701aeeULL, + 0x7fb63e418f06dc99ULL, 0xa4460d99c166d7b8ULL, 0x24dd5248ce520a83ULL, 0x5ec3ad712b928358ULL, + 0x15022a5fbd17930fULL, 0xa4f64a77d82570e3ULL, 0x12bc8d6915783712ULL, 0x498194c0fc620abbULL, + 0x38a2d9d255686c82ULL, 0x785c6bd9193e21f0ULL, 0xe4d5c81ab24a5484ULL, 0x56307860b2e20989ULL, + 0x429d55f78b4d74c4ULL, 0x22f1834643350131ULL, 0x1e60c24598c71fffULL, 0x59f2f014979983efULL, + 0x46a47d56eb494a44ULL, 0x3e22a854d636a18eULL, 0xb346e15274491c3bULL, 0x2ceafd4e5390cde7ULL, + 0xba8a8538be0d6675ULL, 0x4b9074bb50818e23ULL, 0xcbdab89085d304c3ULL, 0x61a24fe0e56192c4ULL, + 0xcb7615e6db525bcbULL, 0xdd7d8c35a567e4caULL, 0xe6b4153acafcdd69ULL, 0x2d668e097f3c9766ULL, + 0xa57e7e265ce55ef0ULL, 0x5d9f4e527cd4b967ULL, 0xfbc83606492fd1e5ULL, 0x090d52beb7c3f7aeULL, + 0x09b9515a1e7b4d7cULL, 0x1f266a2599da44c0ULL, 0xa1c49548e2c55504ULL, 0x7ef04287126f15ccULL, + 0xfed1659dbd30ef15ULL, 0x8b4ab9eec4e0277bULL, 0x884d6236a5df3291ULL, 0x1fd96ea6bf5cf788ULL, + 0x42a161981f190d9aULL, 0x61d849507e6052c1ULL, 0x9fe113bf285a2cd5ULL, 0x7c22d676dbad85d8ULL, + 0x82e770ed2bfbd27dULL, 0x4c05b2ece996f5a5ULL, 0xcd40a9c2b0900150ULL, 0x5895319213d9bf64ULL, + 0xe7cc5d703fea2e08ULL, 0xb50c491258e2188cULL, 0xcce30baa48205bf0ULL, 0x537c659ccfa32d62ULL, + 0x37b6623a98cfc088ULL, 0xfe9bed1fa4d6aca4ULL, 0x04d29b8e56a8d1b0ULL, 0x725f71c40b519575ULL, + 0x28c7f89cd0339ce6ULL, 0x8367b14469ddc18bULL, 0x883ada83a6a1652cULL, 0x585f1974034d6c17ULL, + 0x89cfb266f1b19188ULL, 0xe63b4863e7c35217ULL, 0xd88c9da6b4c0526aULL, 0x3e035c9df0954635ULL, + 0xdd9d5412fb45de9dULL, 0xdd684532e4cff40dULL, 0x4b5c999b151d671cULL, 0x2d8c2cc811e7f690ULL, + 0x7f54be1d90055d40ULL, 0xa464c5df464aaf40ULL, 0x33979624f0e917beULL, 0x2c018dc527356b30ULL, + 0xa5415024e330b3d4ULL, 0x73ff3d96691652d3ULL, 0x94ec42c4ef9b59f1ULL, 0x0747201618d08e5aULL, + 0x4d6ca48aca411c53ULL, 0x66415f2fcfa66119ULL, 0x9c4dd40051e227ffULL, 0x59810bc09a02f7ebULL, + 0x2a7eb171b3dc101dULL, 0x441c5ab99ffef68eULL, 0x32025c9b93b359eaULL, 0x5e8ce0a71e9d112fULL, + 0xbfcccb92429503fdULL, 0xd271ba752f095d55ULL, 0x345ead5e972d091eULL, 0x18c8df11a83103baULL, + 0x90cd949a9aed0f4cULL, 0xc5d1f4cb6660e37eULL, 0xb8cac52d56c52e0bULL, 0x6e42e400c5808e0dULL, + 0xa3b46966eeaefd23ULL, 0x0c4f1f0be39ecdcaULL, 0x189dc8c9d683a51dULL, 0x51f27f054c09351bULL, + 0x4c487ccd2a320682ULL, 0x587ea95bb3df1c96ULL, 0xc8ccf79e555cb8e8ULL, 0x547dc829a206d73dULL, + 0xb822a6cd80c39b06ULL, 0xe96d54732000d4c6ULL, 0x28535b6f91463b4dULL, 0x228f4660e2486e1dULL, + 0x98799538de8d3abfULL, 0x8cd8330045ebca6eULL, 0x79952a008221e738ULL, 0x4322e1a7535cd2bbULL, + 0xb114c11819d1801cULL, 0x2016e4d84f3f5ec7ULL, 0xdd0e2df409260f4cULL, 0x5ec362c0ae5f7266ULL, + 0xc0462b18b8b2b4eeULL, 0x7cc8d950274d1afbULL, 0xf25f7105436b02d2ULL, 0x43bbf8dcbff9ccd3ULL, + 0xb6ad1767a039e9dfULL, 0xb0714da8f69d3583ULL, 0x5e55fa18b42931f5ULL, 0x4ed5558f33c60961ULL, + 0x1fe37901c647a5ddULL, 0x593ddf1f8081d357ULL, 0x0249a4fd813fd7a6ULL, 0x69acca274e9caf61ULL, + 0x047ba3ea330721c9ULL, 0x83423fc20e7e1ea0ULL, 0x1df4c0af01314a60ULL, 0x09a62dab89289527ULL, + 0xa5b325a49cc6cb00ULL, 0xe94b5dc654b56cb6ULL, 0x3be28779adc994a0ULL, 0x4296e8f8ba3a4aadULL, + 0x328689761e451eabULL, 0x2e4d598bff59594aULL, 0x49b96853d7a7084aULL, 0x4980a319601420a8ULL, + 0x9565b9e12f552c42ULL, 0x8a5318db7100fe96ULL, 0x05c90b4d43add0d7ULL, 0x538b4cd66a5d4edaULL, + 0xf4e94fc3e89f039fULL, 0x592c9af26f618045ULL, 0x08a36eb5fd4b9550ULL, 0x25fffaf6c2ed1419ULL, + 0x34434459cc79d354ULL, 0xeeecbfb4b1d5476bULL, 0xddeb34a061615d99ULL, 0x5129cecceb64b773ULL, + 0xee43215894993520ULL, 0x772f9c7cf14c0b3bULL, 0xd2e2fce306bedad5ULL, 0x715f42b546f06a97ULL, + 0x434ecdceda5b5f1aULL, 0x0da17115a49741a9ULL, 0x680bd77c73edad2eULL, 0x487c02354edd9041ULL, + 0xb8efeff3a70ed9c4ULL, 0x56a32aa3e857e302ULL, 0xdf3a68bd48a2a5a0ULL, 0x07f650b73176c444ULL, + 0xe38b9b1626e0ccb1ULL, 0x79e053c18b09fb36ULL, 0x56d90319c9f94964ULL, 0x1ca941e7ac9ff5c4ULL, + 0x49c4df29162fa0bbULL, 0x8488cf3282b33305ULL, 0x95dfda14cabb437dULL, 0x3391f78264d5ad86ULL, + 0x729ae06ae2b5095dULL, 0xd58a58d73259a946ULL, 0xe9834262d13921edULL, 0x27fedafaa54bb592ULL, + 0xa99dc5b829ad48bbULL, 0x5f025742499ee260ULL, 0x802c8ecd5d7513fdULL, 0x78ceb3ef3f6dd938ULL, + 0xc342f44f8a135d94ULL, 0x7b9edb44828cdda3ULL, 0x9436d11a0537cfe7ULL, 0x5064b164ec1ab4c8ULL, + 0x7020eccfd37eb2fcULL, 0x1f31ea3ed90d25fcULL, 0x1b930d7bdfa1bb34ULL, 0x5344467a48113044ULL, + 0x70073170f25e6dfbULL, 0xe385dc1a50114cc8ULL, 0x2348698ac8fc4f00ULL, 0x2a77a55284dd40d8ULL, + 0xfe06afe0c98c6ce4ULL, 0xc235df96dddfd6e4ULL, 0x1428d01e33bf1ed3ULL, 0x785768ec9300bdafULL, + 0x9702e57a91deb63bULL, 0x61bdb8bfe5ce8b80ULL, 0x645b426f3d1d58acULL, 0x4804a82227a557bcULL, + 0x8e57048ab44d2601ULL, 0x68d6501a4b3a6935ULL, 0xc39c9ec3f9e1c293ULL, 0x4172f257d4de63e2ULL, + 0xd368b450330c6401ULL, 0x040d3017418f2391ULL, 0x2c34bb6090b7d90dULL, 0x16f649228fdfd51fULL, + 0xbea6818e2b928ef5ULL, 0xe28ccf91cdc11e72ULL, 0x594aaa68e77a36cdULL, 0x313034806c7ffd0fULL, + 0x8a9d27ac2249bd65ULL, 0x19a3b464018e9512ULL, 0xc26ccff352b37ec7ULL, 0x056f68341d797b21ULL, + 0x5e79d6757efd2327ULL, 0xfabdbcb6553afe15ULL, 0xd3e7222c6eaf5a60ULL, 0x7046c76d4dae743bULL, + 0x660be872b18d4a55ULL, 0x19992518574e1496ULL, 0xc103053a302bdcbbULL, 0x3ed8e9800b218e8eULL, + 0x7b0b9239fa75e03eULL, 0xefe9fb684633c083ULL, 0x98a35fbe391a7793ULL, 0x6065510fe2d0fe34ULL, + 0x55cb668548abad0cULL, 0xb4584548da87e527ULL, 0x2c43ecea0107c1ddULL, 0x526028809372de35ULL, + 0x3415c56af9213b1fULL, 0x5bee1a4d017e98dbULL, 0x13f6b105b5cf709bULL, 0x5ff20e3482b29ab6ULL, + 0x0aa29c75cc2e6c90ULL, 0xfc7d73ca3a70e206ULL, 0x899fc38fc4b5c515ULL, 0x250386b124ffc207ULL, + 0x54ea28d5ae3d2b56ULL, 0x9913149dd6de60ceULL, 0x16694fc58f06d6c1ULL, 0x46b23975eb018fc7ULL, + 0x470a6a0fb4b7b4e2ULL, 0x5d92475a8f7253deULL, 0xabeee5b52fbd3adbULL, 0x7fa20801a0806968ULL, + 0x76f3faf19f7714d2ULL, 0xb3e840c12f4660c3ULL, 0x0fb4cd8df212744eULL, 0x4b065a251d3a2dd2ULL, + 0x5cebde383d77cd4aULL, 0x6adf39df882c9cb1ULL, 0xa2dd242eb09af759ULL, 0x3147c0e50e5f6422ULL, + 0x164ca5101d1350dbULL, 0xf8d13479c33fc962ULL, 0xe640ce4d13e5da08ULL, 0x4bdee0c45061f8baULL, + 0xd7c46dc1a4edb1c9ULL, 0x5514d7b6437fd98aULL, 0x58942f6bb2a1c00bULL, 0x2dffb2ab1d70710eULL, + 0xccdfcf2fc18b6d68ULL, 0xa8ebcba8b7806167ULL, 0x980697f95e2937e3ULL, 0x02fbba1cd0126e8cULL +}; -static void curve25519_bmi2_base(u8 session_key[CURVE25519_KEY_SIZE], - const u8 private_key[CURVE25519_KEY_SIZE]) +static void curve25519_ever64_base(u8 *out, const u8 *priv) { - struct { - u64 buffer[4 * NUM_WORDS_ELTFP25519]; - u64 coordinates[4 * NUM_WORDS_ELTFP25519]; - u64 workspace[4 * NUM_WORDS_ELTFP25519]; - u8 private[CURVE25519_KEY_SIZE]; - } __aligned(32) m; - - const int ite[4] = { 64, 64, 64, 63 }; - const int q = 3; u64 swap = 1; - - int i = 0, j = 0, k = 0; - u64 *const key = (u64 *)m.private; - u64 *const Ur1 = m.coordinates + 0; - u64 *const Zr1 = m.coordinates + 4; - u64 *const Ur2 = m.coordinates + 8; - u64 *const Zr2 = m.coordinates + 12; - - u64 *const UZr1 = m.coordinates + 0; - u64 *const ZUr2 = m.coordinates + 8; - - u64 *const A = m.workspace + 0; - u64 *const B = m.workspace + 4; - u64 *const C = m.workspace + 8; - u64 *const D = m.workspace + 12; - - u64 *const AB = m.workspace + 0; - u64 *const CD = m.workspace + 8; - - const u64 *const P = table_ladder_8k; - - memcpy(m.private, private_key, sizeof(m.private)); - - curve25519_clamp_secret(m.private); - - setzero_eltfp25519_1w(Ur1); - setzero_eltfp25519_1w(Zr1); - setzero_eltfp25519_1w(Zr2); - Ur1[0] = 1; - Zr1[0] = 1; - Zr2[0] = 1; - - /* G-S */ - Ur2[3] = 0x1eaecdeee27cab34UL; - Ur2[2] = 0xadc7a0b9235d48e2UL; - Ur2[1] = 0xbbf095ae14b2edf8UL; - Ur2[0] = 0x7e94e1fec82faabdUL; - - /* main-loop */ - j = q; - for (i = 0; i < NUM_WORDS_ELTFP25519; ++i) { - while (j < ite[i]) { - u64 bit = (key[i] >> j) & 0x1; - k = (64 * i + j - q); + int i, j, k; + u64 tmp[16 + 32 + 4]; + u64 *x1 = &tmp[0]; + u64 *z1 = &tmp[4]; + u64 *x2 = &tmp[8]; + u64 *z2 = &tmp[12]; + u64 *xz1 = &tmp[0]; + u64 *xz2 = &tmp[8]; + u64 *a = &tmp[0 + 16]; + u64 *b = &tmp[4 + 16]; + u64 *c = &tmp[8 + 16]; + u64 *ab = &tmp[0 + 16]; + u64 *abcd = &tmp[0 + 16]; + u64 *ef = &tmp[16 + 16]; + u64 *efgh = &tmp[16 + 16]; + u64 *key = &tmp[0 + 16 + 32]; + + memcpy(key, priv, 32); + ((u8 *)key)[0] &= 248; + ((u8 *)key)[31] = (((u8 *)key)[31] & 127) | 64; + + x1[0] = 1, x1[1] = x1[2] = x1[3] = 0; + z1[0] = 1, z1[1] = z1[2] = z1[3] = 0; + z2[0] = 1, z2[1] = z2[2] = z2[3] = 0; + memcpy(x2, p_minus_s, sizeof(p_minus_s)); + + j = 3; + for (i = 0; i < 4; ++i) { + while (j < (const int[]){ 64, 64, 64, 63 }[i]) { + u64 bit = (key[i] >> j) & 1; + k = (64 * i + j - 3); swap = swap ^ bit; - cswap(swap, Ur1, Ur2); - cswap(swap, Zr1, Zr2); + cswap2(swap, xz1, xz2); swap = bit; - /* Addition */ - sub_eltfp25519_1w(B, Ur1, Zr1); /* B = Ur1-Zr1 */ - add_eltfp25519_1w_bmi2(A, Ur1, Zr1); /* A = Ur1+Zr1 */ - mul_eltfp25519_1w_bmi2(C, &P[4 * k], B);/* C = M0-B */ - sub_eltfp25519_1w(B, A, C); /* B = (Ur1+Zr1) - M*(Ur1-Zr1) */ - add_eltfp25519_1w_bmi2(A, A, C); /* A = (Ur1+Zr1) + M*(Ur1-Zr1) */ - sqr_eltfp25519_2w_bmi2(AB); /* A = A^2 | B = B^2 */ - mul_eltfp25519_2w_bmi2(UZr1, ZUr2, AB); /* Ur1 = Zr2*A | Zr1 = Ur2*B */ + fsub(b, x1, z1); + fadd(a, x1, z1); + fmul(c, &table_ladder[4 * k], b, ef); + fsub(b, a, c); + fadd(a, a, c); + fsqr2(ab, ab, efgh); + fmul2(xz1, xz2, ab, efgh); ++j; } j = 0; } - /* Doubling */ - for (i = 0; i < q; ++i) { - add_eltfp25519_1w_bmi2(A, Ur1, Zr1); /* A = Ur1+Zr1 */ - sub_eltfp25519_1w(B, Ur1, Zr1); /* B = Ur1-Zr1 */ - sqr_eltfp25519_2w_bmi2(AB); /* A = A**2 B = B**2 */ - copy_eltfp25519_1w(C, B); /* C = B */ - sub_eltfp25519_1w(B, A, B); /* B = A-B */ - mul_a24_eltfp25519_1w(D, B); /* D = my_a24*B */ - add_eltfp25519_1w_bmi2(D, D, C); /* D = D+C */ - mul_eltfp25519_2w_bmi2(UZr1, AB, CD); /* Ur1 = A*B Zr1 = Zr1*A */ - } - - /* Convert to affine coordinates */ - inv_eltfp25519_1w_bmi2(A, Zr1); - mul_eltfp25519_1w_bmi2((u64 *)session_key, Ur1, A); - fred_eltfp25519_1w((u64 *)session_key); + point_double(xz1, abcd, efgh); + point_double(xz1, abcd, efgh); + point_double(xz1, abcd, efgh); + encode_point(out, xz1); - memzero_explicit(&m, sizeof(m)); + memzero_explicit(tmp, sizeof(tmp)); } +static __ro_after_init DEFINE_STATIC_KEY_FALSE(curve25519_use_bmi2_adx); + void curve25519_arch(u8 mypublic[CURVE25519_KEY_SIZE], const u8 secret[CURVE25519_KEY_SIZE], const u8 basepoint[CURVE25519_KEY_SIZE]) { - if (static_branch_likely(&curve25519_use_adx)) - curve25519_adx(mypublic, secret, basepoint); - else if (static_branch_likely(&curve25519_use_bmi2)) - curve25519_bmi2(mypublic, secret, basepoint); + if (static_branch_likely(&curve25519_use_bmi2_adx)) + curve25519_ever64(mypublic, secret, basepoint); else curve25519_generic(mypublic, secret, basepoint); } @@ -2355,10 +1605,8 @@ EXPORT_SYMBOL(curve25519_arch); void curve25519_base_arch(u8 pub[CURVE25519_KEY_SIZE], const u8 secret[CURVE25519_KEY_SIZE]) { - if (static_branch_likely(&curve25519_use_adx)) - curve25519_adx_base(pub, secret); - else if (static_branch_likely(&curve25519_use_bmi2)) - curve25519_bmi2_base(pub, secret); + if (static_branch_likely(&curve25519_use_bmi2_adx)) + curve25519_ever64_base(pub, secret); else curve25519_generic(pub, secret, curve25519_base_point); } @@ -2449,12 +1697,11 @@ static struct kpp_alg curve25519_alg = { .max_size = curve25519_max_size, }; + static int __init curve25519_mod_init(void) { - if (boot_cpu_has(X86_FEATURE_BMI2)) - static_branch_enable(&curve25519_use_bmi2); - else if (boot_cpu_has(X86_FEATURE_ADX)) - static_branch_enable(&curve25519_use_adx); + if (boot_cpu_has(X86_FEATURE_BMI2) && boot_cpu_has(X86_FEATURE_ADX)) + static_branch_enable(&curve25519_use_bmi2_adx); else return 0; return IS_REACHABLE(CONFIG_CRYPTO_KPP) ? @@ -2464,7 +1711,7 @@ static int __init curve25519_mod_init(void) static void __exit curve25519_mod_exit(void) { if (IS_REACHABLE(CONFIG_CRYPTO_KPP) && - (boot_cpu_has(X86_FEATURE_BMI2) || boot_cpu_has(X86_FEATURE_ADX))) + static_branch_likely(&curve25519_use_bmi2_adx)) crypto_unregister_kpp(&curve25519_alg); } @@ -2474,3 +1721,4 @@ module_exit(curve25519_mod_exit); MODULE_ALIAS_CRYPTO("curve25519"); MODULE_ALIAS_CRYPTO("curve25519-x86"); MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR("Jason A. Donenfeld <Jason@zx2c4.com>"); diff --git a/arch/x86/crypto/des3_ede-asm_64.S b/arch/x86/crypto/des3_ede-asm_64.S index fac0fdc3f25d..f4c760f4cade 100644 --- a/arch/x86/crypto/des3_ede-asm_64.S +++ b/arch/x86/crypto/des3_ede-asm_64.S @@ -243,7 +243,7 @@ SYM_FUNC_START(des3_ede_x86_64_crypt_blk) popq %r12; popq %rbx; - ret; + RET; SYM_FUNC_END(des3_ede_x86_64_crypt_blk) /*********************************************************************** @@ -528,7 +528,7 @@ SYM_FUNC_START(des3_ede_x86_64_crypt_blk_3way) popq %r12; popq %rbx; - ret; + RET; SYM_FUNC_END(des3_ede_x86_64_crypt_blk_3way) .section .rodata, "a", @progbits diff --git a/arch/x86/crypto/des3_ede_glue.c b/arch/x86/crypto/des3_ede_glue.c index 89830e531350..abb8b1fe123b 100644 --- a/arch/x86/crypto/des3_ede_glue.c +++ b/arch/x86/crypto/des3_ede_glue.c @@ -6,8 +6,6 @@ * * CBC & ECB parts based on code (crypto/cbc.c,ecb.c) by: * Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au> - * CTR part based on code (crypto/ctr.c) by: - * (C) Copyright IBM Corp. 2007 - Joy Latten <latten@us.ibm.com> */ #include <crypto/algapi.h> @@ -47,14 +45,6 @@ static inline void des3_ede_dec_blk(struct des3_ede_x86_ctx *ctx, u8 *dst, des3_ede_x86_64_crypt_blk(dec_ctx, dst, src); } -static inline void des3_ede_enc_blk_3way(struct des3_ede_x86_ctx *ctx, u8 *dst, - const u8 *src) -{ - u32 *enc_ctx = ctx->enc.expkey; - - des3_ede_x86_64_crypt_blk_3way(enc_ctx, dst, src); -} - static inline void des3_ede_dec_blk_3way(struct des3_ede_x86_ctx *ctx, u8 *dst, const u8 *src) { @@ -166,7 +156,7 @@ static int cbc_encrypt(struct skcipher_request *req) err = skcipher_walk_virt(&walk, req, false); - while ((nbytes = walk.nbytes)) { + while (walk.nbytes) { nbytes = __cbc_encrypt(ctx, &walk); err = skcipher_walk_done(&walk, nbytes); } @@ -245,7 +235,7 @@ static int cbc_decrypt(struct skcipher_request *req) err = skcipher_walk_virt(&walk, req, false); - while ((nbytes = walk.nbytes)) { + while (walk.nbytes) { nbytes = __cbc_decrypt(ctx, &walk); err = skcipher_walk_done(&walk, nbytes); } @@ -253,94 +243,6 @@ static int cbc_decrypt(struct skcipher_request *req) return err; } -static void ctr_crypt_final(struct des3_ede_x86_ctx *ctx, - struct skcipher_walk *walk) -{ - u8 *ctrblk = walk->iv; - u8 keystream[DES3_EDE_BLOCK_SIZE]; - u8 *src = walk->src.virt.addr; - u8 *dst = walk->dst.virt.addr; - unsigned int nbytes = walk->nbytes; - - des3_ede_enc_blk(ctx, keystream, ctrblk); - crypto_xor_cpy(dst, keystream, src, nbytes); - - crypto_inc(ctrblk, DES3_EDE_BLOCK_SIZE); -} - -static unsigned int __ctr_crypt(struct des3_ede_x86_ctx *ctx, - struct skcipher_walk *walk) -{ - unsigned int bsize = DES3_EDE_BLOCK_SIZE; - unsigned int nbytes = walk->nbytes; - __be64 *src = (__be64 *)walk->src.virt.addr; - __be64 *dst = (__be64 *)walk->dst.virt.addr; - u64 ctrblk = be64_to_cpu(*(__be64 *)walk->iv); - __be64 ctrblocks[3]; - - /* Process four block batch */ - if (nbytes >= bsize * 3) { - do { - /* create ctrblks for parallel encrypt */ - ctrblocks[0] = cpu_to_be64(ctrblk++); - ctrblocks[1] = cpu_to_be64(ctrblk++); - ctrblocks[2] = cpu_to_be64(ctrblk++); - - des3_ede_enc_blk_3way(ctx, (u8 *)ctrblocks, - (u8 *)ctrblocks); - - dst[0] = src[0] ^ ctrblocks[0]; - dst[1] = src[1] ^ ctrblocks[1]; - dst[2] = src[2] ^ ctrblocks[2]; - - src += 3; - dst += 3; - } while ((nbytes -= bsize * 3) >= bsize * 3); - - if (nbytes < bsize) - goto done; - } - - /* Handle leftovers */ - do { - ctrblocks[0] = cpu_to_be64(ctrblk++); - - des3_ede_enc_blk(ctx, (u8 *)ctrblocks, (u8 *)ctrblocks); - - dst[0] = src[0] ^ ctrblocks[0]; - - src += 1; - dst += 1; - } while ((nbytes -= bsize) >= bsize); - -done: - *(__be64 *)walk->iv = cpu_to_be64(ctrblk); - return nbytes; -} - -static int ctr_crypt(struct skcipher_request *req) -{ - struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - struct des3_ede_x86_ctx *ctx = crypto_skcipher_ctx(tfm); - struct skcipher_walk walk; - unsigned int nbytes; - int err; - - err = skcipher_walk_virt(&walk, req, false); - - while ((nbytes = walk.nbytes) >= DES3_EDE_BLOCK_SIZE) { - nbytes = __ctr_crypt(ctx, &walk); - err = skcipher_walk_done(&walk, nbytes); - } - - if (nbytes) { - ctr_crypt_final(ctx, &walk); - err = skcipher_walk_done(&walk, 0); - } - - return err; -} - static int des3_ede_x86_setkey(struct crypto_tfm *tfm, const u8 *key, unsigned int keylen) { @@ -428,20 +330,6 @@ static struct skcipher_alg des3_ede_skciphers[] = { .setkey = des3_ede_x86_setkey_skcipher, .encrypt = cbc_encrypt, .decrypt = cbc_decrypt, - }, { - .base.cra_name = "ctr(des3_ede)", - .base.cra_driver_name = "ctr-des3_ede-asm", - .base.cra_priority = 300, - .base.cra_blocksize = 1, - .base.cra_ctxsize = sizeof(struct des3_ede_x86_ctx), - .base.cra_module = THIS_MODULE, - .min_keysize = DES3_EDE_KEY_SIZE, - .max_keysize = DES3_EDE_KEY_SIZE, - .ivsize = DES3_EDE_BLOCK_SIZE, - .chunksize = DES3_EDE_BLOCK_SIZE, - .setkey = des3_ede_x86_setkey_skcipher, - .encrypt = ctr_crypt, - .decrypt = ctr_crypt, } }; diff --git a/arch/x86/crypto/ecb_cbc_helpers.h b/arch/x86/crypto/ecb_cbc_helpers.h new file mode 100644 index 000000000000..eaa15c7b29d6 --- /dev/null +++ b/arch/x86/crypto/ecb_cbc_helpers.h @@ -0,0 +1,76 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _CRYPTO_ECB_CBC_HELPER_H +#define _CRYPTO_ECB_CBC_HELPER_H + +#include <crypto/internal/skcipher.h> +#include <asm/fpu/api.h> + +/* + * Mode helpers to instantiate parameterized skcipher ECB/CBC modes without + * having to rely on indirect calls and retpolines. + */ + +#define ECB_WALK_START(req, bsize, fpu_blocks) do { \ + void *ctx = crypto_skcipher_ctx(crypto_skcipher_reqtfm(req)); \ + const int __bsize = (bsize); \ + struct skcipher_walk walk; \ + int err = skcipher_walk_virt(&walk, (req), false); \ + while (walk.nbytes > 0) { \ + unsigned int nbytes = walk.nbytes; \ + bool do_fpu = (fpu_blocks) != -1 && \ + nbytes >= (fpu_blocks) * __bsize; \ + const u8 *src = walk.src.virt.addr; \ + u8 *dst = walk.dst.virt.addr; \ + u8 __maybe_unused buf[(bsize)]; \ + if (do_fpu) kernel_fpu_begin() + +#define CBC_WALK_START(req, bsize, fpu_blocks) \ + ECB_WALK_START(req, bsize, fpu_blocks) + +#define ECB_WALK_ADVANCE(blocks) do { \ + dst += (blocks) * __bsize; \ + src += (blocks) * __bsize; \ + nbytes -= (blocks) * __bsize; \ +} while (0) + +#define ECB_BLOCK(blocks, func) do { \ + while (nbytes >= (blocks) * __bsize) { \ + (func)(ctx, dst, src); \ + ECB_WALK_ADVANCE(blocks); \ + } \ +} while (0) + +#define CBC_ENC_BLOCK(func) do { \ + const u8 *__iv = walk.iv; \ + while (nbytes >= __bsize) { \ + crypto_xor_cpy(dst, src, __iv, __bsize); \ + (func)(ctx, dst, dst); \ + __iv = dst; \ + ECB_WALK_ADVANCE(1); \ + } \ + memcpy(walk.iv, __iv, __bsize); \ +} while (0) + +#define CBC_DEC_BLOCK(blocks, func) do { \ + while (nbytes >= (blocks) * __bsize) { \ + const u8 *__iv = src + ((blocks) - 1) * __bsize; \ + if (dst == src) \ + __iv = memcpy(buf, __iv, __bsize); \ + (func)(ctx, dst, src); \ + crypto_xor(dst, walk.iv, __bsize); \ + memcpy(walk.iv, __iv, __bsize); \ + ECB_WALK_ADVANCE(blocks); \ + } \ +} while (0) + +#define ECB_WALK_END() \ + if (do_fpu) kernel_fpu_end(); \ + err = skcipher_walk_done(&walk, nbytes); \ + } \ + return err; \ +} while (0) + +#define CBC_WALK_END() ECB_WALK_END() + +#endif diff --git a/arch/x86/crypto/ghash-clmulni-intel_asm.S b/arch/x86/crypto/ghash-clmulni-intel_asm.S index bb9735fbb865..2bf871899920 100644 --- a/arch/x86/crypto/ghash-clmulni-intel_asm.S +++ b/arch/x86/crypto/ghash-clmulni-intel_asm.S @@ -14,7 +14,6 @@ */ #include <linux/linkage.h> -#include <asm/inst.h> #include <asm/frame.h> .section .rodata.cst16.bswap_mask, "aM", @progbits, 16 @@ -51,9 +50,9 @@ SYM_FUNC_START_LOCAL(__clmul_gf128mul_ble) pxor DATA, T2 pxor SHASH, T3 - PCLMULQDQ 0x00 SHASH DATA # DATA = a0 * b0 - PCLMULQDQ 0x11 SHASH T1 # T1 = a1 * b1 - PCLMULQDQ 0x00 T3 T2 # T2 = (a1 + a0) * (b1 + b0) + pclmulqdq $0x00, SHASH, DATA # DATA = a0 * b0 + pclmulqdq $0x11, SHASH, T1 # T1 = a1 * b1 + pclmulqdq $0x00, T3, T2 # T2 = (a1 + a0) * (b1 + b0) pxor DATA, T2 pxor T1, T2 # T2 = a0 * b1 + a1 * b0 @@ -86,7 +85,7 @@ SYM_FUNC_START_LOCAL(__clmul_gf128mul_ble) psrlq $1, T2 pxor T2, T1 pxor T1, DATA - ret + RET SYM_FUNC_END(__clmul_gf128mul_ble) /* void clmul_ghash_mul(char *dst, const u128 *shash) */ @@ -95,12 +94,12 @@ SYM_FUNC_START(clmul_ghash_mul) movups (%rdi), DATA movups (%rsi), SHASH movaps .Lbswap_mask, BSWAP - PSHUFB_XMM BSWAP DATA + pshufb BSWAP, DATA call __clmul_gf128mul_ble - PSHUFB_XMM BSWAP DATA + pshufb BSWAP, DATA movups DATA, (%rdi) FRAME_END - ret + RET SYM_FUNC_END(clmul_ghash_mul) /* @@ -114,20 +113,20 @@ SYM_FUNC_START(clmul_ghash_update) movaps .Lbswap_mask, BSWAP movups (%rdi), DATA movups (%rcx), SHASH - PSHUFB_XMM BSWAP DATA + pshufb BSWAP, DATA .align 4 .Lupdate_loop: movups (%rsi), IN1 - PSHUFB_XMM BSWAP IN1 + pshufb BSWAP, IN1 pxor IN1, DATA call __clmul_gf128mul_ble sub $16, %rdx add $16, %rsi cmp $16, %rdx jge .Lupdate_loop - PSHUFB_XMM BSWAP DATA + pshufb BSWAP, DATA movups DATA, (%rdi) .Lupdate_just_ret: FRAME_END - ret + RET SYM_FUNC_END(clmul_ghash_update) diff --git a/arch/x86/crypto/ghash-clmulni-intel_glue.c b/arch/x86/crypto/ghash-clmulni-intel_glue.c index a4b728518e28..1f1a95f3dd0c 100644 --- a/arch/x86/crypto/ghash-clmulni-intel_glue.c +++ b/arch/x86/crypto/ghash-clmulni-intel_glue.c @@ -313,7 +313,7 @@ static struct ahash_alg ghash_async_alg = { }; static const struct x86_cpu_id pcmul_cpu_id[] = { - X86_FEATURE_MATCH(X86_FEATURE_PCLMULQDQ), /* Pickle-Mickle-Duck */ + X86_MATCH_FEATURE(X86_FEATURE_PCLMULQDQ, NULL), /* Pickle-Mickle-Duck */ {} }; MODULE_DEVICE_TABLE(x86cpu, pcmul_cpu_id); diff --git a/arch/x86/crypto/glue_helper-asm-avx.S b/arch/x86/crypto/glue_helper-asm-avx.S index d08fc575ef7f..3da385271227 100644 --- a/arch/x86/crypto/glue_helper-asm-avx.S +++ b/arch/x86/crypto/glue_helper-asm-avx.S @@ -34,107 +34,3 @@ vpxor (5*16)(src), x6, x6; \ vpxor (6*16)(src), x7, x7; \ store_8way(dst, x0, x1, x2, x3, x4, x5, x6, x7); - -#define inc_le128(x, minus_one, tmp) \ - vpcmpeqq minus_one, x, tmp; \ - vpsubq minus_one, x, x; \ - vpslldq $8, tmp, tmp; \ - vpsubq tmp, x, x; - -#define load_ctr_8way(iv, bswap, x0, x1, x2, x3, x4, x5, x6, x7, t0, t1, t2) \ - vpcmpeqd t0, t0, t0; \ - vpsrldq $8, t0, t0; /* low: -1, high: 0 */ \ - vmovdqa bswap, t1; \ - \ - /* load IV and byteswap */ \ - vmovdqu (iv), x7; \ - vpshufb t1, x7, x0; \ - \ - /* construct IVs */ \ - inc_le128(x7, t0, t2); \ - vpshufb t1, x7, x1; \ - inc_le128(x7, t0, t2); \ - vpshufb t1, x7, x2; \ - inc_le128(x7, t0, t2); \ - vpshufb t1, x7, x3; \ - inc_le128(x7, t0, t2); \ - vpshufb t1, x7, x4; \ - inc_le128(x7, t0, t2); \ - vpshufb t1, x7, x5; \ - inc_le128(x7, t0, t2); \ - vpshufb t1, x7, x6; \ - inc_le128(x7, t0, t2); \ - vmovdqa x7, t2; \ - vpshufb t1, x7, x7; \ - inc_le128(t2, t0, t1); \ - vmovdqu t2, (iv); - -#define store_ctr_8way(src, dst, x0, x1, x2, x3, x4, x5, x6, x7) \ - vpxor (0*16)(src), x0, x0; \ - vpxor (1*16)(src), x1, x1; \ - vpxor (2*16)(src), x2, x2; \ - vpxor (3*16)(src), x3, x3; \ - vpxor (4*16)(src), x4, x4; \ - vpxor (5*16)(src), x5, x5; \ - vpxor (6*16)(src), x6, x6; \ - vpxor (7*16)(src), x7, x7; \ - store_8way(dst, x0, x1, x2, x3, x4, x5, x6, x7); - -#define gf128mul_x_ble(iv, mask, tmp) \ - vpsrad $31, iv, tmp; \ - vpaddq iv, iv, iv; \ - vpshufd $0x13, tmp, tmp; \ - vpand mask, tmp, tmp; \ - vpxor tmp, iv, iv; - -#define load_xts_8way(iv, src, dst, x0, x1, x2, x3, x4, x5, x6, x7, tiv, t0, \ - t1, xts_gf128mul_and_shl1_mask) \ - vmovdqa xts_gf128mul_and_shl1_mask, t0; \ - \ - /* load IV */ \ - vmovdqu (iv), tiv; \ - vpxor (0*16)(src), tiv, x0; \ - vmovdqu tiv, (0*16)(dst); \ - \ - /* construct and store IVs, also xor with source */ \ - gf128mul_x_ble(tiv, t0, t1); \ - vpxor (1*16)(src), tiv, x1; \ - vmovdqu tiv, (1*16)(dst); \ - \ - gf128mul_x_ble(tiv, t0, t1); \ - vpxor (2*16)(src), tiv, x2; \ - vmovdqu tiv, (2*16)(dst); \ - \ - gf128mul_x_ble(tiv, t0, t1); \ - vpxor (3*16)(src), tiv, x3; \ - vmovdqu tiv, (3*16)(dst); \ - \ - gf128mul_x_ble(tiv, t0, t1); \ - vpxor (4*16)(src), tiv, x4; \ - vmovdqu tiv, (4*16)(dst); \ - \ - gf128mul_x_ble(tiv, t0, t1); \ - vpxor (5*16)(src), tiv, x5; \ - vmovdqu tiv, (5*16)(dst); \ - \ - gf128mul_x_ble(tiv, t0, t1); \ - vpxor (6*16)(src), tiv, x6; \ - vmovdqu tiv, (6*16)(dst); \ - \ - gf128mul_x_ble(tiv, t0, t1); \ - vpxor (7*16)(src), tiv, x7; \ - vmovdqu tiv, (7*16)(dst); \ - \ - gf128mul_x_ble(tiv, t0, t1); \ - vmovdqu tiv, (iv); - -#define store_xts_8way(dst, x0, x1, x2, x3, x4, x5, x6, x7) \ - vpxor (0*16)(dst), x0, x0; \ - vpxor (1*16)(dst), x1, x1; \ - vpxor (2*16)(dst), x2, x2; \ - vpxor (3*16)(dst), x3, x3; \ - vpxor (4*16)(dst), x4, x4; \ - vpxor (5*16)(dst), x5, x5; \ - vpxor (6*16)(dst), x6, x6; \ - vpxor (7*16)(dst), x7, x7; \ - store_8way(dst, x0, x1, x2, x3, x4, x5, x6, x7); diff --git a/arch/x86/crypto/glue_helper-asm-avx2.S b/arch/x86/crypto/glue_helper-asm-avx2.S index d84508c85c13..c77e9049431f 100644 --- a/arch/x86/crypto/glue_helper-asm-avx2.S +++ b/arch/x86/crypto/glue_helper-asm-avx2.S @@ -37,139 +37,3 @@ vpxor (5*32+16)(src), x6, x6; \ vpxor (6*32+16)(src), x7, x7; \ store_16way(dst, x0, x1, x2, x3, x4, x5, x6, x7); - -#define inc_le128(x, minus_one, tmp) \ - vpcmpeqq minus_one, x, tmp; \ - vpsubq minus_one, x, x; \ - vpslldq $8, tmp, tmp; \ - vpsubq tmp, x, x; - -#define add2_le128(x, minus_one, minus_two, tmp1, tmp2) \ - vpcmpeqq minus_one, x, tmp1; \ - vpcmpeqq minus_two, x, tmp2; \ - vpsubq minus_two, x, x; \ - vpor tmp2, tmp1, tmp1; \ - vpslldq $8, tmp1, tmp1; \ - vpsubq tmp1, x, x; - -#define load_ctr_16way(iv, bswap, x0, x1, x2, x3, x4, x5, x6, x7, t0, t0x, t1, \ - t1x, t2, t2x, t3, t3x, t4, t5) \ - vpcmpeqd t0, t0, t0; \ - vpsrldq $8, t0, t0; /* ab: -1:0 ; cd: -1:0 */ \ - vpaddq t0, t0, t4; /* ab: -2:0 ; cd: -2:0 */\ - \ - /* load IV and byteswap */ \ - vmovdqu (iv), t2x; \ - vmovdqa t2x, t3x; \ - inc_le128(t2x, t0x, t1x); \ - vbroadcasti128 bswap, t1; \ - vinserti128 $1, t2x, t3, t2; /* ab: le0 ; cd: le1 */ \ - vpshufb t1, t2, x0; \ - \ - /* construct IVs */ \ - add2_le128(t2, t0, t4, t3, t5); /* ab: le2 ; cd: le3 */ \ - vpshufb t1, t2, x1; \ - add2_le128(t2, t0, t4, t3, t5); \ - vpshufb t1, t2, x2; \ - add2_le128(t2, t0, t4, t3, t5); \ - vpshufb t1, t2, x3; \ - add2_le128(t2, t0, t4, t3, t5); \ - vpshufb t1, t2, x4; \ - add2_le128(t2, t0, t4, t3, t5); \ - vpshufb t1, t2, x5; \ - add2_le128(t2, t0, t4, t3, t5); \ - vpshufb t1, t2, x6; \ - add2_le128(t2, t0, t4, t3, t5); \ - vpshufb t1, t2, x7; \ - vextracti128 $1, t2, t2x; \ - inc_le128(t2x, t0x, t3x); \ - vmovdqu t2x, (iv); - -#define store_ctr_16way(src, dst, x0, x1, x2, x3, x4, x5, x6, x7) \ - vpxor (0*32)(src), x0, x0; \ - vpxor (1*32)(src), x1, x1; \ - vpxor (2*32)(src), x2, x2; \ - vpxor (3*32)(src), x3, x3; \ - vpxor (4*32)(src), x4, x4; \ - vpxor (5*32)(src), x5, x5; \ - vpxor (6*32)(src), x6, x6; \ - vpxor (7*32)(src), x7, x7; \ - store_16way(dst, x0, x1, x2, x3, x4, x5, x6, x7); - -#define gf128mul_x_ble(iv, mask, tmp) \ - vpsrad $31, iv, tmp; \ - vpaddq iv, iv, iv; \ - vpshufd $0x13, tmp, tmp; \ - vpand mask, tmp, tmp; \ - vpxor tmp, iv, iv; - -#define gf128mul_x2_ble(iv, mask1, mask2, tmp0, tmp1) \ - vpsrad $31, iv, tmp0; \ - vpaddq iv, iv, tmp1; \ - vpsllq $2, iv, iv; \ - vpshufd $0x13, tmp0, tmp0; \ - vpsrad $31, tmp1, tmp1; \ - vpand mask2, tmp0, tmp0; \ - vpshufd $0x13, tmp1, tmp1; \ - vpxor tmp0, iv, iv; \ - vpand mask1, tmp1, tmp1; \ - vpxor tmp1, iv, iv; - -#define load_xts_16way(iv, src, dst, x0, x1, x2, x3, x4, x5, x6, x7, tiv, \ - tivx, t0, t0x, t1, t1x, t2, t2x, t3, \ - xts_gf128mul_and_shl1_mask_0, \ - xts_gf128mul_and_shl1_mask_1) \ - vbroadcasti128 xts_gf128mul_and_shl1_mask_0, t1; \ - \ - /* load IV and construct second IV */ \ - vmovdqu (iv), tivx; \ - vmovdqa tivx, t0x; \ - gf128mul_x_ble(tivx, t1x, t2x); \ - vbroadcasti128 xts_gf128mul_and_shl1_mask_1, t2; \ - vinserti128 $1, tivx, t0, tiv; \ - vpxor (0*32)(src), tiv, x0; \ - vmovdqu tiv, (0*32)(dst); \ - \ - /* construct and store IVs, also xor with source */ \ - gf128mul_x2_ble(tiv, t1, t2, t0, t3); \ - vpxor (1*32)(src), tiv, x1; \ - vmovdqu tiv, (1*32)(dst); \ - \ - gf128mul_x2_ble(tiv, t1, t2, t0, t3); \ - vpxor (2*32)(src), tiv, x2; \ - vmovdqu tiv, (2*32)(dst); \ - \ - gf128mul_x2_ble(tiv, t1, t2, t0, t3); \ - vpxor (3*32)(src), tiv, x3; \ - vmovdqu tiv, (3*32)(dst); \ - \ - gf128mul_x2_ble(tiv, t1, t2, t0, t3); \ - vpxor (4*32)(src), tiv, x4; \ - vmovdqu tiv, (4*32)(dst); \ - \ - gf128mul_x2_ble(tiv, t1, t2, t0, t3); \ - vpxor (5*32)(src), tiv, x5; \ - vmovdqu tiv, (5*32)(dst); \ - \ - gf128mul_x2_ble(tiv, t1, t2, t0, t3); \ - vpxor (6*32)(src), tiv, x6; \ - vmovdqu tiv, (6*32)(dst); \ - \ - gf128mul_x2_ble(tiv, t1, t2, t0, t3); \ - vpxor (7*32)(src), tiv, x7; \ - vmovdqu tiv, (7*32)(dst); \ - \ - vextracti128 $1, tiv, tivx; \ - gf128mul_x_ble(tivx, t1x, t2x); \ - vmovdqu tivx, (iv); - -#define store_xts_16way(dst, x0, x1, x2, x3, x4, x5, x6, x7) \ - vpxor (0*32)(dst), x0, x0; \ - vpxor (1*32)(dst), x1, x1; \ - vpxor (2*32)(dst), x2, x2; \ - vpxor (3*32)(dst), x3, x3; \ - vpxor (4*32)(dst), x4, x4; \ - vpxor (5*32)(dst), x5, x5; \ - vpxor (6*32)(dst), x6, x6; \ - vpxor (7*32)(dst), x7, x7; \ - store_16way(dst, x0, x1, x2, x3, x4, x5, x6, x7); diff --git a/arch/x86/crypto/glue_helper.c b/arch/x86/crypto/glue_helper.c deleted file mode 100644 index d3d91a0abf88..000000000000 --- a/arch/x86/crypto/glue_helper.c +++ /dev/null @@ -1,381 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * Shared glue code for 128bit block ciphers - * - * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi> - * - * CBC & ECB parts based on code (crypto/cbc.c,ecb.c) by: - * Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au> - * CTR part based on code (crypto/ctr.c) by: - * (C) Copyright IBM Corp. 2007 - Joy Latten <latten@us.ibm.com> - */ - -#include <linux/module.h> -#include <crypto/b128ops.h> -#include <crypto/gf128mul.h> -#include <crypto/internal/skcipher.h> -#include <crypto/scatterwalk.h> -#include <crypto/xts.h> -#include <asm/crypto/glue_helper.h> - -int glue_ecb_req_128bit(const struct common_glue_ctx *gctx, - struct skcipher_request *req) -{ - void *ctx = crypto_skcipher_ctx(crypto_skcipher_reqtfm(req)); - const unsigned int bsize = 128 / 8; - struct skcipher_walk walk; - bool fpu_enabled = false; - unsigned int nbytes; - int err; - - err = skcipher_walk_virt(&walk, req, false); - - while ((nbytes = walk.nbytes)) { - const u8 *src = walk.src.virt.addr; - u8 *dst = walk.dst.virt.addr; - unsigned int func_bytes; - unsigned int i; - - fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit, - &walk, fpu_enabled, nbytes); - for (i = 0; i < gctx->num_funcs; i++) { - func_bytes = bsize * gctx->funcs[i].num_blocks; - - if (nbytes < func_bytes) - continue; - - /* Process multi-block batch */ - do { - gctx->funcs[i].fn_u.ecb(ctx, dst, src); - src += func_bytes; - dst += func_bytes; - nbytes -= func_bytes; - } while (nbytes >= func_bytes); - - if (nbytes < bsize) - break; - } - err = skcipher_walk_done(&walk, nbytes); - } - - glue_fpu_end(fpu_enabled); - return err; -} -EXPORT_SYMBOL_GPL(glue_ecb_req_128bit); - -int glue_cbc_encrypt_req_128bit(const common_glue_func_t fn, - struct skcipher_request *req) -{ - void *ctx = crypto_skcipher_ctx(crypto_skcipher_reqtfm(req)); - const unsigned int bsize = 128 / 8; - struct skcipher_walk walk; - unsigned int nbytes; - int err; - - err = skcipher_walk_virt(&walk, req, false); - - while ((nbytes = walk.nbytes)) { - const u128 *src = (u128 *)walk.src.virt.addr; - u128 *dst = (u128 *)walk.dst.virt.addr; - u128 *iv = (u128 *)walk.iv; - - do { - u128_xor(dst, src, iv); - fn(ctx, (u8 *)dst, (u8 *)dst); - iv = dst; - src++; - dst++; - nbytes -= bsize; - } while (nbytes >= bsize); - - *(u128 *)walk.iv = *iv; - err = skcipher_walk_done(&walk, nbytes); - } - return err; -} -EXPORT_SYMBOL_GPL(glue_cbc_encrypt_req_128bit); - -int glue_cbc_decrypt_req_128bit(const struct common_glue_ctx *gctx, - struct skcipher_request *req) -{ - void *ctx = crypto_skcipher_ctx(crypto_skcipher_reqtfm(req)); - const unsigned int bsize = 128 / 8; - struct skcipher_walk walk; - bool fpu_enabled = false; - unsigned int nbytes; - int err; - - err = skcipher_walk_virt(&walk, req, false); - - while ((nbytes = walk.nbytes)) { - const u128 *src = walk.src.virt.addr; - u128 *dst = walk.dst.virt.addr; - unsigned int func_bytes, num_blocks; - unsigned int i; - u128 last_iv; - - fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit, - &walk, fpu_enabled, nbytes); - /* Start of the last block. */ - src += nbytes / bsize - 1; - dst += nbytes / bsize - 1; - - last_iv = *src; - - for (i = 0; i < gctx->num_funcs; i++) { - num_blocks = gctx->funcs[i].num_blocks; - func_bytes = bsize * num_blocks; - - if (nbytes < func_bytes) - continue; - - /* Process multi-block batch */ - do { - src -= num_blocks - 1; - dst -= num_blocks - 1; - - gctx->funcs[i].fn_u.cbc(ctx, (u8 *)dst, - (const u8 *)src); - - nbytes -= func_bytes; - if (nbytes < bsize) - goto done; - - u128_xor(dst, dst, --src); - dst--; - } while (nbytes >= func_bytes); - } -done: - u128_xor(dst, dst, (u128 *)walk.iv); - *(u128 *)walk.iv = last_iv; - err = skcipher_walk_done(&walk, nbytes); - } - - glue_fpu_end(fpu_enabled); - return err; -} -EXPORT_SYMBOL_GPL(glue_cbc_decrypt_req_128bit); - -int glue_ctr_req_128bit(const struct common_glue_ctx *gctx, - struct skcipher_request *req) -{ - void *ctx = crypto_skcipher_ctx(crypto_skcipher_reqtfm(req)); - const unsigned int bsize = 128 / 8; - struct skcipher_walk walk; - bool fpu_enabled = false; - unsigned int nbytes; - int err; - - err = skcipher_walk_virt(&walk, req, false); - - while ((nbytes = walk.nbytes) >= bsize) { - const u128 *src = walk.src.virt.addr; - u128 *dst = walk.dst.virt.addr; - unsigned int func_bytes, num_blocks; - unsigned int i; - le128 ctrblk; - - fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit, - &walk, fpu_enabled, nbytes); - - be128_to_le128(&ctrblk, (be128 *)walk.iv); - - for (i = 0; i < gctx->num_funcs; i++) { - num_blocks = gctx->funcs[i].num_blocks; - func_bytes = bsize * num_blocks; - - if (nbytes < func_bytes) - continue; - - /* Process multi-block batch */ - do { - gctx->funcs[i].fn_u.ctr(ctx, (u8 *)dst, - (const u8 *)src, - &ctrblk); - src += num_blocks; - dst += num_blocks; - nbytes -= func_bytes; - } while (nbytes >= func_bytes); - - if (nbytes < bsize) - break; - } - - le128_to_be128((be128 *)walk.iv, &ctrblk); - err = skcipher_walk_done(&walk, nbytes); - } - - glue_fpu_end(fpu_enabled); - - if (nbytes) { - le128 ctrblk; - u128 tmp; - - be128_to_le128(&ctrblk, (be128 *)walk.iv); - memcpy(&tmp, walk.src.virt.addr, nbytes); - gctx->funcs[gctx->num_funcs - 1].fn_u.ctr(ctx, (u8 *)&tmp, - (const u8 *)&tmp, - &ctrblk); - memcpy(walk.dst.virt.addr, &tmp, nbytes); - le128_to_be128((be128 *)walk.iv, &ctrblk); - - err = skcipher_walk_done(&walk, 0); - } - - return err; -} -EXPORT_SYMBOL_GPL(glue_ctr_req_128bit); - -static unsigned int __glue_xts_req_128bit(const struct common_glue_ctx *gctx, - void *ctx, - struct skcipher_walk *walk) -{ - const unsigned int bsize = 128 / 8; - unsigned int nbytes = walk->nbytes; - u128 *src = walk->src.virt.addr; - u128 *dst = walk->dst.virt.addr; - unsigned int num_blocks, func_bytes; - unsigned int i; - - /* Process multi-block batch */ - for (i = 0; i < gctx->num_funcs; i++) { - num_blocks = gctx->funcs[i].num_blocks; - func_bytes = bsize * num_blocks; - - if (nbytes >= func_bytes) { - do { - gctx->funcs[i].fn_u.xts(ctx, (u8 *)dst, - (const u8 *)src, - walk->iv); - - src += num_blocks; - dst += num_blocks; - nbytes -= func_bytes; - } while (nbytes >= func_bytes); - - if (nbytes < bsize) - goto done; - } - } - -done: - return nbytes; -} - -int glue_xts_req_128bit(const struct common_glue_ctx *gctx, - struct skcipher_request *req, - common_glue_func_t tweak_fn, void *tweak_ctx, - void *crypt_ctx, bool decrypt) -{ - const bool cts = (req->cryptlen % XTS_BLOCK_SIZE); - const unsigned int bsize = 128 / 8; - struct skcipher_request subreq; - struct skcipher_walk walk; - bool fpu_enabled = false; - unsigned int nbytes, tail; - int err; - - if (req->cryptlen < XTS_BLOCK_SIZE) - return -EINVAL; - - if (unlikely(cts)) { - struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - - tail = req->cryptlen % XTS_BLOCK_SIZE + XTS_BLOCK_SIZE; - - skcipher_request_set_tfm(&subreq, tfm); - skcipher_request_set_callback(&subreq, - crypto_skcipher_get_flags(tfm), - NULL, NULL); - skcipher_request_set_crypt(&subreq, req->src, req->dst, - req->cryptlen - tail, req->iv); - req = &subreq; - } - - err = skcipher_walk_virt(&walk, req, false); - nbytes = walk.nbytes; - if (err) - return err; - - /* set minimum length to bsize, for tweak_fn */ - fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit, - &walk, fpu_enabled, - nbytes < bsize ? bsize : nbytes); - - /* calculate first value of T */ - tweak_fn(tweak_ctx, walk.iv, walk.iv); - - while (nbytes) { - nbytes = __glue_xts_req_128bit(gctx, crypt_ctx, &walk); - - err = skcipher_walk_done(&walk, nbytes); - nbytes = walk.nbytes; - } - - if (unlikely(cts)) { - u8 *next_tweak, *final_tweak = req->iv; - struct scatterlist *src, *dst; - struct scatterlist s[2], d[2]; - le128 b[2]; - - dst = src = scatterwalk_ffwd(s, req->src, req->cryptlen); - if (req->dst != req->src) - dst = scatterwalk_ffwd(d, req->dst, req->cryptlen); - - if (decrypt) { - next_tweak = memcpy(b, req->iv, XTS_BLOCK_SIZE); - gf128mul_x_ble(b, b); - } else { - next_tweak = req->iv; - } - - skcipher_request_set_crypt(&subreq, src, dst, XTS_BLOCK_SIZE, - next_tweak); - - err = skcipher_walk_virt(&walk, req, false) ?: - skcipher_walk_done(&walk, - __glue_xts_req_128bit(gctx, crypt_ctx, &walk)); - if (err) - goto out; - - scatterwalk_map_and_copy(b, dst, 0, XTS_BLOCK_SIZE, 0); - memcpy(b + 1, b, tail - XTS_BLOCK_SIZE); - scatterwalk_map_and_copy(b, src, XTS_BLOCK_SIZE, - tail - XTS_BLOCK_SIZE, 0); - scatterwalk_map_and_copy(b, dst, 0, tail, 1); - - skcipher_request_set_crypt(&subreq, dst, dst, XTS_BLOCK_SIZE, - final_tweak); - - err = skcipher_walk_virt(&walk, req, false) ?: - skcipher_walk_done(&walk, - __glue_xts_req_128bit(gctx, crypt_ctx, &walk)); - } - -out: - glue_fpu_end(fpu_enabled); - - return err; -} -EXPORT_SYMBOL_GPL(glue_xts_req_128bit); - -void glue_xts_crypt_128bit_one(const void *ctx, u8 *dst, const u8 *src, - le128 *iv, common_glue_func_t fn) -{ - le128 ivblk = *iv; - - /* generate next IV */ - gf128mul_x_ble(iv, &ivblk); - - /* CC <- T xor C */ - u128_xor((u128 *)dst, (const u128 *)src, (u128 *)&ivblk); - - /* PP <- D(Key2,CC) */ - fn(ctx, dst, dst); - - /* P <- T xor PP */ - u128_xor((u128 *)dst, (u128 *)dst, (u128 *)&ivblk); -} -EXPORT_SYMBOL_GPL(glue_xts_crypt_128bit_one); - -MODULE_LICENSE("GPL"); diff --git a/arch/x86/crypto/nh-avx2-x86_64.S b/arch/x86/crypto/nh-avx2-x86_64.S index b22c7b936272..6a0b15e7196a 100644 --- a/arch/x86/crypto/nh-avx2-x86_64.S +++ b/arch/x86/crypto/nh-avx2-x86_64.S @@ -153,5 +153,5 @@ SYM_FUNC_START(nh_avx2) vpaddq T1, T0, T0 vpaddq T4, T0, T0 vmovdqu T0, (HASH) - ret + RET SYM_FUNC_END(nh_avx2) diff --git a/arch/x86/crypto/nh-sse2-x86_64.S b/arch/x86/crypto/nh-sse2-x86_64.S index d7ae22dd6683..34c567bbcb4f 100644 --- a/arch/x86/crypto/nh-sse2-x86_64.S +++ b/arch/x86/crypto/nh-sse2-x86_64.S @@ -119,5 +119,5 @@ SYM_FUNC_START(nh_sse2) paddq PASS2_SUMS, T1 movdqu T0, 0x00(HASH) movdqu T1, 0x10(HASH) - ret + RET SYM_FUNC_END(nh_sse2) diff --git a/arch/x86/crypto/nhpoly1305-avx2-glue.c b/arch/x86/crypto/nhpoly1305-avx2-glue.c index f7567cbd35b6..8ea5ab0f1ca7 100644 --- a/arch/x86/crypto/nhpoly1305-avx2-glue.c +++ b/arch/x86/crypto/nhpoly1305-avx2-glue.c @@ -10,6 +10,7 @@ #include <crypto/internal/simd.h> #include <crypto/nhpoly1305.h> #include <linux/module.h> +#include <linux/sizes.h> #include <asm/simd.h> asmlinkage void nh_avx2(const u32 *key, const u8 *message, size_t message_len, @@ -29,7 +30,7 @@ static int nhpoly1305_avx2_update(struct shash_desc *desc, return crypto_nhpoly1305_update(desc, src, srclen); do { - unsigned int n = min_t(unsigned int, srclen, PAGE_SIZE); + unsigned int n = min_t(unsigned int, srclen, SZ_4K); kernel_fpu_begin(); crypto_nhpoly1305_update_helper(desc, src, n, _nh_avx2); diff --git a/arch/x86/crypto/nhpoly1305-sse2-glue.c b/arch/x86/crypto/nhpoly1305-sse2-glue.c index a661ede3b5cf..2b353d42ed13 100644 --- a/arch/x86/crypto/nhpoly1305-sse2-glue.c +++ b/arch/x86/crypto/nhpoly1305-sse2-glue.c @@ -10,6 +10,7 @@ #include <crypto/internal/simd.h> #include <crypto/nhpoly1305.h> #include <linux/module.h> +#include <linux/sizes.h> #include <asm/simd.h> asmlinkage void nh_sse2(const u32 *key, const u8 *message, size_t message_len, @@ -29,7 +30,7 @@ static int nhpoly1305_sse2_update(struct shash_desc *desc, return crypto_nhpoly1305_update(desc, src, srclen); do { - unsigned int n = min_t(unsigned int, srclen, PAGE_SIZE); + unsigned int n = min_t(unsigned int, srclen, SZ_4K); kernel_fpu_begin(); crypto_nhpoly1305_update_helper(desc, src, n, _nh_sse2); diff --git a/arch/x86/crypto/poly1305-x86_64-cryptogams.pl b/arch/x86/crypto/poly1305-x86_64-cryptogams.pl index 7a6b5380a46f..2077ce7a5647 100644 --- a/arch/x86/crypto/poly1305-x86_64-cryptogams.pl +++ b/arch/x86/crypto/poly1305-x86_64-cryptogams.pl @@ -246,12 +246,12 @@ $code.=<<___ if (!$kernel); ___ &declare_function("poly1305_init_x86_64", 32, 3); $code.=<<___; - xor %rax,%rax + xor %eax,%eax mov %rax,0($ctx) # initialize hash value mov %rax,8($ctx) mov %rax,16($ctx) - cmp \$0,$inp + test $inp,$inp je .Lno_key ___ $code.=<<___ if (!$kernel); @@ -297,7 +297,7 @@ ___ $code.=<<___; mov \$1,%eax .Lno_key: - ret + RET ___ &end_function("poly1305_init_x86_64"); @@ -373,7 +373,7 @@ $code.=<<___; .cfi_adjust_cfa_offset -48 .Lno_data: .Lblocks_epilogue: - ret + RET .cfi_endproc ___ &end_function("poly1305_blocks_x86_64"); @@ -399,15 +399,11 @@ $code.=<<___; mov %rax,0($mac) # write result mov %rcx,8($mac) - ret + RET ___ &end_function("poly1305_emit_x86_64"); if ($avx) { -if($kernel) { - $code .= "#ifdef CONFIG_AS_AVX\n"; -} - ######################################################################## # Layout of opaque area is following. # @@ -433,7 +429,7 @@ ___ &poly1305_iteration(); $code.=<<___; pop $ctx - ret + RET .size __poly1305_block,.-__poly1305_block .type __poly1305_init_avx,\@abi-omnipotent @@ -598,7 +594,7 @@ __poly1305_init_avx: lea -48-64($ctx),$ctx # size [de-]optimization pop %rbp - ret + RET .size __poly1305_init_avx,.-__poly1305_init_avx ___ @@ -751,7 +747,7 @@ $code.=<<___; .cfi_restore %rbp .Lno_data_avx: .Lblocks_avx_epilogue: - ret + RET .cfi_endproc .align 32 @@ -1456,7 +1452,7 @@ $code.=<<___ if (!$win64); ___ $code.=<<___; vzeroupper - ret + RET .cfi_endproc ___ &end_function("poly1305_blocks_avx"); @@ -1512,20 +1508,12 @@ $code.=<<___; mov %rax,0($mac) # write result mov %rcx,8($mac) - ret + RET ___ &end_function("poly1305_emit_avx"); -if ($kernel) { - $code .= "#endif\n"; -} - if ($avx>1) { -if ($kernel) { - $code .= "#ifdef CONFIG_AS_AVX2\n"; -} - my ($H0,$H1,$H2,$H3,$H4, $MASK, $T4,$T0,$T1,$T2,$T3, $D0,$D1,$D2,$D3,$D4) = map("%ymm$_",(0..15)); my $S4=$MASK; @@ -1687,7 +1675,7 @@ $code.=<<___; .cfi_restore %rbp .Lno_data_avx2$suffix: .Lblocks_avx2_epilogue$suffix: - ret + RET .cfi_endproc .align 32 @@ -2213,7 +2201,7 @@ $code.=<<___ if (!$win64); ___ $code.=<<___; vzeroupper - ret + RET .cfi_endproc ___ if($avx > 2 && $avx512) { @@ -2804,7 +2792,7 @@ $code.=<<___ if (!$win64); .cfi_def_cfa_register %rsp ___ $code.=<<___; - ret + RET .cfi_endproc ___ @@ -2816,10 +2804,6 @@ ___ poly1305_blocks_avxN(0); &end_function("poly1305_blocks_avx2"); -if($kernel) { - $code .= "#endif\n"; -} - ####################################################################### if ($avx>2) { # On entry we have input length divisible by 64. But since inner loop @@ -2869,7 +2853,7 @@ $code.=<<___; .type poly1305_init_base2_44,\@function,3 .align 32 poly1305_init_base2_44: - xor %rax,%rax + xor %eax,%eax mov %rax,0($ctx) # initialize hash value mov %rax,8($ctx) mov %rax,16($ctx) @@ -2909,7 +2893,7 @@ $code.=<<___ if ($flavour =~ /elf32/); ___ $code.=<<___; mov \$1,%eax - ret + RET .size poly1305_init_base2_44,.-poly1305_init_base2_44 ___ { @@ -3026,7 +3010,7 @@ poly1305_blocks_vpmadd52: jnz .Lblocks_vpmadd52_4x .Lno_data_vpmadd52: - ret + RET .size poly1305_blocks_vpmadd52,.-poly1305_blocks_vpmadd52 ___ } @@ -3467,7 +3451,7 @@ poly1305_blocks_vpmadd52_4x: vzeroall .Lno_data_vpmadd52_4x: - ret + RET .size poly1305_blocks_vpmadd52_4x,.-poly1305_blocks_vpmadd52_4x ___ } @@ -3840,7 +3824,7 @@ $code.=<<___; vzeroall .Lno_data_vpmadd52_8x: - ret + RET .size poly1305_blocks_vpmadd52_8x,.-poly1305_blocks_vpmadd52_8x ___ } @@ -3877,7 +3861,7 @@ poly1305_emit_base2_44: mov %rax,0($mac) # write result mov %rcx,8($mac) - ret + RET .size poly1305_emit_base2_44,.-poly1305_emit_base2_44 ___ } } } @@ -3932,7 +3916,7 @@ xor128_encrypt_n_pad: .Ldone_enc: mov $otp,%rax - ret + RET .size xor128_encrypt_n_pad,.-xor128_encrypt_n_pad .globl xor128_decrypt_n_pad @@ -3963,7 +3947,7 @@ xor128_decrypt_n_pad: mov \$16,$len sub %r10,$len xor %eax,%eax - xor %r11,%r11 + xor %r11d,%r11d .Loop_dec_byte: mov ($inp,$otp),%r11b mov ($otp),%al @@ -3983,7 +3967,7 @@ xor128_decrypt_n_pad: .Ldone_dec: mov $otp,%rax - ret + RET .size xor128_decrypt_n_pad,.-xor128_decrypt_n_pad ___ } @@ -4101,7 +4085,7 @@ avx_handler: .long 0xa548f3fc # cld; rep movsq mov $disp,%rsi - xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER + xor %ecx,%ecx # arg1, UNW_FLAG_NHANDLER mov 8(%rsi),%rdx # arg2, disp->ImageBase mov 0(%rsi),%r8 # arg3, disp->ControlPc mov 16(%rsi),%r9 # arg4, disp->FunctionEntry @@ -4125,7 +4109,7 @@ avx_handler: pop %rbx pop %rdi pop %rsi - ret + RET .size avx_handler,.-avx_handler .section .pdata diff --git a/arch/x86/crypto/poly1305_glue.c b/arch/x86/crypto/poly1305_glue.c index 79bb58737d52..1dfb8af48a3c 100644 --- a/arch/x86/crypto/poly1305_glue.c +++ b/arch/x86/crypto/poly1305_glue.c @@ -11,11 +11,12 @@ #include <linux/jump_label.h> #include <linux/kernel.h> #include <linux/module.h> +#include <linux/sizes.h> #include <asm/intel-family.h> #include <asm/simd.h> asmlinkage void poly1305_init_x86_64(void *ctx, - const u8 key[POLY1305_KEY_SIZE]); + const u8 key[POLY1305_BLOCK_SIZE]); asmlinkage void poly1305_blocks_x86_64(void *ctx, const u8 *inp, const size_t len, const u32 padbit); asmlinkage void poly1305_emit_x86_64(void *ctx, u8 mac[POLY1305_DIGEST_SIZE], @@ -80,7 +81,7 @@ static void convert_to_base2_64(void *ctx) state->is_base2_26 = 0; } -static void poly1305_simd_init(void *ctx, const u8 key[POLY1305_KEY_SIZE]) +static void poly1305_simd_init(void *ctx, const u8 key[POLY1305_BLOCK_SIZE]) { poly1305_init_x86_64(ctx, key); } @@ -91,10 +92,10 @@ static void poly1305_simd_blocks(void *ctx, const u8 *inp, size_t len, struct poly1305_arch_internal *state = ctx; /* SIMD disables preemption, so relax after processing each page. */ - BUILD_BUG_ON(PAGE_SIZE < POLY1305_BLOCK_SIZE || - PAGE_SIZE % POLY1305_BLOCK_SIZE); + BUILD_BUG_ON(SZ_4K < POLY1305_BLOCK_SIZE || + SZ_4K % POLY1305_BLOCK_SIZE); - if (!IS_ENABLED(CONFIG_AS_AVX) || !static_branch_likely(&poly1305_use_avx) || + if (!static_branch_likely(&poly1305_use_avx) || (len < (POLY1305_BLOCK_SIZE * 18) && !state->is_base2_26) || !crypto_simd_usable()) { convert_to_base2_64(ctx); @@ -102,34 +103,33 @@ static void poly1305_simd_blocks(void *ctx, const u8 *inp, size_t len, return; } - for (;;) { - const size_t bytes = min_t(size_t, len, PAGE_SIZE); + do { + const size_t bytes = min_t(size_t, len, SZ_4K); kernel_fpu_begin(); if (IS_ENABLED(CONFIG_AS_AVX512) && static_branch_likely(&poly1305_use_avx512)) poly1305_blocks_avx512(ctx, inp, bytes, padbit); - else if (IS_ENABLED(CONFIG_AS_AVX2) && static_branch_likely(&poly1305_use_avx2)) + else if (static_branch_likely(&poly1305_use_avx2)) poly1305_blocks_avx2(ctx, inp, bytes, padbit); else poly1305_blocks_avx(ctx, inp, bytes, padbit); kernel_fpu_end(); + len -= bytes; - if (!len) - break; inp += bytes; - } + } while (len); } static void poly1305_simd_emit(void *ctx, u8 mac[POLY1305_DIGEST_SIZE], const u32 nonce[4]) { - if (!IS_ENABLED(CONFIG_AS_AVX) || !static_branch_likely(&poly1305_use_avx)) + if (!static_branch_likely(&poly1305_use_avx)) poly1305_emit_x86_64(ctx, mac, nonce); else poly1305_emit_avx(ctx, mac, nonce); } -void poly1305_init_arch(struct poly1305_desc_ctx *dctx, const u8 *key) +void poly1305_init_arch(struct poly1305_desc_ctx *dctx, const u8 key[POLY1305_KEY_SIZE]) { poly1305_simd_init(&dctx->h, key); dctx->s[0] = get_unaligned_le32(&key[16]); @@ -158,8 +158,6 @@ static unsigned int crypto_poly1305_setdctxkey(struct poly1305_desc_ctx *dctx, dctx->s[1] = get_unaligned_le32(&inp[4]); dctx->s[2] = get_unaligned_le32(&inp[8]); dctx->s[3] = get_unaligned_le32(&inp[12]); - inp += POLY1305_BLOCK_SIZE; - len -= POLY1305_BLOCK_SIZE; acc += POLY1305_BLOCK_SIZE; dctx->sset = true; } @@ -212,7 +210,7 @@ void poly1305_final_arch(struct poly1305_desc_ctx *dctx, u8 *dst) } poly1305_simd_emit(&dctx->h, dst, dctx->s); - *dctx = (struct poly1305_desc_ctx){}; + memzero_explicit(dctx, sizeof(*dctx)); } EXPORT_SYMBOL(poly1305_final_arch); @@ -261,11 +259,10 @@ static struct shash_alg alg = { static int __init poly1305_simd_mod_init(void) { - if (IS_ENABLED(CONFIG_AS_AVX) && boot_cpu_has(X86_FEATURE_AVX) && + if (boot_cpu_has(X86_FEATURE_AVX) && cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) static_branch_enable(&poly1305_use_avx); - if (IS_ENABLED(CONFIG_AS_AVX2) && boot_cpu_has(X86_FEATURE_AVX) && - boot_cpu_has(X86_FEATURE_AVX2) && + if (boot_cpu_has(X86_FEATURE_AVX) && boot_cpu_has(X86_FEATURE_AVX2) && cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) static_branch_enable(&poly1305_use_avx2); if (IS_ENABLED(CONFIG_AS_AVX512) && boot_cpu_has(X86_FEATURE_AVX) && diff --git a/arch/x86/crypto/polyval-clmulni_asm.S b/arch/x86/crypto/polyval-clmulni_asm.S new file mode 100644 index 000000000000..a6ebe4e7dd2b --- /dev/null +++ b/arch/x86/crypto/polyval-clmulni_asm.S @@ -0,0 +1,321 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright 2021 Google LLC + */ +/* + * This is an efficient implementation of POLYVAL using intel PCLMULQDQ-NI + * instructions. It works on 8 blocks at a time, by precomputing the first 8 + * keys powers h^8, ..., h^1 in the POLYVAL finite field. This precomputation + * allows us to split finite field multiplication into two steps. + * + * In the first step, we consider h^i, m_i as normal polynomials of degree less + * than 128. We then compute p(x) = h^8m_0 + ... + h^1m_7 where multiplication + * is simply polynomial multiplication. + * + * In the second step, we compute the reduction of p(x) modulo the finite field + * modulus g(x) = x^128 + x^127 + x^126 + x^121 + 1. + * + * This two step process is equivalent to computing h^8m_0 + ... + h^1m_7 where + * multiplication is finite field multiplication. The advantage is that the + * two-step process only requires 1 finite field reduction for every 8 + * polynomial multiplications. Further parallelism is gained by interleaving the + * multiplications and polynomial reductions. + */ + +#include <linux/linkage.h> +#include <asm/frame.h> + +#define STRIDE_BLOCKS 8 + +#define GSTAR %xmm7 +#define PL %xmm8 +#define PH %xmm9 +#define TMP_XMM %xmm11 +#define LO %xmm12 +#define HI %xmm13 +#define MI %xmm14 +#define SUM %xmm15 + +#define KEY_POWERS %rdi +#define MSG %rsi +#define BLOCKS_LEFT %rdx +#define ACCUMULATOR %rcx +#define TMP %rax + +.section .rodata.cst16.gstar, "aM", @progbits, 16 +.align 16 + +.Lgstar: + .quad 0xc200000000000000, 0xc200000000000000 + +.text + +/* + * Performs schoolbook1_iteration on two lists of 128-bit polynomials of length + * count pointed to by MSG and KEY_POWERS. + */ +.macro schoolbook1 count + .set i, 0 + .rept (\count) + schoolbook1_iteration i 0 + .set i, (i +1) + .endr +.endm + +/* + * Computes the product of two 128-bit polynomials at the memory locations + * specified by (MSG + 16*i) and (KEY_POWERS + 16*i) and XORs the components of + * the 256-bit product into LO, MI, HI. + * + * Given: + * X = [X_1 : X_0] + * Y = [Y_1 : Y_0] + * + * We compute: + * LO += X_0 * Y_0 + * MI += X_0 * Y_1 + X_1 * Y_0 + * HI += X_1 * Y_1 + * + * Later, the 256-bit result can be extracted as: + * [HI_1 : HI_0 + MI_1 : LO_1 + MI_0 : LO_0] + * This step is done when computing the polynomial reduction for efficiency + * reasons. + * + * If xor_sum == 1, then also XOR the value of SUM into m_0. This avoids an + * extra multiplication of SUM and h^8. + */ +.macro schoolbook1_iteration i xor_sum + movups (16*\i)(MSG), %xmm0 + .if (\i == 0 && \xor_sum == 1) + pxor SUM, %xmm0 + .endif + vpclmulqdq $0x01, (16*\i)(KEY_POWERS), %xmm0, %xmm2 + vpclmulqdq $0x00, (16*\i)(KEY_POWERS), %xmm0, %xmm1 + vpclmulqdq $0x10, (16*\i)(KEY_POWERS), %xmm0, %xmm3 + vpclmulqdq $0x11, (16*\i)(KEY_POWERS), %xmm0, %xmm4 + vpxor %xmm2, MI, MI + vpxor %xmm1, LO, LO + vpxor %xmm4, HI, HI + vpxor %xmm3, MI, MI +.endm + +/* + * Performs the same computation as schoolbook1_iteration, except we expect the + * arguments to already be loaded into xmm0 and xmm1 and we set the result + * registers LO, MI, and HI directly rather than XOR'ing into them. + */ +.macro schoolbook1_noload + vpclmulqdq $0x01, %xmm0, %xmm1, MI + vpclmulqdq $0x10, %xmm0, %xmm1, %xmm2 + vpclmulqdq $0x00, %xmm0, %xmm1, LO + vpclmulqdq $0x11, %xmm0, %xmm1, HI + vpxor %xmm2, MI, MI +.endm + +/* + * Computes the 256-bit polynomial represented by LO, HI, MI. Stores + * the result in PL, PH. + * [PH : PL] = [HI_1 : HI_0 + MI_1 : LO_1 + MI_0 : LO_0] + */ +.macro schoolbook2 + vpslldq $8, MI, PL + vpsrldq $8, MI, PH + pxor LO, PL + pxor HI, PH +.endm + +/* + * Computes the 128-bit reduction of PH : PL. Stores the result in dest. + * + * This macro computes p(x) mod g(x) where p(x) is in montgomery form and g(x) = + * x^128 + x^127 + x^126 + x^121 + 1. + * + * We have a 256-bit polynomial PH : PL = P_3 : P_2 : P_1 : P_0 that is the + * product of two 128-bit polynomials in Montgomery form. We need to reduce it + * mod g(x). Also, since polynomials in Montgomery form have an "extra" factor + * of x^128, this product has two extra factors of x^128. To get it back into + * Montgomery form, we need to remove one of these factors by dividing by x^128. + * + * To accomplish both of these goals, we add multiples of g(x) that cancel out + * the low 128 bits P_1 : P_0, leaving just the high 128 bits. Since the low + * bits are zero, the polynomial division by x^128 can be done by right shifting. + * + * Since the only nonzero term in the low 64 bits of g(x) is the constant term, + * the multiple of g(x) needed to cancel out P_0 is P_0 * g(x). The CPU can + * only do 64x64 bit multiplications, so split P_0 * g(x) into x^128 * P_0 + + * x^64 * g*(x) * P_0 + P_0, where g*(x) is bits 64-127 of g(x). Adding this to + * the original polynomial gives P_3 : P_2 + P_0 + T_1 : P_1 + T_0 : 0, where T + * = T_1 : T_0 = g*(x) * P_0. Thus, bits 0-63 got "folded" into bits 64-191. + * + * Repeating this same process on the next 64 bits "folds" bits 64-127 into bits + * 128-255, giving the answer in bits 128-255. This time, we need to cancel P_1 + * + T_0 in bits 64-127. The multiple of g(x) required is (P_1 + T_0) * g(x) * + * x^64. Adding this to our previous computation gives P_3 + P_1 + T_0 + V_1 : + * P_2 + P_0 + T_1 + V_0 : 0 : 0, where V = V_1 : V_0 = g*(x) * (P_1 + T_0). + * + * So our final computation is: + * T = T_1 : T_0 = g*(x) * P_0 + * V = V_1 : V_0 = g*(x) * (P_1 + T_0) + * p(x) / x^{128} mod g(x) = P_3 + P_1 + T_0 + V_1 : P_2 + P_0 + T_1 + V_0 + * + * The implementation below saves a XOR instruction by computing P_1 + T_0 : P_0 + * + T_1 and XORing into dest, rather than separately XORing P_1 : P_0 and T_0 : + * T_1 into dest. This allows us to reuse P_1 + T_0 when computing V. + */ +.macro montgomery_reduction dest + vpclmulqdq $0x00, PL, GSTAR, TMP_XMM # TMP_XMM = T_1 : T_0 = P_0 * g*(x) + pshufd $0b01001110, TMP_XMM, TMP_XMM # TMP_XMM = T_0 : T_1 + pxor PL, TMP_XMM # TMP_XMM = P_1 + T_0 : P_0 + T_1 + pxor TMP_XMM, PH # PH = P_3 + P_1 + T_0 : P_2 + P_0 + T_1 + pclmulqdq $0x11, GSTAR, TMP_XMM # TMP_XMM = V_1 : V_0 = V = [(P_1 + T_0) * g*(x)] + vpxor TMP_XMM, PH, \dest +.endm + +/* + * Compute schoolbook multiplication for 8 blocks + * m_0h^8 + ... + m_7h^1 + * + * If reduce is set, also computes the montgomery reduction of the + * previous full_stride call and XORs with the first message block. + * (m_0 + REDUCE(PL, PH))h^8 + ... + m_7h^1. + * I.e., the first multiplication uses m_0 + REDUCE(PL, PH) instead of m_0. + */ +.macro full_stride reduce + pxor LO, LO + pxor HI, HI + pxor MI, MI + + schoolbook1_iteration 7 0 + .if \reduce + vpclmulqdq $0x00, PL, GSTAR, TMP_XMM + .endif + + schoolbook1_iteration 6 0 + .if \reduce + pshufd $0b01001110, TMP_XMM, TMP_XMM + .endif + + schoolbook1_iteration 5 0 + .if \reduce + pxor PL, TMP_XMM + .endif + + schoolbook1_iteration 4 0 + .if \reduce + pxor TMP_XMM, PH + .endif + + schoolbook1_iteration 3 0 + .if \reduce + pclmulqdq $0x11, GSTAR, TMP_XMM + .endif + + schoolbook1_iteration 2 0 + .if \reduce + vpxor TMP_XMM, PH, SUM + .endif + + schoolbook1_iteration 1 0 + + schoolbook1_iteration 0 1 + + addq $(8*16), MSG + schoolbook2 +.endm + +/* + * Process BLOCKS_LEFT blocks, where 0 < BLOCKS_LEFT < STRIDE_BLOCKS + */ +.macro partial_stride + mov BLOCKS_LEFT, TMP + shlq $4, TMP + addq $(16*STRIDE_BLOCKS), KEY_POWERS + subq TMP, KEY_POWERS + + movups (MSG), %xmm0 + pxor SUM, %xmm0 + movaps (KEY_POWERS), %xmm1 + schoolbook1_noload + dec BLOCKS_LEFT + addq $16, MSG + addq $16, KEY_POWERS + + test $4, BLOCKS_LEFT + jz .Lpartial4BlocksDone + schoolbook1 4 + addq $(4*16), MSG + addq $(4*16), KEY_POWERS +.Lpartial4BlocksDone: + test $2, BLOCKS_LEFT + jz .Lpartial2BlocksDone + schoolbook1 2 + addq $(2*16), MSG + addq $(2*16), KEY_POWERS +.Lpartial2BlocksDone: + test $1, BLOCKS_LEFT + jz .LpartialDone + schoolbook1 1 +.LpartialDone: + schoolbook2 + montgomery_reduction SUM +.endm + +/* + * Perform montgomery multiplication in GF(2^128) and store result in op1. + * + * Computes op1*op2*x^{-128} mod x^128 + x^127 + x^126 + x^121 + 1 + * If op1, op2 are in montgomery form, this computes the montgomery + * form of op1*op2. + * + * void clmul_polyval_mul(u8 *op1, const u8 *op2); + */ +SYM_FUNC_START(clmul_polyval_mul) + FRAME_BEGIN + vmovdqa .Lgstar(%rip), GSTAR + movups (%rdi), %xmm0 + movups (%rsi), %xmm1 + schoolbook1_noload + schoolbook2 + montgomery_reduction SUM + movups SUM, (%rdi) + FRAME_END + RET +SYM_FUNC_END(clmul_polyval_mul) + +/* + * Perform polynomial evaluation as specified by POLYVAL. This computes: + * h^n * accumulator + h^n * m_0 + ... + h^1 * m_{n-1} + * where n=nblocks, h is the hash key, and m_i are the message blocks. + * + * rdi - pointer to precomputed key powers h^8 ... h^1 + * rsi - pointer to message blocks + * rdx - number of blocks to hash + * rcx - pointer to the accumulator + * + * void clmul_polyval_update(const struct polyval_tfm_ctx *keys, + * const u8 *in, size_t nblocks, u8 *accumulator); + */ +SYM_FUNC_START(clmul_polyval_update) + FRAME_BEGIN + vmovdqa .Lgstar(%rip), GSTAR + movups (ACCUMULATOR), SUM + subq $STRIDE_BLOCKS, BLOCKS_LEFT + js .LstrideLoopExit + full_stride 0 + subq $STRIDE_BLOCKS, BLOCKS_LEFT + js .LstrideLoopExitReduce +.LstrideLoop: + full_stride 1 + subq $STRIDE_BLOCKS, BLOCKS_LEFT + jns .LstrideLoop +.LstrideLoopExitReduce: + montgomery_reduction SUM +.LstrideLoopExit: + add $STRIDE_BLOCKS, BLOCKS_LEFT + jz .LskipPartial + partial_stride +.LskipPartial: + movups SUM, (ACCUMULATOR) + FRAME_END + RET +SYM_FUNC_END(clmul_polyval_update) diff --git a/arch/x86/crypto/polyval-clmulni_glue.c b/arch/x86/crypto/polyval-clmulni_glue.c new file mode 100644 index 000000000000..8fa58b0f3cb3 --- /dev/null +++ b/arch/x86/crypto/polyval-clmulni_glue.c @@ -0,0 +1,212 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Glue code for POLYVAL using PCMULQDQ-NI + * + * Copyright (c) 2007 Nokia Siemens Networks - Mikko Herranen <mh1@iki.fi> + * Copyright (c) 2009 Intel Corp. + * Author: Huang Ying <ying.huang@intel.com> + * Copyright 2021 Google LLC + */ + +/* + * Glue code based on ghash-clmulni-intel_glue.c. + * + * This implementation of POLYVAL uses montgomery multiplication + * accelerated by PCLMULQDQ-NI to implement the finite field + * operations. + */ + +#include <crypto/algapi.h> +#include <crypto/internal/hash.h> +#include <crypto/internal/simd.h> +#include <crypto/polyval.h> +#include <linux/crypto.h> +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <asm/cpu_device_id.h> +#include <asm/simd.h> + +#define POLYVAL_ALIGN 16 +#define POLYVAL_ALIGN_ATTR __aligned(POLYVAL_ALIGN) +#define POLYVAL_ALIGN_EXTRA ((POLYVAL_ALIGN - 1) & ~(CRYPTO_MINALIGN - 1)) +#define POLYVAL_CTX_SIZE (sizeof(struct polyval_tfm_ctx) + POLYVAL_ALIGN_EXTRA) +#define NUM_KEY_POWERS 8 + +struct polyval_tfm_ctx { + /* + * These powers must be in the order h^8, ..., h^1. + */ + u8 key_powers[NUM_KEY_POWERS][POLYVAL_BLOCK_SIZE] POLYVAL_ALIGN_ATTR; +}; + +struct polyval_desc_ctx { + u8 buffer[POLYVAL_BLOCK_SIZE]; + u32 bytes; +}; + +asmlinkage void clmul_polyval_update(const struct polyval_tfm_ctx *keys, + const u8 *in, size_t nblocks, u8 *accumulator); +asmlinkage void clmul_polyval_mul(u8 *op1, const u8 *op2); + +static inline struct polyval_tfm_ctx *polyval_tfm_ctx(struct crypto_shash *tfm) +{ + return PTR_ALIGN(crypto_shash_ctx(tfm), POLYVAL_ALIGN); +} + +static void internal_polyval_update(const struct polyval_tfm_ctx *keys, + const u8 *in, size_t nblocks, u8 *accumulator) +{ + if (likely(crypto_simd_usable())) { + kernel_fpu_begin(); + clmul_polyval_update(keys, in, nblocks, accumulator); + kernel_fpu_end(); + } else { + polyval_update_non4k(keys->key_powers[NUM_KEY_POWERS-1], in, + nblocks, accumulator); + } +} + +static void internal_polyval_mul(u8 *op1, const u8 *op2) +{ + if (likely(crypto_simd_usable())) { + kernel_fpu_begin(); + clmul_polyval_mul(op1, op2); + kernel_fpu_end(); + } else { + polyval_mul_non4k(op1, op2); + } +} + +static int polyval_x86_setkey(struct crypto_shash *tfm, + const u8 *key, unsigned int keylen) +{ + struct polyval_tfm_ctx *tctx = polyval_tfm_ctx(tfm); + int i; + + if (keylen != POLYVAL_BLOCK_SIZE) + return -EINVAL; + + memcpy(tctx->key_powers[NUM_KEY_POWERS-1], key, POLYVAL_BLOCK_SIZE); + + for (i = NUM_KEY_POWERS-2; i >= 0; i--) { + memcpy(tctx->key_powers[i], key, POLYVAL_BLOCK_SIZE); + internal_polyval_mul(tctx->key_powers[i], + tctx->key_powers[i+1]); + } + + return 0; +} + +static int polyval_x86_init(struct shash_desc *desc) +{ + struct polyval_desc_ctx *dctx = shash_desc_ctx(desc); + + memset(dctx, 0, sizeof(*dctx)); + + return 0; +} + +static int polyval_x86_update(struct shash_desc *desc, + const u8 *src, unsigned int srclen) +{ + struct polyval_desc_ctx *dctx = shash_desc_ctx(desc); + const struct polyval_tfm_ctx *tctx = polyval_tfm_ctx(desc->tfm); + u8 *pos; + unsigned int nblocks; + unsigned int n; + + if (dctx->bytes) { + n = min(srclen, dctx->bytes); + pos = dctx->buffer + POLYVAL_BLOCK_SIZE - dctx->bytes; + + dctx->bytes -= n; + srclen -= n; + + while (n--) + *pos++ ^= *src++; + + if (!dctx->bytes) + internal_polyval_mul(dctx->buffer, + tctx->key_powers[NUM_KEY_POWERS-1]); + } + + while (srclen >= POLYVAL_BLOCK_SIZE) { + /* Allow rescheduling every 4K bytes. */ + nblocks = min(srclen, 4096U) / POLYVAL_BLOCK_SIZE; + internal_polyval_update(tctx, src, nblocks, dctx->buffer); + srclen -= nblocks * POLYVAL_BLOCK_SIZE; + src += nblocks * POLYVAL_BLOCK_SIZE; + } + + if (srclen) { + dctx->bytes = POLYVAL_BLOCK_SIZE - srclen; + pos = dctx->buffer; + while (srclen--) + *pos++ ^= *src++; + } + + return 0; +} + +static int polyval_x86_final(struct shash_desc *desc, u8 *dst) +{ + struct polyval_desc_ctx *dctx = shash_desc_ctx(desc); + const struct polyval_tfm_ctx *tctx = polyval_tfm_ctx(desc->tfm); + + if (dctx->bytes) { + internal_polyval_mul(dctx->buffer, + tctx->key_powers[NUM_KEY_POWERS-1]); + } + + memcpy(dst, dctx->buffer, POLYVAL_BLOCK_SIZE); + + return 0; +} + +static struct shash_alg polyval_alg = { + .digestsize = POLYVAL_DIGEST_SIZE, + .init = polyval_x86_init, + .update = polyval_x86_update, + .final = polyval_x86_final, + .setkey = polyval_x86_setkey, + .descsize = sizeof(struct polyval_desc_ctx), + .base = { + .cra_name = "polyval", + .cra_driver_name = "polyval-clmulni", + .cra_priority = 200, + .cra_blocksize = POLYVAL_BLOCK_SIZE, + .cra_ctxsize = POLYVAL_CTX_SIZE, + .cra_module = THIS_MODULE, + }, +}; + +__maybe_unused static const struct x86_cpu_id pcmul_cpu_id[] = { + X86_MATCH_FEATURE(X86_FEATURE_PCLMULQDQ, NULL), + {} +}; +MODULE_DEVICE_TABLE(x86cpu, pcmul_cpu_id); + +static int __init polyval_clmulni_mod_init(void) +{ + if (!x86_match_cpu(pcmul_cpu_id)) + return -ENODEV; + + if (!boot_cpu_has(X86_FEATURE_AVX)) + return -ENODEV; + + return crypto_register_shash(&polyval_alg); +} + +static void __exit polyval_clmulni_mod_exit(void) +{ + crypto_unregister_shash(&polyval_alg); +} + +module_init(polyval_clmulni_mod_init); +module_exit(polyval_clmulni_mod_exit); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("POLYVAL hash function accelerated by PCLMULQDQ-NI"); +MODULE_ALIAS_CRYPTO("polyval"); +MODULE_ALIAS_CRYPTO("polyval-clmulni"); diff --git a/arch/x86/crypto/serpent-avx-x86_64-asm_64.S b/arch/x86/crypto/serpent-avx-x86_64-asm_64.S index ba9e4c1e7f5c..82f2313f512b 100644 --- a/arch/x86/crypto/serpent-avx-x86_64-asm_64.S +++ b/arch/x86/crypto/serpent-avx-x86_64-asm_64.S @@ -18,10 +18,6 @@ .align 16 .Lbswap128_mask: .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 -.section .rodata.cst16.xts_gf128mul_and_shl1_mask, "aM", @progbits, 16 -.align 16 -.Lxts_gf128mul_and_shl1_mask: - .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0 .text @@ -605,7 +601,7 @@ SYM_FUNC_START_LOCAL(__serpent_enc_blk8_avx) write_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2); write_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2); - ret; + RET; SYM_FUNC_END(__serpent_enc_blk8_avx) .align 8 @@ -659,7 +655,7 @@ SYM_FUNC_START_LOCAL(__serpent_dec_blk8_avx) write_blocks(RC1, RD1, RB1, RE1, RK0, RK1, RK2); write_blocks(RC2, RD2, RB2, RE2, RK0, RK1, RK2); - ret; + RET; SYM_FUNC_END(__serpent_dec_blk8_avx) SYM_FUNC_START(serpent_ecb_enc_8way_avx) @@ -677,7 +673,7 @@ SYM_FUNC_START(serpent_ecb_enc_8way_avx) store_8way(%rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); FRAME_END - ret; + RET; SYM_FUNC_END(serpent_ecb_enc_8way_avx) SYM_FUNC_START(serpent_ecb_dec_8way_avx) @@ -695,7 +691,7 @@ SYM_FUNC_START(serpent_ecb_dec_8way_avx) store_8way(%rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2); FRAME_END - ret; + RET; SYM_FUNC_END(serpent_ecb_dec_8way_avx) SYM_FUNC_START(serpent_cbc_dec_8way_avx) @@ -713,69 +709,5 @@ SYM_FUNC_START(serpent_cbc_dec_8way_avx) store_cbc_8way(%rdx, %rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2); FRAME_END - ret; + RET; SYM_FUNC_END(serpent_cbc_dec_8way_avx) - -SYM_FUNC_START(serpent_ctr_8way_avx) - /* input: - * %rdi: ctx, CTX - * %rsi: dst - * %rdx: src - * %rcx: iv (little endian, 128bit) - */ - FRAME_BEGIN - - load_ctr_8way(%rcx, .Lbswap128_mask, RA1, RB1, RC1, RD1, RA2, RB2, RC2, - RD2, RK0, RK1, RK2); - - call __serpent_enc_blk8_avx; - - store_ctr_8way(%rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); - - FRAME_END - ret; -SYM_FUNC_END(serpent_ctr_8way_avx) - -SYM_FUNC_START(serpent_xts_enc_8way_avx) - /* input: - * %rdi: ctx, CTX - * %rsi: dst - * %rdx: src - * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) - */ - FRAME_BEGIN - - /* regs <= src, dst <= IVs, regs <= regs xor IVs */ - load_xts_8way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2, - RK0, RK1, RK2, .Lxts_gf128mul_and_shl1_mask); - - call __serpent_enc_blk8_avx; - - /* dst <= regs xor IVs(in dst) */ - store_xts_8way(%rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); - - FRAME_END - ret; -SYM_FUNC_END(serpent_xts_enc_8way_avx) - -SYM_FUNC_START(serpent_xts_dec_8way_avx) - /* input: - * %rdi: ctx, CTX - * %rsi: dst - * %rdx: src - * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) - */ - FRAME_BEGIN - - /* regs <= src, dst <= IVs, regs <= regs xor IVs */ - load_xts_8way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2, - RK0, RK1, RK2, .Lxts_gf128mul_and_shl1_mask); - - call __serpent_dec_blk8_avx; - - /* dst <= regs xor IVs(in dst) */ - store_xts_8way(%rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2); - - FRAME_END - ret; -SYM_FUNC_END(serpent_xts_dec_8way_avx) diff --git a/arch/x86/crypto/serpent-avx.h b/arch/x86/crypto/serpent-avx.h new file mode 100644 index 000000000000..23f3361a0e72 --- /dev/null +++ b/arch/x86/crypto/serpent-avx.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef ASM_X86_SERPENT_AVX_H +#define ASM_X86_SERPENT_AVX_H + +#include <crypto/b128ops.h> +#include <crypto/serpent.h> +#include <linux/types.h> + +struct crypto_skcipher; + +#define SERPENT_PARALLEL_BLOCKS 8 + +asmlinkage void serpent_ecb_enc_8way_avx(const void *ctx, u8 *dst, + const u8 *src); +asmlinkage void serpent_ecb_dec_8way_avx(const void *ctx, u8 *dst, + const u8 *src); + +asmlinkage void serpent_cbc_dec_8way_avx(const void *ctx, u8 *dst, + const u8 *src); + +#endif diff --git a/arch/x86/crypto/serpent-avx2-asm_64.S b/arch/x86/crypto/serpent-avx2-asm_64.S index c9648aeae705..8ea34c9b9316 100644 --- a/arch/x86/crypto/serpent-avx2-asm_64.S +++ b/arch/x86/crypto/serpent-avx2-asm_64.S @@ -20,16 +20,6 @@ .Lbswap128_mask: .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 -.section .rodata.cst16.xts_gf128mul_and_shl1_mask_0, "aM", @progbits, 16 -.align 16 -.Lxts_gf128mul_and_shl1_mask_0: - .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0 - -.section .rodata.cst16.xts_gf128mul_and_shl1_mask_1, "aM", @progbits, 16 -.align 16 -.Lxts_gf128mul_and_shl1_mask_1: - .byte 0x0e, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0 - .text #define CTX %rdi @@ -611,7 +601,7 @@ SYM_FUNC_START_LOCAL(__serpent_enc_blk16) write_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2); write_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2); - ret; + RET; SYM_FUNC_END(__serpent_enc_blk16) .align 8 @@ -665,7 +655,7 @@ SYM_FUNC_START_LOCAL(__serpent_dec_blk16) write_blocks(RC1, RD1, RB1, RE1, RK0, RK1, RK2); write_blocks(RC2, RD2, RB2, RE2, RK0, RK1, RK2); - ret; + RET; SYM_FUNC_END(__serpent_dec_blk16) SYM_FUNC_START(serpent_ecb_enc_16way) @@ -687,7 +677,7 @@ SYM_FUNC_START(serpent_ecb_enc_16way) vzeroupper; FRAME_END - ret; + RET; SYM_FUNC_END(serpent_ecb_enc_16way) SYM_FUNC_START(serpent_ecb_dec_16way) @@ -709,7 +699,7 @@ SYM_FUNC_START(serpent_ecb_dec_16way) vzeroupper; FRAME_END - ret; + RET; SYM_FUNC_END(serpent_ecb_dec_16way) SYM_FUNC_START(serpent_cbc_dec_16way) @@ -732,82 +722,5 @@ SYM_FUNC_START(serpent_cbc_dec_16way) vzeroupper; FRAME_END - ret; + RET; SYM_FUNC_END(serpent_cbc_dec_16way) - -SYM_FUNC_START(serpent_ctr_16way) - /* input: - * %rdi: ctx, CTX - * %rsi: dst (16 blocks) - * %rdx: src (16 blocks) - * %rcx: iv (little endian, 128bit) - */ - FRAME_BEGIN - - vzeroupper; - - load_ctr_16way(%rcx, .Lbswap128_mask, RA1, RB1, RC1, RD1, RA2, RB2, RC2, - RD2, RK0, RK0x, RK1, RK1x, RK2, RK2x, RK3, RK3x, RNOT, - tp); - - call __serpent_enc_blk16; - - store_ctr_16way(%rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); - - vzeroupper; - - FRAME_END - ret; -SYM_FUNC_END(serpent_ctr_16way) - -SYM_FUNC_START(serpent_xts_enc_16way) - /* input: - * %rdi: ctx, CTX - * %rsi: dst (16 blocks) - * %rdx: src (16 blocks) - * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) - */ - FRAME_BEGIN - - vzeroupper; - - load_xts_16way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, - RD2, RK0, RK0x, RK1, RK1x, RK2, RK2x, RK3, RK3x, RNOT, - .Lxts_gf128mul_and_shl1_mask_0, - .Lxts_gf128mul_and_shl1_mask_1); - - call __serpent_enc_blk16; - - store_xts_16way(%rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); - - vzeroupper; - - FRAME_END - ret; -SYM_FUNC_END(serpent_xts_enc_16way) - -SYM_FUNC_START(serpent_xts_dec_16way) - /* input: - * %rdi: ctx, CTX - * %rsi: dst (16 blocks) - * %rdx: src (16 blocks) - * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) - */ - FRAME_BEGIN - - vzeroupper; - - load_xts_16way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, - RD2, RK0, RK0x, RK1, RK1x, RK2, RK2x, RK3, RK3x, RNOT, - .Lxts_gf128mul_and_shl1_mask_0, - .Lxts_gf128mul_and_shl1_mask_1); - - call __serpent_dec_blk16; - - store_xts_16way(%rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2); - - vzeroupper; - - FRAME_END - ret; -SYM_FUNC_END(serpent_xts_dec_16way) diff --git a/arch/x86/crypto/serpent-sse2-i586-asm_32.S b/arch/x86/crypto/serpent-sse2-i586-asm_32.S index 6379b99cb722..8ccb03ad7cef 100644 --- a/arch/x86/crypto/serpent-sse2-i586-asm_32.S +++ b/arch/x86/crypto/serpent-sse2-i586-asm_32.S @@ -553,12 +553,12 @@ SYM_FUNC_START(__serpent_enc_blk_4way) write_blocks(%eax, RA, RB, RC, RD, RT0, RT1, RE); - ret; + RET; .L__enc_xor4: xor_blocks(%eax, RA, RB, RC, RD, RT0, RT1, RE); - ret; + RET; SYM_FUNC_END(__serpent_enc_blk_4way) SYM_FUNC_START(serpent_dec_blk_4way) @@ -612,5 +612,5 @@ SYM_FUNC_START(serpent_dec_blk_4way) movl arg_dst(%esp), %eax; write_blocks(%eax, RC, RD, RB, RE, RT0, RT1, RA); - ret; + RET; SYM_FUNC_END(serpent_dec_blk_4way) diff --git a/arch/x86/crypto/serpent-sse2-x86_64-asm_64.S b/arch/x86/crypto/serpent-sse2-x86_64-asm_64.S index efb6dc17dc90..e0998a011d1d 100644 --- a/arch/x86/crypto/serpent-sse2-x86_64-asm_64.S +++ b/arch/x86/crypto/serpent-sse2-x86_64-asm_64.S @@ -675,13 +675,13 @@ SYM_FUNC_START(__serpent_enc_blk_8way) write_blocks(%rsi, RA1, RB1, RC1, RD1, RK0, RK1, RK2); write_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2); - ret; + RET; .L__enc_xor8: xor_blocks(%rsi, RA1, RB1, RC1, RD1, RK0, RK1, RK2); xor_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2); - ret; + RET; SYM_FUNC_END(__serpent_enc_blk_8way) SYM_FUNC_START(serpent_dec_blk_8way) @@ -735,5 +735,5 @@ SYM_FUNC_START(serpent_dec_blk_8way) write_blocks(%rsi, RC1, RD1, RB1, RE1, RK0, RK1, RK2); write_blocks(%rax, RC2, RD2, RB2, RE2, RK0, RK1, RK2); - ret; + RET; SYM_FUNC_END(serpent_dec_blk_8way) diff --git a/arch/x86/crypto/serpent-sse2.h b/arch/x86/crypto/serpent-sse2.h new file mode 100644 index 000000000000..860ca248914b --- /dev/null +++ b/arch/x86/crypto/serpent-sse2.h @@ -0,0 +1,60 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef ASM_X86_SERPENT_SSE2_H +#define ASM_X86_SERPENT_SSE2_H + +#include <linux/crypto.h> +#include <crypto/serpent.h> + +#ifdef CONFIG_X86_32 + +#define SERPENT_PARALLEL_BLOCKS 4 + +asmlinkage void __serpent_enc_blk_4way(const struct serpent_ctx *ctx, u8 *dst, + const u8 *src, bool xor); +asmlinkage void serpent_dec_blk_4way(const struct serpent_ctx *ctx, u8 *dst, + const u8 *src); + +static inline void serpent_enc_blk_xway(const void *ctx, u8 *dst, const u8 *src) +{ + __serpent_enc_blk_4way(ctx, dst, src, false); +} + +static inline void serpent_enc_blk_xway_xor(const struct serpent_ctx *ctx, + u8 *dst, const u8 *src) +{ + __serpent_enc_blk_4way(ctx, dst, src, true); +} + +static inline void serpent_dec_blk_xway(const void *ctx, u8 *dst, const u8 *src) +{ + serpent_dec_blk_4way(ctx, dst, src); +} + +#else + +#define SERPENT_PARALLEL_BLOCKS 8 + +asmlinkage void __serpent_enc_blk_8way(const struct serpent_ctx *ctx, u8 *dst, + const u8 *src, bool xor); +asmlinkage void serpent_dec_blk_8way(const struct serpent_ctx *ctx, u8 *dst, + const u8 *src); + +static inline void serpent_enc_blk_xway(const void *ctx, u8 *dst, const u8 *src) +{ + __serpent_enc_blk_8way(ctx, dst, src, false); +} + +static inline void serpent_enc_blk_xway_xor(const struct serpent_ctx *ctx, + u8 *dst, const u8 *src) +{ + __serpent_enc_blk_8way(ctx, dst, src, true); +} + +static inline void serpent_dec_blk_xway(const void *ctx, u8 *dst, const u8 *src) +{ + serpent_dec_blk_8way(ctx, dst, src); +} + +#endif + +#endif diff --git a/arch/x86/crypto/serpent_avx2_glue.c b/arch/x86/crypto/serpent_avx2_glue.c index f973ace44ad3..347e97f4b713 100644 --- a/arch/x86/crypto/serpent_avx2_glue.c +++ b/arch/x86/crypto/serpent_avx2_glue.c @@ -12,9 +12,9 @@ #include <crypto/algapi.h> #include <crypto/internal/simd.h> #include <crypto/serpent.h> -#include <crypto/xts.h> -#include <asm/crypto/glue_helper.h> -#include <asm/crypto/serpent-avx.h> + +#include "serpent-avx.h" +#include "ecb_cbc_helpers.h" #define SERPENT_AVX2_PARALLEL_BLOCKS 16 @@ -23,158 +23,44 @@ asmlinkage void serpent_ecb_enc_16way(const void *ctx, u8 *dst, const u8 *src); asmlinkage void serpent_ecb_dec_16way(const void *ctx, u8 *dst, const u8 *src); asmlinkage void serpent_cbc_dec_16way(const void *ctx, u8 *dst, const u8 *src); -asmlinkage void serpent_ctr_16way(const void *ctx, u8 *dst, const u8 *src, - le128 *iv); -asmlinkage void serpent_xts_enc_16way(const void *ctx, u8 *dst, const u8 *src, - le128 *iv); -asmlinkage void serpent_xts_dec_16way(const void *ctx, u8 *dst, const u8 *src, - le128 *iv); - static int serpent_setkey_skcipher(struct crypto_skcipher *tfm, const u8 *key, unsigned int keylen) { return __serpent_setkey(crypto_skcipher_ctx(tfm), key, keylen); } -static const struct common_glue_ctx serpent_enc = { - .num_funcs = 3, - .fpu_blocks_limit = 8, - - .funcs = { { - .num_blocks = 16, - .fn_u = { .ecb = serpent_ecb_enc_16way } - }, { - .num_blocks = 8, - .fn_u = { .ecb = serpent_ecb_enc_8way_avx } - }, { - .num_blocks = 1, - .fn_u = { .ecb = __serpent_encrypt } - } } -}; - -static const struct common_glue_ctx serpent_ctr = { - .num_funcs = 3, - .fpu_blocks_limit = 8, - - .funcs = { { - .num_blocks = 16, - .fn_u = { .ctr = serpent_ctr_16way } - }, { - .num_blocks = 8, - .fn_u = { .ctr = serpent_ctr_8way_avx } - }, { - .num_blocks = 1, - .fn_u = { .ctr = __serpent_crypt_ctr } - } } -}; - -static const struct common_glue_ctx serpent_enc_xts = { - .num_funcs = 3, - .fpu_blocks_limit = 8, - - .funcs = { { - .num_blocks = 16, - .fn_u = { .xts = serpent_xts_enc_16way } - }, { - .num_blocks = 8, - .fn_u = { .xts = serpent_xts_enc_8way_avx } - }, { - .num_blocks = 1, - .fn_u = { .xts = serpent_xts_enc } - } } -}; - -static const struct common_glue_ctx serpent_dec = { - .num_funcs = 3, - .fpu_blocks_limit = 8, - - .funcs = { { - .num_blocks = 16, - .fn_u = { .ecb = serpent_ecb_dec_16way } - }, { - .num_blocks = 8, - .fn_u = { .ecb = serpent_ecb_dec_8way_avx } - }, { - .num_blocks = 1, - .fn_u = { .ecb = __serpent_decrypt } - } } -}; - -static const struct common_glue_ctx serpent_dec_cbc = { - .num_funcs = 3, - .fpu_blocks_limit = 8, - - .funcs = { { - .num_blocks = 16, - .fn_u = { .cbc = serpent_cbc_dec_16way } - }, { - .num_blocks = 8, - .fn_u = { .cbc = serpent_cbc_dec_8way_avx } - }, { - .num_blocks = 1, - .fn_u = { .cbc = __serpent_decrypt } - } } -}; - -static const struct common_glue_ctx serpent_dec_xts = { - .num_funcs = 3, - .fpu_blocks_limit = 8, - - .funcs = { { - .num_blocks = 16, - .fn_u = { .xts = serpent_xts_dec_16way } - }, { - .num_blocks = 8, - .fn_u = { .xts = serpent_xts_dec_8way_avx } - }, { - .num_blocks = 1, - .fn_u = { .xts = serpent_xts_dec } - } } -}; - static int ecb_encrypt(struct skcipher_request *req) { - return glue_ecb_req_128bit(&serpent_enc, req); + ECB_WALK_START(req, SERPENT_BLOCK_SIZE, SERPENT_PARALLEL_BLOCKS); + ECB_BLOCK(SERPENT_AVX2_PARALLEL_BLOCKS, serpent_ecb_enc_16way); + ECB_BLOCK(SERPENT_PARALLEL_BLOCKS, serpent_ecb_enc_8way_avx); + ECB_BLOCK(1, __serpent_encrypt); + ECB_WALK_END(); } static int ecb_decrypt(struct skcipher_request *req) { - return glue_ecb_req_128bit(&serpent_dec, req); + ECB_WALK_START(req, SERPENT_BLOCK_SIZE, SERPENT_PARALLEL_BLOCKS); + ECB_BLOCK(SERPENT_AVX2_PARALLEL_BLOCKS, serpent_ecb_dec_16way); + ECB_BLOCK(SERPENT_PARALLEL_BLOCKS, serpent_ecb_dec_8way_avx); + ECB_BLOCK(1, __serpent_decrypt); + ECB_WALK_END(); } static int cbc_encrypt(struct skcipher_request *req) { - return glue_cbc_encrypt_req_128bit(__serpent_encrypt, req); + CBC_WALK_START(req, SERPENT_BLOCK_SIZE, -1); + CBC_ENC_BLOCK(__serpent_encrypt); + CBC_WALK_END(); } static int cbc_decrypt(struct skcipher_request *req) { - return glue_cbc_decrypt_req_128bit(&serpent_dec_cbc, req); -} - -static int ctr_crypt(struct skcipher_request *req) -{ - return glue_ctr_req_128bit(&serpent_ctr, req); -} - -static int xts_encrypt(struct skcipher_request *req) -{ - struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - struct serpent_xts_ctx *ctx = crypto_skcipher_ctx(tfm); - - return glue_xts_req_128bit(&serpent_enc_xts, req, - __serpent_encrypt, &ctx->tweak_ctx, - &ctx->crypt_ctx, false); -} - -static int xts_decrypt(struct skcipher_request *req) -{ - struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - struct serpent_xts_ctx *ctx = crypto_skcipher_ctx(tfm); - - return glue_xts_req_128bit(&serpent_dec_xts, req, - __serpent_encrypt, &ctx->tweak_ctx, - &ctx->crypt_ctx, true); + CBC_WALK_START(req, SERPENT_BLOCK_SIZE, SERPENT_PARALLEL_BLOCKS); + CBC_DEC_BLOCK(SERPENT_AVX2_PARALLEL_BLOCKS, serpent_cbc_dec_16way); + CBC_DEC_BLOCK(SERPENT_PARALLEL_BLOCKS, serpent_cbc_dec_8way_avx); + CBC_DEC_BLOCK(1, __serpent_decrypt); + CBC_WALK_END(); } static struct skcipher_alg serpent_algs[] = { @@ -205,41 +91,12 @@ static struct skcipher_alg serpent_algs[] = { .setkey = serpent_setkey_skcipher, .encrypt = cbc_encrypt, .decrypt = cbc_decrypt, - }, { - .base.cra_name = "__ctr(serpent)", - .base.cra_driver_name = "__ctr-serpent-avx2", - .base.cra_priority = 600, - .base.cra_flags = CRYPTO_ALG_INTERNAL, - .base.cra_blocksize = 1, - .base.cra_ctxsize = sizeof(struct serpent_ctx), - .base.cra_module = THIS_MODULE, - .min_keysize = SERPENT_MIN_KEY_SIZE, - .max_keysize = SERPENT_MAX_KEY_SIZE, - .ivsize = SERPENT_BLOCK_SIZE, - .chunksize = SERPENT_BLOCK_SIZE, - .setkey = serpent_setkey_skcipher, - .encrypt = ctr_crypt, - .decrypt = ctr_crypt, - }, { - .base.cra_name = "__xts(serpent)", - .base.cra_driver_name = "__xts-serpent-avx2", - .base.cra_priority = 600, - .base.cra_flags = CRYPTO_ALG_INTERNAL, - .base.cra_blocksize = SERPENT_BLOCK_SIZE, - .base.cra_ctxsize = sizeof(struct serpent_xts_ctx), - .base.cra_module = THIS_MODULE, - .min_keysize = 2 * SERPENT_MIN_KEY_SIZE, - .max_keysize = 2 * SERPENT_MAX_KEY_SIZE, - .ivsize = SERPENT_BLOCK_SIZE, - .setkey = xts_serpent_setkey, - .encrypt = xts_encrypt, - .decrypt = xts_decrypt, }, }; static struct simd_skcipher_alg *serpent_simd_algs[ARRAY_SIZE(serpent_algs)]; -static int __init init(void) +static int __init serpent_avx2_init(void) { const char *feature_name; @@ -258,14 +115,14 @@ static int __init init(void) serpent_simd_algs); } -static void __exit fini(void) +static void __exit serpent_avx2_fini(void) { simd_unregister_skciphers(serpent_algs, ARRAY_SIZE(serpent_algs), serpent_simd_algs); } -module_init(init); -module_exit(fini); +module_init(serpent_avx2_init); +module_exit(serpent_avx2_fini); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Serpent Cipher Algorithm, AVX2 optimized"); diff --git a/arch/x86/crypto/serpent_avx_glue.c b/arch/x86/crypto/serpent_avx_glue.c index 7806d1cbe854..6c248e1ea4ef 100644 --- a/arch/x86/crypto/serpent_avx_glue.c +++ b/arch/x86/crypto/serpent_avx_glue.c @@ -15,9 +15,9 @@ #include <crypto/algapi.h> #include <crypto/internal/simd.h> #include <crypto/serpent.h> -#include <crypto/xts.h> -#include <asm/crypto/glue_helper.h> -#include <asm/crypto/serpent-avx.h> + +#include "serpent-avx.h" +#include "ecb_cbc_helpers.h" /* 8-way parallel cipher functions */ asmlinkage void serpent_ecb_enc_8way_avx(const void *ctx, u8 *dst, @@ -32,191 +32,41 @@ asmlinkage void serpent_cbc_dec_8way_avx(const void *ctx, u8 *dst, const u8 *src); EXPORT_SYMBOL_GPL(serpent_cbc_dec_8way_avx); -asmlinkage void serpent_ctr_8way_avx(const void *ctx, u8 *dst, const u8 *src, - le128 *iv); -EXPORT_SYMBOL_GPL(serpent_ctr_8way_avx); - -asmlinkage void serpent_xts_enc_8way_avx(const void *ctx, u8 *dst, - const u8 *src, le128 *iv); -EXPORT_SYMBOL_GPL(serpent_xts_enc_8way_avx); - -asmlinkage void serpent_xts_dec_8way_avx(const void *ctx, u8 *dst, - const u8 *src, le128 *iv); -EXPORT_SYMBOL_GPL(serpent_xts_dec_8way_avx); - -void __serpent_crypt_ctr(const void *ctx, u8 *d, const u8 *s, le128 *iv) -{ - be128 ctrblk; - u128 *dst = (u128 *)d; - const u128 *src = (const u128 *)s; - - le128_to_be128(&ctrblk, iv); - le128_inc(iv); - - __serpent_encrypt(ctx, (u8 *)&ctrblk, (u8 *)&ctrblk); - u128_xor(dst, src, (u128 *)&ctrblk); -} -EXPORT_SYMBOL_GPL(__serpent_crypt_ctr); - -void serpent_xts_enc(const void *ctx, u8 *dst, const u8 *src, le128 *iv) -{ - glue_xts_crypt_128bit_one(ctx, dst, src, iv, __serpent_encrypt); -} -EXPORT_SYMBOL_GPL(serpent_xts_enc); - -void serpent_xts_dec(const void *ctx, u8 *dst, const u8 *src, le128 *iv) -{ - glue_xts_crypt_128bit_one(ctx, dst, src, iv, __serpent_decrypt); -} -EXPORT_SYMBOL_GPL(serpent_xts_dec); - static int serpent_setkey_skcipher(struct crypto_skcipher *tfm, const u8 *key, unsigned int keylen) { return __serpent_setkey(crypto_skcipher_ctx(tfm), key, keylen); } -int xts_serpent_setkey(struct crypto_skcipher *tfm, const u8 *key, - unsigned int keylen) -{ - struct serpent_xts_ctx *ctx = crypto_skcipher_ctx(tfm); - int err; - - err = xts_verify_key(tfm, key, keylen); - if (err) - return err; - - /* first half of xts-key is for crypt */ - err = __serpent_setkey(&ctx->crypt_ctx, key, keylen / 2); - if (err) - return err; - - /* second half of xts-key is for tweak */ - return __serpent_setkey(&ctx->tweak_ctx, key + keylen / 2, keylen / 2); -} -EXPORT_SYMBOL_GPL(xts_serpent_setkey); - -static const struct common_glue_ctx serpent_enc = { - .num_funcs = 2, - .fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS, - - .funcs = { { - .num_blocks = SERPENT_PARALLEL_BLOCKS, - .fn_u = { .ecb = serpent_ecb_enc_8way_avx } - }, { - .num_blocks = 1, - .fn_u = { .ecb = __serpent_encrypt } - } } -}; - -static const struct common_glue_ctx serpent_ctr = { - .num_funcs = 2, - .fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS, - - .funcs = { { - .num_blocks = SERPENT_PARALLEL_BLOCKS, - .fn_u = { .ctr = serpent_ctr_8way_avx } - }, { - .num_blocks = 1, - .fn_u = { .ctr = __serpent_crypt_ctr } - } } -}; - -static const struct common_glue_ctx serpent_enc_xts = { - .num_funcs = 2, - .fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS, - - .funcs = { { - .num_blocks = SERPENT_PARALLEL_BLOCKS, - .fn_u = { .xts = serpent_xts_enc_8way_avx } - }, { - .num_blocks = 1, - .fn_u = { .xts = serpent_xts_enc } - } } -}; - -static const struct common_glue_ctx serpent_dec = { - .num_funcs = 2, - .fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS, - - .funcs = { { - .num_blocks = SERPENT_PARALLEL_BLOCKS, - .fn_u = { .ecb = serpent_ecb_dec_8way_avx } - }, { - .num_blocks = 1, - .fn_u = { .ecb = __serpent_decrypt } - } } -}; - -static const struct common_glue_ctx serpent_dec_cbc = { - .num_funcs = 2, - .fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS, - - .funcs = { { - .num_blocks = SERPENT_PARALLEL_BLOCKS, - .fn_u = { .cbc = serpent_cbc_dec_8way_avx } - }, { - .num_blocks = 1, - .fn_u = { .cbc = __serpent_decrypt } - } } -}; - -static const struct common_glue_ctx serpent_dec_xts = { - .num_funcs = 2, - .fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS, - - .funcs = { { - .num_blocks = SERPENT_PARALLEL_BLOCKS, - .fn_u = { .xts = serpent_xts_dec_8way_avx } - }, { - .num_blocks = 1, - .fn_u = { .xts = serpent_xts_dec } - } } -}; - static int ecb_encrypt(struct skcipher_request *req) { - return glue_ecb_req_128bit(&serpent_enc, req); + ECB_WALK_START(req, SERPENT_BLOCK_SIZE, SERPENT_PARALLEL_BLOCKS); + ECB_BLOCK(SERPENT_PARALLEL_BLOCKS, serpent_ecb_enc_8way_avx); + ECB_BLOCK(1, __serpent_encrypt); + ECB_WALK_END(); } static int ecb_decrypt(struct skcipher_request *req) { - return glue_ecb_req_128bit(&serpent_dec, req); + ECB_WALK_START(req, SERPENT_BLOCK_SIZE, SERPENT_PARALLEL_BLOCKS); + ECB_BLOCK(SERPENT_PARALLEL_BLOCKS, serpent_ecb_dec_8way_avx); + ECB_BLOCK(1, __serpent_decrypt); + ECB_WALK_END(); } static int cbc_encrypt(struct skcipher_request *req) { - return glue_cbc_encrypt_req_128bit(__serpent_encrypt, req); + CBC_WALK_START(req, SERPENT_BLOCK_SIZE, -1); + CBC_ENC_BLOCK(__serpent_encrypt); + CBC_WALK_END(); } static int cbc_decrypt(struct skcipher_request *req) { - return glue_cbc_decrypt_req_128bit(&serpent_dec_cbc, req); -} - -static int ctr_crypt(struct skcipher_request *req) -{ - return glue_ctr_req_128bit(&serpent_ctr, req); -} - -static int xts_encrypt(struct skcipher_request *req) -{ - struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - struct serpent_xts_ctx *ctx = crypto_skcipher_ctx(tfm); - - return glue_xts_req_128bit(&serpent_enc_xts, req, - __serpent_encrypt, &ctx->tweak_ctx, - &ctx->crypt_ctx, false); -} - -static int xts_decrypt(struct skcipher_request *req) -{ - struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - struct serpent_xts_ctx *ctx = crypto_skcipher_ctx(tfm); - - return glue_xts_req_128bit(&serpent_dec_xts, req, - __serpent_encrypt, &ctx->tweak_ctx, - &ctx->crypt_ctx, true); + CBC_WALK_START(req, SERPENT_BLOCK_SIZE, SERPENT_PARALLEL_BLOCKS); + CBC_DEC_BLOCK(SERPENT_PARALLEL_BLOCKS, serpent_cbc_dec_8way_avx); + CBC_DEC_BLOCK(1, __serpent_decrypt); + CBC_WALK_END(); } static struct skcipher_alg serpent_algs[] = { @@ -247,35 +97,6 @@ static struct skcipher_alg serpent_algs[] = { .setkey = serpent_setkey_skcipher, .encrypt = cbc_encrypt, .decrypt = cbc_decrypt, - }, { - .base.cra_name = "__ctr(serpent)", - .base.cra_driver_name = "__ctr-serpent-avx", - .base.cra_priority = 500, - .base.cra_flags = CRYPTO_ALG_INTERNAL, - .base.cra_blocksize = 1, - .base.cra_ctxsize = sizeof(struct serpent_ctx), - .base.cra_module = THIS_MODULE, - .min_keysize = SERPENT_MIN_KEY_SIZE, - .max_keysize = SERPENT_MAX_KEY_SIZE, - .ivsize = SERPENT_BLOCK_SIZE, - .chunksize = SERPENT_BLOCK_SIZE, - .setkey = serpent_setkey_skcipher, - .encrypt = ctr_crypt, - .decrypt = ctr_crypt, - }, { - .base.cra_name = "__xts(serpent)", - .base.cra_driver_name = "__xts-serpent-avx", - .base.cra_priority = 500, - .base.cra_flags = CRYPTO_ALG_INTERNAL, - .base.cra_blocksize = SERPENT_BLOCK_SIZE, - .base.cra_ctxsize = sizeof(struct serpent_xts_ctx), - .base.cra_module = THIS_MODULE, - .min_keysize = 2 * SERPENT_MIN_KEY_SIZE, - .max_keysize = 2 * SERPENT_MAX_KEY_SIZE, - .ivsize = SERPENT_BLOCK_SIZE, - .setkey = xts_serpent_setkey, - .encrypt = xts_encrypt, - .decrypt = xts_decrypt, }, }; diff --git a/arch/x86/crypto/serpent_sse2_glue.c b/arch/x86/crypto/serpent_sse2_glue.c index 4fed8d26b91a..d78f37e9b2cf 100644 --- a/arch/x86/crypto/serpent_sse2_glue.c +++ b/arch/x86/crypto/serpent_sse2_glue.c @@ -10,8 +10,6 @@ * * CBC & ECB parts based on code (crypto/cbc.c,ecb.c) by: * Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au> - * CTR part based on code (crypto/ctr.c) by: - * (C) Copyright IBM Corp. 2007 - Joy Latten <latten@us.ibm.com> */ #include <linux/module.h> @@ -22,8 +20,9 @@ #include <crypto/b128ops.h> #include <crypto/internal/simd.h> #include <crypto/serpent.h> -#include <asm/crypto/serpent-sse2.h> -#include <asm/crypto/glue_helper.h> + +#include "serpent-sse2.h" +#include "ecb_cbc_helpers.h" static int serpent_setkey_skcipher(struct crypto_skcipher *tfm, const u8 *key, unsigned int keylen) @@ -31,130 +30,46 @@ static int serpent_setkey_skcipher(struct crypto_skcipher *tfm, return __serpent_setkey(crypto_skcipher_ctx(tfm), key, keylen); } -static void serpent_decrypt_cbc_xway(const void *ctx, u8 *d, const u8 *s) -{ - u128 ivs[SERPENT_PARALLEL_BLOCKS - 1]; - u128 *dst = (u128 *)d; - const u128 *src = (const u128 *)s; - unsigned int j; - - for (j = 0; j < SERPENT_PARALLEL_BLOCKS - 1; j++) - ivs[j] = src[j]; - - serpent_dec_blk_xway(ctx, (u8 *)dst, (u8 *)src); - - for (j = 0; j < SERPENT_PARALLEL_BLOCKS - 1; j++) - u128_xor(dst + (j + 1), dst + (j + 1), ivs + j); -} - -static void serpent_crypt_ctr(const void *ctx, u8 *d, const u8 *s, le128 *iv) -{ - be128 ctrblk; - u128 *dst = (u128 *)d; - const u128 *src = (const u128 *)s; - - le128_to_be128(&ctrblk, iv); - le128_inc(iv); - - __serpent_encrypt(ctx, (u8 *)&ctrblk, (u8 *)&ctrblk); - u128_xor(dst, src, (u128 *)&ctrblk); -} - -static void serpent_crypt_ctr_xway(const void *ctx, u8 *d, const u8 *s, - le128 *iv) +static void serpent_decrypt_cbc_xway(const void *ctx, u8 *dst, const u8 *src) { - be128 ctrblks[SERPENT_PARALLEL_BLOCKS]; - u128 *dst = (u128 *)d; - const u128 *src = (const u128 *)s; - unsigned int i; - - for (i = 0; i < SERPENT_PARALLEL_BLOCKS; i++) { - if (dst != src) - dst[i] = src[i]; - - le128_to_be128(&ctrblks[i], iv); - le128_inc(iv); - } + u8 buf[SERPENT_PARALLEL_BLOCKS - 1][SERPENT_BLOCK_SIZE]; + const u8 *s = src; - serpent_enc_blk_xway_xor(ctx, (u8 *)dst, (u8 *)ctrblks); + if (dst == src) + s = memcpy(buf, src, sizeof(buf)); + serpent_dec_blk_xway(ctx, dst, src); + crypto_xor(dst + SERPENT_BLOCK_SIZE, s, sizeof(buf)); } -static const struct common_glue_ctx serpent_enc = { - .num_funcs = 2, - .fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS, - - .funcs = { { - .num_blocks = SERPENT_PARALLEL_BLOCKS, - .fn_u = { .ecb = serpent_enc_blk_xway } - }, { - .num_blocks = 1, - .fn_u = { .ecb = __serpent_encrypt } - } } -}; - -static const struct common_glue_ctx serpent_ctr = { - .num_funcs = 2, - .fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS, - - .funcs = { { - .num_blocks = SERPENT_PARALLEL_BLOCKS, - .fn_u = { .ctr = serpent_crypt_ctr_xway } - }, { - .num_blocks = 1, - .fn_u = { .ctr = serpent_crypt_ctr } - } } -}; - -static const struct common_glue_ctx serpent_dec = { - .num_funcs = 2, - .fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS, - - .funcs = { { - .num_blocks = SERPENT_PARALLEL_BLOCKS, - .fn_u = { .ecb = serpent_dec_blk_xway } - }, { - .num_blocks = 1, - .fn_u = { .ecb = __serpent_decrypt } - } } -}; - -static const struct common_glue_ctx serpent_dec_cbc = { - .num_funcs = 2, - .fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS, - - .funcs = { { - .num_blocks = SERPENT_PARALLEL_BLOCKS, - .fn_u = { .cbc = serpent_decrypt_cbc_xway } - }, { - .num_blocks = 1, - .fn_u = { .cbc = __serpent_decrypt } - } } -}; - static int ecb_encrypt(struct skcipher_request *req) { - return glue_ecb_req_128bit(&serpent_enc, req); + ECB_WALK_START(req, SERPENT_BLOCK_SIZE, SERPENT_PARALLEL_BLOCKS); + ECB_BLOCK(SERPENT_PARALLEL_BLOCKS, serpent_enc_blk_xway); + ECB_BLOCK(1, __serpent_encrypt); + ECB_WALK_END(); } static int ecb_decrypt(struct skcipher_request *req) { - return glue_ecb_req_128bit(&serpent_dec, req); + ECB_WALK_START(req, SERPENT_BLOCK_SIZE, SERPENT_PARALLEL_BLOCKS); + ECB_BLOCK(SERPENT_PARALLEL_BLOCKS, serpent_dec_blk_xway); + ECB_BLOCK(1, __serpent_decrypt); + ECB_WALK_END(); } static int cbc_encrypt(struct skcipher_request *req) { - return glue_cbc_encrypt_req_128bit(__serpent_encrypt, - req); + CBC_WALK_START(req, SERPENT_BLOCK_SIZE, -1); + CBC_ENC_BLOCK(__serpent_encrypt); + CBC_WALK_END(); } static int cbc_decrypt(struct skcipher_request *req) { - return glue_cbc_decrypt_req_128bit(&serpent_dec_cbc, req); -} - -static int ctr_crypt(struct skcipher_request *req) -{ - return glue_ctr_req_128bit(&serpent_ctr, req); + CBC_WALK_START(req, SERPENT_BLOCK_SIZE, SERPENT_PARALLEL_BLOCKS); + CBC_DEC_BLOCK(SERPENT_PARALLEL_BLOCKS, serpent_decrypt_cbc_xway); + CBC_DEC_BLOCK(1, __serpent_decrypt); + CBC_WALK_END(); } static struct skcipher_alg serpent_algs[] = { @@ -185,21 +100,6 @@ static struct skcipher_alg serpent_algs[] = { .setkey = serpent_setkey_skcipher, .encrypt = cbc_encrypt, .decrypt = cbc_decrypt, - }, { - .base.cra_name = "__ctr(serpent)", - .base.cra_driver_name = "__ctr-serpent-sse2", - .base.cra_priority = 400, - .base.cra_flags = CRYPTO_ALG_INTERNAL, - .base.cra_blocksize = 1, - .base.cra_ctxsize = sizeof(struct serpent_ctx), - .base.cra_module = THIS_MODULE, - .min_keysize = SERPENT_MIN_KEY_SIZE, - .max_keysize = SERPENT_MAX_KEY_SIZE, - .ivsize = SERPENT_BLOCK_SIZE, - .chunksize = SERPENT_BLOCK_SIZE, - .setkey = serpent_setkey_skcipher, - .encrypt = ctr_crypt, - .decrypt = ctr_crypt, }, }; diff --git a/arch/x86/crypto/sha1_avx2_x86_64_asm.S b/arch/x86/crypto/sha1_avx2_x86_64_asm.S index 1e594d60afa5..a96b2fd26dab 100644 --- a/arch/x86/crypto/sha1_avx2_x86_64_asm.S +++ b/arch/x86/crypto/sha1_avx2_x86_64_asm.S @@ -645,9 +645,9 @@ _loop3: RESERVE_STACK = (W_SIZE*4 + 8+24) /* Align stack */ - mov %rsp, %rbx + push %rbp + mov %rsp, %rbp and $~(0x20-1), %rsp - push %rbx sub $RESERVE_STACK, %rsp avx2_zeroupper @@ -665,8 +665,8 @@ _loop3: avx2_zeroupper - add $RESERVE_STACK, %rsp - pop %rsp + mov %rbp, %rsp + pop %rbp pop %r15 pop %r14 @@ -674,7 +674,7 @@ _loop3: pop %r12 pop %rbx - ret + RET SYM_FUNC_END(\name) .endm diff --git a/arch/x86/crypto/sha1_ni_asm.S b/arch/x86/crypto/sha1_ni_asm.S index 11efe3a45a1f..2f94ec0e763b 100644 --- a/arch/x86/crypto/sha1_ni_asm.S +++ b/arch/x86/crypto/sha1_ni_asm.S @@ -59,8 +59,6 @@ #define DATA_PTR %rsi /* 2nd arg */ #define NUM_BLKS %rdx /* 3rd arg */ -#define RSPSAVE %rax - /* gcc conversion */ #define FRAME_SIZE 32 /* space for 2x16 bytes */ @@ -96,7 +94,8 @@ .text .align 32 SYM_FUNC_START(sha1_ni_transform) - mov %rsp, RSPSAVE + push %rbp + mov %rsp, %rbp sub $FRAME_SIZE, %rsp and $~0xF, %rsp @@ -288,9 +287,10 @@ SYM_FUNC_START(sha1_ni_transform) pextrd $3, E0, 1*16(DIGEST_PTR) .Ldone_hash: - mov RSPSAVE, %rsp + mov %rbp, %rsp + pop %rbp - ret + RET SYM_FUNC_END(sha1_ni_transform) .section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16 diff --git a/arch/x86/crypto/sha1_ssse3_asm.S b/arch/x86/crypto/sha1_ssse3_asm.S index 12e2d19d7402..263f916362e0 100644 --- a/arch/x86/crypto/sha1_ssse3_asm.S +++ b/arch/x86/crypto/sha1_ssse3_asm.S @@ -99,7 +99,7 @@ pop %rbp pop %r12 pop %rbx - ret + RET SYM_FUNC_END(\name) .endm @@ -467,8 +467,6 @@ W_PRECALC_SSSE3 */ SHA1_VECTOR_ASM sha1_transform_ssse3 -#ifdef CONFIG_AS_AVX - .macro W_PRECALC_AVX .purgem W_PRECALC_00_15 @@ -553,5 +551,3 @@ W_PRECALC_AVX * const u8 *data, int blocks); */ SHA1_VECTOR_ASM sha1_transform_avx - -#endif diff --git a/arch/x86/crypto/sha1_ssse3_glue.c b/arch/x86/crypto/sha1_ssse3_glue.c index d70b40ad594c..44340a1139e0 100644 --- a/arch/x86/crypto/sha1_ssse3_glue.c +++ b/arch/x86/crypto/sha1_ssse3_glue.c @@ -21,9 +21,8 @@ #include <linux/init.h> #include <linux/module.h> #include <linux/mm.h> -#include <linux/cryptohash.h> #include <linux/types.h> -#include <crypto/sha.h> +#include <crypto/sha1.h> #include <crypto/sha1_base.h> #include <asm/simd.h> @@ -114,7 +113,6 @@ static void unregister_sha1_ssse3(void) crypto_unregister_shash(&sha1_ssse3_alg); } -#ifdef CONFIG_AS_AVX asmlinkage void sha1_transform_avx(struct sha1_state *state, const u8 *data, int blocks); @@ -175,13 +173,6 @@ static void unregister_sha1_avx(void) crypto_unregister_shash(&sha1_avx_alg); } -#else /* CONFIG_AS_AVX */ -static inline int register_sha1_avx(void) { return 0; } -static inline void unregister_sha1_avx(void) { } -#endif /* CONFIG_AS_AVX */ - - -#if defined(CONFIG_AS_AVX2) && (CONFIG_AS_AVX) #define SHA1_AVX2_BLOCK_OPTSIZE 4 /* optimal 4*64 bytes of SHA1 blocks */ asmlinkage void sha1_transform_avx2(struct sha1_state *state, @@ -253,11 +244,6 @@ static void unregister_sha1_avx2(void) crypto_unregister_shash(&sha1_avx2_alg); } -#else -static inline int register_sha1_avx2(void) { return 0; } -static inline void unregister_sha1_avx2(void) { } -#endif - #ifdef CONFIG_AS_SHA1_NI asmlinkage void sha1_ni_transform(struct sha1_state *digest, const u8 *data, int rounds); diff --git a/arch/x86/crypto/sha256-avx-asm.S b/arch/x86/crypto/sha256-avx-asm.S index fcbc30f58c38..3baa1ec39097 100644 --- a/arch/x86/crypto/sha256-avx-asm.S +++ b/arch/x86/crypto/sha256-avx-asm.S @@ -47,7 +47,6 @@ # This code schedules 1 block at a time, with 4 lanes per block ######################################################################## -#ifdef CONFIG_AS_AVX #include <linux/linkage.h> ## assume buffers not aligned @@ -459,7 +458,7 @@ done_hash: popq %r13 popq %r12 popq %rbx - ret + RET SYM_FUNC_END(sha256_transform_avx) .section .rodata.cst256.K256, "aM", @progbits, 256 @@ -498,5 +497,3 @@ _SHUF_00BA: # shuffle xDxC -> DC00 _SHUF_DC00: .octa 0x0b0a090803020100FFFFFFFFFFFFFFFF - -#endif diff --git a/arch/x86/crypto/sha256-avx2-asm.S b/arch/x86/crypto/sha256-avx2-asm.S index 499d9ec129de..9bcdbc47b8b4 100644 --- a/arch/x86/crypto/sha256-avx2-asm.S +++ b/arch/x86/crypto/sha256-avx2-asm.S @@ -48,7 +48,6 @@ # This code schedules 2 blocks at a time, with 4 lanes per block ######################################################################## -#ifdef CONFIG_AS_AVX2 #include <linux/linkage.h> ## assume buffers not aligned @@ -118,15 +117,13 @@ _XMM_SAVE_SIZE = 0 _INP_END_SIZE = 8 _INP_SIZE = 8 _CTX_SIZE = 8 -_RSP_SIZE = 8 _XFER = 0 _XMM_SAVE = _XFER + _XFER_SIZE _INP_END = _XMM_SAVE + _XMM_SAVE_SIZE _INP = _INP_END + _INP_END_SIZE _CTX = _INP + _INP_SIZE -_RSP = _CTX + _CTX_SIZE -STACK_SIZE = _RSP + _RSP_SIZE +STACK_SIZE = _CTX + _CTX_SIZE # rotate_Xs # Rotate values of symbols X0...X3 @@ -534,11 +531,11 @@ SYM_FUNC_START(sha256_transform_rorx) pushq %r14 pushq %r15 - mov %rsp, %rax + push %rbp + mov %rsp, %rbp + subq $STACK_SIZE, %rsp and $-32, %rsp # align rsp to 32 byte boundary - mov %rax, _RSP(%rsp) - shl $6, NUM_BLKS # convert to bytes jz done_hash @@ -705,14 +702,15 @@ only_one_block: done_hash: - mov _RSP(%rsp), %rsp + mov %rbp, %rsp + pop %rbp popq %r15 popq %r14 popq %r13 popq %r12 popq %rbx - ret + RET SYM_FUNC_END(sha256_transform_rorx) .section .rodata.cst512.K256, "aM", @progbits, 512 @@ -767,5 +765,3 @@ _SHUF_00BA: .align 32 _SHUF_DC00: .octa 0x0b0a090803020100FFFFFFFFFFFFFFFF,0x0b0a090803020100FFFFFFFFFFFFFFFF - -#endif diff --git a/arch/x86/crypto/sha256-ssse3-asm.S b/arch/x86/crypto/sha256-ssse3-asm.S index ddfa863b4ee3..c4a5db612c32 100644 --- a/arch/x86/crypto/sha256-ssse3-asm.S +++ b/arch/x86/crypto/sha256-ssse3-asm.S @@ -472,7 +472,7 @@ done_hash: popq %r12 popq %rbx - ret + RET SYM_FUNC_END(sha256_transform_ssse3) .section .rodata.cst256.K256, "aM", @progbits, 256 diff --git a/arch/x86/crypto/sha256_ni_asm.S b/arch/x86/crypto/sha256_ni_asm.S index 7abade04a3a3..94d50dd27cb5 100644 --- a/arch/x86/crypto/sha256_ni_asm.S +++ b/arch/x86/crypto/sha256_ni_asm.S @@ -326,7 +326,7 @@ SYM_FUNC_START(sha256_ni_transform) .Ldone_hash: - ret + RET SYM_FUNC_END(sha256_ni_transform) .section .rodata.cst256.K256, "aM", @progbits, 256 diff --git a/arch/x86/crypto/sha256_ssse3_glue.c b/arch/x86/crypto/sha256_ssse3_glue.c index 03ad657c04bd..3a5f6be7dbba 100644 --- a/arch/x86/crypto/sha256_ssse3_glue.c +++ b/arch/x86/crypto/sha256_ssse3_glue.c @@ -34,9 +34,8 @@ #include <linux/init.h> #include <linux/module.h> #include <linux/mm.h> -#include <linux/cryptohash.h> #include <linux/types.h> -#include <crypto/sha.h> +#include <crypto/sha2.h> #include <crypto/sha256_base.h> #include <linux/string.h> #include <asm/simd.h> @@ -144,7 +143,6 @@ static void unregister_sha256_ssse3(void) ARRAY_SIZE(sha256_ssse3_algs)); } -#ifdef CONFIG_AS_AVX asmlinkage void sha256_transform_avx(struct sha256_state *state, const u8 *data, int blocks); @@ -221,12 +219,6 @@ static void unregister_sha256_avx(void) ARRAY_SIZE(sha256_avx_algs)); } -#else -static inline int register_sha256_avx(void) { return 0; } -static inline void unregister_sha256_avx(void) { } -#endif - -#if defined(CONFIG_AS_AVX2) && defined(CONFIG_AS_AVX) asmlinkage void sha256_transform_rorx(struct sha256_state *state, const u8 *data, int blocks); @@ -301,11 +293,6 @@ static void unregister_sha256_avx2(void) ARRAY_SIZE(sha256_avx2_algs)); } -#else -static inline int register_sha256_avx2(void) { return 0; } -static inline void unregister_sha256_avx2(void) { } -#endif - #ifdef CONFIG_AS_SHA256_NI asmlinkage void sha256_ni_transform(struct sha256_state *digest, const u8 *data, int rounds); diff --git a/arch/x86/crypto/sha512-avx-asm.S b/arch/x86/crypto/sha512-avx-asm.S index 90ea945ba5e6..1fefe6dd3a9e 100644 --- a/arch/x86/crypto/sha512-avx-asm.S +++ b/arch/x86/crypto/sha512-avx-asm.S @@ -47,7 +47,6 @@ # ######################################################################## -#ifdef CONFIG_AS_AVX #include <linux/linkage.h> .text @@ -77,14 +76,10 @@ tmp0 = %rax W_SIZE = 80*8 # W[t] + K[t] | W[t+1] + K[t+1] WK_SIZE = 2*8 -RSPSAVE_SIZE = 1*8 -GPRSAVE_SIZE = 5*8 frame_W = 0 frame_WK = frame_W + W_SIZE -frame_RSPSAVE = frame_WK + WK_SIZE -frame_GPRSAVE = frame_RSPSAVE + RSPSAVE_SIZE -frame_size = frame_GPRSAVE + GPRSAVE_SIZE +frame_size = frame_WK + WK_SIZE # Useful QWORD "arrays" for simpler memory references # MSG, DIGEST, K_t, W_t are arrays @@ -279,21 +274,21 @@ frame_size = frame_GPRSAVE + GPRSAVE_SIZE # "blocks" is the message length in SHA512 blocks ######################################################################## SYM_FUNC_START(sha512_transform_avx) - cmp $0, msglen + test msglen, msglen je nowork + # Save GPRs + push %rbx + push %r12 + push %r13 + push %r14 + push %r15 + # Allocate Stack Space - mov %rsp, %rax + push %rbp + mov %rsp, %rbp sub $frame_size, %rsp and $~(0x20 - 1), %rsp - mov %rax, frame_RSPSAVE(%rsp) - - # Save GPRs - mov %rbx, frame_GPRSAVE(%rsp) - mov %r12, frame_GPRSAVE +8*1(%rsp) - mov %r13, frame_GPRSAVE +8*2(%rsp) - mov %r14, frame_GPRSAVE +8*3(%rsp) - mov %r15, frame_GPRSAVE +8*4(%rsp) updateblock: @@ -354,18 +349,19 @@ updateblock: dec msglen jnz updateblock - # Restore GPRs - mov frame_GPRSAVE(%rsp), %rbx - mov frame_GPRSAVE +8*1(%rsp), %r12 - mov frame_GPRSAVE +8*2(%rsp), %r13 - mov frame_GPRSAVE +8*3(%rsp), %r14 - mov frame_GPRSAVE +8*4(%rsp), %r15 - # Restore Stack Pointer - mov frame_RSPSAVE(%rsp), %rsp + mov %rbp, %rsp + pop %rbp + + # Restore GPRs + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbx nowork: - ret + RET SYM_FUNC_END(sha512_transform_avx) ######################################################################## @@ -424,4 +420,3 @@ K512: .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 -#endif diff --git a/arch/x86/crypto/sha512-avx2-asm.S b/arch/x86/crypto/sha512-avx2-asm.S index 3dd886b14e7d..5cdaab7d6901 100644 --- a/arch/x86/crypto/sha512-avx2-asm.S +++ b/arch/x86/crypto/sha512-avx2-asm.S @@ -49,7 +49,6 @@ # This code schedules 1 blocks at a time, with 4 lanes per block ######################################################################## -#ifdef CONFIG_AS_AVX2 #include <linux/linkage.h> .text @@ -103,17 +102,13 @@ SRND_SIZE = 1*8 INP_SIZE = 1*8 INPEND_SIZE = 1*8 CTX_SIZE = 1*8 -RSPSAVE_SIZE = 1*8 -GPRSAVE_SIZE = 5*8 frame_XFER = 0 frame_SRND = frame_XFER + XFER_SIZE frame_INP = frame_SRND + SRND_SIZE frame_INPEND = frame_INP + INP_SIZE frame_CTX = frame_INPEND + INPEND_SIZE -frame_RSPSAVE = frame_CTX + CTX_SIZE -frame_GPRSAVE = frame_RSPSAVE + RSPSAVE_SIZE -frame_size = frame_GPRSAVE + GPRSAVE_SIZE +frame_size = frame_CTX + CTX_SIZE ## assume buffers not aligned #define VMOVDQ vmovdqu @@ -571,18 +566,18 @@ frame_size = frame_GPRSAVE + GPRSAVE_SIZE # "blocks" is the message length in SHA512 blocks ######################################################################## SYM_FUNC_START(sha512_transform_rorx) + # Save GPRs + push %rbx + push %r12 + push %r13 + push %r14 + push %r15 + # Allocate Stack Space - mov %rsp, %rax + push %rbp + mov %rsp, %rbp sub $frame_size, %rsp and $~(0x20 - 1), %rsp - mov %rax, frame_RSPSAVE(%rsp) - - # Save GPRs - mov %rbx, 8*0+frame_GPRSAVE(%rsp) - mov %r12, 8*1+frame_GPRSAVE(%rsp) - mov %r13, 8*2+frame_GPRSAVE(%rsp) - mov %r14, 8*3+frame_GPRSAVE(%rsp) - mov %r15, 8*4+frame_GPRSAVE(%rsp) shl $7, NUM_BLKS # convert to bytes jz done_hash @@ -673,16 +668,18 @@ loop2: done_hash: -# Restore GPRs - mov 8*0+frame_GPRSAVE(%rsp), %rbx - mov 8*1+frame_GPRSAVE(%rsp), %r12 - mov 8*2+frame_GPRSAVE(%rsp), %r13 - mov 8*3+frame_GPRSAVE(%rsp), %r14 - mov 8*4+frame_GPRSAVE(%rsp), %r15 - # Restore Stack Pointer - mov frame_RSPSAVE(%rsp), %rsp - ret + mov %rbp, %rsp + pop %rbp + + # Restore GPRs + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbx + + RET SYM_FUNC_END(sha512_transform_rorx) ######################################################################## @@ -749,5 +746,3 @@ PSHUFFLE_BYTE_FLIP_MASK: MASK_YMM_LO: .octa 0x00000000000000000000000000000000 .octa 0xFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF - -#endif diff --git a/arch/x86/crypto/sha512-ssse3-asm.S b/arch/x86/crypto/sha512-ssse3-asm.S index 7946a1bee85b..b84c22e06c5f 100644 --- a/arch/x86/crypto/sha512-ssse3-asm.S +++ b/arch/x86/crypto/sha512-ssse3-asm.S @@ -74,14 +74,10 @@ tmp0 = %rax W_SIZE = 80*8 WK_SIZE = 2*8 -RSPSAVE_SIZE = 1*8 -GPRSAVE_SIZE = 5*8 frame_W = 0 frame_WK = frame_W + W_SIZE -frame_RSPSAVE = frame_WK + WK_SIZE -frame_GPRSAVE = frame_RSPSAVE + RSPSAVE_SIZE -frame_size = frame_GPRSAVE + GPRSAVE_SIZE +frame_size = frame_WK + WK_SIZE # Useful QWORD "arrays" for simpler memory references # MSG, DIGEST, K_t, W_t are arrays @@ -280,21 +276,21 @@ frame_size = frame_GPRSAVE + GPRSAVE_SIZE ######################################################################## SYM_FUNC_START(sha512_transform_ssse3) - cmp $0, msglen + test msglen, msglen je nowork + # Save GPRs + push %rbx + push %r12 + push %r13 + push %r14 + push %r15 + # Allocate Stack Space - mov %rsp, %rax + push %rbp + mov %rsp, %rbp sub $frame_size, %rsp and $~(0x20 - 1), %rsp - mov %rax, frame_RSPSAVE(%rsp) - - # Save GPRs - mov %rbx, frame_GPRSAVE(%rsp) - mov %r12, frame_GPRSAVE +8*1(%rsp) - mov %r13, frame_GPRSAVE +8*2(%rsp) - mov %r14, frame_GPRSAVE +8*3(%rsp) - mov %r15, frame_GPRSAVE +8*4(%rsp) updateblock: @@ -355,18 +351,19 @@ updateblock: dec msglen jnz updateblock - # Restore GPRs - mov frame_GPRSAVE(%rsp), %rbx - mov frame_GPRSAVE +8*1(%rsp), %r12 - mov frame_GPRSAVE +8*2(%rsp), %r13 - mov frame_GPRSAVE +8*3(%rsp), %r14 - mov frame_GPRSAVE +8*4(%rsp), %r15 - # Restore Stack Pointer - mov frame_RSPSAVE(%rsp), %rsp + mov %rbp, %rsp + pop %rbp + + # Restore GPRs + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbx nowork: - ret + RET SYM_FUNC_END(sha512_transform_ssse3) ######################################################################## diff --git a/arch/x86/crypto/sha512_ssse3_glue.c b/arch/x86/crypto/sha512_ssse3_glue.c index 1c444f41037c..6d3b85e53d0e 100644 --- a/arch/x86/crypto/sha512_ssse3_glue.c +++ b/arch/x86/crypto/sha512_ssse3_glue.c @@ -32,11 +32,11 @@ #include <linux/init.h> #include <linux/module.h> #include <linux/mm.h> -#include <linux/cryptohash.h> #include <linux/string.h> #include <linux/types.h> -#include <crypto/sha.h> +#include <crypto/sha2.h> #include <crypto/sha512_base.h> +#include <asm/cpu_device_id.h> #include <asm/simd.h> asmlinkage void sha512_transform_ssse3(struct sha512_state *state, @@ -142,7 +142,6 @@ static void unregister_sha512_ssse3(void) ARRAY_SIZE(sha512_ssse3_algs)); } -#ifdef CONFIG_AS_AVX asmlinkage void sha512_transform_avx(struct sha512_state *state, const u8 *data, int blocks); static bool avx_usable(void) @@ -218,12 +217,7 @@ static void unregister_sha512_avx(void) crypto_unregister_shashes(sha512_avx_algs, ARRAY_SIZE(sha512_avx_algs)); } -#else -static inline int register_sha512_avx(void) { return 0; } -static inline void unregister_sha512_avx(void) { } -#endif -#if defined(CONFIG_AS_AVX2) && defined(CONFIG_AS_AVX) asmlinkage void sha512_transform_rorx(struct sha512_state *state, const u8 *data, int blocks); @@ -291,6 +285,13 @@ static int register_sha512_avx2(void) ARRAY_SIZE(sha512_avx2_algs)); return 0; } +static const struct x86_cpu_id module_cpu_ids[] = { + X86_MATCH_FEATURE(X86_FEATURE_AVX2, NULL), + X86_MATCH_FEATURE(X86_FEATURE_AVX, NULL), + X86_MATCH_FEATURE(X86_FEATURE_SSSE3, NULL), + {} +}; +MODULE_DEVICE_TABLE(x86cpu, module_cpu_ids); static void unregister_sha512_avx2(void) { @@ -298,13 +299,11 @@ static void unregister_sha512_avx2(void) crypto_unregister_shashes(sha512_avx2_algs, ARRAY_SIZE(sha512_avx2_algs)); } -#else -static inline int register_sha512_avx2(void) { return 0; } -static inline void unregister_sha512_avx2(void) { } -#endif static int __init sha512_ssse3_mod_init(void) { + if (!x86_match_cpu(module_cpu_ids)) + return -ENODEV; if (register_sha512_ssse3()) goto fail; diff --git a/arch/x86/crypto/sm3-avx-asm_64.S b/arch/x86/crypto/sm3-avx-asm_64.S new file mode 100644 index 000000000000..b12b9efb5ec5 --- /dev/null +++ b/arch/x86/crypto/sm3-avx-asm_64.S @@ -0,0 +1,517 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * SM3 AVX accelerated transform. + * specified in: https://datatracker.ietf.org/doc/html/draft-sca-cfrg-sm3-02 + * + * Copyright (C) 2021 Jussi Kivilinna <jussi.kivilinna@iki.fi> + * Copyright (C) 2021 Tianjia Zhang <tianjia.zhang@linux.alibaba.com> + */ + +/* Based on SM3 AES/BMI2 accelerated work by libgcrypt at: + * https://gnupg.org/software/libgcrypt/index.html + */ + +#include <linux/linkage.h> +#include <asm/frame.h> + +/* Context structure */ + +#define state_h0 0 +#define state_h1 4 +#define state_h2 8 +#define state_h3 12 +#define state_h4 16 +#define state_h5 20 +#define state_h6 24 +#define state_h7 28 + +/* Constants */ + +/* Round constant macros */ + +#define K0 2043430169 /* 0x79cc4519 */ +#define K1 -208106958 /* 0xf3988a32 */ +#define K2 -416213915 /* 0xe7311465 */ +#define K3 -832427829 /* 0xce6228cb */ +#define K4 -1664855657 /* 0x9cc45197 */ +#define K5 965255983 /* 0x3988a32f */ +#define K6 1930511966 /* 0x7311465e */ +#define K7 -433943364 /* 0xe6228cbc */ +#define K8 -867886727 /* 0xcc451979 */ +#define K9 -1735773453 /* 0x988a32f3 */ +#define K10 823420391 /* 0x311465e7 */ +#define K11 1646840782 /* 0x6228cbce */ +#define K12 -1001285732 /* 0xc451979c */ +#define K13 -2002571463 /* 0x88a32f39 */ +#define K14 289824371 /* 0x11465e73 */ +#define K15 579648742 /* 0x228cbce6 */ +#define K16 -1651869049 /* 0x9d8a7a87 */ +#define K17 991229199 /* 0x3b14f50f */ +#define K18 1982458398 /* 0x7629ea1e */ +#define K19 -330050500 /* 0xec53d43c */ +#define K20 -660100999 /* 0xd8a7a879 */ +#define K21 -1320201997 /* 0xb14f50f3 */ +#define K22 1654563303 /* 0x629ea1e7 */ +#define K23 -985840690 /* 0xc53d43ce */ +#define K24 -1971681379 /* 0x8a7a879d */ +#define K25 351604539 /* 0x14f50f3b */ +#define K26 703209078 /* 0x29ea1e76 */ +#define K27 1406418156 /* 0x53d43cec */ +#define K28 -1482130984 /* 0xa7a879d8 */ +#define K29 1330705329 /* 0x4f50f3b1 */ +#define K30 -1633556638 /* 0x9ea1e762 */ +#define K31 1027854021 /* 0x3d43cec5 */ +#define K32 2055708042 /* 0x7a879d8a */ +#define K33 -183551212 /* 0xf50f3b14 */ +#define K34 -367102423 /* 0xea1e7629 */ +#define K35 -734204845 /* 0xd43cec53 */ +#define K36 -1468409689 /* 0xa879d8a7 */ +#define K37 1358147919 /* 0x50f3b14f */ +#define K38 -1578671458 /* 0xa1e7629e */ +#define K39 1137624381 /* 0x43cec53d */ +#define K40 -2019718534 /* 0x879d8a7a */ +#define K41 255530229 /* 0x0f3b14f5 */ +#define K42 511060458 /* 0x1e7629ea */ +#define K43 1022120916 /* 0x3cec53d4 */ +#define K44 2044241832 /* 0x79d8a7a8 */ +#define K45 -206483632 /* 0xf3b14f50 */ +#define K46 -412967263 /* 0xe7629ea1 */ +#define K47 -825934525 /* 0xcec53d43 */ +#define K48 -1651869049 /* 0x9d8a7a87 */ +#define K49 991229199 /* 0x3b14f50f */ +#define K50 1982458398 /* 0x7629ea1e */ +#define K51 -330050500 /* 0xec53d43c */ +#define K52 -660100999 /* 0xd8a7a879 */ +#define K53 -1320201997 /* 0xb14f50f3 */ +#define K54 1654563303 /* 0x629ea1e7 */ +#define K55 -985840690 /* 0xc53d43ce */ +#define K56 -1971681379 /* 0x8a7a879d */ +#define K57 351604539 /* 0x14f50f3b */ +#define K58 703209078 /* 0x29ea1e76 */ +#define K59 1406418156 /* 0x53d43cec */ +#define K60 -1482130984 /* 0xa7a879d8 */ +#define K61 1330705329 /* 0x4f50f3b1 */ +#define K62 -1633556638 /* 0x9ea1e762 */ +#define K63 1027854021 /* 0x3d43cec5 */ + +/* Register macros */ + +#define RSTATE %rdi +#define RDATA %rsi +#define RNBLKS %rdx + +#define t0 %eax +#define t1 %ebx +#define t2 %ecx + +#define a %r8d +#define b %r9d +#define c %r10d +#define d %r11d +#define e %r12d +#define f %r13d +#define g %r14d +#define h %r15d + +#define W0 %xmm0 +#define W1 %xmm1 +#define W2 %xmm2 +#define W3 %xmm3 +#define W4 %xmm4 +#define W5 %xmm5 + +#define XTMP0 %xmm6 +#define XTMP1 %xmm7 +#define XTMP2 %xmm8 +#define XTMP3 %xmm9 +#define XTMP4 %xmm10 +#define XTMP5 %xmm11 +#define XTMP6 %xmm12 + +#define BSWAP_REG %xmm15 + +/* Stack structure */ + +#define STACK_W_SIZE (32 * 2 * 3) +#define STACK_REG_SAVE_SIZE (64) + +#define STACK_W (0) +#define STACK_REG_SAVE (STACK_W + STACK_W_SIZE) +#define STACK_SIZE (STACK_REG_SAVE + STACK_REG_SAVE_SIZE) + +/* Instruction helpers. */ + +#define roll2(v, reg) \ + roll $(v), reg; + +#define roll3mov(v, src, dst) \ + movl src, dst; \ + roll $(v), dst; + +#define roll3(v, src, dst) \ + rorxl $(32-(v)), src, dst; + +#define addl2(a, out) \ + leal (a, out), out; + +/* Round function macros. */ + +#define GG1(x, y, z, o, t) \ + movl x, o; \ + xorl y, o; \ + xorl z, o; + +#define FF1(x, y, z, o, t) GG1(x, y, z, o, t) + +#define GG2(x, y, z, o, t) \ + andnl z, x, o; \ + movl y, t; \ + andl x, t; \ + addl2(t, o); + +#define FF2(x, y, z, o, t) \ + movl y, o; \ + xorl x, o; \ + movl y, t; \ + andl x, t; \ + andl z, o; \ + xorl t, o; + +#define R(i, a, b, c, d, e, f, g, h, round, widx, wtype) \ + /* rol(a, 12) => t0 */ \ + roll3mov(12, a, t0); /* rorxl here would reduce perf by 6% on zen3 */ \ + /* rol (t0 + e + t), 7) => t1 */ \ + leal K##round(t0, e, 1), t1; \ + roll2(7, t1); \ + /* h + w1 => h */ \ + addl wtype##_W1_ADDR(round, widx), h; \ + /* h + t1 => h */ \ + addl2(t1, h); \ + /* t1 ^ t0 => t0 */ \ + xorl t1, t0; \ + /* w1w2 + d => d */ \ + addl wtype##_W1W2_ADDR(round, widx), d; \ + /* FF##i(a,b,c) => t1 */ \ + FF##i(a, b, c, t1, t2); \ + /* d + t1 => d */ \ + addl2(t1, d); \ + /* GG#i(e,f,g) => t2 */ \ + GG##i(e, f, g, t2, t1); \ + /* h + t2 => h */ \ + addl2(t2, h); \ + /* rol (f, 19) => f */ \ + roll2(19, f); \ + /* d + t0 => d */ \ + addl2(t0, d); \ + /* rol (b, 9) => b */ \ + roll2(9, b); \ + /* P0(h) => h */ \ + roll3(9, h, t2); \ + roll3(17, h, t1); \ + xorl t2, h; \ + xorl t1, h; + +#define R1(a, b, c, d, e, f, g, h, round, widx, wtype) \ + R(1, a, b, c, d, e, f, g, h, round, widx, wtype) + +#define R2(a, b, c, d, e, f, g, h, round, widx, wtype) \ + R(2, a, b, c, d, e, f, g, h, round, widx, wtype) + +/* Input expansion macros. */ + +/* Byte-swapped input address. */ +#define IW_W_ADDR(round, widx, offs) \ + (STACK_W + ((round) / 4) * 64 + (offs) + ((widx) * 4))(%rsp) + +/* Expanded input address. */ +#define XW_W_ADDR(round, widx, offs) \ + (STACK_W + ((((round) / 3) - 4) % 2) * 64 + (offs) + ((widx) * 4))(%rsp) + +/* Rounds 1-12, byte-swapped input block addresses. */ +#define IW_W1_ADDR(round, widx) IW_W_ADDR(round, widx, 0) +#define IW_W1W2_ADDR(round, widx) IW_W_ADDR(round, widx, 32) + +/* Rounds 1-12, expanded input block addresses. */ +#define XW_W1_ADDR(round, widx) XW_W_ADDR(round, widx, 0) +#define XW_W1W2_ADDR(round, widx) XW_W_ADDR(round, widx, 32) + +/* Input block loading. */ +#define LOAD_W_XMM_1() \ + vmovdqu 0*16(RDATA), XTMP0; /* XTMP0: w3, w2, w1, w0 */ \ + vmovdqu 1*16(RDATA), XTMP1; /* XTMP1: w7, w6, w5, w4 */ \ + vmovdqu 2*16(RDATA), XTMP2; /* XTMP2: w11, w10, w9, w8 */ \ + vmovdqu 3*16(RDATA), XTMP3; /* XTMP3: w15, w14, w13, w12 */ \ + vpshufb BSWAP_REG, XTMP0, XTMP0; \ + vpshufb BSWAP_REG, XTMP1, XTMP1; \ + vpshufb BSWAP_REG, XTMP2, XTMP2; \ + vpshufb BSWAP_REG, XTMP3, XTMP3; \ + vpxor XTMP0, XTMP1, XTMP4; \ + vpxor XTMP1, XTMP2, XTMP5; \ + vpxor XTMP2, XTMP3, XTMP6; \ + leaq 64(RDATA), RDATA; \ + vmovdqa XTMP0, IW_W1_ADDR(0, 0); \ + vmovdqa XTMP4, IW_W1W2_ADDR(0, 0); \ + vmovdqa XTMP1, IW_W1_ADDR(4, 0); \ + vmovdqa XTMP5, IW_W1W2_ADDR(4, 0); + +#define LOAD_W_XMM_2() \ + vmovdqa XTMP2, IW_W1_ADDR(8, 0); \ + vmovdqa XTMP6, IW_W1W2_ADDR(8, 0); + +#define LOAD_W_XMM_3() \ + vpshufd $0b00000000, XTMP0, W0; /* W0: xx, w0, xx, xx */ \ + vpshufd $0b11111001, XTMP0, W1; /* W1: xx, w3, w2, w1 */ \ + vmovdqa XTMP1, W2; /* W2: xx, w6, w5, w4 */ \ + vpalignr $12, XTMP1, XTMP2, W3; /* W3: xx, w9, w8, w7 */ \ + vpalignr $8, XTMP2, XTMP3, W4; /* W4: xx, w12, w11, w10 */ \ + vpshufd $0b11111001, XTMP3, W5; /* W5: xx, w15, w14, w13 */ + +/* Message scheduling. Note: 3 words per XMM register. */ +#define SCHED_W_0(round, w0, w1, w2, w3, w4, w5) \ + /* Load (w[i - 16]) => XTMP0 */ \ + vpshufd $0b10111111, w0, XTMP0; \ + vpalignr $12, XTMP0, w1, XTMP0; /* XTMP0: xx, w2, w1, w0 */ \ + /* Load (w[i - 13]) => XTMP1 */ \ + vpshufd $0b10111111, w1, XTMP1; \ + vpalignr $12, XTMP1, w2, XTMP1; \ + /* w[i - 9] == w3 */ \ + /* XMM3 ^ XTMP0 => XTMP0 */ \ + vpxor w3, XTMP0, XTMP0; + +#define SCHED_W_1(round, w0, w1, w2, w3, w4, w5) \ + /* w[i - 3] == w5 */ \ + /* rol(XMM5, 15) ^ XTMP0 => XTMP0 */ \ + vpslld $15, w5, XTMP2; \ + vpsrld $(32-15), w5, XTMP3; \ + vpxor XTMP2, XTMP3, XTMP3; \ + vpxor XTMP3, XTMP0, XTMP0; \ + /* rol(XTMP1, 7) => XTMP1 */ \ + vpslld $7, XTMP1, XTMP5; \ + vpsrld $(32-7), XTMP1, XTMP1; \ + vpxor XTMP5, XTMP1, XTMP1; \ + /* XMM4 ^ XTMP1 => XTMP1 */ \ + vpxor w4, XTMP1, XTMP1; \ + /* w[i - 6] == XMM4 */ \ + /* P1(XTMP0) ^ XTMP1 => XMM0 */ \ + vpslld $15, XTMP0, XTMP5; \ + vpsrld $(32-15), XTMP0, XTMP6; \ + vpslld $23, XTMP0, XTMP2; \ + vpsrld $(32-23), XTMP0, XTMP3; \ + vpxor XTMP0, XTMP1, XTMP1; \ + vpxor XTMP6, XTMP5, XTMP5; \ + vpxor XTMP3, XTMP2, XTMP2; \ + vpxor XTMP2, XTMP5, XTMP5; \ + vpxor XTMP5, XTMP1, w0; + +#define SCHED_W_2(round, w0, w1, w2, w3, w4, w5) \ + /* W1 in XMM12 */ \ + vpshufd $0b10111111, w4, XTMP4; \ + vpalignr $12, XTMP4, w5, XTMP4; \ + vmovdqa XTMP4, XW_W1_ADDR((round), 0); \ + /* W1 ^ W2 => XTMP1 */ \ + vpxor w0, XTMP4, XTMP1; \ + vmovdqa XTMP1, XW_W1W2_ADDR((round), 0); + + +.section .rodata.cst16, "aM", @progbits, 16 +.align 16 + +.Lbe32mask: + .long 0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f + +.text + +/* + * Transform nblocks*64 bytes (nblocks*16 32-bit words) at DATA. + * + * void sm3_transform_avx(struct sm3_state *state, + * const u8 *data, int nblocks); + */ +.align 16 +SYM_FUNC_START(sm3_transform_avx) + /* input: + * %rdi: ctx, CTX + * %rsi: data (64*nblks bytes) + * %rdx: nblocks + */ + vzeroupper; + + pushq %rbp; + movq %rsp, %rbp; + + movq %rdx, RNBLKS; + + subq $STACK_SIZE, %rsp; + andq $(~63), %rsp; + + movq %rbx, (STACK_REG_SAVE + 0 * 8)(%rsp); + movq %r15, (STACK_REG_SAVE + 1 * 8)(%rsp); + movq %r14, (STACK_REG_SAVE + 2 * 8)(%rsp); + movq %r13, (STACK_REG_SAVE + 3 * 8)(%rsp); + movq %r12, (STACK_REG_SAVE + 4 * 8)(%rsp); + + vmovdqa .Lbe32mask (%rip), BSWAP_REG; + + /* Get the values of the chaining variables. */ + movl state_h0(RSTATE), a; + movl state_h1(RSTATE), b; + movl state_h2(RSTATE), c; + movl state_h3(RSTATE), d; + movl state_h4(RSTATE), e; + movl state_h5(RSTATE), f; + movl state_h6(RSTATE), g; + movl state_h7(RSTATE), h; + +.align 16 +.Loop: + /* Load data part1. */ + LOAD_W_XMM_1(); + + leaq -1(RNBLKS), RNBLKS; + + /* Transform 0-3 + Load data part2. */ + R1(a, b, c, d, e, f, g, h, 0, 0, IW); LOAD_W_XMM_2(); + R1(d, a, b, c, h, e, f, g, 1, 1, IW); + R1(c, d, a, b, g, h, e, f, 2, 2, IW); + R1(b, c, d, a, f, g, h, e, 3, 3, IW); LOAD_W_XMM_3(); + + /* Transform 4-7 + Precalc 12-14. */ + R1(a, b, c, d, e, f, g, h, 4, 0, IW); + R1(d, a, b, c, h, e, f, g, 5, 1, IW); + R1(c, d, a, b, g, h, e, f, 6, 2, IW); SCHED_W_0(12, W0, W1, W2, W3, W4, W5); + R1(b, c, d, a, f, g, h, e, 7, 3, IW); SCHED_W_1(12, W0, W1, W2, W3, W4, W5); + + /* Transform 8-11 + Precalc 12-17. */ + R1(a, b, c, d, e, f, g, h, 8, 0, IW); SCHED_W_2(12, W0, W1, W2, W3, W4, W5); + R1(d, a, b, c, h, e, f, g, 9, 1, IW); SCHED_W_0(15, W1, W2, W3, W4, W5, W0); + R1(c, d, a, b, g, h, e, f, 10, 2, IW); SCHED_W_1(15, W1, W2, W3, W4, W5, W0); + R1(b, c, d, a, f, g, h, e, 11, 3, IW); SCHED_W_2(15, W1, W2, W3, W4, W5, W0); + + /* Transform 12-14 + Precalc 18-20 */ + R1(a, b, c, d, e, f, g, h, 12, 0, XW); SCHED_W_0(18, W2, W3, W4, W5, W0, W1); + R1(d, a, b, c, h, e, f, g, 13, 1, XW); SCHED_W_1(18, W2, W3, W4, W5, W0, W1); + R1(c, d, a, b, g, h, e, f, 14, 2, XW); SCHED_W_2(18, W2, W3, W4, W5, W0, W1); + + /* Transform 15-17 + Precalc 21-23 */ + R1(b, c, d, a, f, g, h, e, 15, 0, XW); SCHED_W_0(21, W3, W4, W5, W0, W1, W2); + R2(a, b, c, d, e, f, g, h, 16, 1, XW); SCHED_W_1(21, W3, W4, W5, W0, W1, W2); + R2(d, a, b, c, h, e, f, g, 17, 2, XW); SCHED_W_2(21, W3, W4, W5, W0, W1, W2); + + /* Transform 18-20 + Precalc 24-26 */ + R2(c, d, a, b, g, h, e, f, 18, 0, XW); SCHED_W_0(24, W4, W5, W0, W1, W2, W3); + R2(b, c, d, a, f, g, h, e, 19, 1, XW); SCHED_W_1(24, W4, W5, W0, W1, W2, W3); + R2(a, b, c, d, e, f, g, h, 20, 2, XW); SCHED_W_2(24, W4, W5, W0, W1, W2, W3); + + /* Transform 21-23 + Precalc 27-29 */ + R2(d, a, b, c, h, e, f, g, 21, 0, XW); SCHED_W_0(27, W5, W0, W1, W2, W3, W4); + R2(c, d, a, b, g, h, e, f, 22, 1, XW); SCHED_W_1(27, W5, W0, W1, W2, W3, W4); + R2(b, c, d, a, f, g, h, e, 23, 2, XW); SCHED_W_2(27, W5, W0, W1, W2, W3, W4); + + /* Transform 24-26 + Precalc 30-32 */ + R2(a, b, c, d, e, f, g, h, 24, 0, XW); SCHED_W_0(30, W0, W1, W2, W3, W4, W5); + R2(d, a, b, c, h, e, f, g, 25, 1, XW); SCHED_W_1(30, W0, W1, W2, W3, W4, W5); + R2(c, d, a, b, g, h, e, f, 26, 2, XW); SCHED_W_2(30, W0, W1, W2, W3, W4, W5); + + /* Transform 27-29 + Precalc 33-35 */ + R2(b, c, d, a, f, g, h, e, 27, 0, XW); SCHED_W_0(33, W1, W2, W3, W4, W5, W0); + R2(a, b, c, d, e, f, g, h, 28, 1, XW); SCHED_W_1(33, W1, W2, W3, W4, W5, W0); + R2(d, a, b, c, h, e, f, g, 29, 2, XW); SCHED_W_2(33, W1, W2, W3, W4, W5, W0); + + /* Transform 30-32 + Precalc 36-38 */ + R2(c, d, a, b, g, h, e, f, 30, 0, XW); SCHED_W_0(36, W2, W3, W4, W5, W0, W1); + R2(b, c, d, a, f, g, h, e, 31, 1, XW); SCHED_W_1(36, W2, W3, W4, W5, W0, W1); + R2(a, b, c, d, e, f, g, h, 32, 2, XW); SCHED_W_2(36, W2, W3, W4, W5, W0, W1); + + /* Transform 33-35 + Precalc 39-41 */ + R2(d, a, b, c, h, e, f, g, 33, 0, XW); SCHED_W_0(39, W3, W4, W5, W0, W1, W2); + R2(c, d, a, b, g, h, e, f, 34, 1, XW); SCHED_W_1(39, W3, W4, W5, W0, W1, W2); + R2(b, c, d, a, f, g, h, e, 35, 2, XW); SCHED_W_2(39, W3, W4, W5, W0, W1, W2); + + /* Transform 36-38 + Precalc 42-44 */ + R2(a, b, c, d, e, f, g, h, 36, 0, XW); SCHED_W_0(42, W4, W5, W0, W1, W2, W3); + R2(d, a, b, c, h, e, f, g, 37, 1, XW); SCHED_W_1(42, W4, W5, W0, W1, W2, W3); + R2(c, d, a, b, g, h, e, f, 38, 2, XW); SCHED_W_2(42, W4, W5, W0, W1, W2, W3); + + /* Transform 39-41 + Precalc 45-47 */ + R2(b, c, d, a, f, g, h, e, 39, 0, XW); SCHED_W_0(45, W5, W0, W1, W2, W3, W4); + R2(a, b, c, d, e, f, g, h, 40, 1, XW); SCHED_W_1(45, W5, W0, W1, W2, W3, W4); + R2(d, a, b, c, h, e, f, g, 41, 2, XW); SCHED_W_2(45, W5, W0, W1, W2, W3, W4); + + /* Transform 42-44 + Precalc 48-50 */ + R2(c, d, a, b, g, h, e, f, 42, 0, XW); SCHED_W_0(48, W0, W1, W2, W3, W4, W5); + R2(b, c, d, a, f, g, h, e, 43, 1, XW); SCHED_W_1(48, W0, W1, W2, W3, W4, W5); + R2(a, b, c, d, e, f, g, h, 44, 2, XW); SCHED_W_2(48, W0, W1, W2, W3, W4, W5); + + /* Transform 45-47 + Precalc 51-53 */ + R2(d, a, b, c, h, e, f, g, 45, 0, XW); SCHED_W_0(51, W1, W2, W3, W4, W5, W0); + R2(c, d, a, b, g, h, e, f, 46, 1, XW); SCHED_W_1(51, W1, W2, W3, W4, W5, W0); + R2(b, c, d, a, f, g, h, e, 47, 2, XW); SCHED_W_2(51, W1, W2, W3, W4, W5, W0); + + /* Transform 48-50 + Precalc 54-56 */ + R2(a, b, c, d, e, f, g, h, 48, 0, XW); SCHED_W_0(54, W2, W3, W4, W5, W0, W1); + R2(d, a, b, c, h, e, f, g, 49, 1, XW); SCHED_W_1(54, W2, W3, W4, W5, W0, W1); + R2(c, d, a, b, g, h, e, f, 50, 2, XW); SCHED_W_2(54, W2, W3, W4, W5, W0, W1); + + /* Transform 51-53 + Precalc 57-59 */ + R2(b, c, d, a, f, g, h, e, 51, 0, XW); SCHED_W_0(57, W3, W4, W5, W0, W1, W2); + R2(a, b, c, d, e, f, g, h, 52, 1, XW); SCHED_W_1(57, W3, W4, W5, W0, W1, W2); + R2(d, a, b, c, h, e, f, g, 53, 2, XW); SCHED_W_2(57, W3, W4, W5, W0, W1, W2); + + /* Transform 54-56 + Precalc 60-62 */ + R2(c, d, a, b, g, h, e, f, 54, 0, XW); SCHED_W_0(60, W4, W5, W0, W1, W2, W3); + R2(b, c, d, a, f, g, h, e, 55, 1, XW); SCHED_W_1(60, W4, W5, W0, W1, W2, W3); + R2(a, b, c, d, e, f, g, h, 56, 2, XW); SCHED_W_2(60, W4, W5, W0, W1, W2, W3); + + /* Transform 57-59 + Precalc 63 */ + R2(d, a, b, c, h, e, f, g, 57, 0, XW); SCHED_W_0(63, W5, W0, W1, W2, W3, W4); + R2(c, d, a, b, g, h, e, f, 58, 1, XW); + R2(b, c, d, a, f, g, h, e, 59, 2, XW); SCHED_W_1(63, W5, W0, W1, W2, W3, W4); + + /* Transform 60-62 + Precalc 63 */ + R2(a, b, c, d, e, f, g, h, 60, 0, XW); + R2(d, a, b, c, h, e, f, g, 61, 1, XW); SCHED_W_2(63, W5, W0, W1, W2, W3, W4); + R2(c, d, a, b, g, h, e, f, 62, 2, XW); + + /* Transform 63 */ + R2(b, c, d, a, f, g, h, e, 63, 0, XW); + + /* Update the chaining variables. */ + xorl state_h0(RSTATE), a; + xorl state_h1(RSTATE), b; + xorl state_h2(RSTATE), c; + xorl state_h3(RSTATE), d; + movl a, state_h0(RSTATE); + movl b, state_h1(RSTATE); + movl c, state_h2(RSTATE); + movl d, state_h3(RSTATE); + xorl state_h4(RSTATE), e; + xorl state_h5(RSTATE), f; + xorl state_h6(RSTATE), g; + xorl state_h7(RSTATE), h; + movl e, state_h4(RSTATE); + movl f, state_h5(RSTATE); + movl g, state_h6(RSTATE); + movl h, state_h7(RSTATE); + + cmpq $0, RNBLKS; + jne .Loop; + + vzeroall; + + movq (STACK_REG_SAVE + 0 * 8)(%rsp), %rbx; + movq (STACK_REG_SAVE + 1 * 8)(%rsp), %r15; + movq (STACK_REG_SAVE + 2 * 8)(%rsp), %r14; + movq (STACK_REG_SAVE + 3 * 8)(%rsp), %r13; + movq (STACK_REG_SAVE + 4 * 8)(%rsp), %r12; + + vmovdqa %xmm0, IW_W1_ADDR(0, 0); + vmovdqa %xmm0, IW_W1W2_ADDR(0, 0); + vmovdqa %xmm0, IW_W1_ADDR(4, 0); + vmovdqa %xmm0, IW_W1W2_ADDR(4, 0); + vmovdqa %xmm0, IW_W1_ADDR(8, 0); + vmovdqa %xmm0, IW_W1W2_ADDR(8, 0); + + movq %rbp, %rsp; + popq %rbp; + RET; +SYM_FUNC_END(sm3_transform_avx) diff --git a/arch/x86/crypto/sm3_avx_glue.c b/arch/x86/crypto/sm3_avx_glue.c new file mode 100644 index 000000000000..661b6f22ffcd --- /dev/null +++ b/arch/x86/crypto/sm3_avx_glue.c @@ -0,0 +1,134 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * SM3 Secure Hash Algorithm, AVX assembler accelerated. + * specified in: https://datatracker.ietf.org/doc/html/draft-sca-cfrg-sm3-02 + * + * Copyright (C) 2021 Tianjia Zhang <tianjia.zhang@linux.alibaba.com> + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <crypto/internal/hash.h> +#include <crypto/internal/simd.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/types.h> +#include <crypto/sm3.h> +#include <crypto/sm3_base.h> +#include <asm/simd.h> + +asmlinkage void sm3_transform_avx(struct sm3_state *state, + const u8 *data, int nblocks); + +static int sm3_avx_update(struct shash_desc *desc, const u8 *data, + unsigned int len) +{ + struct sm3_state *sctx = shash_desc_ctx(desc); + + if (!crypto_simd_usable() || + (sctx->count % SM3_BLOCK_SIZE) + len < SM3_BLOCK_SIZE) { + sm3_update(sctx, data, len); + return 0; + } + + /* + * Make sure struct sm3_state begins directly with the SM3 + * 256-bit internal state, as this is what the asm functions expect. + */ + BUILD_BUG_ON(offsetof(struct sm3_state, state) != 0); + + kernel_fpu_begin(); + sm3_base_do_update(desc, data, len, sm3_transform_avx); + kernel_fpu_end(); + + return 0; +} + +static int sm3_avx_finup(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *out) +{ + if (!crypto_simd_usable()) { + struct sm3_state *sctx = shash_desc_ctx(desc); + + if (len) + sm3_update(sctx, data, len); + + sm3_final(sctx, out); + return 0; + } + + kernel_fpu_begin(); + if (len) + sm3_base_do_update(desc, data, len, sm3_transform_avx); + sm3_base_do_finalize(desc, sm3_transform_avx); + kernel_fpu_end(); + + return sm3_base_finish(desc, out); +} + +static int sm3_avx_final(struct shash_desc *desc, u8 *out) +{ + if (!crypto_simd_usable()) { + sm3_final(shash_desc_ctx(desc), out); + return 0; + } + + kernel_fpu_begin(); + sm3_base_do_finalize(desc, sm3_transform_avx); + kernel_fpu_end(); + + return sm3_base_finish(desc, out); +} + +static struct shash_alg sm3_avx_alg = { + .digestsize = SM3_DIGEST_SIZE, + .init = sm3_base_init, + .update = sm3_avx_update, + .final = sm3_avx_final, + .finup = sm3_avx_finup, + .descsize = sizeof(struct sm3_state), + .base = { + .cra_name = "sm3", + .cra_driver_name = "sm3-avx", + .cra_priority = 300, + .cra_blocksize = SM3_BLOCK_SIZE, + .cra_module = THIS_MODULE, + } +}; + +static int __init sm3_avx_mod_init(void) +{ + const char *feature_name; + + if (!boot_cpu_has(X86_FEATURE_AVX)) { + pr_info("AVX instruction are not detected.\n"); + return -ENODEV; + } + + if (!boot_cpu_has(X86_FEATURE_BMI2)) { + pr_info("BMI2 instruction are not detected.\n"); + return -ENODEV; + } + + if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, + &feature_name)) { + pr_info("CPU feature '%s' is not supported.\n", feature_name); + return -ENODEV; + } + + return crypto_register_shash(&sm3_avx_alg); +} + +static void __exit sm3_avx_mod_exit(void) +{ + crypto_unregister_shash(&sm3_avx_alg); +} + +module_init(sm3_avx_mod_init); +module_exit(sm3_avx_mod_exit); + +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR("Tianjia Zhang <tianjia.zhang@linux.alibaba.com>"); +MODULE_DESCRIPTION("SM3 Secure Hash Algorithm, AVX assembler accelerated"); +MODULE_ALIAS_CRYPTO("sm3"); +MODULE_ALIAS_CRYPTO("sm3-avx"); diff --git a/arch/x86/crypto/sm4-aesni-avx-asm_64.S b/arch/x86/crypto/sm4-aesni-avx-asm_64.S new file mode 100644 index 000000000000..4767ab61ff48 --- /dev/null +++ b/arch/x86/crypto/sm4-aesni-avx-asm_64.S @@ -0,0 +1,594 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * SM4 Cipher Algorithm, AES-NI/AVX optimized. + * as specified in + * https://tools.ietf.org/id/draft-ribose-cfrg-sm4-10.html + * + * Copyright (C) 2018 Markku-Juhani O. Saarinen <mjos@iki.fi> + * Copyright (C) 2020 Jussi Kivilinna <jussi.kivilinna@iki.fi> + * Copyright (c) 2021 Tianjia Zhang <tianjia.zhang@linux.alibaba.com> + */ + +/* Based on SM4 AES-NI work by libgcrypt and Markku-Juhani O. Saarinen at: + * https://github.com/mjosaarinen/sm4ni + */ + +#include <linux/linkage.h> +#include <asm/frame.h> + +#define rRIP (%rip) + +#define RX0 %xmm0 +#define RX1 %xmm1 +#define MASK_4BIT %xmm2 +#define RTMP0 %xmm3 +#define RTMP1 %xmm4 +#define RTMP2 %xmm5 +#define RTMP3 %xmm6 +#define RTMP4 %xmm7 + +#define RA0 %xmm8 +#define RA1 %xmm9 +#define RA2 %xmm10 +#define RA3 %xmm11 + +#define RB0 %xmm12 +#define RB1 %xmm13 +#define RB2 %xmm14 +#define RB3 %xmm15 + +#define RNOT %xmm0 +#define RBSWAP %xmm1 + + +/* Transpose four 32-bit words between 128-bit vectors. */ +#define transpose_4x4(x0, x1, x2, x3, t1, t2) \ + vpunpckhdq x1, x0, t2; \ + vpunpckldq x1, x0, x0; \ + \ + vpunpckldq x3, x2, t1; \ + vpunpckhdq x3, x2, x2; \ + \ + vpunpckhqdq t1, x0, x1; \ + vpunpcklqdq t1, x0, x0; \ + \ + vpunpckhqdq x2, t2, x3; \ + vpunpcklqdq x2, t2, x2; + +/* pre-SubByte transform. */ +#define transform_pre(x, lo_t, hi_t, mask4bit, tmp0) \ + vpand x, mask4bit, tmp0; \ + vpandn x, mask4bit, x; \ + vpsrld $4, x, x; \ + \ + vpshufb tmp0, lo_t, tmp0; \ + vpshufb x, hi_t, x; \ + vpxor tmp0, x, x; + +/* post-SubByte transform. Note: x has been XOR'ed with mask4bit by + * 'vaeslastenc' instruction. + */ +#define transform_post(x, lo_t, hi_t, mask4bit, tmp0) \ + vpandn mask4bit, x, tmp0; \ + vpsrld $4, x, x; \ + vpand x, mask4bit, x; \ + \ + vpshufb tmp0, lo_t, tmp0; \ + vpshufb x, hi_t, x; \ + vpxor tmp0, x, x; + + +.section .rodata.cst16, "aM", @progbits, 16 +.align 16 + +/* + * Following four affine transform look-up tables are from work by + * Markku-Juhani O. Saarinen, at https://github.com/mjosaarinen/sm4ni + * + * These allow exposing SM4 S-Box from AES SubByte. + */ + +/* pre-SubByte affine transform, from SM4 field to AES field. */ +.Lpre_tf_lo_s: + .quad 0x9197E2E474720701, 0xC7C1B4B222245157 +.Lpre_tf_hi_s: + .quad 0xE240AB09EB49A200, 0xF052B91BF95BB012 + +/* post-SubByte affine transform, from AES field to SM4 field. */ +.Lpost_tf_lo_s: + .quad 0x5B67F2CEA19D0834, 0xEDD14478172BBE82 +.Lpost_tf_hi_s: + .quad 0xAE7201DD73AFDC00, 0x11CDBE62CC1063BF + +/* For isolating SubBytes from AESENCLAST, inverse shift row */ +.Linv_shift_row: + .byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b + .byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03 + +/* Inverse shift row + Rotate left by 8 bits on 32-bit words with vpshufb */ +.Linv_shift_row_rol_8: + .byte 0x07, 0x00, 0x0d, 0x0a, 0x0b, 0x04, 0x01, 0x0e + .byte 0x0f, 0x08, 0x05, 0x02, 0x03, 0x0c, 0x09, 0x06 + +/* Inverse shift row + Rotate left by 16 bits on 32-bit words with vpshufb */ +.Linv_shift_row_rol_16: + .byte 0x0a, 0x07, 0x00, 0x0d, 0x0e, 0x0b, 0x04, 0x01 + .byte 0x02, 0x0f, 0x08, 0x05, 0x06, 0x03, 0x0c, 0x09 + +/* Inverse shift row + Rotate left by 24 bits on 32-bit words with vpshufb */ +.Linv_shift_row_rol_24: + .byte 0x0d, 0x0a, 0x07, 0x00, 0x01, 0x0e, 0x0b, 0x04 + .byte 0x05, 0x02, 0x0f, 0x08, 0x09, 0x06, 0x03, 0x0c + +/* For CTR-mode IV byteswap */ +.Lbswap128_mask: + .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 + +/* For input word byte-swap */ +.Lbswap32_mask: + .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 + +.align 4 +/* 4-bit mask */ +.L0f0f0f0f: + .long 0x0f0f0f0f + +/* 12 bytes, only for padding */ +.Lpadding_deadbeef: + .long 0xdeadbeef, 0xdeadbeef, 0xdeadbeef + + +.text +.align 16 + +/* + * void sm4_aesni_avx_crypt4(const u32 *rk, u8 *dst, + * const u8 *src, int nblocks) + */ +.align 8 +SYM_FUNC_START(sm4_aesni_avx_crypt4) + /* input: + * %rdi: round key array, CTX + * %rsi: dst (1..4 blocks) + * %rdx: src (1..4 blocks) + * %rcx: num blocks (1..4) + */ + FRAME_BEGIN + + vmovdqu 0*16(%rdx), RA0; + vmovdqa RA0, RA1; + vmovdqa RA0, RA2; + vmovdqa RA0, RA3; + cmpq $2, %rcx; + jb .Lblk4_load_input_done; + vmovdqu 1*16(%rdx), RA1; + je .Lblk4_load_input_done; + vmovdqu 2*16(%rdx), RA2; + cmpq $3, %rcx; + je .Lblk4_load_input_done; + vmovdqu 3*16(%rdx), RA3; + +.Lblk4_load_input_done: + + vmovdqa .Lbswap32_mask rRIP, RTMP2; + vpshufb RTMP2, RA0, RA0; + vpshufb RTMP2, RA1, RA1; + vpshufb RTMP2, RA2, RA2; + vpshufb RTMP2, RA3, RA3; + + vbroadcastss .L0f0f0f0f rRIP, MASK_4BIT; + vmovdqa .Lpre_tf_lo_s rRIP, RTMP4; + vmovdqa .Lpre_tf_hi_s rRIP, RB0; + vmovdqa .Lpost_tf_lo_s rRIP, RB1; + vmovdqa .Lpost_tf_hi_s rRIP, RB2; + vmovdqa .Linv_shift_row rRIP, RB3; + vmovdqa .Linv_shift_row_rol_8 rRIP, RTMP2; + vmovdqa .Linv_shift_row_rol_16 rRIP, RTMP3; + transpose_4x4(RA0, RA1, RA2, RA3, RTMP0, RTMP1); + +#define ROUND(round, s0, s1, s2, s3) \ + vbroadcastss (4*(round))(%rdi), RX0; \ + vpxor s1, RX0, RX0; \ + vpxor s2, RX0, RX0; \ + vpxor s3, RX0, RX0; /* s1 ^ s2 ^ s3 ^ rk */ \ + \ + /* sbox, non-linear part */ \ + transform_pre(RX0, RTMP4, RB0, MASK_4BIT, RTMP0); \ + vaesenclast MASK_4BIT, RX0, RX0; \ + transform_post(RX0, RB1, RB2, MASK_4BIT, RTMP0); \ + \ + /* linear part */ \ + vpshufb RB3, RX0, RTMP0; \ + vpxor RTMP0, s0, s0; /* s0 ^ x */ \ + vpshufb RTMP2, RX0, RTMP1; \ + vpxor RTMP1, RTMP0, RTMP0; /* x ^ rol(x,8) */ \ + vpshufb RTMP3, RX0, RTMP1; \ + vpxor RTMP1, RTMP0, RTMP0; /* x ^ rol(x,8) ^ rol(x,16) */ \ + vpshufb .Linv_shift_row_rol_24 rRIP, RX0, RTMP1; \ + vpxor RTMP1, s0, s0; /* s0 ^ x ^ rol(x,24) */ \ + vpslld $2, RTMP0, RTMP1; \ + vpsrld $30, RTMP0, RTMP0; \ + vpxor RTMP0, s0, s0; \ + /* s0 ^ x ^ rol(x,2) ^ rol(x,10) ^ rol(x,18) ^ rol(x,24) */ \ + vpxor RTMP1, s0, s0; + + leaq (32*4)(%rdi), %rax; +.align 16 +.Lroundloop_blk4: + ROUND(0, RA0, RA1, RA2, RA3); + ROUND(1, RA1, RA2, RA3, RA0); + ROUND(2, RA2, RA3, RA0, RA1); + ROUND(3, RA3, RA0, RA1, RA2); + leaq (4*4)(%rdi), %rdi; + cmpq %rax, %rdi; + jne .Lroundloop_blk4; + +#undef ROUND + + vmovdqa .Lbswap128_mask rRIP, RTMP2; + + transpose_4x4(RA0, RA1, RA2, RA3, RTMP0, RTMP1); + vpshufb RTMP2, RA0, RA0; + vpshufb RTMP2, RA1, RA1; + vpshufb RTMP2, RA2, RA2; + vpshufb RTMP2, RA3, RA3; + + vmovdqu RA0, 0*16(%rsi); + cmpq $2, %rcx; + jb .Lblk4_store_output_done; + vmovdqu RA1, 1*16(%rsi); + je .Lblk4_store_output_done; + vmovdqu RA2, 2*16(%rsi); + cmpq $3, %rcx; + je .Lblk4_store_output_done; + vmovdqu RA3, 3*16(%rsi); + +.Lblk4_store_output_done: + vzeroall; + FRAME_END + RET; +SYM_FUNC_END(sm4_aesni_avx_crypt4) + +.align 8 +SYM_FUNC_START_LOCAL(__sm4_crypt_blk8) + /* input: + * %rdi: round key array, CTX + * RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3: eight parallel + * plaintext blocks + * output: + * RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3: eight parallel + * ciphertext blocks + */ + FRAME_BEGIN + + vmovdqa .Lbswap32_mask rRIP, RTMP2; + vpshufb RTMP2, RA0, RA0; + vpshufb RTMP2, RA1, RA1; + vpshufb RTMP2, RA2, RA2; + vpshufb RTMP2, RA3, RA3; + vpshufb RTMP2, RB0, RB0; + vpshufb RTMP2, RB1, RB1; + vpshufb RTMP2, RB2, RB2; + vpshufb RTMP2, RB3, RB3; + + vbroadcastss .L0f0f0f0f rRIP, MASK_4BIT; + transpose_4x4(RA0, RA1, RA2, RA3, RTMP0, RTMP1); + transpose_4x4(RB0, RB1, RB2, RB3, RTMP0, RTMP1); + +#define ROUND(round, s0, s1, s2, s3, r0, r1, r2, r3) \ + vbroadcastss (4*(round))(%rdi), RX0; \ + vmovdqa .Lpre_tf_lo_s rRIP, RTMP4; \ + vmovdqa .Lpre_tf_hi_s rRIP, RTMP1; \ + vmovdqa RX0, RX1; \ + vpxor s1, RX0, RX0; \ + vpxor s2, RX0, RX0; \ + vpxor s3, RX0, RX0; /* s1 ^ s2 ^ s3 ^ rk */ \ + vmovdqa .Lpost_tf_lo_s rRIP, RTMP2; \ + vmovdqa .Lpost_tf_hi_s rRIP, RTMP3; \ + vpxor r1, RX1, RX1; \ + vpxor r2, RX1, RX1; \ + vpxor r3, RX1, RX1; /* r1 ^ r2 ^ r3 ^ rk */ \ + \ + /* sbox, non-linear part */ \ + transform_pre(RX0, RTMP4, RTMP1, MASK_4BIT, RTMP0); \ + transform_pre(RX1, RTMP4, RTMP1, MASK_4BIT, RTMP0); \ + vmovdqa .Linv_shift_row rRIP, RTMP4; \ + vaesenclast MASK_4BIT, RX0, RX0; \ + vaesenclast MASK_4BIT, RX1, RX1; \ + transform_post(RX0, RTMP2, RTMP3, MASK_4BIT, RTMP0); \ + transform_post(RX1, RTMP2, RTMP3, MASK_4BIT, RTMP0); \ + \ + /* linear part */ \ + vpshufb RTMP4, RX0, RTMP0; \ + vpxor RTMP0, s0, s0; /* s0 ^ x */ \ + vpshufb RTMP4, RX1, RTMP2; \ + vmovdqa .Linv_shift_row_rol_8 rRIP, RTMP4; \ + vpxor RTMP2, r0, r0; /* r0 ^ x */ \ + vpshufb RTMP4, RX0, RTMP1; \ + vpxor RTMP1, RTMP0, RTMP0; /* x ^ rol(x,8) */ \ + vpshufb RTMP4, RX1, RTMP3; \ + vmovdqa .Linv_shift_row_rol_16 rRIP, RTMP4; \ + vpxor RTMP3, RTMP2, RTMP2; /* x ^ rol(x,8) */ \ + vpshufb RTMP4, RX0, RTMP1; \ + vpxor RTMP1, RTMP0, RTMP0; /* x ^ rol(x,8) ^ rol(x,16) */ \ + vpshufb RTMP4, RX1, RTMP3; \ + vmovdqa .Linv_shift_row_rol_24 rRIP, RTMP4; \ + vpxor RTMP3, RTMP2, RTMP2; /* x ^ rol(x,8) ^ rol(x,16) */ \ + vpshufb RTMP4, RX0, RTMP1; \ + vpxor RTMP1, s0, s0; /* s0 ^ x ^ rol(x,24) */ \ + /* s0 ^ x ^ rol(x,2) ^ rol(x,10) ^ rol(x,18) ^ rol(x,24) */ \ + vpslld $2, RTMP0, RTMP1; \ + vpsrld $30, RTMP0, RTMP0; \ + vpxor RTMP0, s0, s0; \ + vpxor RTMP1, s0, s0; \ + vpshufb RTMP4, RX1, RTMP3; \ + vpxor RTMP3, r0, r0; /* r0 ^ x ^ rol(x,24) */ \ + /* r0 ^ x ^ rol(x,2) ^ rol(x,10) ^ rol(x,18) ^ rol(x,24) */ \ + vpslld $2, RTMP2, RTMP3; \ + vpsrld $30, RTMP2, RTMP2; \ + vpxor RTMP2, r0, r0; \ + vpxor RTMP3, r0, r0; + + leaq (32*4)(%rdi), %rax; +.align 16 +.Lroundloop_blk8: + ROUND(0, RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3); + ROUND(1, RA1, RA2, RA3, RA0, RB1, RB2, RB3, RB0); + ROUND(2, RA2, RA3, RA0, RA1, RB2, RB3, RB0, RB1); + ROUND(3, RA3, RA0, RA1, RA2, RB3, RB0, RB1, RB2); + leaq (4*4)(%rdi), %rdi; + cmpq %rax, %rdi; + jne .Lroundloop_blk8; + +#undef ROUND + + vmovdqa .Lbswap128_mask rRIP, RTMP2; + + transpose_4x4(RA0, RA1, RA2, RA3, RTMP0, RTMP1); + transpose_4x4(RB0, RB1, RB2, RB3, RTMP0, RTMP1); + vpshufb RTMP2, RA0, RA0; + vpshufb RTMP2, RA1, RA1; + vpshufb RTMP2, RA2, RA2; + vpshufb RTMP2, RA3, RA3; + vpshufb RTMP2, RB0, RB0; + vpshufb RTMP2, RB1, RB1; + vpshufb RTMP2, RB2, RB2; + vpshufb RTMP2, RB3, RB3; + + FRAME_END + RET; +SYM_FUNC_END(__sm4_crypt_blk8) + +/* + * void sm4_aesni_avx_crypt8(const u32 *rk, u8 *dst, + * const u8 *src, int nblocks) + */ +.align 8 +SYM_FUNC_START(sm4_aesni_avx_crypt8) + /* input: + * %rdi: round key array, CTX + * %rsi: dst (1..8 blocks) + * %rdx: src (1..8 blocks) + * %rcx: num blocks (1..8) + */ + cmpq $5, %rcx; + jb sm4_aesni_avx_crypt4; + + FRAME_BEGIN + + vmovdqu (0 * 16)(%rdx), RA0; + vmovdqu (1 * 16)(%rdx), RA1; + vmovdqu (2 * 16)(%rdx), RA2; + vmovdqu (3 * 16)(%rdx), RA3; + vmovdqu (4 * 16)(%rdx), RB0; + vmovdqa RB0, RB1; + vmovdqa RB0, RB2; + vmovdqa RB0, RB3; + je .Lblk8_load_input_done; + vmovdqu (5 * 16)(%rdx), RB1; + cmpq $7, %rcx; + jb .Lblk8_load_input_done; + vmovdqu (6 * 16)(%rdx), RB2; + je .Lblk8_load_input_done; + vmovdqu (7 * 16)(%rdx), RB3; + +.Lblk8_load_input_done: + call __sm4_crypt_blk8; + + cmpq $6, %rcx; + vmovdqu RA0, (0 * 16)(%rsi); + vmovdqu RA1, (1 * 16)(%rsi); + vmovdqu RA2, (2 * 16)(%rsi); + vmovdqu RA3, (3 * 16)(%rsi); + vmovdqu RB0, (4 * 16)(%rsi); + jb .Lblk8_store_output_done; + vmovdqu RB1, (5 * 16)(%rsi); + je .Lblk8_store_output_done; + vmovdqu RB2, (6 * 16)(%rsi); + cmpq $7, %rcx; + je .Lblk8_store_output_done; + vmovdqu RB3, (7 * 16)(%rsi); + +.Lblk8_store_output_done: + vzeroall; + FRAME_END + RET; +SYM_FUNC_END(sm4_aesni_avx_crypt8) + +/* + * void sm4_aesni_avx_ctr_enc_blk8(const u32 *rk, u8 *dst, + * const u8 *src, u8 *iv) + */ +.align 8 +SYM_FUNC_START(sm4_aesni_avx_ctr_enc_blk8) + /* input: + * %rdi: round key array, CTX + * %rsi: dst (8 blocks) + * %rdx: src (8 blocks) + * %rcx: iv (big endian, 128bit) + */ + FRAME_BEGIN + + /* load IV and byteswap */ + vmovdqu (%rcx), RA0; + + vmovdqa .Lbswap128_mask rRIP, RBSWAP; + vpshufb RBSWAP, RA0, RTMP0; /* be => le */ + + vpcmpeqd RNOT, RNOT, RNOT; + vpsrldq $8, RNOT, RNOT; /* low: -1, high: 0 */ + +#define inc_le128(x, minus_one, tmp) \ + vpcmpeqq minus_one, x, tmp; \ + vpsubq minus_one, x, x; \ + vpslldq $8, tmp, tmp; \ + vpsubq tmp, x, x; + + /* construct IVs */ + inc_le128(RTMP0, RNOT, RTMP2); /* +1 */ + vpshufb RBSWAP, RTMP0, RA1; + inc_le128(RTMP0, RNOT, RTMP2); /* +2 */ + vpshufb RBSWAP, RTMP0, RA2; + inc_le128(RTMP0, RNOT, RTMP2); /* +3 */ + vpshufb RBSWAP, RTMP0, RA3; + inc_le128(RTMP0, RNOT, RTMP2); /* +4 */ + vpshufb RBSWAP, RTMP0, RB0; + inc_le128(RTMP0, RNOT, RTMP2); /* +5 */ + vpshufb RBSWAP, RTMP0, RB1; + inc_le128(RTMP0, RNOT, RTMP2); /* +6 */ + vpshufb RBSWAP, RTMP0, RB2; + inc_le128(RTMP0, RNOT, RTMP2); /* +7 */ + vpshufb RBSWAP, RTMP0, RB3; + inc_le128(RTMP0, RNOT, RTMP2); /* +8 */ + vpshufb RBSWAP, RTMP0, RTMP1; + + /* store new IV */ + vmovdqu RTMP1, (%rcx); + + call __sm4_crypt_blk8; + + vpxor (0 * 16)(%rdx), RA0, RA0; + vpxor (1 * 16)(%rdx), RA1, RA1; + vpxor (2 * 16)(%rdx), RA2, RA2; + vpxor (3 * 16)(%rdx), RA3, RA3; + vpxor (4 * 16)(%rdx), RB0, RB0; + vpxor (5 * 16)(%rdx), RB1, RB1; + vpxor (6 * 16)(%rdx), RB2, RB2; + vpxor (7 * 16)(%rdx), RB3, RB3; + + vmovdqu RA0, (0 * 16)(%rsi); + vmovdqu RA1, (1 * 16)(%rsi); + vmovdqu RA2, (2 * 16)(%rsi); + vmovdqu RA3, (3 * 16)(%rsi); + vmovdqu RB0, (4 * 16)(%rsi); + vmovdqu RB1, (5 * 16)(%rsi); + vmovdqu RB2, (6 * 16)(%rsi); + vmovdqu RB3, (7 * 16)(%rsi); + + vzeroall; + FRAME_END + RET; +SYM_FUNC_END(sm4_aesni_avx_ctr_enc_blk8) + +/* + * void sm4_aesni_avx_cbc_dec_blk8(const u32 *rk, u8 *dst, + * const u8 *src, u8 *iv) + */ +.align 8 +SYM_FUNC_START(sm4_aesni_avx_cbc_dec_blk8) + /* input: + * %rdi: round key array, CTX + * %rsi: dst (8 blocks) + * %rdx: src (8 blocks) + * %rcx: iv + */ + FRAME_BEGIN + + vmovdqu (0 * 16)(%rdx), RA0; + vmovdqu (1 * 16)(%rdx), RA1; + vmovdqu (2 * 16)(%rdx), RA2; + vmovdqu (3 * 16)(%rdx), RA3; + vmovdqu (4 * 16)(%rdx), RB0; + vmovdqu (5 * 16)(%rdx), RB1; + vmovdqu (6 * 16)(%rdx), RB2; + vmovdqu (7 * 16)(%rdx), RB3; + + call __sm4_crypt_blk8; + + vmovdqu (7 * 16)(%rdx), RNOT; + vpxor (%rcx), RA0, RA0; + vpxor (0 * 16)(%rdx), RA1, RA1; + vpxor (1 * 16)(%rdx), RA2, RA2; + vpxor (2 * 16)(%rdx), RA3, RA3; + vpxor (3 * 16)(%rdx), RB0, RB0; + vpxor (4 * 16)(%rdx), RB1, RB1; + vpxor (5 * 16)(%rdx), RB2, RB2; + vpxor (6 * 16)(%rdx), RB3, RB3; + vmovdqu RNOT, (%rcx); /* store new IV */ + + vmovdqu RA0, (0 * 16)(%rsi); + vmovdqu RA1, (1 * 16)(%rsi); + vmovdqu RA2, (2 * 16)(%rsi); + vmovdqu RA3, (3 * 16)(%rsi); + vmovdqu RB0, (4 * 16)(%rsi); + vmovdqu RB1, (5 * 16)(%rsi); + vmovdqu RB2, (6 * 16)(%rsi); + vmovdqu RB3, (7 * 16)(%rsi); + + vzeroall; + FRAME_END + RET; +SYM_FUNC_END(sm4_aesni_avx_cbc_dec_blk8) + +/* + * void sm4_aesni_avx_cfb_dec_blk8(const u32 *rk, u8 *dst, + * const u8 *src, u8 *iv) + */ +.align 8 +SYM_FUNC_START(sm4_aesni_avx_cfb_dec_blk8) + /* input: + * %rdi: round key array, CTX + * %rsi: dst (8 blocks) + * %rdx: src (8 blocks) + * %rcx: iv + */ + FRAME_BEGIN + + /* Load input */ + vmovdqu (%rcx), RA0; + vmovdqu 0 * 16(%rdx), RA1; + vmovdqu 1 * 16(%rdx), RA2; + vmovdqu 2 * 16(%rdx), RA3; + vmovdqu 3 * 16(%rdx), RB0; + vmovdqu 4 * 16(%rdx), RB1; + vmovdqu 5 * 16(%rdx), RB2; + vmovdqu 6 * 16(%rdx), RB3; + + /* Update IV */ + vmovdqu 7 * 16(%rdx), RNOT; + vmovdqu RNOT, (%rcx); + + call __sm4_crypt_blk8; + + vpxor (0 * 16)(%rdx), RA0, RA0; + vpxor (1 * 16)(%rdx), RA1, RA1; + vpxor (2 * 16)(%rdx), RA2, RA2; + vpxor (3 * 16)(%rdx), RA3, RA3; + vpxor (4 * 16)(%rdx), RB0, RB0; + vpxor (5 * 16)(%rdx), RB1, RB1; + vpxor (6 * 16)(%rdx), RB2, RB2; + vpxor (7 * 16)(%rdx), RB3, RB3; + + vmovdqu RA0, (0 * 16)(%rsi); + vmovdqu RA1, (1 * 16)(%rsi); + vmovdqu RA2, (2 * 16)(%rsi); + vmovdqu RA3, (3 * 16)(%rsi); + vmovdqu RB0, (4 * 16)(%rsi); + vmovdqu RB1, (5 * 16)(%rsi); + vmovdqu RB2, (6 * 16)(%rsi); + vmovdqu RB3, (7 * 16)(%rsi); + + vzeroall; + FRAME_END + RET; +SYM_FUNC_END(sm4_aesni_avx_cfb_dec_blk8) diff --git a/arch/x86/crypto/sm4-aesni-avx2-asm_64.S b/arch/x86/crypto/sm4-aesni-avx2-asm_64.S new file mode 100644 index 000000000000..4732fe8bb65b --- /dev/null +++ b/arch/x86/crypto/sm4-aesni-avx2-asm_64.S @@ -0,0 +1,501 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * SM4 Cipher Algorithm, AES-NI/AVX2 optimized. + * as specified in + * https://tools.ietf.org/id/draft-ribose-cfrg-sm4-10.html + * + * Copyright (C) 2018 Markku-Juhani O. Saarinen <mjos@iki.fi> + * Copyright (C) 2020 Jussi Kivilinna <jussi.kivilinna@iki.fi> + * Copyright (c) 2021 Tianjia Zhang <tianjia.zhang@linux.alibaba.com> + */ + +/* Based on SM4 AES-NI work by libgcrypt and Markku-Juhani O. Saarinen at: + * https://github.com/mjosaarinen/sm4ni + */ + +#include <linux/linkage.h> +#include <asm/frame.h> + +#define rRIP (%rip) + +/* vector registers */ +#define RX0 %ymm0 +#define RX1 %ymm1 +#define MASK_4BIT %ymm2 +#define RTMP0 %ymm3 +#define RTMP1 %ymm4 +#define RTMP2 %ymm5 +#define RTMP3 %ymm6 +#define RTMP4 %ymm7 + +#define RA0 %ymm8 +#define RA1 %ymm9 +#define RA2 %ymm10 +#define RA3 %ymm11 + +#define RB0 %ymm12 +#define RB1 %ymm13 +#define RB2 %ymm14 +#define RB3 %ymm15 + +#define RNOT %ymm0 +#define RBSWAP %ymm1 + +#define RX0x %xmm0 +#define RX1x %xmm1 +#define MASK_4BITx %xmm2 + +#define RNOTx %xmm0 +#define RBSWAPx %xmm1 + +#define RTMP0x %xmm3 +#define RTMP1x %xmm4 +#define RTMP2x %xmm5 +#define RTMP3x %xmm6 +#define RTMP4x %xmm7 + + +/* helper macros */ + +/* Transpose four 32-bit words between 128-bit vector lanes. */ +#define transpose_4x4(x0, x1, x2, x3, t1, t2) \ + vpunpckhdq x1, x0, t2; \ + vpunpckldq x1, x0, x0; \ + \ + vpunpckldq x3, x2, t1; \ + vpunpckhdq x3, x2, x2; \ + \ + vpunpckhqdq t1, x0, x1; \ + vpunpcklqdq t1, x0, x0; \ + \ + vpunpckhqdq x2, t2, x3; \ + vpunpcklqdq x2, t2, x2; + +/* post-SubByte transform. */ +#define transform_pre(x, lo_t, hi_t, mask4bit, tmp0) \ + vpand x, mask4bit, tmp0; \ + vpandn x, mask4bit, x; \ + vpsrld $4, x, x; \ + \ + vpshufb tmp0, lo_t, tmp0; \ + vpshufb x, hi_t, x; \ + vpxor tmp0, x, x; + +/* post-SubByte transform. Note: x has been XOR'ed with mask4bit by + * 'vaeslastenc' instruction. */ +#define transform_post(x, lo_t, hi_t, mask4bit, tmp0) \ + vpandn mask4bit, x, tmp0; \ + vpsrld $4, x, x; \ + vpand x, mask4bit, x; \ + \ + vpshufb tmp0, lo_t, tmp0; \ + vpshufb x, hi_t, x; \ + vpxor tmp0, x, x; + + +.section .rodata.cst16, "aM", @progbits, 16 +.align 16 + +/* + * Following four affine transform look-up tables are from work by + * Markku-Juhani O. Saarinen, at https://github.com/mjosaarinen/sm4ni + * + * These allow exposing SM4 S-Box from AES SubByte. + */ + +/* pre-SubByte affine transform, from SM4 field to AES field. */ +.Lpre_tf_lo_s: + .quad 0x9197E2E474720701, 0xC7C1B4B222245157 +.Lpre_tf_hi_s: + .quad 0xE240AB09EB49A200, 0xF052B91BF95BB012 + +/* post-SubByte affine transform, from AES field to SM4 field. */ +.Lpost_tf_lo_s: + .quad 0x5B67F2CEA19D0834, 0xEDD14478172BBE82 +.Lpost_tf_hi_s: + .quad 0xAE7201DD73AFDC00, 0x11CDBE62CC1063BF + +/* For isolating SubBytes from AESENCLAST, inverse shift row */ +.Linv_shift_row: + .byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b + .byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03 + +/* Inverse shift row + Rotate left by 8 bits on 32-bit words with vpshufb */ +.Linv_shift_row_rol_8: + .byte 0x07, 0x00, 0x0d, 0x0a, 0x0b, 0x04, 0x01, 0x0e + .byte 0x0f, 0x08, 0x05, 0x02, 0x03, 0x0c, 0x09, 0x06 + +/* Inverse shift row + Rotate left by 16 bits on 32-bit words with vpshufb */ +.Linv_shift_row_rol_16: + .byte 0x0a, 0x07, 0x00, 0x0d, 0x0e, 0x0b, 0x04, 0x01 + .byte 0x02, 0x0f, 0x08, 0x05, 0x06, 0x03, 0x0c, 0x09 + +/* Inverse shift row + Rotate left by 24 bits on 32-bit words with vpshufb */ +.Linv_shift_row_rol_24: + .byte 0x0d, 0x0a, 0x07, 0x00, 0x01, 0x0e, 0x0b, 0x04 + .byte 0x05, 0x02, 0x0f, 0x08, 0x09, 0x06, 0x03, 0x0c + +/* For CTR-mode IV byteswap */ +.Lbswap128_mask: + .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 + +/* For input word byte-swap */ +.Lbswap32_mask: + .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 + +.align 4 +/* 4-bit mask */ +.L0f0f0f0f: + .long 0x0f0f0f0f + +/* 12 bytes, only for padding */ +.Lpadding_deadbeef: + .long 0xdeadbeef, 0xdeadbeef, 0xdeadbeef + +.text +.align 16 + +.align 8 +SYM_FUNC_START_LOCAL(__sm4_crypt_blk16) + /* input: + * %rdi: round key array, CTX + * RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3: sixteen parallel + * plaintext blocks + * output: + * RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3: sixteen parallel + * ciphertext blocks + */ + FRAME_BEGIN + + vbroadcasti128 .Lbswap32_mask rRIP, RTMP2; + vpshufb RTMP2, RA0, RA0; + vpshufb RTMP2, RA1, RA1; + vpshufb RTMP2, RA2, RA2; + vpshufb RTMP2, RA3, RA3; + vpshufb RTMP2, RB0, RB0; + vpshufb RTMP2, RB1, RB1; + vpshufb RTMP2, RB2, RB2; + vpshufb RTMP2, RB3, RB3; + + vpbroadcastd .L0f0f0f0f rRIP, MASK_4BIT; + transpose_4x4(RA0, RA1, RA2, RA3, RTMP0, RTMP1); + transpose_4x4(RB0, RB1, RB2, RB3, RTMP0, RTMP1); + +#define ROUND(round, s0, s1, s2, s3, r0, r1, r2, r3) \ + vpbroadcastd (4*(round))(%rdi), RX0; \ + vbroadcasti128 .Lpre_tf_lo_s rRIP, RTMP4; \ + vbroadcasti128 .Lpre_tf_hi_s rRIP, RTMP1; \ + vmovdqa RX0, RX1; \ + vpxor s1, RX0, RX0; \ + vpxor s2, RX0, RX0; \ + vpxor s3, RX0, RX0; /* s1 ^ s2 ^ s3 ^ rk */ \ + vbroadcasti128 .Lpost_tf_lo_s rRIP, RTMP2; \ + vbroadcasti128 .Lpost_tf_hi_s rRIP, RTMP3; \ + vpxor r1, RX1, RX1; \ + vpxor r2, RX1, RX1; \ + vpxor r3, RX1, RX1; /* r1 ^ r2 ^ r3 ^ rk */ \ + \ + /* sbox, non-linear part */ \ + transform_pre(RX0, RTMP4, RTMP1, MASK_4BIT, RTMP0); \ + transform_pre(RX1, RTMP4, RTMP1, MASK_4BIT, RTMP0); \ + vextracti128 $1, RX0, RTMP4x; \ + vextracti128 $1, RX1, RTMP0x; \ + vaesenclast MASK_4BITx, RX0x, RX0x; \ + vaesenclast MASK_4BITx, RTMP4x, RTMP4x; \ + vaesenclast MASK_4BITx, RX1x, RX1x; \ + vaesenclast MASK_4BITx, RTMP0x, RTMP0x; \ + vinserti128 $1, RTMP4x, RX0, RX0; \ + vbroadcasti128 .Linv_shift_row rRIP, RTMP4; \ + vinserti128 $1, RTMP0x, RX1, RX1; \ + transform_post(RX0, RTMP2, RTMP3, MASK_4BIT, RTMP0); \ + transform_post(RX1, RTMP2, RTMP3, MASK_4BIT, RTMP0); \ + \ + /* linear part */ \ + vpshufb RTMP4, RX0, RTMP0; \ + vpxor RTMP0, s0, s0; /* s0 ^ x */ \ + vpshufb RTMP4, RX1, RTMP2; \ + vbroadcasti128 .Linv_shift_row_rol_8 rRIP, RTMP4; \ + vpxor RTMP2, r0, r0; /* r0 ^ x */ \ + vpshufb RTMP4, RX0, RTMP1; \ + vpxor RTMP1, RTMP0, RTMP0; /* x ^ rol(x,8) */ \ + vpshufb RTMP4, RX1, RTMP3; \ + vbroadcasti128 .Linv_shift_row_rol_16 rRIP, RTMP4; \ + vpxor RTMP3, RTMP2, RTMP2; /* x ^ rol(x,8) */ \ + vpshufb RTMP4, RX0, RTMP1; \ + vpxor RTMP1, RTMP0, RTMP0; /* x ^ rol(x,8) ^ rol(x,16) */ \ + vpshufb RTMP4, RX1, RTMP3; \ + vbroadcasti128 .Linv_shift_row_rol_24 rRIP, RTMP4; \ + vpxor RTMP3, RTMP2, RTMP2; /* x ^ rol(x,8) ^ rol(x,16) */ \ + vpshufb RTMP4, RX0, RTMP1; \ + vpxor RTMP1, s0, s0; /* s0 ^ x ^ rol(x,24) */ \ + vpslld $2, RTMP0, RTMP1; \ + vpsrld $30, RTMP0, RTMP0; \ + vpxor RTMP0, s0, s0; \ + /* s0 ^ x ^ rol(x,2) ^ rol(x,10) ^ rol(x,18) ^ rol(x,24) */ \ + vpxor RTMP1, s0, s0; \ + vpshufb RTMP4, RX1, RTMP3; \ + vpxor RTMP3, r0, r0; /* r0 ^ x ^ rol(x,24) */ \ + vpslld $2, RTMP2, RTMP3; \ + vpsrld $30, RTMP2, RTMP2; \ + vpxor RTMP2, r0, r0; \ + /* r0 ^ x ^ rol(x,2) ^ rol(x,10) ^ rol(x,18) ^ rol(x,24) */ \ + vpxor RTMP3, r0, r0; + + leaq (32*4)(%rdi), %rax; +.align 16 +.Lroundloop_blk8: + ROUND(0, RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3); + ROUND(1, RA1, RA2, RA3, RA0, RB1, RB2, RB3, RB0); + ROUND(2, RA2, RA3, RA0, RA1, RB2, RB3, RB0, RB1); + ROUND(3, RA3, RA0, RA1, RA2, RB3, RB0, RB1, RB2); + leaq (4*4)(%rdi), %rdi; + cmpq %rax, %rdi; + jne .Lroundloop_blk8; + +#undef ROUND + + vbroadcasti128 .Lbswap128_mask rRIP, RTMP2; + + transpose_4x4(RA0, RA1, RA2, RA3, RTMP0, RTMP1); + transpose_4x4(RB0, RB1, RB2, RB3, RTMP0, RTMP1); + vpshufb RTMP2, RA0, RA0; + vpshufb RTMP2, RA1, RA1; + vpshufb RTMP2, RA2, RA2; + vpshufb RTMP2, RA3, RA3; + vpshufb RTMP2, RB0, RB0; + vpshufb RTMP2, RB1, RB1; + vpshufb RTMP2, RB2, RB2; + vpshufb RTMP2, RB3, RB3; + + FRAME_END + RET; +SYM_FUNC_END(__sm4_crypt_blk16) + +#define inc_le128(x, minus_one, tmp) \ + vpcmpeqq minus_one, x, tmp; \ + vpsubq minus_one, x, x; \ + vpslldq $8, tmp, tmp; \ + vpsubq tmp, x, x; + +/* + * void sm4_aesni_avx2_ctr_enc_blk16(const u32 *rk, u8 *dst, + * const u8 *src, u8 *iv) + */ +.align 8 +SYM_FUNC_START(sm4_aesni_avx2_ctr_enc_blk16) + /* input: + * %rdi: round key array, CTX + * %rsi: dst (16 blocks) + * %rdx: src (16 blocks) + * %rcx: iv (big endian, 128bit) + */ + FRAME_BEGIN + + movq 8(%rcx), %rax; + bswapq %rax; + + vzeroupper; + + vbroadcasti128 .Lbswap128_mask rRIP, RTMP3; + vpcmpeqd RNOT, RNOT, RNOT; + vpsrldq $8, RNOT, RNOT; /* ab: -1:0 ; cd: -1:0 */ + vpaddq RNOT, RNOT, RTMP2; /* ab: -2:0 ; cd: -2:0 */ + + /* load IV and byteswap */ + vmovdqu (%rcx), RTMP4x; + vpshufb RTMP3x, RTMP4x, RTMP4x; + vmovdqa RTMP4x, RTMP0x; + inc_le128(RTMP4x, RNOTx, RTMP1x); + vinserti128 $1, RTMP4x, RTMP0, RTMP0; + vpshufb RTMP3, RTMP0, RA0; /* +1 ; +0 */ + + /* check need for handling 64-bit overflow and carry */ + cmpq $(0xffffffffffffffff - 16), %rax; + ja .Lhandle_ctr_carry; + + /* construct IVs */ + vpsubq RTMP2, RTMP0, RTMP0; /* +3 ; +2 */ + vpshufb RTMP3, RTMP0, RA1; + vpsubq RTMP2, RTMP0, RTMP0; /* +5 ; +4 */ + vpshufb RTMP3, RTMP0, RA2; + vpsubq RTMP2, RTMP0, RTMP0; /* +7 ; +6 */ + vpshufb RTMP3, RTMP0, RA3; + vpsubq RTMP2, RTMP0, RTMP0; /* +9 ; +8 */ + vpshufb RTMP3, RTMP0, RB0; + vpsubq RTMP2, RTMP0, RTMP0; /* +11 ; +10 */ + vpshufb RTMP3, RTMP0, RB1; + vpsubq RTMP2, RTMP0, RTMP0; /* +13 ; +12 */ + vpshufb RTMP3, RTMP0, RB2; + vpsubq RTMP2, RTMP0, RTMP0; /* +15 ; +14 */ + vpshufb RTMP3, RTMP0, RB3; + vpsubq RTMP2, RTMP0, RTMP0; /* +16 */ + vpshufb RTMP3x, RTMP0x, RTMP0x; + + jmp .Lctr_carry_done; + +.Lhandle_ctr_carry: + /* construct IVs */ + inc_le128(RTMP0, RNOT, RTMP1); + inc_le128(RTMP0, RNOT, RTMP1); + vpshufb RTMP3, RTMP0, RA1; /* +3 ; +2 */ + inc_le128(RTMP0, RNOT, RTMP1); + inc_le128(RTMP0, RNOT, RTMP1); + vpshufb RTMP3, RTMP0, RA2; /* +5 ; +4 */ + inc_le128(RTMP0, RNOT, RTMP1); + inc_le128(RTMP0, RNOT, RTMP1); + vpshufb RTMP3, RTMP0, RA3; /* +7 ; +6 */ + inc_le128(RTMP0, RNOT, RTMP1); + inc_le128(RTMP0, RNOT, RTMP1); + vpshufb RTMP3, RTMP0, RB0; /* +9 ; +8 */ + inc_le128(RTMP0, RNOT, RTMP1); + inc_le128(RTMP0, RNOT, RTMP1); + vpshufb RTMP3, RTMP0, RB1; /* +11 ; +10 */ + inc_le128(RTMP0, RNOT, RTMP1); + inc_le128(RTMP0, RNOT, RTMP1); + vpshufb RTMP3, RTMP0, RB2; /* +13 ; +12 */ + inc_le128(RTMP0, RNOT, RTMP1); + inc_le128(RTMP0, RNOT, RTMP1); + vpshufb RTMP3, RTMP0, RB3; /* +15 ; +14 */ + inc_le128(RTMP0, RNOT, RTMP1); + vextracti128 $1, RTMP0, RTMP0x; + vpshufb RTMP3x, RTMP0x, RTMP0x; /* +16 */ + +.align 4 +.Lctr_carry_done: + /* store new IV */ + vmovdqu RTMP0x, (%rcx); + + call __sm4_crypt_blk16; + + vpxor (0 * 32)(%rdx), RA0, RA0; + vpxor (1 * 32)(%rdx), RA1, RA1; + vpxor (2 * 32)(%rdx), RA2, RA2; + vpxor (3 * 32)(%rdx), RA3, RA3; + vpxor (4 * 32)(%rdx), RB0, RB0; + vpxor (5 * 32)(%rdx), RB1, RB1; + vpxor (6 * 32)(%rdx), RB2, RB2; + vpxor (7 * 32)(%rdx), RB3, RB3; + + vmovdqu RA0, (0 * 32)(%rsi); + vmovdqu RA1, (1 * 32)(%rsi); + vmovdqu RA2, (2 * 32)(%rsi); + vmovdqu RA3, (3 * 32)(%rsi); + vmovdqu RB0, (4 * 32)(%rsi); + vmovdqu RB1, (5 * 32)(%rsi); + vmovdqu RB2, (6 * 32)(%rsi); + vmovdqu RB3, (7 * 32)(%rsi); + + vzeroall; + FRAME_END + RET; +SYM_FUNC_END(sm4_aesni_avx2_ctr_enc_blk16) + +/* + * void sm4_aesni_avx2_cbc_dec_blk16(const u32 *rk, u8 *dst, + * const u8 *src, u8 *iv) + */ +.align 8 +SYM_FUNC_START(sm4_aesni_avx2_cbc_dec_blk16) + /* input: + * %rdi: round key array, CTX + * %rsi: dst (16 blocks) + * %rdx: src (16 blocks) + * %rcx: iv + */ + FRAME_BEGIN + + vzeroupper; + + vmovdqu (0 * 32)(%rdx), RA0; + vmovdqu (1 * 32)(%rdx), RA1; + vmovdqu (2 * 32)(%rdx), RA2; + vmovdqu (3 * 32)(%rdx), RA3; + vmovdqu (4 * 32)(%rdx), RB0; + vmovdqu (5 * 32)(%rdx), RB1; + vmovdqu (6 * 32)(%rdx), RB2; + vmovdqu (7 * 32)(%rdx), RB3; + + call __sm4_crypt_blk16; + + vmovdqu (%rcx), RNOTx; + vinserti128 $1, (%rdx), RNOT, RNOT; + vpxor RNOT, RA0, RA0; + vpxor (0 * 32 + 16)(%rdx), RA1, RA1; + vpxor (1 * 32 + 16)(%rdx), RA2, RA2; + vpxor (2 * 32 + 16)(%rdx), RA3, RA3; + vpxor (3 * 32 + 16)(%rdx), RB0, RB0; + vpxor (4 * 32 + 16)(%rdx), RB1, RB1; + vpxor (5 * 32 + 16)(%rdx), RB2, RB2; + vpxor (6 * 32 + 16)(%rdx), RB3, RB3; + vmovdqu (7 * 32 + 16)(%rdx), RNOTx; + vmovdqu RNOTx, (%rcx); /* store new IV */ + + vmovdqu RA0, (0 * 32)(%rsi); + vmovdqu RA1, (1 * 32)(%rsi); + vmovdqu RA2, (2 * 32)(%rsi); + vmovdqu RA3, (3 * 32)(%rsi); + vmovdqu RB0, (4 * 32)(%rsi); + vmovdqu RB1, (5 * 32)(%rsi); + vmovdqu RB2, (6 * 32)(%rsi); + vmovdqu RB3, (7 * 32)(%rsi); + + vzeroall; + FRAME_END + RET; +SYM_FUNC_END(sm4_aesni_avx2_cbc_dec_blk16) + +/* + * void sm4_aesni_avx2_cfb_dec_blk16(const u32 *rk, u8 *dst, + * const u8 *src, u8 *iv) + */ +.align 8 +SYM_FUNC_START(sm4_aesni_avx2_cfb_dec_blk16) + /* input: + * %rdi: round key array, CTX + * %rsi: dst (16 blocks) + * %rdx: src (16 blocks) + * %rcx: iv + */ + FRAME_BEGIN + + vzeroupper; + + /* Load input */ + vmovdqu (%rcx), RNOTx; + vinserti128 $1, (%rdx), RNOT, RA0; + vmovdqu (0 * 32 + 16)(%rdx), RA1; + vmovdqu (1 * 32 + 16)(%rdx), RA2; + vmovdqu (2 * 32 + 16)(%rdx), RA3; + vmovdqu (3 * 32 + 16)(%rdx), RB0; + vmovdqu (4 * 32 + 16)(%rdx), RB1; + vmovdqu (5 * 32 + 16)(%rdx), RB2; + vmovdqu (6 * 32 + 16)(%rdx), RB3; + + /* Update IV */ + vmovdqu (7 * 32 + 16)(%rdx), RNOTx; + vmovdqu RNOTx, (%rcx); + + call __sm4_crypt_blk16; + + vpxor (0 * 32)(%rdx), RA0, RA0; + vpxor (1 * 32)(%rdx), RA1, RA1; + vpxor (2 * 32)(%rdx), RA2, RA2; + vpxor (3 * 32)(%rdx), RA3, RA3; + vpxor (4 * 32)(%rdx), RB0, RB0; + vpxor (5 * 32)(%rdx), RB1, RB1; + vpxor (6 * 32)(%rdx), RB2, RB2; + vpxor (7 * 32)(%rdx), RB3, RB3; + + vmovdqu RA0, (0 * 32)(%rsi); + vmovdqu RA1, (1 * 32)(%rsi); + vmovdqu RA2, (2 * 32)(%rsi); + vmovdqu RA3, (3 * 32)(%rsi); + vmovdqu RB0, (4 * 32)(%rsi); + vmovdqu RB1, (5 * 32)(%rsi); + vmovdqu RB2, (6 * 32)(%rsi); + vmovdqu RB3, (7 * 32)(%rsi); + + vzeroall; + FRAME_END + RET; +SYM_FUNC_END(sm4_aesni_avx2_cfb_dec_blk16) diff --git a/arch/x86/crypto/sm4-avx.h b/arch/x86/crypto/sm4-avx.h new file mode 100644 index 000000000000..1bceab7516aa --- /dev/null +++ b/arch/x86/crypto/sm4-avx.h @@ -0,0 +1,24 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#ifndef ASM_X86_SM4_AVX_H +#define ASM_X86_SM4_AVX_H + +#include <linux/types.h> +#include <crypto/sm4.h> + +typedef void (*sm4_crypt_func)(const u32 *rk, u8 *dst, const u8 *src, u8 *iv); + +int sm4_avx_ecb_encrypt(struct skcipher_request *req); +int sm4_avx_ecb_decrypt(struct skcipher_request *req); + +int sm4_cbc_encrypt(struct skcipher_request *req); +int sm4_avx_cbc_decrypt(struct skcipher_request *req, + unsigned int bsize, sm4_crypt_func func); + +int sm4_cfb_encrypt(struct skcipher_request *req); +int sm4_avx_cfb_decrypt(struct skcipher_request *req, + unsigned int bsize, sm4_crypt_func func); + +int sm4_avx_ctr_crypt(struct skcipher_request *req, + unsigned int bsize, sm4_crypt_func func); + +#endif diff --git a/arch/x86/crypto/sm4_aesni_avx2_glue.c b/arch/x86/crypto/sm4_aesni_avx2_glue.c new file mode 100644 index 000000000000..84bc718f49a3 --- /dev/null +++ b/arch/x86/crypto/sm4_aesni_avx2_glue.c @@ -0,0 +1,169 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * SM4 Cipher Algorithm, AES-NI/AVX2 optimized. + * as specified in + * https://tools.ietf.org/id/draft-ribose-cfrg-sm4-10.html + * + * Copyright (c) 2021, Alibaba Group. + * Copyright (c) 2021 Tianjia Zhang <tianjia.zhang@linux.alibaba.com> + */ + +#include <linux/module.h> +#include <linux/crypto.h> +#include <linux/kernel.h> +#include <asm/simd.h> +#include <crypto/internal/simd.h> +#include <crypto/internal/skcipher.h> +#include <crypto/sm4.h> +#include "sm4-avx.h" + +#define SM4_CRYPT16_BLOCK_SIZE (SM4_BLOCK_SIZE * 16) + +asmlinkage void sm4_aesni_avx2_ctr_enc_blk16(const u32 *rk, u8 *dst, + const u8 *src, u8 *iv); +asmlinkage void sm4_aesni_avx2_cbc_dec_blk16(const u32 *rk, u8 *dst, + const u8 *src, u8 *iv); +asmlinkage void sm4_aesni_avx2_cfb_dec_blk16(const u32 *rk, u8 *dst, + const u8 *src, u8 *iv); + +static int sm4_skcipher_setkey(struct crypto_skcipher *tfm, const u8 *key, + unsigned int key_len) +{ + struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm); + + return sm4_expandkey(ctx, key, key_len); +} + +static int cbc_decrypt(struct skcipher_request *req) +{ + return sm4_avx_cbc_decrypt(req, SM4_CRYPT16_BLOCK_SIZE, + sm4_aesni_avx2_cbc_dec_blk16); +} + + +static int cfb_decrypt(struct skcipher_request *req) +{ + return sm4_avx_cfb_decrypt(req, SM4_CRYPT16_BLOCK_SIZE, + sm4_aesni_avx2_cfb_dec_blk16); +} + +static int ctr_crypt(struct skcipher_request *req) +{ + return sm4_avx_ctr_crypt(req, SM4_CRYPT16_BLOCK_SIZE, + sm4_aesni_avx2_ctr_enc_blk16); +} + +static struct skcipher_alg sm4_aesni_avx2_skciphers[] = { + { + .base = { + .cra_name = "__ecb(sm4)", + .cra_driver_name = "__ecb-sm4-aesni-avx2", + .cra_priority = 500, + .cra_flags = CRYPTO_ALG_INTERNAL, + .cra_blocksize = SM4_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct sm4_ctx), + .cra_module = THIS_MODULE, + }, + .min_keysize = SM4_KEY_SIZE, + .max_keysize = SM4_KEY_SIZE, + .walksize = 16 * SM4_BLOCK_SIZE, + .setkey = sm4_skcipher_setkey, + .encrypt = sm4_avx_ecb_encrypt, + .decrypt = sm4_avx_ecb_decrypt, + }, { + .base = { + .cra_name = "__cbc(sm4)", + .cra_driver_name = "__cbc-sm4-aesni-avx2", + .cra_priority = 500, + .cra_flags = CRYPTO_ALG_INTERNAL, + .cra_blocksize = SM4_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct sm4_ctx), + .cra_module = THIS_MODULE, + }, + .min_keysize = SM4_KEY_SIZE, + .max_keysize = SM4_KEY_SIZE, + .ivsize = SM4_BLOCK_SIZE, + .walksize = 16 * SM4_BLOCK_SIZE, + .setkey = sm4_skcipher_setkey, + .encrypt = sm4_cbc_encrypt, + .decrypt = cbc_decrypt, + }, { + .base = { + .cra_name = "__cfb(sm4)", + .cra_driver_name = "__cfb-sm4-aesni-avx2", + .cra_priority = 500, + .cra_flags = CRYPTO_ALG_INTERNAL, + .cra_blocksize = 1, + .cra_ctxsize = sizeof(struct sm4_ctx), + .cra_module = THIS_MODULE, + }, + .min_keysize = SM4_KEY_SIZE, + .max_keysize = SM4_KEY_SIZE, + .ivsize = SM4_BLOCK_SIZE, + .chunksize = SM4_BLOCK_SIZE, + .walksize = 16 * SM4_BLOCK_SIZE, + .setkey = sm4_skcipher_setkey, + .encrypt = sm4_cfb_encrypt, + .decrypt = cfb_decrypt, + }, { + .base = { + .cra_name = "__ctr(sm4)", + .cra_driver_name = "__ctr-sm4-aesni-avx2", + .cra_priority = 500, + .cra_flags = CRYPTO_ALG_INTERNAL, + .cra_blocksize = 1, + .cra_ctxsize = sizeof(struct sm4_ctx), + .cra_module = THIS_MODULE, + }, + .min_keysize = SM4_KEY_SIZE, + .max_keysize = SM4_KEY_SIZE, + .ivsize = SM4_BLOCK_SIZE, + .chunksize = SM4_BLOCK_SIZE, + .walksize = 16 * SM4_BLOCK_SIZE, + .setkey = sm4_skcipher_setkey, + .encrypt = ctr_crypt, + .decrypt = ctr_crypt, + } +}; + +static struct simd_skcipher_alg * +simd_sm4_aesni_avx2_skciphers[ARRAY_SIZE(sm4_aesni_avx2_skciphers)]; + +static int __init sm4_init(void) +{ + const char *feature_name; + + if (!boot_cpu_has(X86_FEATURE_AVX) || + !boot_cpu_has(X86_FEATURE_AVX2) || + !boot_cpu_has(X86_FEATURE_AES) || + !boot_cpu_has(X86_FEATURE_OSXSAVE)) { + pr_info("AVX2 or AES-NI instructions are not detected.\n"); + return -ENODEV; + } + + if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, + &feature_name)) { + pr_info("CPU feature '%s' is not supported.\n", feature_name); + return -ENODEV; + } + + return simd_register_skciphers_compat(sm4_aesni_avx2_skciphers, + ARRAY_SIZE(sm4_aesni_avx2_skciphers), + simd_sm4_aesni_avx2_skciphers); +} + +static void __exit sm4_exit(void) +{ + simd_unregister_skciphers(sm4_aesni_avx2_skciphers, + ARRAY_SIZE(sm4_aesni_avx2_skciphers), + simd_sm4_aesni_avx2_skciphers); +} + +module_init(sm4_init); +module_exit(sm4_exit); + +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR("Tianjia Zhang <tianjia.zhang@linux.alibaba.com>"); +MODULE_DESCRIPTION("SM4 Cipher Algorithm, AES-NI/AVX2 optimized"); +MODULE_ALIAS_CRYPTO("sm4"); +MODULE_ALIAS_CRYPTO("sm4-aesni-avx2"); diff --git a/arch/x86/crypto/sm4_aesni_avx_glue.c b/arch/x86/crypto/sm4_aesni_avx_glue.c new file mode 100644 index 000000000000..7800f77d68ad --- /dev/null +++ b/arch/x86/crypto/sm4_aesni_avx_glue.c @@ -0,0 +1,487 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * SM4 Cipher Algorithm, AES-NI/AVX optimized. + * as specified in + * https://tools.ietf.org/id/draft-ribose-cfrg-sm4-10.html + * + * Copyright (c) 2021, Alibaba Group. + * Copyright (c) 2021 Tianjia Zhang <tianjia.zhang@linux.alibaba.com> + */ + +#include <linux/module.h> +#include <linux/crypto.h> +#include <linux/kernel.h> +#include <asm/simd.h> +#include <crypto/internal/simd.h> +#include <crypto/internal/skcipher.h> +#include <crypto/sm4.h> +#include "sm4-avx.h" + +#define SM4_CRYPT8_BLOCK_SIZE (SM4_BLOCK_SIZE * 8) + +asmlinkage void sm4_aesni_avx_crypt4(const u32 *rk, u8 *dst, + const u8 *src, int nblocks); +asmlinkage void sm4_aesni_avx_crypt8(const u32 *rk, u8 *dst, + const u8 *src, int nblocks); +asmlinkage void sm4_aesni_avx_ctr_enc_blk8(const u32 *rk, u8 *dst, + const u8 *src, u8 *iv); +asmlinkage void sm4_aesni_avx_cbc_dec_blk8(const u32 *rk, u8 *dst, + const u8 *src, u8 *iv); +asmlinkage void sm4_aesni_avx_cfb_dec_blk8(const u32 *rk, u8 *dst, + const u8 *src, u8 *iv); + +static int sm4_skcipher_setkey(struct crypto_skcipher *tfm, const u8 *key, + unsigned int key_len) +{ + struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm); + + return sm4_expandkey(ctx, key, key_len); +} + +static int ecb_do_crypt(struct skcipher_request *req, const u32 *rkey) +{ + struct skcipher_walk walk; + unsigned int nbytes; + int err; + + err = skcipher_walk_virt(&walk, req, false); + + while ((nbytes = walk.nbytes) > 0) { + const u8 *src = walk.src.virt.addr; + u8 *dst = walk.dst.virt.addr; + + kernel_fpu_begin(); + while (nbytes >= SM4_CRYPT8_BLOCK_SIZE) { + sm4_aesni_avx_crypt8(rkey, dst, src, 8); + dst += SM4_CRYPT8_BLOCK_SIZE; + src += SM4_CRYPT8_BLOCK_SIZE; + nbytes -= SM4_CRYPT8_BLOCK_SIZE; + } + while (nbytes >= SM4_BLOCK_SIZE) { + unsigned int nblocks = min(nbytes >> 4, 4u); + sm4_aesni_avx_crypt4(rkey, dst, src, nblocks); + dst += nblocks * SM4_BLOCK_SIZE; + src += nblocks * SM4_BLOCK_SIZE; + nbytes -= nblocks * SM4_BLOCK_SIZE; + } + kernel_fpu_end(); + + err = skcipher_walk_done(&walk, nbytes); + } + + return err; +} + +int sm4_avx_ecb_encrypt(struct skcipher_request *req) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm); + + return ecb_do_crypt(req, ctx->rkey_enc); +} +EXPORT_SYMBOL_GPL(sm4_avx_ecb_encrypt); + +int sm4_avx_ecb_decrypt(struct skcipher_request *req) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm); + + return ecb_do_crypt(req, ctx->rkey_dec); +} +EXPORT_SYMBOL_GPL(sm4_avx_ecb_decrypt); + +int sm4_cbc_encrypt(struct skcipher_request *req) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm); + struct skcipher_walk walk; + unsigned int nbytes; + int err; + + err = skcipher_walk_virt(&walk, req, false); + + while ((nbytes = walk.nbytes) > 0) { + const u8 *iv = walk.iv; + const u8 *src = walk.src.virt.addr; + u8 *dst = walk.dst.virt.addr; + + while (nbytes >= SM4_BLOCK_SIZE) { + crypto_xor_cpy(dst, src, iv, SM4_BLOCK_SIZE); + sm4_crypt_block(ctx->rkey_enc, dst, dst); + iv = dst; + src += SM4_BLOCK_SIZE; + dst += SM4_BLOCK_SIZE; + nbytes -= SM4_BLOCK_SIZE; + } + if (iv != walk.iv) + memcpy(walk.iv, iv, SM4_BLOCK_SIZE); + + err = skcipher_walk_done(&walk, nbytes); + } + + return err; +} +EXPORT_SYMBOL_GPL(sm4_cbc_encrypt); + +int sm4_avx_cbc_decrypt(struct skcipher_request *req, + unsigned int bsize, sm4_crypt_func func) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm); + struct skcipher_walk walk; + unsigned int nbytes; + int err; + + err = skcipher_walk_virt(&walk, req, false); + + while ((nbytes = walk.nbytes) > 0) { + const u8 *src = walk.src.virt.addr; + u8 *dst = walk.dst.virt.addr; + + kernel_fpu_begin(); + + while (nbytes >= bsize) { + func(ctx->rkey_dec, dst, src, walk.iv); + dst += bsize; + src += bsize; + nbytes -= bsize; + } + + while (nbytes >= SM4_BLOCK_SIZE) { + u8 keystream[SM4_BLOCK_SIZE * 8]; + u8 iv[SM4_BLOCK_SIZE]; + unsigned int nblocks = min(nbytes >> 4, 8u); + int i; + + sm4_aesni_avx_crypt8(ctx->rkey_dec, keystream, + src, nblocks); + + src += ((int)nblocks - 2) * SM4_BLOCK_SIZE; + dst += (nblocks - 1) * SM4_BLOCK_SIZE; + memcpy(iv, src + SM4_BLOCK_SIZE, SM4_BLOCK_SIZE); + + for (i = nblocks - 1; i > 0; i--) { + crypto_xor_cpy(dst, src, + &keystream[i * SM4_BLOCK_SIZE], + SM4_BLOCK_SIZE); + src -= SM4_BLOCK_SIZE; + dst -= SM4_BLOCK_SIZE; + } + crypto_xor_cpy(dst, walk.iv, keystream, SM4_BLOCK_SIZE); + memcpy(walk.iv, iv, SM4_BLOCK_SIZE); + dst += nblocks * SM4_BLOCK_SIZE; + src += (nblocks + 1) * SM4_BLOCK_SIZE; + nbytes -= nblocks * SM4_BLOCK_SIZE; + } + + kernel_fpu_end(); + err = skcipher_walk_done(&walk, nbytes); + } + + return err; +} +EXPORT_SYMBOL_GPL(sm4_avx_cbc_decrypt); + +static int cbc_decrypt(struct skcipher_request *req) +{ + return sm4_avx_cbc_decrypt(req, SM4_CRYPT8_BLOCK_SIZE, + sm4_aesni_avx_cbc_dec_blk8); +} + +int sm4_cfb_encrypt(struct skcipher_request *req) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm); + struct skcipher_walk walk; + unsigned int nbytes; + int err; + + err = skcipher_walk_virt(&walk, req, false); + + while ((nbytes = walk.nbytes) > 0) { + u8 keystream[SM4_BLOCK_SIZE]; + const u8 *iv = walk.iv; + const u8 *src = walk.src.virt.addr; + u8 *dst = walk.dst.virt.addr; + + while (nbytes >= SM4_BLOCK_SIZE) { + sm4_crypt_block(ctx->rkey_enc, keystream, iv); + crypto_xor_cpy(dst, src, keystream, SM4_BLOCK_SIZE); + iv = dst; + src += SM4_BLOCK_SIZE; + dst += SM4_BLOCK_SIZE; + nbytes -= SM4_BLOCK_SIZE; + } + if (iv != walk.iv) + memcpy(walk.iv, iv, SM4_BLOCK_SIZE); + + /* tail */ + if (walk.nbytes == walk.total && nbytes > 0) { + sm4_crypt_block(ctx->rkey_enc, keystream, walk.iv); + crypto_xor_cpy(dst, src, keystream, nbytes); + nbytes = 0; + } + + err = skcipher_walk_done(&walk, nbytes); + } + + return err; +} +EXPORT_SYMBOL_GPL(sm4_cfb_encrypt); + +int sm4_avx_cfb_decrypt(struct skcipher_request *req, + unsigned int bsize, sm4_crypt_func func) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm); + struct skcipher_walk walk; + unsigned int nbytes; + int err; + + err = skcipher_walk_virt(&walk, req, false); + + while ((nbytes = walk.nbytes) > 0) { + const u8 *src = walk.src.virt.addr; + u8 *dst = walk.dst.virt.addr; + + kernel_fpu_begin(); + + while (nbytes >= bsize) { + func(ctx->rkey_enc, dst, src, walk.iv); + dst += bsize; + src += bsize; + nbytes -= bsize; + } + + while (nbytes >= SM4_BLOCK_SIZE) { + u8 keystream[SM4_BLOCK_SIZE * 8]; + unsigned int nblocks = min(nbytes >> 4, 8u); + + memcpy(keystream, walk.iv, SM4_BLOCK_SIZE); + if (nblocks > 1) + memcpy(&keystream[SM4_BLOCK_SIZE], src, + (nblocks - 1) * SM4_BLOCK_SIZE); + memcpy(walk.iv, src + (nblocks - 1) * SM4_BLOCK_SIZE, + SM4_BLOCK_SIZE); + + sm4_aesni_avx_crypt8(ctx->rkey_enc, keystream, + keystream, nblocks); + + crypto_xor_cpy(dst, src, keystream, + nblocks * SM4_BLOCK_SIZE); + dst += nblocks * SM4_BLOCK_SIZE; + src += nblocks * SM4_BLOCK_SIZE; + nbytes -= nblocks * SM4_BLOCK_SIZE; + } + + kernel_fpu_end(); + + /* tail */ + if (walk.nbytes == walk.total && nbytes > 0) { + u8 keystream[SM4_BLOCK_SIZE]; + + sm4_crypt_block(ctx->rkey_enc, keystream, walk.iv); + crypto_xor_cpy(dst, src, keystream, nbytes); + nbytes = 0; + } + + err = skcipher_walk_done(&walk, nbytes); + } + + return err; +} +EXPORT_SYMBOL_GPL(sm4_avx_cfb_decrypt); + +static int cfb_decrypt(struct skcipher_request *req) +{ + return sm4_avx_cfb_decrypt(req, SM4_CRYPT8_BLOCK_SIZE, + sm4_aesni_avx_cfb_dec_blk8); +} + +int sm4_avx_ctr_crypt(struct skcipher_request *req, + unsigned int bsize, sm4_crypt_func func) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm); + struct skcipher_walk walk; + unsigned int nbytes; + int err; + + err = skcipher_walk_virt(&walk, req, false); + + while ((nbytes = walk.nbytes) > 0) { + const u8 *src = walk.src.virt.addr; + u8 *dst = walk.dst.virt.addr; + + kernel_fpu_begin(); + + while (nbytes >= bsize) { + func(ctx->rkey_enc, dst, src, walk.iv); + dst += bsize; + src += bsize; + nbytes -= bsize; + } + + while (nbytes >= SM4_BLOCK_SIZE) { + u8 keystream[SM4_BLOCK_SIZE * 8]; + unsigned int nblocks = min(nbytes >> 4, 8u); + int i; + + for (i = 0; i < nblocks; i++) { + memcpy(&keystream[i * SM4_BLOCK_SIZE], + walk.iv, SM4_BLOCK_SIZE); + crypto_inc(walk.iv, SM4_BLOCK_SIZE); + } + sm4_aesni_avx_crypt8(ctx->rkey_enc, keystream, + keystream, nblocks); + + crypto_xor_cpy(dst, src, keystream, + nblocks * SM4_BLOCK_SIZE); + dst += nblocks * SM4_BLOCK_SIZE; + src += nblocks * SM4_BLOCK_SIZE; + nbytes -= nblocks * SM4_BLOCK_SIZE; + } + + kernel_fpu_end(); + + /* tail */ + if (walk.nbytes == walk.total && nbytes > 0) { + u8 keystream[SM4_BLOCK_SIZE]; + + memcpy(keystream, walk.iv, SM4_BLOCK_SIZE); + crypto_inc(walk.iv, SM4_BLOCK_SIZE); + + sm4_crypt_block(ctx->rkey_enc, keystream, keystream); + + crypto_xor_cpy(dst, src, keystream, nbytes); + dst += nbytes; + src += nbytes; + nbytes = 0; + } + + err = skcipher_walk_done(&walk, nbytes); + } + + return err; +} +EXPORT_SYMBOL_GPL(sm4_avx_ctr_crypt); + +static int ctr_crypt(struct skcipher_request *req) +{ + return sm4_avx_ctr_crypt(req, SM4_CRYPT8_BLOCK_SIZE, + sm4_aesni_avx_ctr_enc_blk8); +} + +static struct skcipher_alg sm4_aesni_avx_skciphers[] = { + { + .base = { + .cra_name = "__ecb(sm4)", + .cra_driver_name = "__ecb-sm4-aesni-avx", + .cra_priority = 400, + .cra_flags = CRYPTO_ALG_INTERNAL, + .cra_blocksize = SM4_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct sm4_ctx), + .cra_module = THIS_MODULE, + }, + .min_keysize = SM4_KEY_SIZE, + .max_keysize = SM4_KEY_SIZE, + .walksize = 8 * SM4_BLOCK_SIZE, + .setkey = sm4_skcipher_setkey, + .encrypt = sm4_avx_ecb_encrypt, + .decrypt = sm4_avx_ecb_decrypt, + }, { + .base = { + .cra_name = "__cbc(sm4)", + .cra_driver_name = "__cbc-sm4-aesni-avx", + .cra_priority = 400, + .cra_flags = CRYPTO_ALG_INTERNAL, + .cra_blocksize = SM4_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct sm4_ctx), + .cra_module = THIS_MODULE, + }, + .min_keysize = SM4_KEY_SIZE, + .max_keysize = SM4_KEY_SIZE, + .ivsize = SM4_BLOCK_SIZE, + .walksize = 8 * SM4_BLOCK_SIZE, + .setkey = sm4_skcipher_setkey, + .encrypt = sm4_cbc_encrypt, + .decrypt = cbc_decrypt, + }, { + .base = { + .cra_name = "__cfb(sm4)", + .cra_driver_name = "__cfb-sm4-aesni-avx", + .cra_priority = 400, + .cra_flags = CRYPTO_ALG_INTERNAL, + .cra_blocksize = 1, + .cra_ctxsize = sizeof(struct sm4_ctx), + .cra_module = THIS_MODULE, + }, + .min_keysize = SM4_KEY_SIZE, + .max_keysize = SM4_KEY_SIZE, + .ivsize = SM4_BLOCK_SIZE, + .chunksize = SM4_BLOCK_SIZE, + .walksize = 8 * SM4_BLOCK_SIZE, + .setkey = sm4_skcipher_setkey, + .encrypt = sm4_cfb_encrypt, + .decrypt = cfb_decrypt, + }, { + .base = { + .cra_name = "__ctr(sm4)", + .cra_driver_name = "__ctr-sm4-aesni-avx", + .cra_priority = 400, + .cra_flags = CRYPTO_ALG_INTERNAL, + .cra_blocksize = 1, + .cra_ctxsize = sizeof(struct sm4_ctx), + .cra_module = THIS_MODULE, + }, + .min_keysize = SM4_KEY_SIZE, + .max_keysize = SM4_KEY_SIZE, + .ivsize = SM4_BLOCK_SIZE, + .chunksize = SM4_BLOCK_SIZE, + .walksize = 8 * SM4_BLOCK_SIZE, + .setkey = sm4_skcipher_setkey, + .encrypt = ctr_crypt, + .decrypt = ctr_crypt, + } +}; + +static struct simd_skcipher_alg * +simd_sm4_aesni_avx_skciphers[ARRAY_SIZE(sm4_aesni_avx_skciphers)]; + +static int __init sm4_init(void) +{ + const char *feature_name; + + if (!boot_cpu_has(X86_FEATURE_AVX) || + !boot_cpu_has(X86_FEATURE_AES) || + !boot_cpu_has(X86_FEATURE_OSXSAVE)) { + pr_info("AVX or AES-NI instructions are not detected.\n"); + return -ENODEV; + } + + if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, + &feature_name)) { + pr_info("CPU feature '%s' is not supported.\n", feature_name); + return -ENODEV; + } + + return simd_register_skciphers_compat(sm4_aesni_avx_skciphers, + ARRAY_SIZE(sm4_aesni_avx_skciphers), + simd_sm4_aesni_avx_skciphers); +} + +static void __exit sm4_exit(void) +{ + simd_unregister_skciphers(sm4_aesni_avx_skciphers, + ARRAY_SIZE(sm4_aesni_avx_skciphers), + simd_sm4_aesni_avx_skciphers); +} + +module_init(sm4_init); +module_exit(sm4_exit); + +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR("Tianjia Zhang <tianjia.zhang@linux.alibaba.com>"); +MODULE_DESCRIPTION("SM4 Cipher Algorithm, AES-NI/AVX optimized"); +MODULE_ALIAS_CRYPTO("sm4"); +MODULE_ALIAS_CRYPTO("sm4-aesni-avx"); diff --git a/arch/x86/crypto/twofish-avx-x86_64-asm_64.S b/arch/x86/crypto/twofish-avx-x86_64-asm_64.S index a5151393bb2f..31f9b2ec3857 100644 --- a/arch/x86/crypto/twofish-avx-x86_64-asm_64.S +++ b/arch/x86/crypto/twofish-avx-x86_64-asm_64.S @@ -19,11 +19,6 @@ .Lbswap128_mask: .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 -.section .rodata.cst16.xts_gf128mul_and_shl1_mask, "aM", @progbits, 16 -.align 16 -.Lxts_gf128mul_and_shl1_mask: - .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0 - .text /* structure of crypto context */ @@ -272,7 +267,7 @@ SYM_FUNC_START_LOCAL(__twofish_enc_blk8) outunpack_blocks(RC1, RD1, RA1, RB1, RK1, RX0, RY0, RK2); outunpack_blocks(RC2, RD2, RA2, RB2, RK1, RX0, RY0, RK2); - ret; + RET; SYM_FUNC_END(__twofish_enc_blk8) .align 8 @@ -312,7 +307,7 @@ SYM_FUNC_START_LOCAL(__twofish_dec_blk8) outunpack_blocks(RA1, RB1, RC1, RD1, RK1, RX0, RY0, RK2); outunpack_blocks(RA2, RB2, RC2, RD2, RK1, RX0, RY0, RK2); - ret; + RET; SYM_FUNC_END(__twofish_dec_blk8) SYM_FUNC_START(twofish_ecb_enc_8way) @@ -332,7 +327,7 @@ SYM_FUNC_START(twofish_ecb_enc_8way) store_8way(%r11, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2); FRAME_END - ret; + RET; SYM_FUNC_END(twofish_ecb_enc_8way) SYM_FUNC_START(twofish_ecb_dec_8way) @@ -352,7 +347,7 @@ SYM_FUNC_START(twofish_ecb_dec_8way) store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); FRAME_END - ret; + RET; SYM_FUNC_END(twofish_ecb_dec_8way) SYM_FUNC_START(twofish_cbc_dec_8way) @@ -377,80 +372,5 @@ SYM_FUNC_START(twofish_cbc_dec_8way) popq %r12; FRAME_END - ret; + RET; SYM_FUNC_END(twofish_cbc_dec_8way) - -SYM_FUNC_START(twofish_ctr_8way) - /* input: - * %rdi: ctx, CTX - * %rsi: dst - * %rdx: src - * %rcx: iv (little endian, 128bit) - */ - FRAME_BEGIN - - pushq %r12; - - movq %rsi, %r11; - movq %rdx, %r12; - - load_ctr_8way(%rcx, .Lbswap128_mask, RA1, RB1, RC1, RD1, RA2, RB2, RC2, - RD2, RX0, RX1, RY0); - - call __twofish_enc_blk8; - - store_ctr_8way(%r12, %r11, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2); - - popq %r12; - - FRAME_END - ret; -SYM_FUNC_END(twofish_ctr_8way) - -SYM_FUNC_START(twofish_xts_enc_8way) - /* input: - * %rdi: ctx, CTX - * %rsi: dst - * %rdx: src - * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) - */ - FRAME_BEGIN - - movq %rsi, %r11; - - /* regs <= src, dst <= IVs, regs <= regs xor IVs */ - load_xts_8way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2, - RX0, RX1, RY0, .Lxts_gf128mul_and_shl1_mask); - - call __twofish_enc_blk8; - - /* dst <= regs xor IVs(in dst) */ - store_xts_8way(%r11, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2); - - FRAME_END - ret; -SYM_FUNC_END(twofish_xts_enc_8way) - -SYM_FUNC_START(twofish_xts_dec_8way) - /* input: - * %rdi: ctx, CTX - * %rsi: dst - * %rdx: src - * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) - */ - FRAME_BEGIN - - movq %rsi, %r11; - - /* regs <= src, dst <= IVs, regs <= regs xor IVs */ - load_xts_8way(%rcx, %rdx, %rsi, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2, - RX0, RX1, RY0, .Lxts_gf128mul_and_shl1_mask); - - call __twofish_dec_blk8; - - /* dst <= regs xor IVs(in dst) */ - store_xts_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); - - FRAME_END - ret; -SYM_FUNC_END(twofish_xts_dec_8way) diff --git a/arch/x86/crypto/twofish-i586-asm_32.S b/arch/x86/crypto/twofish-i586-asm_32.S index a6f09e4f2e46..3abcad661884 100644 --- a/arch/x86/crypto/twofish-i586-asm_32.S +++ b/arch/x86/crypto/twofish-i586-asm_32.S @@ -260,7 +260,7 @@ SYM_FUNC_START(twofish_enc_blk) pop %ebx pop %ebp mov $1, %eax - ret + RET SYM_FUNC_END(twofish_enc_blk) SYM_FUNC_START(twofish_dec_blk) @@ -317,5 +317,5 @@ SYM_FUNC_START(twofish_dec_blk) pop %ebx pop %ebp mov $1, %eax - ret + RET SYM_FUNC_END(twofish_dec_blk) diff --git a/arch/x86/crypto/twofish-x86_64-asm_64-3way.S b/arch/x86/crypto/twofish-x86_64-asm_64-3way.S index fc23552afe37..d2288bf38a8a 100644 --- a/arch/x86/crypto/twofish-x86_64-asm_64-3way.S +++ b/arch/x86/crypto/twofish-x86_64-asm_64-3way.S @@ -88,7 +88,7 @@ /* * Combined G1 & G2 function. Reordered with help of rotates to have moves - * at begining. + * at beginning. */ #define g1g2_3(ab, cd, Tx0, Tx1, Tx2, Tx3, Ty0, Ty1, Ty2, Ty3, x, y) \ /* G1,1 && G2,1 */ \ @@ -258,7 +258,7 @@ SYM_FUNC_START(__twofish_enc_blk_3way) popq %rbx; popq %r12; popq %r13; - ret; + RET; .L__enc_xor3: outunpack_enc3(xor); @@ -266,7 +266,7 @@ SYM_FUNC_START(__twofish_enc_blk_3way) popq %rbx; popq %r12; popq %r13; - ret; + RET; SYM_FUNC_END(__twofish_enc_blk_3way) SYM_FUNC_START(twofish_dec_blk_3way) @@ -301,5 +301,5 @@ SYM_FUNC_START(twofish_dec_blk_3way) popq %rbx; popq %r12; popq %r13; - ret; + RET; SYM_FUNC_END(twofish_dec_blk_3way) diff --git a/arch/x86/crypto/twofish-x86_64-asm_64.S b/arch/x86/crypto/twofish-x86_64-asm_64.S index d2e56232494a..775af290cd19 100644 --- a/arch/x86/crypto/twofish-x86_64-asm_64.S +++ b/arch/x86/crypto/twofish-x86_64-asm_64.S @@ -252,7 +252,7 @@ SYM_FUNC_START(twofish_enc_blk) popq R1 movl $1,%eax - ret + RET SYM_FUNC_END(twofish_enc_blk) SYM_FUNC_START(twofish_dec_blk) @@ -304,5 +304,5 @@ SYM_FUNC_START(twofish_dec_blk) popq R1 movl $1,%eax - ret + RET SYM_FUNC_END(twofish_dec_blk) diff --git a/arch/x86/crypto/twofish.h b/arch/x86/crypto/twofish.h new file mode 100644 index 000000000000..12df400e6d53 --- /dev/null +++ b/arch/x86/crypto/twofish.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef ASM_X86_TWOFISH_H +#define ASM_X86_TWOFISH_H + +#include <linux/crypto.h> +#include <crypto/twofish.h> +#include <crypto/b128ops.h> + +/* regular block cipher functions from twofish_x86_64 module */ +asmlinkage void twofish_enc_blk(const void *ctx, u8 *dst, const u8 *src); +asmlinkage void twofish_dec_blk(const void *ctx, u8 *dst, const u8 *src); + +/* 3-way parallel cipher functions */ +asmlinkage void __twofish_enc_blk_3way(const void *ctx, u8 *dst, const u8 *src, + bool xor); +asmlinkage void twofish_dec_blk_3way(const void *ctx, u8 *dst, const u8 *src); + +/* helpers from twofish_x86_64-3way module */ +extern void twofish_dec_blk_cbc_3way(const void *ctx, u8 *dst, const u8 *src); + +#endif /* ASM_X86_TWOFISH_H */ diff --git a/arch/x86/crypto/twofish_avx_glue.c b/arch/x86/crypto/twofish_avx_glue.c index 2dbc8ce3730e..3eb3440b477a 100644 --- a/arch/x86/crypto/twofish_avx_glue.c +++ b/arch/x86/crypto/twofish_avx_glue.c @@ -15,9 +15,9 @@ #include <crypto/algapi.h> #include <crypto/internal/simd.h> #include <crypto/twofish.h> -#include <crypto/xts.h> -#include <asm/crypto/glue_helper.h> -#include <asm/crypto/twofish.h> + +#include "twofish.h" +#include "ecb_cbc_helpers.h" #define TWOFISH_PARALLEL_BLOCKS 8 @@ -26,13 +26,6 @@ asmlinkage void twofish_ecb_enc_8way(const void *ctx, u8 *dst, const u8 *src); asmlinkage void twofish_ecb_dec_8way(const void *ctx, u8 *dst, const u8 *src); asmlinkage void twofish_cbc_dec_8way(const void *ctx, u8 *dst, const u8 *src); -asmlinkage void twofish_ctr_8way(const void *ctx, u8 *dst, const u8 *src, - le128 *iv); - -asmlinkage void twofish_xts_enc_8way(const void *ctx, u8 *dst, const u8 *src, - le128 *iv); -asmlinkage void twofish_xts_dec_8way(const void *ctx, u8 *dst, const u8 *src, - le128 *iv); static int twofish_setkey_skcipher(struct crypto_skcipher *tfm, const u8 *key, unsigned int keylen) @@ -45,171 +38,38 @@ static inline void twofish_enc_blk_3way(const void *ctx, u8 *dst, const u8 *src) __twofish_enc_blk_3way(ctx, dst, src, false); } -static void twofish_xts_enc(const void *ctx, u8 *dst, const u8 *src, le128 *iv) -{ - glue_xts_crypt_128bit_one(ctx, dst, src, iv, twofish_enc_blk); -} - -static void twofish_xts_dec(const void *ctx, u8 *dst, const u8 *src, le128 *iv) -{ - glue_xts_crypt_128bit_one(ctx, dst, src, iv, twofish_dec_blk); -} - -struct twofish_xts_ctx { - struct twofish_ctx tweak_ctx; - struct twofish_ctx crypt_ctx; -}; - -static int xts_twofish_setkey(struct crypto_skcipher *tfm, const u8 *key, - unsigned int keylen) -{ - struct twofish_xts_ctx *ctx = crypto_skcipher_ctx(tfm); - int err; - - err = xts_verify_key(tfm, key, keylen); - if (err) - return err; - - /* first half of xts-key is for crypt */ - err = __twofish_setkey(&ctx->crypt_ctx, key, keylen / 2); - if (err) - return err; - - /* second half of xts-key is for tweak */ - return __twofish_setkey(&ctx->tweak_ctx, key + keylen / 2, keylen / 2); -} - -static const struct common_glue_ctx twofish_enc = { - .num_funcs = 3, - .fpu_blocks_limit = TWOFISH_PARALLEL_BLOCKS, - - .funcs = { { - .num_blocks = TWOFISH_PARALLEL_BLOCKS, - .fn_u = { .ecb = twofish_ecb_enc_8way } - }, { - .num_blocks = 3, - .fn_u = { .ecb = twofish_enc_blk_3way } - }, { - .num_blocks = 1, - .fn_u = { .ecb = twofish_enc_blk } - } } -}; - -static const struct common_glue_ctx twofish_ctr = { - .num_funcs = 3, - .fpu_blocks_limit = TWOFISH_PARALLEL_BLOCKS, - - .funcs = { { - .num_blocks = TWOFISH_PARALLEL_BLOCKS, - .fn_u = { .ctr = twofish_ctr_8way } - }, { - .num_blocks = 3, - .fn_u = { .ctr = twofish_enc_blk_ctr_3way } - }, { - .num_blocks = 1, - .fn_u = { .ctr = twofish_enc_blk_ctr } - } } -}; - -static const struct common_glue_ctx twofish_enc_xts = { - .num_funcs = 2, - .fpu_blocks_limit = TWOFISH_PARALLEL_BLOCKS, - - .funcs = { { - .num_blocks = TWOFISH_PARALLEL_BLOCKS, - .fn_u = { .xts = twofish_xts_enc_8way } - }, { - .num_blocks = 1, - .fn_u = { .xts = twofish_xts_enc } - } } -}; - -static const struct common_glue_ctx twofish_dec = { - .num_funcs = 3, - .fpu_blocks_limit = TWOFISH_PARALLEL_BLOCKS, - - .funcs = { { - .num_blocks = TWOFISH_PARALLEL_BLOCKS, - .fn_u = { .ecb = twofish_ecb_dec_8way } - }, { - .num_blocks = 3, - .fn_u = { .ecb = twofish_dec_blk_3way } - }, { - .num_blocks = 1, - .fn_u = { .ecb = twofish_dec_blk } - } } -}; - -static const struct common_glue_ctx twofish_dec_cbc = { - .num_funcs = 3, - .fpu_blocks_limit = TWOFISH_PARALLEL_BLOCKS, - - .funcs = { { - .num_blocks = TWOFISH_PARALLEL_BLOCKS, - .fn_u = { .cbc = twofish_cbc_dec_8way } - }, { - .num_blocks = 3, - .fn_u = { .cbc = twofish_dec_blk_cbc_3way } - }, { - .num_blocks = 1, - .fn_u = { .cbc = twofish_dec_blk } - } } -}; - -static const struct common_glue_ctx twofish_dec_xts = { - .num_funcs = 2, - .fpu_blocks_limit = TWOFISH_PARALLEL_BLOCKS, - - .funcs = { { - .num_blocks = TWOFISH_PARALLEL_BLOCKS, - .fn_u = { .xts = twofish_xts_dec_8way } - }, { - .num_blocks = 1, - .fn_u = { .xts = twofish_xts_dec } - } } -}; - static int ecb_encrypt(struct skcipher_request *req) { - return glue_ecb_req_128bit(&twofish_enc, req); + ECB_WALK_START(req, TF_BLOCK_SIZE, TWOFISH_PARALLEL_BLOCKS); + ECB_BLOCK(TWOFISH_PARALLEL_BLOCKS, twofish_ecb_enc_8way); + ECB_BLOCK(3, twofish_enc_blk_3way); + ECB_BLOCK(1, twofish_enc_blk); + ECB_WALK_END(); } static int ecb_decrypt(struct skcipher_request *req) { - return glue_ecb_req_128bit(&twofish_dec, req); + ECB_WALK_START(req, TF_BLOCK_SIZE, TWOFISH_PARALLEL_BLOCKS); + ECB_BLOCK(TWOFISH_PARALLEL_BLOCKS, twofish_ecb_dec_8way); + ECB_BLOCK(3, twofish_dec_blk_3way); + ECB_BLOCK(1, twofish_dec_blk); + ECB_WALK_END(); } static int cbc_encrypt(struct skcipher_request *req) { - return glue_cbc_encrypt_req_128bit(twofish_enc_blk, req); + CBC_WALK_START(req, TF_BLOCK_SIZE, -1); + CBC_ENC_BLOCK(twofish_enc_blk); + CBC_WALK_END(); } static int cbc_decrypt(struct skcipher_request *req) { - return glue_cbc_decrypt_req_128bit(&twofish_dec_cbc, req); -} - -static int ctr_crypt(struct skcipher_request *req) -{ - return glue_ctr_req_128bit(&twofish_ctr, req); -} - -static int xts_encrypt(struct skcipher_request *req) -{ - struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - struct twofish_xts_ctx *ctx = crypto_skcipher_ctx(tfm); - - return glue_xts_req_128bit(&twofish_enc_xts, req, twofish_enc_blk, - &ctx->tweak_ctx, &ctx->crypt_ctx, false); -} - -static int xts_decrypt(struct skcipher_request *req) -{ - struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - struct twofish_xts_ctx *ctx = crypto_skcipher_ctx(tfm); - - return glue_xts_req_128bit(&twofish_dec_xts, req, twofish_enc_blk, - &ctx->tweak_ctx, &ctx->crypt_ctx, true); + CBC_WALK_START(req, TF_BLOCK_SIZE, TWOFISH_PARALLEL_BLOCKS); + CBC_DEC_BLOCK(TWOFISH_PARALLEL_BLOCKS, twofish_cbc_dec_8way); + CBC_DEC_BLOCK(3, twofish_dec_blk_cbc_3way); + CBC_DEC_BLOCK(1, twofish_dec_blk); + CBC_WALK_END(); } static struct skcipher_alg twofish_algs[] = { @@ -240,35 +100,6 @@ static struct skcipher_alg twofish_algs[] = { .setkey = twofish_setkey_skcipher, .encrypt = cbc_encrypt, .decrypt = cbc_decrypt, - }, { - .base.cra_name = "__ctr(twofish)", - .base.cra_driver_name = "__ctr-twofish-avx", - .base.cra_priority = 400, - .base.cra_flags = CRYPTO_ALG_INTERNAL, - .base.cra_blocksize = 1, - .base.cra_ctxsize = sizeof(struct twofish_ctx), - .base.cra_module = THIS_MODULE, - .min_keysize = TF_MIN_KEY_SIZE, - .max_keysize = TF_MAX_KEY_SIZE, - .ivsize = TF_BLOCK_SIZE, - .chunksize = TF_BLOCK_SIZE, - .setkey = twofish_setkey_skcipher, - .encrypt = ctr_crypt, - .decrypt = ctr_crypt, - }, { - .base.cra_name = "__xts(twofish)", - .base.cra_driver_name = "__xts-twofish-avx", - .base.cra_priority = 400, - .base.cra_flags = CRYPTO_ALG_INTERNAL, - .base.cra_blocksize = TF_BLOCK_SIZE, - .base.cra_ctxsize = sizeof(struct twofish_xts_ctx), - .base.cra_module = THIS_MODULE, - .min_keysize = 2 * TF_MIN_KEY_SIZE, - .max_keysize = 2 * TF_MAX_KEY_SIZE, - .ivsize = TF_BLOCK_SIZE, - .setkey = xts_twofish_setkey, - .encrypt = xts_encrypt, - .decrypt = xts_decrypt, }, }; diff --git a/arch/x86/crypto/twofish_glue.c b/arch/x86/crypto/twofish_glue.c index 77e06c2da83d..f9c4adc27404 100644 --- a/arch/x86/crypto/twofish_glue.c +++ b/arch/x86/crypto/twofish_glue.c @@ -81,18 +81,18 @@ static struct crypto_alg alg = { } }; -static int __init init(void) +static int __init twofish_glue_init(void) { return crypto_register_alg(&alg); } -static void __exit fini(void) +static void __exit twofish_glue_fini(void) { crypto_unregister_alg(&alg); } -module_init(init); -module_exit(fini); +module_init(twofish_glue_init); +module_exit(twofish_glue_fini); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION ("Twofish Cipher Algorithm, asm optimized"); diff --git a/arch/x86/crypto/twofish_glue_3way.c b/arch/x86/crypto/twofish_glue_3way.c index 768af6075479..90454cf18e0d 100644 --- a/arch/x86/crypto/twofish_glue_3way.c +++ b/arch/x86/crypto/twofish_glue_3way.c @@ -5,17 +5,16 @@ * Copyright (c) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> */ -#include <asm/crypto/glue_helper.h> -#include <asm/crypto/twofish.h> #include <crypto/algapi.h> -#include <crypto/b128ops.h> -#include <crypto/internal/skcipher.h> #include <crypto/twofish.h> #include <linux/crypto.h> #include <linux/init.h> #include <linux/module.h> #include <linux/types.h> +#include "twofish.h" +#include "ecb_cbc_helpers.h" + EXPORT_SYMBOL_GPL(__twofish_enc_blk_3way); EXPORT_SYMBOL_GPL(twofish_dec_blk_3way); @@ -30,143 +29,48 @@ static inline void twofish_enc_blk_3way(const void *ctx, u8 *dst, const u8 *src) __twofish_enc_blk_3way(ctx, dst, src, false); } -static inline void twofish_enc_blk_xor_3way(const void *ctx, u8 *dst, - const u8 *src) -{ - __twofish_enc_blk_3way(ctx, dst, src, true); -} - -void twofish_dec_blk_cbc_3way(const void *ctx, u8 *d, const u8 *s) +void twofish_dec_blk_cbc_3way(const void *ctx, u8 *dst, const u8 *src) { - u128 ivs[2]; - u128 *dst = (u128 *)d; - const u128 *src = (const u128 *)s; - - ivs[0] = src[0]; - ivs[1] = src[1]; + u8 buf[2][TF_BLOCK_SIZE]; + const u8 *s = src; - twofish_dec_blk_3way(ctx, (u8 *)dst, (u8 *)src); + if (dst == src) + s = memcpy(buf, src, sizeof(buf)); + twofish_dec_blk_3way(ctx, dst, src); + crypto_xor(dst + TF_BLOCK_SIZE, s, sizeof(buf)); - u128_xor(&dst[1], &dst[1], &ivs[0]); - u128_xor(&dst[2], &dst[2], &ivs[1]); } EXPORT_SYMBOL_GPL(twofish_dec_blk_cbc_3way); -void twofish_enc_blk_ctr(const void *ctx, u8 *d, const u8 *s, le128 *iv) -{ - be128 ctrblk; - u128 *dst = (u128 *)d; - const u128 *src = (const u128 *)s; - - if (dst != src) - *dst = *src; - - le128_to_be128(&ctrblk, iv); - le128_inc(iv); - - twofish_enc_blk(ctx, (u8 *)&ctrblk, (u8 *)&ctrblk); - u128_xor(dst, dst, (u128 *)&ctrblk); -} -EXPORT_SYMBOL_GPL(twofish_enc_blk_ctr); - -void twofish_enc_blk_ctr_3way(const void *ctx, u8 *d, const u8 *s, le128 *iv) -{ - be128 ctrblks[3]; - u128 *dst = (u128 *)d; - const u128 *src = (const u128 *)s; - - if (dst != src) { - dst[0] = src[0]; - dst[1] = src[1]; - dst[2] = src[2]; - } - - le128_to_be128(&ctrblks[0], iv); - le128_inc(iv); - le128_to_be128(&ctrblks[1], iv); - le128_inc(iv); - le128_to_be128(&ctrblks[2], iv); - le128_inc(iv); - - twofish_enc_blk_xor_3way(ctx, (u8 *)dst, (u8 *)ctrblks); -} -EXPORT_SYMBOL_GPL(twofish_enc_blk_ctr_3way); - -static const struct common_glue_ctx twofish_enc = { - .num_funcs = 2, - .fpu_blocks_limit = -1, - - .funcs = { { - .num_blocks = 3, - .fn_u = { .ecb = twofish_enc_blk_3way } - }, { - .num_blocks = 1, - .fn_u = { .ecb = twofish_enc_blk } - } } -}; - -static const struct common_glue_ctx twofish_ctr = { - .num_funcs = 2, - .fpu_blocks_limit = -1, - - .funcs = { { - .num_blocks = 3, - .fn_u = { .ctr = twofish_enc_blk_ctr_3way } - }, { - .num_blocks = 1, - .fn_u = { .ctr = twofish_enc_blk_ctr } - } } -}; - -static const struct common_glue_ctx twofish_dec = { - .num_funcs = 2, - .fpu_blocks_limit = -1, - - .funcs = { { - .num_blocks = 3, - .fn_u = { .ecb = twofish_dec_blk_3way } - }, { - .num_blocks = 1, - .fn_u = { .ecb = twofish_dec_blk } - } } -}; - -static const struct common_glue_ctx twofish_dec_cbc = { - .num_funcs = 2, - .fpu_blocks_limit = -1, - - .funcs = { { - .num_blocks = 3, - .fn_u = { .cbc = twofish_dec_blk_cbc_3way } - }, { - .num_blocks = 1, - .fn_u = { .cbc = twofish_dec_blk } - } } -}; - static int ecb_encrypt(struct skcipher_request *req) { - return glue_ecb_req_128bit(&twofish_enc, req); + ECB_WALK_START(req, TF_BLOCK_SIZE, -1); + ECB_BLOCK(3, twofish_enc_blk_3way); + ECB_BLOCK(1, twofish_enc_blk); + ECB_WALK_END(); } static int ecb_decrypt(struct skcipher_request *req) { - return glue_ecb_req_128bit(&twofish_dec, req); + ECB_WALK_START(req, TF_BLOCK_SIZE, -1); + ECB_BLOCK(3, twofish_dec_blk_3way); + ECB_BLOCK(1, twofish_dec_blk); + ECB_WALK_END(); } static int cbc_encrypt(struct skcipher_request *req) { - return glue_cbc_encrypt_req_128bit(twofish_enc_blk, req); + CBC_WALK_START(req, TF_BLOCK_SIZE, -1); + CBC_ENC_BLOCK(twofish_enc_blk); + CBC_WALK_END(); } static int cbc_decrypt(struct skcipher_request *req) { - return glue_cbc_decrypt_req_128bit(&twofish_dec_cbc, req); -} - -static int ctr_crypt(struct skcipher_request *req) -{ - return glue_ctr_req_128bit(&twofish_ctr, req); + CBC_WALK_START(req, TF_BLOCK_SIZE, -1); + CBC_DEC_BLOCK(3, twofish_dec_blk_cbc_3way); + CBC_DEC_BLOCK(1, twofish_dec_blk); + CBC_WALK_END(); } static struct skcipher_alg tf_skciphers[] = { @@ -195,20 +99,6 @@ static struct skcipher_alg tf_skciphers[] = { .setkey = twofish_setkey_skcipher, .encrypt = cbc_encrypt, .decrypt = cbc_decrypt, - }, { - .base.cra_name = "ctr(twofish)", - .base.cra_driver_name = "ctr-twofish-3way", - .base.cra_priority = 300, - .base.cra_blocksize = 1, - .base.cra_ctxsize = sizeof(struct twofish_ctx), - .base.cra_module = THIS_MODULE, - .min_keysize = TF_MIN_KEY_SIZE, - .max_keysize = TF_MAX_KEY_SIZE, - .ivsize = TF_BLOCK_SIZE, - .chunksize = TF_BLOCK_SIZE, - .setkey = twofish_setkey_skcipher, - .encrypt = ctr_crypt, - .decrypt = ctr_crypt, }, }; @@ -227,7 +117,7 @@ static bool is_blacklisted_cpu(void) * storing blocks in 64bit registers to allow three blocks to * be processed parallel. Parallel operation then allows gaining * more performance than was trade off, on out-of-order CPUs. - * However Atom does not benefit from this parallellism and + * However Atom does not benefit from this parallelism and * should be blacklisted. */ return true; @@ -250,7 +140,7 @@ static int force; module_param(force, int, 0); MODULE_PARM_DESC(force, "Force module load, ignore CPU blacklist"); -static int __init init(void) +static int __init twofish_3way_init(void) { if (!force && is_blacklisted_cpu()) { printk(KERN_INFO @@ -264,13 +154,13 @@ static int __init init(void) ARRAY_SIZE(tf_skciphers)); } -static void __exit fini(void) +static void __exit twofish_3way_fini(void) { crypto_unregister_skciphers(tf_skciphers, ARRAY_SIZE(tf_skciphers)); } -module_init(init); -module_exit(fini); +module_init(twofish_3way_init); +module_exit(twofish_3way_fini); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Twofish Cipher Algorithm, 3-way parallel asm optimized"); |