diff options
Diffstat (limited to 'arch/x86/crypto')
65 files changed, 5569 insertions, 19428 deletions
diff --git a/arch/x86/crypto/Kconfig b/arch/x86/crypto/Kconfig index c9e59589a1ce..56cfdc79e2c6 100644 --- a/arch/x86/crypto/Kconfig +++ b/arch/x86/crypto/Kconfig @@ -3,10 +3,12 @@ menu "Accelerated Cryptographic Algorithms for CPU (x86)" config CRYPTO_CURVE25519_X86 - tristate "Public key crypto: Curve25519 (ADX)" - depends on X86 && 64BIT + tristate + depends on 64BIT + select CRYPTO_KPP select CRYPTO_LIB_CURVE25519_GENERIC select CRYPTO_ARCH_HAVE_LIB_CURVE25519 + default CRYPTO_LIB_CURVE25519_INTERNAL help Curve25519 algorithm @@ -14,24 +16,27 @@ config CRYPTO_CURVE25519_X86 - ADX (large integer arithmetic) config CRYPTO_AES_NI_INTEL - tristate "Ciphers: AES, modes: ECB, CBC, CTS, CTR, XTR, XTS, GCM (AES-NI)" - depends on X86 + tristate "Ciphers: AES, modes: ECB, CBC, CTS, CTR, XCTR, XTS, GCM (AES-NI/VAES)" select CRYPTO_AEAD select CRYPTO_LIB_AES + select CRYPTO_LIB_GF128MUL select CRYPTO_ALGAPI select CRYPTO_SKCIPHER - select CRYPTO_SIMD help Block cipher: AES cipher algorithms AEAD cipher: AES with GCM - Length-preserving ciphers: AES with ECB, CBC, CTS, CTR, XTR, XTS + Length-preserving ciphers: AES with ECB, CBC, CTS, CTR, XCTR, XTS Architecture: x86 (32-bit and 64-bit) using: - AES-NI (AES new instructions) + - VAES (Vector AES) + + Some algorithm implementations are supported only in 64-bit builds, + and some have additional prerequisites such as AVX2 or AVX512. config CRYPTO_BLOWFISH_X86_64 tristate "Ciphers: Blowfish, modes: ECB, CBC" - depends on X86 && 64BIT + depends on 64BIT select CRYPTO_SKCIPHER select CRYPTO_BLOWFISH_COMMON imply CRYPTO_CTR @@ -43,7 +48,7 @@ config CRYPTO_BLOWFISH_X86_64 config CRYPTO_CAMELLIA_X86_64 tristate "Ciphers: Camellia with modes: ECB, CBC" - depends on X86 && 64BIT + depends on 64BIT select CRYPTO_SKCIPHER imply CRYPTO_CTR help @@ -54,10 +59,9 @@ config CRYPTO_CAMELLIA_X86_64 config CRYPTO_CAMELLIA_AESNI_AVX_X86_64 tristate "Ciphers: Camellia with modes: ECB, CBC (AES-NI/AVX)" - depends on X86 && 64BIT + depends on 64BIT select CRYPTO_SKCIPHER select CRYPTO_CAMELLIA_X86_64 - select CRYPTO_SIMD imply CRYPTO_XTS help Length-preserving ciphers: Camellia with ECB and CBC modes @@ -68,7 +72,7 @@ config CRYPTO_CAMELLIA_AESNI_AVX_X86_64 config CRYPTO_CAMELLIA_AESNI_AVX2_X86_64 tristate "Ciphers: Camellia with modes: ECB, CBC (AES-NI/AVX2)" - depends on X86 && 64BIT + depends on 64BIT select CRYPTO_CAMELLIA_AESNI_AVX_X86_64 help Length-preserving ciphers: Camellia with ECB and CBC modes @@ -79,11 +83,10 @@ config CRYPTO_CAMELLIA_AESNI_AVX2_X86_64 config CRYPTO_CAST5_AVX_X86_64 tristate "Ciphers: CAST5 with modes: ECB, CBC (AVX)" - depends on X86 && 64BIT + depends on 64BIT select CRYPTO_SKCIPHER select CRYPTO_CAST5 select CRYPTO_CAST_COMMON - select CRYPTO_SIMD imply CRYPTO_CTR help Length-preserving ciphers: CAST5 (CAST-128) cipher algorithm @@ -96,11 +99,10 @@ config CRYPTO_CAST5_AVX_X86_64 config CRYPTO_CAST6_AVX_X86_64 tristate "Ciphers: CAST6 with modes: ECB, CBC (AVX)" - depends on X86 && 64BIT + depends on 64BIT select CRYPTO_SKCIPHER select CRYPTO_CAST6 select CRYPTO_CAST_COMMON - select CRYPTO_SIMD imply CRYPTO_XTS imply CRYPTO_CTR help @@ -114,7 +116,7 @@ config CRYPTO_CAST6_AVX_X86_64 config CRYPTO_DES3_EDE_X86_64 tristate "Ciphers: Triple DES EDE with modes: ECB, CBC" - depends on X86 && 64BIT + depends on 64BIT select CRYPTO_SKCIPHER select CRYPTO_LIB_DES imply CRYPTO_CTR @@ -128,10 +130,9 @@ config CRYPTO_DES3_EDE_X86_64 config CRYPTO_SERPENT_SSE2_X86_64 tristate "Ciphers: Serpent with modes: ECB, CBC (SSE2)" - depends on X86 && 64BIT + depends on 64BIT select CRYPTO_SKCIPHER select CRYPTO_SERPENT - select CRYPTO_SIMD imply CRYPTO_CTR help Length-preserving ciphers: Serpent cipher algorithm @@ -144,10 +145,9 @@ config CRYPTO_SERPENT_SSE2_X86_64 config CRYPTO_SERPENT_SSE2_586 tristate "Ciphers: Serpent with modes: ECB, CBC (32-bit with SSE2)" - depends on X86 && !64BIT + depends on !64BIT select CRYPTO_SKCIPHER select CRYPTO_SERPENT - select CRYPTO_SIMD imply CRYPTO_CTR help Length-preserving ciphers: Serpent cipher algorithm @@ -160,10 +160,9 @@ config CRYPTO_SERPENT_SSE2_586 config CRYPTO_SERPENT_AVX_X86_64 tristate "Ciphers: Serpent with modes: ECB, CBC (AVX)" - depends on X86 && 64BIT + depends on 64BIT select CRYPTO_SKCIPHER select CRYPTO_SERPENT - select CRYPTO_SIMD imply CRYPTO_XTS imply CRYPTO_CTR help @@ -177,7 +176,7 @@ config CRYPTO_SERPENT_AVX_X86_64 config CRYPTO_SERPENT_AVX2_X86_64 tristate "Ciphers: Serpent with modes: ECB, CBC (AVX2)" - depends on X86 && 64BIT + depends on 64BIT select CRYPTO_SERPENT_AVX_X86_64 help Length-preserving ciphers: Serpent cipher algorithm @@ -190,9 +189,8 @@ config CRYPTO_SERPENT_AVX2_X86_64 config CRYPTO_SM4_AESNI_AVX_X86_64 tristate "Ciphers: SM4 with modes: ECB, CBC, CTR (AES-NI/AVX)" - depends on X86 && 64BIT + depends on 64BIT select CRYPTO_SKCIPHER - select CRYPTO_SIMD select CRYPTO_ALGAPI select CRYPTO_SM4 help @@ -211,9 +209,8 @@ config CRYPTO_SM4_AESNI_AVX_X86_64 config CRYPTO_SM4_AESNI_AVX2_X86_64 tristate "Ciphers: SM4 with modes: ECB, CBC, CTR (AES-NI/AVX2)" - depends on X86 && 64BIT + depends on 64BIT select CRYPTO_SKCIPHER - select CRYPTO_SIMD select CRYPTO_ALGAPI select CRYPTO_SM4 select CRYPTO_SM4_AESNI_AVX_X86_64 @@ -233,7 +230,7 @@ config CRYPTO_SM4_AESNI_AVX2_X86_64 config CRYPTO_TWOFISH_586 tristate "Ciphers: Twofish (32-bit)" - depends on (X86 || UML_X86) && !64BIT + depends on !64BIT select CRYPTO_ALGAPI select CRYPTO_TWOFISH_COMMON imply CRYPTO_CTR @@ -244,7 +241,7 @@ config CRYPTO_TWOFISH_586 config CRYPTO_TWOFISH_X86_64 tristate "Ciphers: Twofish" - depends on (X86 || UML_X86) && 64BIT + depends on 64BIT select CRYPTO_ALGAPI select CRYPTO_TWOFISH_COMMON imply CRYPTO_CTR @@ -255,7 +252,7 @@ config CRYPTO_TWOFISH_X86_64 config CRYPTO_TWOFISH_X86_64_3WAY tristate "Ciphers: Twofish with modes: ECB, CBC (3-way parallel)" - depends on X86 && 64BIT + depends on 64BIT select CRYPTO_SKCIPHER select CRYPTO_TWOFISH_COMMON select CRYPTO_TWOFISH_X86_64 @@ -270,9 +267,8 @@ config CRYPTO_TWOFISH_X86_64_3WAY config CRYPTO_TWOFISH_AVX_X86_64 tristate "Ciphers: Twofish with modes: ECB, CBC (AVX)" - depends on X86 && 64BIT + depends on 64BIT select CRYPTO_SKCIPHER - select CRYPTO_SIMD select CRYPTO_TWOFISH_COMMON select CRYPTO_TWOFISH_X86_64 select CRYPTO_TWOFISH_X86_64_3WAY @@ -288,9 +284,8 @@ config CRYPTO_TWOFISH_AVX_X86_64 config CRYPTO_ARIA_AESNI_AVX_X86_64 tristate "Ciphers: ARIA with modes: ECB, CTR (AES-NI/AVX/GFNI)" - depends on X86 && 64BIT + depends on 64BIT select CRYPTO_SKCIPHER - select CRYPTO_SIMD select CRYPTO_ALGAPI select CRYPTO_ARIA help @@ -306,9 +301,8 @@ config CRYPTO_ARIA_AESNI_AVX_X86_64 config CRYPTO_ARIA_AESNI_AVX2_X86_64 tristate "Ciphers: ARIA with modes: ECB, CTR (AES-NI/AVX2/GFNI)" - depends on X86 && 64BIT + depends on 64BIT select CRYPTO_SKCIPHER - select CRYPTO_SIMD select CRYPTO_ALGAPI select CRYPTO_ARIA select CRYPTO_ARIA_AESNI_AVX_X86_64 @@ -325,9 +319,8 @@ config CRYPTO_ARIA_AESNI_AVX2_X86_64 config CRYPTO_ARIA_GFNI_AVX512_X86_64 tristate "Ciphers: ARIA with modes: ECB, CTR (AVX512/GFNI)" - depends on X86 && 64BIT && AS_AVX512 && AS_GFNI + depends on 64BIT && AS_GFNI select CRYPTO_SKCIPHER - select CRYPTO_SIMD select CRYPTO_ALGAPI select CRYPTO_ARIA select CRYPTO_ARIA_AESNI_AVX_X86_64 @@ -342,36 +335,20 @@ config CRYPTO_ARIA_GFNI_AVX512_X86_64 Processes 64 blocks in parallel. -config CRYPTO_CHACHA20_X86_64 - tristate "Ciphers: ChaCha20, XChaCha20, XChaCha12 (SSSE3/AVX2/AVX-512VL)" - depends on X86 && 64BIT - select CRYPTO_SKCIPHER - select CRYPTO_LIB_CHACHA_GENERIC - select CRYPTO_ARCH_HAVE_LIB_CHACHA - help - Length-preserving ciphers: ChaCha20, XChaCha20, and XChaCha12 - stream cipher algorithms - - Architecture: x86_64 using: - - SSSE3 (Supplemental SSE3) - - AVX2 (Advanced Vector Extensions 2) - - AVX-512VL (Advanced Vector Extensions-512VL) - config CRYPTO_AEGIS128_AESNI_SSE2 - tristate "AEAD ciphers: AEGIS-128 (AES-NI/SSE2)" - depends on X86 && 64BIT + tristate "AEAD ciphers: AEGIS-128 (AES-NI/SSE4.1)" + depends on 64BIT select CRYPTO_AEAD - select CRYPTO_SIMD help AEGIS-128 AEAD algorithm Architecture: x86_64 using: - AES-NI (AES New Instructions) - - SSE2 (Streaming SIMD Extensions 2) + - SSE4.1 (Streaming SIMD Extensions 4.1) config CRYPTO_NHPOLY1305_SSE2 tristate "Hash functions: NHPoly1305 (SSE2)" - depends on X86 && 64BIT + depends on 64BIT select CRYPTO_NHPOLY1305 help NHPoly1305 hash function for Adiantum @@ -381,7 +358,7 @@ config CRYPTO_NHPOLY1305_SSE2 config CRYPTO_NHPOLY1305_AVX2 tristate "Hash functions: NHPoly1305 (AVX2)" - depends on X86 && 64BIT + depends on 64BIT select CRYPTO_NHPOLY1305 help NHPoly1305 hash function for Adiantum @@ -389,21 +366,9 @@ config CRYPTO_NHPOLY1305_AVX2 Architecture: x86_64 using: - AVX2 (Advanced Vector Extensions 2) -config CRYPTO_BLAKE2S_X86 - bool "Hash functions: BLAKE2s (SSSE3/AVX-512)" - depends on X86 && 64BIT - select CRYPTO_LIB_BLAKE2S_GENERIC - select CRYPTO_ARCH_HAVE_LIB_BLAKE2S - help - BLAKE2s cryptographic hash function (RFC 7693) - - Architecture: x86_64 using: - - SSSE3 (Supplemental SSE3) - - AVX-512 (Advanced Vector Extensions-512) - config CRYPTO_POLYVAL_CLMUL_NI tristate "Hash functions: POLYVAL (CLMUL-NI)" - depends on X86 && 64BIT + depends on 64BIT select CRYPTO_POLYVAL help POLYVAL hash function for HCTR2 @@ -411,21 +376,9 @@ config CRYPTO_POLYVAL_CLMUL_NI Architecture: x86_64 using: - CLMUL-NI (carry-less multiplication new instructions) -config CRYPTO_POLY1305_X86_64 - tristate "Hash functions: Poly1305 (SSE2/AVX2)" - depends on X86 && 64BIT - select CRYPTO_LIB_POLY1305_GENERIC - select CRYPTO_ARCH_HAVE_LIB_POLY1305 - help - Poly1305 authenticator algorithm (RFC7539) - - Architecture: x86_64 using: - - SSE2 (Streaming SIMD Extensions 2) - - AVX2 (Advanced Vector Extensions 2) - config CRYPTO_SHA1_SSSE3 tristate "Hash functions: SHA-1 (SSSE3/AVX/AVX2/SHA-NI)" - depends on X86 && 64BIT + depends on 64BIT select CRYPTO_SHA1 select CRYPTO_HASH help @@ -437,23 +390,9 @@ config CRYPTO_SHA1_SSSE3 - AVX2 (Advanced Vector Extensions 2) - SHA-NI (SHA Extensions New Instructions) -config CRYPTO_SHA256_SSSE3 - tristate "Hash functions: SHA-224 and SHA-256 (SSSE3/AVX/AVX2/SHA-NI)" - depends on X86 && 64BIT - select CRYPTO_SHA256 - select CRYPTO_HASH - help - SHA-224 and SHA-256 secure hash algorithms (FIPS 180) - - Architecture: x86_64 using: - - SSSE3 (Supplemental SSE3) - - AVX (Advanced Vector Extensions) - - AVX2 (Advanced Vector Extensions 2) - - SHA-NI (SHA Extensions New Instructions) - config CRYPTO_SHA512_SSSE3 tristate "Hash functions: SHA-384 and SHA-512 (SSSE3/AVX/AVX2)" - depends on X86 && 64BIT + depends on 64BIT select CRYPTO_SHA512 select CRYPTO_HASH help @@ -466,9 +405,9 @@ config CRYPTO_SHA512_SSSE3 config CRYPTO_SM3_AVX_X86_64 tristate "Hash functions: SM3 (AVX)" - depends on X86 && 64BIT + depends on 64BIT select CRYPTO_HASH - select CRYPTO_SM3 + select CRYPTO_LIB_SM3 help SM3 secure hash function as defined by OSCCA GM/T 0004-2012 SM3 @@ -479,7 +418,7 @@ config CRYPTO_SM3_AVX_X86_64 config CRYPTO_GHASH_CLMUL_NI_INTEL tristate "Hash functions: GHASH (CLMUL-NI)" - depends on X86 && 64BIT + depends on 64BIT select CRYPTO_CRYPTD help GCM GHASH hash function (NIST SP800-38D) @@ -487,36 +426,4 @@ config CRYPTO_GHASH_CLMUL_NI_INTEL Architecture: x86_64 using: - CLMUL-NI (carry-less multiplication new instructions) -config CRYPTO_CRC32C_INTEL - tristate "CRC32c (SSE4.2/PCLMULQDQ)" - depends on X86 - select CRYPTO_HASH - help - CRC32c CRC algorithm with the iSCSI polynomial (RFC 3385 and RFC 3720) - - Architecture: x86 (32-bit and 64-bit) using: - - SSE4.2 (Streaming SIMD Extensions 4.2) CRC32 instruction - - PCLMULQDQ (carry-less multiplication) - -config CRYPTO_CRC32_PCLMUL - tristate "CRC32 (PCLMULQDQ)" - depends on X86 - select CRYPTO_HASH - select CRC32 - help - CRC32 CRC algorithm (IEEE 802.3) - - Architecture: x86 (32-bit and 64-bit) using: - - PCLMULQDQ (carry-less multiplication) - -config CRYPTO_CRCT10DIF_PCLMUL - tristate "CRCT10DIF (PCLMULQDQ)" - depends on X86 && 64BIT && CRC_T10DIF - select CRYPTO_HASH - help - CRC16 CRC algorithm used for the T10 (SCSI) Data Integrity Field (DIF) - - Architecture: x86_64 using: - - PCLMULQDQ (carry-less multiplication) - endmenu diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile index 9aa46093c91b..aa289a9e0153 100644 --- a/arch/x86/crypto/Makefile +++ b/arch/x86/crypto/Makefile @@ -42,48 +42,27 @@ cast6-avx-x86_64-y := cast6-avx-x86_64-asm_64.o cast6_avx_glue.o obj-$(CONFIG_CRYPTO_AEGIS128_AESNI_SSE2) += aegis128-aesni.o aegis128-aesni-y := aegis128-aesni-asm.o aegis128-aesni-glue.o -obj-$(CONFIG_CRYPTO_CHACHA20_X86_64) += chacha-x86_64.o -chacha-x86_64-y := chacha-avx2-x86_64.o chacha-ssse3-x86_64.o chacha_glue.o -chacha-x86_64-$(CONFIG_AS_AVX512) += chacha-avx512vl-x86_64.o - obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o -aesni-intel-$(CONFIG_64BIT) += aesni-intel_avx-x86_64.o aes_ctrby8_avx-x86_64.o +aesni-intel-$(CONFIG_64BIT) += aes-ctr-avx-x86_64.o \ + aes-gcm-aesni-x86_64.o \ + aes-xts-avx-x86_64.o +ifeq ($(CONFIG_AS_VAES)$(CONFIG_AS_VPCLMULQDQ),yy) +aesni-intel-$(CONFIG_64BIT) += aes-gcm-avx10-x86_64.o +endif obj-$(CONFIG_CRYPTO_SHA1_SSSE3) += sha1-ssse3.o -sha1-ssse3-y := sha1_avx2_x86_64_asm.o sha1_ssse3_asm.o sha1_ssse3_glue.o -sha1-ssse3-$(CONFIG_AS_SHA1_NI) += sha1_ni_asm.o - -obj-$(CONFIG_CRYPTO_SHA256_SSSE3) += sha256-ssse3.o -sha256-ssse3-y := sha256-ssse3-asm.o sha256-avx-asm.o sha256-avx2-asm.o sha256_ssse3_glue.o -sha256-ssse3-$(CONFIG_AS_SHA256_NI) += sha256_ni_asm.o +sha1-ssse3-y := sha1_avx2_x86_64_asm.o sha1_ssse3_asm.o sha1_ni_asm.o sha1_ssse3_glue.o obj-$(CONFIG_CRYPTO_SHA512_SSSE3) += sha512-ssse3.o sha512-ssse3-y := sha512-ssse3-asm.o sha512-avx-asm.o sha512-avx2-asm.o sha512_ssse3_glue.o -obj-$(CONFIG_CRYPTO_BLAKE2S_X86) += libblake2s-x86_64.o -libblake2s-x86_64-y := blake2s-core.o blake2s-glue.o - obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o obj-$(CONFIG_CRYPTO_POLYVAL_CLMUL_NI) += polyval-clmulni.o polyval-clmulni-y := polyval-clmulni_asm.o polyval-clmulni_glue.o -obj-$(CONFIG_CRYPTO_CRC32C_INTEL) += crc32c-intel.o -crc32c-intel-y := crc32c-intel_glue.o -crc32c-intel-$(CONFIG_64BIT) += crc32c-pcl-intel-asm_64.o - -obj-$(CONFIG_CRYPTO_CRC32_PCLMUL) += crc32-pclmul.o -crc32-pclmul-y := crc32-pclmul_asm.o crc32-pclmul_glue.o - -obj-$(CONFIG_CRYPTO_CRCT10DIF_PCLMUL) += crct10dif-pclmul.o -crct10dif-pclmul-y := crct10dif-pcl-asm_64.o crct10dif-pclmul_glue.o - -obj-$(CONFIG_CRYPTO_POLY1305_X86_64) += poly1305-x86_64.o -poly1305-x86_64-y := poly1305-x86_64-cryptogams.o poly1305_glue.o -targets += poly1305-x86_64-cryptogams.S - obj-$(CONFIG_CRYPTO_NHPOLY1305_SSE2) += nhpoly1305-sse2.o nhpoly1305-sse2-y := nh-sse2-x86_64.o nhpoly1305-sse2-glue.o obj-$(CONFIG_CRYPTO_NHPOLY1305_AVX2) += nhpoly1305-avx2.o @@ -109,10 +88,5 @@ aria-aesni-avx2-x86_64-y := aria-aesni-avx2-asm_64.o aria_aesni_avx2_glue.o obj-$(CONFIG_CRYPTO_ARIA_GFNI_AVX512_X86_64) += aria-gfni-avx512-x86_64.o aria-gfni-avx512-x86_64-y := aria-gfni-avx512-asm_64.o aria_gfni_avx512_glue.o -quiet_cmd_perlasm = PERLASM $@ - cmd_perlasm = $(PERL) $< > $@ -$(obj)/%.S: $(src)/%.pl FORCE - $(call if_changed,perlasm) - # Disable GCOV in odd or sensitive code GCOV_PROFILE_curve25519-x86_64.o := n diff --git a/arch/x86/crypto/aegis128-aesni-asm.S b/arch/x86/crypto/aegis128-aesni-asm.S index ad7f4c891625..7294dc0ee7ba 100644 --- a/arch/x86/crypto/aegis128-aesni-asm.S +++ b/arch/x86/crypto/aegis128-aesni-asm.S @@ -1,14 +1,13 @@ /* SPDX-License-Identifier: GPL-2.0-only */ /* - * AES-NI + SSE2 implementation of AEGIS-128 + * AES-NI + SSE4.1 implementation of AEGIS-128 * * Copyright (c) 2017-2018 Ondrej Mosnacek <omosnacek@gmail.com> * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved. + * Copyright 2024 Google LLC */ #include <linux/linkage.h> -#include <linux/cfi_types.h> -#include <asm/frame.h> #define STATE0 %xmm0 #define STATE1 %xmm1 @@ -20,11 +19,6 @@ #define T0 %xmm6 #define T1 %xmm7 -#define STATEP %rdi -#define LEN %rsi -#define SRC %rdx -#define DST %rcx - .section .rodata.cst16.aegis128_const, "aM", @progbits, 32 .align 16 .Laegis128_const_0: @@ -34,11 +28,11 @@ .byte 0xdb, 0x3d, 0x18, 0x55, 0x6d, 0xc2, 0x2f, 0xf1 .byte 0x20, 0x11, 0x31, 0x42, 0x73, 0xb5, 0x28, 0xdd -.section .rodata.cst16.aegis128_counter, "aM", @progbits, 16 -.align 16 -.Laegis128_counter: - .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 - .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f +.section .rodata.cst32.zeropad_mask, "aM", @progbits, 32 +.align 32 +.Lzeropad_mask: + .octa 0xffffffffffffffffffffffffffffffff + .octa 0 .text @@ -61,140 +55,102 @@ .endm /* - * __load_partial: internal ABI - * input: - * LEN - bytes - * SRC - src - * output: - * MSG - message block - * changed: - * T0 - * %r8 - * %r9 + * Load 1 <= LEN (%ecx) <= 15 bytes from the pointer SRC into the xmm register + * MSG and zeroize any remaining bytes. Clobbers %rax, %rcx, and %r8. */ -SYM_FUNC_START_LOCAL(__load_partial) - xor %r9d, %r9d - pxor MSG, MSG - - mov LEN, %r8 - and $0x1, %r8 - jz .Lld_partial_1 - - mov LEN, %r8 - and $0x1E, %r8 - add SRC, %r8 - mov (%r8), %r9b - -.Lld_partial_1: - mov LEN, %r8 - and $0x2, %r8 - jz .Lld_partial_2 - - mov LEN, %r8 - and $0x1C, %r8 - add SRC, %r8 - shl $0x10, %r9 - mov (%r8), %r9w - -.Lld_partial_2: - mov LEN, %r8 - and $0x4, %r8 - jz .Lld_partial_4 - - mov LEN, %r8 - and $0x18, %r8 - add SRC, %r8 - shl $32, %r9 - mov (%r8), %r8d - xor %r8, %r9 - -.Lld_partial_4: - movq %r9, MSG - - mov LEN, %r8 - and $0x8, %r8 - jz .Lld_partial_8 - - mov LEN, %r8 - and $0x10, %r8 - add SRC, %r8 - pslldq $8, MSG - movq (%r8), T0 - pxor T0, MSG - -.Lld_partial_8: - RET -SYM_FUNC_END(__load_partial) +.macro load_partial + sub $8, %ecx /* LEN - 8 */ + jle .Lle8\@ + + /* Load 9 <= LEN <= 15 bytes: */ + movq (SRC), MSG /* Load first 8 bytes */ + mov (SRC, %rcx), %rax /* Load last 8 bytes */ + neg %ecx + shl $3, %ecx + shr %cl, %rax /* Discard overlapping bytes */ + pinsrq $1, %rax, MSG + jmp .Ldone\@ + +.Lle8\@: + add $4, %ecx /* LEN - 4 */ + jl .Llt4\@ + + /* Load 4 <= LEN <= 8 bytes: */ + mov (SRC), %eax /* Load first 4 bytes */ + mov (SRC, %rcx), %r8d /* Load last 4 bytes */ + jmp .Lcombine\@ + +.Llt4\@: + /* Load 1 <= LEN <= 3 bytes: */ + add $2, %ecx /* LEN - 2 */ + movzbl (SRC), %eax /* Load first byte */ + jl .Lmovq\@ + movzwl (SRC, %rcx), %r8d /* Load last 2 bytes */ +.Lcombine\@: + shl $3, %ecx + shl %cl, %r8 + or %r8, %rax /* Combine the two parts */ +.Lmovq\@: + movq %rax, MSG +.Ldone\@: +.endm /* - * __store_partial: internal ABI - * input: - * LEN - bytes - * DST - dst - * output: - * T0 - message block - * changed: - * %r8 - * %r9 - * %r10 + * Store 1 <= LEN (%ecx) <= 15 bytes from the xmm register \msg to the pointer + * DST. Clobbers %rax, %rcx, and %r8. */ -SYM_FUNC_START_LOCAL(__store_partial) - mov LEN, %r8 - mov DST, %r9 - - movq T0, %r10 - - cmp $8, %r8 - jl .Lst_partial_8 - - mov %r10, (%r9) - psrldq $8, T0 - movq T0, %r10 - - sub $8, %r8 - add $8, %r9 - -.Lst_partial_8: - cmp $4, %r8 - jl .Lst_partial_4 - - mov %r10d, (%r9) - shr $32, %r10 - - sub $4, %r8 - add $4, %r9 - -.Lst_partial_4: - cmp $2, %r8 - jl .Lst_partial_2 - - mov %r10w, (%r9) - shr $0x10, %r10 - - sub $2, %r8 - add $2, %r9 - -.Lst_partial_2: - cmp $1, %r8 - jl .Lst_partial_1 - - mov %r10b, (%r9) - -.Lst_partial_1: - RET -SYM_FUNC_END(__store_partial) +.macro store_partial msg + sub $8, %ecx /* LEN - 8 */ + jl .Llt8\@ + + /* Store 8 <= LEN <= 15 bytes: */ + pextrq $1, \msg, %rax + mov %ecx, %r8d + shl $3, %ecx + ror %cl, %rax + mov %rax, (DST, %r8) /* Store last LEN - 8 bytes */ + movq \msg, (DST) /* Store first 8 bytes */ + jmp .Ldone\@ + +.Llt8\@: + add $4, %ecx /* LEN - 4 */ + jl .Llt4\@ + + /* Store 4 <= LEN <= 7 bytes: */ + pextrd $1, \msg, %eax + mov %ecx, %r8d + shl $3, %ecx + ror %cl, %eax + mov %eax, (DST, %r8) /* Store last LEN - 4 bytes */ + movd \msg, (DST) /* Store first 4 bytes */ + jmp .Ldone\@ + +.Llt4\@: + /* Store 1 <= LEN <= 3 bytes: */ + pextrb $0, \msg, 0(DST) + cmp $-2, %ecx /* LEN - 4 == -2, i.e. LEN == 2? */ + jl .Ldone\@ + pextrb $1, \msg, 1(DST) + je .Ldone\@ + pextrb $2, \msg, 2(DST) +.Ldone\@: +.endm /* - * void crypto_aegis128_aesni_init(void *state, const void *key, const void *iv); + * void aegis128_aesni_init(struct aegis_state *state, + * const struct aegis_block *key, + * const u8 iv[AEGIS128_NONCE_SIZE]); */ -SYM_FUNC_START(crypto_aegis128_aesni_init) - FRAME_BEGIN +SYM_FUNC_START(aegis128_aesni_init) + .set STATEP, %rdi + .set KEYP, %rsi + .set IVP, %rdx /* load IV: */ - movdqu (%rdx), T1 + movdqu (IVP), T1 /* load key: */ - movdqa (%rsi), KEY + movdqa (KEYP), KEY pxor KEY, T1 movdqa T1, STATE0 movdqa KEY, STATE3 @@ -224,20 +180,22 @@ SYM_FUNC_START(crypto_aegis128_aesni_init) movdqu STATE2, 0x20(STATEP) movdqu STATE3, 0x30(STATEP) movdqu STATE4, 0x40(STATEP) - - FRAME_END RET -SYM_FUNC_END(crypto_aegis128_aesni_init) +SYM_FUNC_END(aegis128_aesni_init) /* - * void crypto_aegis128_aesni_ad(void *state, unsigned int length, - * const void *data); + * void aegis128_aesni_ad(struct aegis_state *state, const u8 *data, + * unsigned int len); + * + * len must be a multiple of 16. */ -SYM_FUNC_START(crypto_aegis128_aesni_ad) - FRAME_BEGIN +SYM_FUNC_START(aegis128_aesni_ad) + .set STATEP, %rdi + .set SRC, %rsi + .set LEN, %edx - cmp $0x10, LEN - jb .Lad_out + test LEN, LEN + jz .Lad_out /* load the state: */ movdqu 0x00(STATEP), STATE0 @@ -246,89 +204,40 @@ SYM_FUNC_START(crypto_aegis128_aesni_ad) movdqu 0x30(STATEP), STATE3 movdqu 0x40(STATEP), STATE4 - mov SRC, %r8 - and $0xF, %r8 - jnz .Lad_u_loop - -.align 8 -.Lad_a_loop: - movdqa 0x00(SRC), MSG - aegis128_update - pxor MSG, STATE4 - sub $0x10, LEN - cmp $0x10, LEN - jl .Lad_out_1 - - movdqa 0x10(SRC), MSG - aegis128_update - pxor MSG, STATE3 - sub $0x10, LEN - cmp $0x10, LEN - jl .Lad_out_2 - - movdqa 0x20(SRC), MSG - aegis128_update - pxor MSG, STATE2 - sub $0x10, LEN - cmp $0x10, LEN - jl .Lad_out_3 - - movdqa 0x30(SRC), MSG - aegis128_update - pxor MSG, STATE1 - sub $0x10, LEN - cmp $0x10, LEN - jl .Lad_out_4 - - movdqa 0x40(SRC), MSG - aegis128_update - pxor MSG, STATE0 - sub $0x10, LEN - cmp $0x10, LEN - jl .Lad_out_0 - - add $0x50, SRC - jmp .Lad_a_loop - .align 8 -.Lad_u_loop: +.Lad_loop: movdqu 0x00(SRC), MSG aegis128_update pxor MSG, STATE4 sub $0x10, LEN - cmp $0x10, LEN - jl .Lad_out_1 + jz .Lad_out_1 movdqu 0x10(SRC), MSG aegis128_update pxor MSG, STATE3 sub $0x10, LEN - cmp $0x10, LEN - jl .Lad_out_2 + jz .Lad_out_2 movdqu 0x20(SRC), MSG aegis128_update pxor MSG, STATE2 sub $0x10, LEN - cmp $0x10, LEN - jl .Lad_out_3 + jz .Lad_out_3 movdqu 0x30(SRC), MSG aegis128_update pxor MSG, STATE1 sub $0x10, LEN - cmp $0x10, LEN - jl .Lad_out_4 + jz .Lad_out_4 movdqu 0x40(SRC), MSG aegis128_update pxor MSG, STATE0 sub $0x10, LEN - cmp $0x10, LEN - jl .Lad_out_0 + jz .Lad_out_0 add $0x50, SRC - jmp .Lad_u_loop + jmp .Lad_loop /* store the state: */ .Lad_out_0: @@ -337,7 +246,6 @@ SYM_FUNC_START(crypto_aegis128_aesni_ad) movdqu STATE2, 0x20(STATEP) movdqu STATE3, 0x30(STATEP) movdqu STATE4, 0x40(STATEP) - FRAME_END RET .Lad_out_1: @@ -346,7 +254,6 @@ SYM_FUNC_START(crypto_aegis128_aesni_ad) movdqu STATE1, 0x20(STATEP) movdqu STATE2, 0x30(STATEP) movdqu STATE3, 0x40(STATEP) - FRAME_END RET .Lad_out_2: @@ -355,7 +262,6 @@ SYM_FUNC_START(crypto_aegis128_aesni_ad) movdqu STATE0, 0x20(STATEP) movdqu STATE1, 0x30(STATEP) movdqu STATE2, 0x40(STATEP) - FRAME_END RET .Lad_out_3: @@ -364,7 +270,6 @@ SYM_FUNC_START(crypto_aegis128_aesni_ad) movdqu STATE4, 0x20(STATEP) movdqu STATE0, 0x30(STATEP) movdqu STATE1, 0x40(STATEP) - FRAME_END RET .Lad_out_4: @@ -373,41 +278,38 @@ SYM_FUNC_START(crypto_aegis128_aesni_ad) movdqu STATE3, 0x20(STATEP) movdqu STATE4, 0x30(STATEP) movdqu STATE0, 0x40(STATEP) - FRAME_END - RET - .Lad_out: - FRAME_END RET -SYM_FUNC_END(crypto_aegis128_aesni_ad) +SYM_FUNC_END(aegis128_aesni_ad) -.macro encrypt_block a s0 s1 s2 s3 s4 i - movdq\a (\i * 0x10)(SRC), MSG +.macro encrypt_block s0 s1 s2 s3 s4 i + movdqu (\i * 0x10)(SRC), MSG movdqa MSG, T0 pxor \s1, T0 pxor \s4, T0 movdqa \s2, T1 pand \s3, T1 pxor T1, T0 - movdq\a T0, (\i * 0x10)(DST) + movdqu T0, (\i * 0x10)(DST) aegis128_update pxor MSG, \s4 sub $0x10, LEN - cmp $0x10, LEN - jl .Lenc_out_\i + jz .Lenc_out_\i .endm /* - * void crypto_aegis128_aesni_enc(void *state, unsigned int length, - * const void *src, void *dst); + * void aegis128_aesni_enc(struct aegis_state *state, const u8 *src, u8 *dst, + * unsigned int len); + * + * len must be nonzero and a multiple of 16. */ -SYM_TYPED_FUNC_START(crypto_aegis128_aesni_enc) - FRAME_BEGIN - - cmp $0x10, LEN - jb .Lenc_out +SYM_FUNC_START(aegis128_aesni_enc) + .set STATEP, %rdi + .set SRC, %rsi + .set DST, %rdx + .set LEN, %ecx /* load the state: */ movdqu 0x00(STATEP), STATE0 @@ -416,34 +318,17 @@ SYM_TYPED_FUNC_START(crypto_aegis128_aesni_enc) movdqu 0x30(STATEP), STATE3 movdqu 0x40(STATEP), STATE4 - mov SRC, %r8 - or DST, %r8 - and $0xF, %r8 - jnz .Lenc_u_loop - .align 8 -.Lenc_a_loop: - encrypt_block a STATE0 STATE1 STATE2 STATE3 STATE4 0 - encrypt_block a STATE4 STATE0 STATE1 STATE2 STATE3 1 - encrypt_block a STATE3 STATE4 STATE0 STATE1 STATE2 2 - encrypt_block a STATE2 STATE3 STATE4 STATE0 STATE1 3 - encrypt_block a STATE1 STATE2 STATE3 STATE4 STATE0 4 +.Lenc_loop: + encrypt_block STATE0 STATE1 STATE2 STATE3 STATE4 0 + encrypt_block STATE4 STATE0 STATE1 STATE2 STATE3 1 + encrypt_block STATE3 STATE4 STATE0 STATE1 STATE2 2 + encrypt_block STATE2 STATE3 STATE4 STATE0 STATE1 3 + encrypt_block STATE1 STATE2 STATE3 STATE4 STATE0 4 add $0x50, SRC add $0x50, DST - jmp .Lenc_a_loop - -.align 8 -.Lenc_u_loop: - encrypt_block u STATE0 STATE1 STATE2 STATE3 STATE4 0 - encrypt_block u STATE4 STATE0 STATE1 STATE2 STATE3 1 - encrypt_block u STATE3 STATE4 STATE0 STATE1 STATE2 2 - encrypt_block u STATE2 STATE3 STATE4 STATE0 STATE1 3 - encrypt_block u STATE1 STATE2 STATE3 STATE4 STATE0 4 - - add $0x50, SRC - add $0x50, DST - jmp .Lenc_u_loop + jmp .Lenc_loop /* store the state: */ .Lenc_out_0: @@ -452,7 +337,6 @@ SYM_TYPED_FUNC_START(crypto_aegis128_aesni_enc) movdqu STATE1, 0x20(STATEP) movdqu STATE2, 0x30(STATEP) movdqu STATE3, 0x40(STATEP) - FRAME_END RET .Lenc_out_1: @@ -461,7 +345,6 @@ SYM_TYPED_FUNC_START(crypto_aegis128_aesni_enc) movdqu STATE0, 0x20(STATEP) movdqu STATE1, 0x30(STATEP) movdqu STATE2, 0x40(STATEP) - FRAME_END RET .Lenc_out_2: @@ -470,7 +353,6 @@ SYM_TYPED_FUNC_START(crypto_aegis128_aesni_enc) movdqu STATE4, 0x20(STATEP) movdqu STATE0, 0x30(STATEP) movdqu STATE1, 0x40(STATEP) - FRAME_END RET .Lenc_out_3: @@ -479,7 +361,6 @@ SYM_TYPED_FUNC_START(crypto_aegis128_aesni_enc) movdqu STATE3, 0x20(STATEP) movdqu STATE4, 0x30(STATEP) movdqu STATE0, 0x40(STATEP) - FRAME_END RET .Lenc_out_4: @@ -488,20 +369,19 @@ SYM_TYPED_FUNC_START(crypto_aegis128_aesni_enc) movdqu STATE2, 0x20(STATEP) movdqu STATE3, 0x30(STATEP) movdqu STATE4, 0x40(STATEP) - FRAME_END - RET - .Lenc_out: - FRAME_END RET -SYM_FUNC_END(crypto_aegis128_aesni_enc) +SYM_FUNC_END(aegis128_aesni_enc) /* - * void crypto_aegis128_aesni_enc_tail(void *state, unsigned int length, - * const void *src, void *dst); + * void aegis128_aesni_enc_tail(struct aegis_state *state, const u8 *src, + * u8 *dst, unsigned int len); */ -SYM_TYPED_FUNC_START(crypto_aegis128_aesni_enc_tail) - FRAME_BEGIN +SYM_FUNC_START(aegis128_aesni_enc_tail) + .set STATEP, %rdi + .set SRC, %rsi + .set DST, %rdx + .set LEN, %ecx /* {load,store}_partial rely on this being %ecx */ /* load the state: */ movdqu 0x00(STATEP), STATE0 @@ -511,7 +391,8 @@ SYM_TYPED_FUNC_START(crypto_aegis128_aesni_enc_tail) movdqu 0x40(STATEP), STATE4 /* encrypt message: */ - call __load_partial + mov LEN, %r9d + load_partial movdqa MSG, T0 pxor STATE1, T0 @@ -520,7 +401,8 @@ SYM_TYPED_FUNC_START(crypto_aegis128_aesni_enc_tail) pand STATE3, T1 pxor T1, T0 - call __store_partial + mov %r9d, LEN + store_partial T0 aegis128_update pxor MSG, STATE4 @@ -531,37 +413,36 @@ SYM_TYPED_FUNC_START(crypto_aegis128_aesni_enc_tail) movdqu STATE1, 0x20(STATEP) movdqu STATE2, 0x30(STATEP) movdqu STATE3, 0x40(STATEP) - - FRAME_END RET -SYM_FUNC_END(crypto_aegis128_aesni_enc_tail) +SYM_FUNC_END(aegis128_aesni_enc_tail) -.macro decrypt_block a s0 s1 s2 s3 s4 i - movdq\a (\i * 0x10)(SRC), MSG +.macro decrypt_block s0 s1 s2 s3 s4 i + movdqu (\i * 0x10)(SRC), MSG pxor \s1, MSG pxor \s4, MSG movdqa \s2, T1 pand \s3, T1 pxor T1, MSG - movdq\a MSG, (\i * 0x10)(DST) + movdqu MSG, (\i * 0x10)(DST) aegis128_update pxor MSG, \s4 sub $0x10, LEN - cmp $0x10, LEN - jl .Ldec_out_\i + jz .Ldec_out_\i .endm /* - * void crypto_aegis128_aesni_dec(void *state, unsigned int length, - * const void *src, void *dst); + * void aegis128_aesni_dec(struct aegis_state *state, const u8 *src, u8 *dst, + * unsigned int len); + * + * len must be nonzero and a multiple of 16. */ -SYM_TYPED_FUNC_START(crypto_aegis128_aesni_dec) - FRAME_BEGIN - - cmp $0x10, LEN - jb .Ldec_out +SYM_FUNC_START(aegis128_aesni_dec) + .set STATEP, %rdi + .set SRC, %rsi + .set DST, %rdx + .set LEN, %ecx /* load the state: */ movdqu 0x00(STATEP), STATE0 @@ -570,34 +451,17 @@ SYM_TYPED_FUNC_START(crypto_aegis128_aesni_dec) movdqu 0x30(STATEP), STATE3 movdqu 0x40(STATEP), STATE4 - mov SRC, %r8 - or DST, %r8 - and $0xF, %r8 - jnz .Ldec_u_loop - .align 8 -.Ldec_a_loop: - decrypt_block a STATE0 STATE1 STATE2 STATE3 STATE4 0 - decrypt_block a STATE4 STATE0 STATE1 STATE2 STATE3 1 - decrypt_block a STATE3 STATE4 STATE0 STATE1 STATE2 2 - decrypt_block a STATE2 STATE3 STATE4 STATE0 STATE1 3 - decrypt_block a STATE1 STATE2 STATE3 STATE4 STATE0 4 +.Ldec_loop: + decrypt_block STATE0 STATE1 STATE2 STATE3 STATE4 0 + decrypt_block STATE4 STATE0 STATE1 STATE2 STATE3 1 + decrypt_block STATE3 STATE4 STATE0 STATE1 STATE2 2 + decrypt_block STATE2 STATE3 STATE4 STATE0 STATE1 3 + decrypt_block STATE1 STATE2 STATE3 STATE4 STATE0 4 add $0x50, SRC add $0x50, DST - jmp .Ldec_a_loop - -.align 8 -.Ldec_u_loop: - decrypt_block u STATE0 STATE1 STATE2 STATE3 STATE4 0 - decrypt_block u STATE4 STATE0 STATE1 STATE2 STATE3 1 - decrypt_block u STATE3 STATE4 STATE0 STATE1 STATE2 2 - decrypt_block u STATE2 STATE3 STATE4 STATE0 STATE1 3 - decrypt_block u STATE1 STATE2 STATE3 STATE4 STATE0 4 - - add $0x50, SRC - add $0x50, DST - jmp .Ldec_u_loop + jmp .Ldec_loop /* store the state: */ .Ldec_out_0: @@ -606,7 +470,6 @@ SYM_TYPED_FUNC_START(crypto_aegis128_aesni_dec) movdqu STATE1, 0x20(STATEP) movdqu STATE2, 0x30(STATEP) movdqu STATE3, 0x40(STATEP) - FRAME_END RET .Ldec_out_1: @@ -615,7 +478,6 @@ SYM_TYPED_FUNC_START(crypto_aegis128_aesni_dec) movdqu STATE0, 0x20(STATEP) movdqu STATE1, 0x30(STATEP) movdqu STATE2, 0x40(STATEP) - FRAME_END RET .Ldec_out_2: @@ -624,7 +486,6 @@ SYM_TYPED_FUNC_START(crypto_aegis128_aesni_dec) movdqu STATE4, 0x20(STATEP) movdqu STATE0, 0x30(STATEP) movdqu STATE1, 0x40(STATEP) - FRAME_END RET .Ldec_out_3: @@ -633,7 +494,6 @@ SYM_TYPED_FUNC_START(crypto_aegis128_aesni_dec) movdqu STATE3, 0x20(STATEP) movdqu STATE4, 0x30(STATEP) movdqu STATE0, 0x40(STATEP) - FRAME_END RET .Ldec_out_4: @@ -642,20 +502,19 @@ SYM_TYPED_FUNC_START(crypto_aegis128_aesni_dec) movdqu STATE2, 0x20(STATEP) movdqu STATE3, 0x30(STATEP) movdqu STATE4, 0x40(STATEP) - FRAME_END - RET - .Ldec_out: - FRAME_END RET -SYM_FUNC_END(crypto_aegis128_aesni_dec) +SYM_FUNC_END(aegis128_aesni_dec) /* - * void crypto_aegis128_aesni_dec_tail(void *state, unsigned int length, - * const void *src, void *dst); + * void aegis128_aesni_dec_tail(struct aegis_state *state, const u8 *src, + * u8 *dst, unsigned int len); */ -SYM_TYPED_FUNC_START(crypto_aegis128_aesni_dec_tail) - FRAME_BEGIN +SYM_FUNC_START(aegis128_aesni_dec_tail) + .set STATEP, %rdi + .set SRC, %rsi + .set DST, %rdx + .set LEN, %ecx /* {load,store}_partial rely on this being %ecx */ /* load the state: */ movdqu 0x00(STATEP), STATE0 @@ -665,7 +524,8 @@ SYM_TYPED_FUNC_START(crypto_aegis128_aesni_dec_tail) movdqu 0x40(STATEP), STATE4 /* decrypt message: */ - call __load_partial + mov LEN, %r9d + load_partial pxor STATE1, MSG pxor STATE4, MSG @@ -673,17 +533,13 @@ SYM_TYPED_FUNC_START(crypto_aegis128_aesni_dec_tail) pand STATE3, T1 pxor T1, MSG - movdqa MSG, T0 - call __store_partial + mov %r9d, LEN + store_partial MSG /* mask with byte count: */ - movq LEN, T0 - punpcklbw T0, T0 - punpcklbw T0, T0 - punpcklbw T0, T0 - punpcklbw T0, T0 - movdqa .Laegis128_counter(%rip), T1 - pcmpgtb T1, T0 + lea .Lzeropad_mask+16(%rip), %rax + sub %r9, %rax + movdqu (%rax), T0 pand T0, MSG aegis128_update @@ -695,17 +551,19 @@ SYM_TYPED_FUNC_START(crypto_aegis128_aesni_dec_tail) movdqu STATE1, 0x20(STATEP) movdqu STATE2, 0x30(STATEP) movdqu STATE3, 0x40(STATEP) - - FRAME_END RET -SYM_FUNC_END(crypto_aegis128_aesni_dec_tail) +SYM_FUNC_END(aegis128_aesni_dec_tail) /* - * void crypto_aegis128_aesni_final(void *state, void *tag_xor, - * u64 assoclen, u64 cryptlen); + * void aegis128_aesni_final(struct aegis_state *state, + * struct aegis_block *tag_xor, + * unsigned int assoclen, unsigned int cryptlen); */ -SYM_FUNC_START(crypto_aegis128_aesni_final) - FRAME_BEGIN +SYM_FUNC_START(aegis128_aesni_final) + .set STATEP, %rdi + .set TAG_XOR, %rsi + .set ASSOCLEN, %edx + .set CRYPTLEN, %ecx /* load the state: */ movdqu 0x00(STATEP), STATE0 @@ -715,10 +573,8 @@ SYM_FUNC_START(crypto_aegis128_aesni_final) movdqu 0x40(STATEP), STATE4 /* prepare length block: */ - movq %rdx, MSG - movq %rcx, T0 - pslldq $8, T0 - pxor T0, MSG + movd ASSOCLEN, MSG + pinsrd $2, CRYPTLEN, MSG psllq $3, MSG /* multiply by 8 (to get bit count) */ pxor STATE3, MSG @@ -733,7 +589,7 @@ SYM_FUNC_START(crypto_aegis128_aesni_final) aegis128_update; pxor MSG, STATE3 /* xor tag: */ - movdqu (%rsi), MSG + movdqu (TAG_XOR), MSG pxor STATE0, MSG pxor STATE1, MSG @@ -741,8 +597,6 @@ SYM_FUNC_START(crypto_aegis128_aesni_final) pxor STATE3, MSG pxor STATE4, MSG - movdqu MSG, (%rsi) - - FRAME_END + movdqu MSG, (TAG_XOR) RET -SYM_FUNC_END(crypto_aegis128_aesni_final) +SYM_FUNC_END(aegis128_aesni_final) diff --git a/arch/x86/crypto/aegis128-aesni-glue.c b/arch/x86/crypto/aegis128-aesni-glue.c index 4623189000d8..f1b6d40154e3 100644 --- a/arch/x86/crypto/aegis128-aesni-glue.c +++ b/arch/x86/crypto/aegis128-aesni-glue.c @@ -1,14 +1,13 @@ // SPDX-License-Identifier: GPL-2.0-or-later /* * The AEGIS-128 Authenticated-Encryption Algorithm - * Glue for AES-NI + SSE2 implementation + * Glue for AES-NI + SSE4.1 implementation * * Copyright (c) 2017-2018 Ondrej Mosnacek <omosnacek@gmail.com> * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved. */ #include <crypto/internal/aead.h> -#include <crypto/internal/simd.h> #include <crypto/internal/skcipher.h> #include <crypto/scatterwalk.h> #include <linux/module.h> @@ -23,27 +22,6 @@ #define AEGIS128_MIN_AUTH_SIZE 8 #define AEGIS128_MAX_AUTH_SIZE 16 -asmlinkage void crypto_aegis128_aesni_init(void *state, void *key, void *iv); - -asmlinkage void crypto_aegis128_aesni_ad( - void *state, unsigned int length, const void *data); - -asmlinkage void crypto_aegis128_aesni_enc( - void *state, unsigned int length, const void *src, void *dst); - -asmlinkage void crypto_aegis128_aesni_dec( - void *state, unsigned int length, const void *src, void *dst); - -asmlinkage void crypto_aegis128_aesni_enc_tail( - void *state, unsigned int length, const void *src, void *dst); - -asmlinkage void crypto_aegis128_aesni_dec_tail( - void *state, unsigned int length, const void *src, void *dst); - -asmlinkage void crypto_aegis128_aesni_final( - void *state, void *tag_xor, unsigned int cryptlen, - unsigned int assoclen); - struct aegis_block { u8 bytes[AEGIS128_BLOCK_SIZE] __aligned(AEGIS128_BLOCK_ALIGN); }; @@ -56,15 +34,31 @@ struct aegis_ctx { struct aegis_block key; }; -struct aegis_crypt_ops { - int (*skcipher_walk_init)(struct skcipher_walk *walk, - struct aead_request *req, bool atomic); +asmlinkage void aegis128_aesni_init(struct aegis_state *state, + const struct aegis_block *key, + const u8 iv[AEGIS128_NONCE_SIZE]); - void (*crypt_blocks)(void *state, unsigned int length, const void *src, - void *dst); - void (*crypt_tail)(void *state, unsigned int length, const void *src, - void *dst); -}; +asmlinkage void aegis128_aesni_ad(struct aegis_state *state, const u8 *data, + unsigned int len); + +asmlinkage void aegis128_aesni_enc(struct aegis_state *state, const u8 *src, + u8 *dst, unsigned int len); + +asmlinkage void aegis128_aesni_dec(struct aegis_state *state, const u8 *src, + u8 *dst, unsigned int len); + +asmlinkage void aegis128_aesni_enc_tail(struct aegis_state *state, + const u8 *src, u8 *dst, + unsigned int len); + +asmlinkage void aegis128_aesni_dec_tail(struct aegis_state *state, + const u8 *src, u8 *dst, + unsigned int len); + +asmlinkage void aegis128_aesni_final(struct aegis_state *state, + struct aegis_block *tag_xor, + unsigned int assoclen, + unsigned int cryptlen); static void crypto_aegis128_aesni_process_ad( struct aegis_state *state, struct scatterlist *sg_src, @@ -76,25 +70,23 @@ static void crypto_aegis128_aesni_process_ad( scatterwalk_start(&walk, sg_src); while (assoclen != 0) { - unsigned int size = scatterwalk_clamp(&walk, assoclen); + unsigned int size = scatterwalk_next(&walk, assoclen); + const u8 *src = walk.addr; unsigned int left = size; - void *mapped = scatterwalk_map(&walk); - const u8 *src = (const u8 *)mapped; if (pos + size >= AEGIS128_BLOCK_SIZE) { if (pos > 0) { unsigned int fill = AEGIS128_BLOCK_SIZE - pos; memcpy(buf.bytes + pos, src, fill); - crypto_aegis128_aesni_ad(state, - AEGIS128_BLOCK_SIZE, - buf.bytes); + aegis128_aesni_ad(state, buf.bytes, + AEGIS128_BLOCK_SIZE); pos = 0; left -= fill; src += fill; } - crypto_aegis128_aesni_ad(state, left, src); - + aegis128_aesni_ad(state, src, + left & ~(AEGIS128_BLOCK_SIZE - 1)); src += left & ~(AEGIS128_BLOCK_SIZE - 1); left &= AEGIS128_BLOCK_SIZE - 1; } @@ -103,31 +95,42 @@ static void crypto_aegis128_aesni_process_ad( pos += left; assoclen -= size; - scatterwalk_unmap(mapped); - scatterwalk_advance(&walk, size); - scatterwalk_done(&walk, 0, assoclen); + scatterwalk_done_src(&walk, size); } if (pos > 0) { memset(buf.bytes + pos, 0, AEGIS128_BLOCK_SIZE - pos); - crypto_aegis128_aesni_ad(state, AEGIS128_BLOCK_SIZE, buf.bytes); + aegis128_aesni_ad(state, buf.bytes, AEGIS128_BLOCK_SIZE); } } -static void crypto_aegis128_aesni_process_crypt( - struct aegis_state *state, struct skcipher_walk *walk, - const struct aegis_crypt_ops *ops) +static __always_inline void +crypto_aegis128_aesni_process_crypt(struct aegis_state *state, + struct skcipher_walk *walk, bool enc) { while (walk->nbytes >= AEGIS128_BLOCK_SIZE) { - ops->crypt_blocks(state, - round_down(walk->nbytes, AEGIS128_BLOCK_SIZE), - walk->src.virt.addr, walk->dst.virt.addr); + if (enc) + aegis128_aesni_enc(state, walk->src.virt.addr, + walk->dst.virt.addr, + round_down(walk->nbytes, + AEGIS128_BLOCK_SIZE)); + else + aegis128_aesni_dec(state, walk->src.virt.addr, + walk->dst.virt.addr, + round_down(walk->nbytes, + AEGIS128_BLOCK_SIZE)); skcipher_walk_done(walk, walk->nbytes % AEGIS128_BLOCK_SIZE); } if (walk->nbytes) { - ops->crypt_tail(state, walk->nbytes, walk->src.virt.addr, - walk->dst.virt.addr); + if (enc) + aegis128_aesni_enc_tail(state, walk->src.virt.addr, + walk->dst.virt.addr, + walk->nbytes); + else + aegis128_aesni_dec_tail(state, walk->src.virt.addr, + walk->dst.virt.addr, + walk->nbytes); skcipher_walk_done(walk, 0); } } @@ -162,42 +165,39 @@ static int crypto_aegis128_aesni_setauthsize(struct crypto_aead *tfm, return 0; } -static void crypto_aegis128_aesni_crypt(struct aead_request *req, - struct aegis_block *tag_xor, - unsigned int cryptlen, - const struct aegis_crypt_ops *ops) +static __always_inline void +crypto_aegis128_aesni_crypt(struct aead_request *req, + struct aegis_block *tag_xor, + unsigned int cryptlen, bool enc) { struct crypto_aead *tfm = crypto_aead_reqtfm(req); struct aegis_ctx *ctx = crypto_aegis128_aesni_ctx(tfm); struct skcipher_walk walk; struct aegis_state state; - ops->skcipher_walk_init(&walk, req, true); + if (enc) + skcipher_walk_aead_encrypt(&walk, req, true); + else + skcipher_walk_aead_decrypt(&walk, req, true); kernel_fpu_begin(); - crypto_aegis128_aesni_init(&state, ctx->key.bytes, req->iv); + aegis128_aesni_init(&state, &ctx->key, req->iv); crypto_aegis128_aesni_process_ad(&state, req->src, req->assoclen); - crypto_aegis128_aesni_process_crypt(&state, &walk, ops); - crypto_aegis128_aesni_final(&state, tag_xor, req->assoclen, cryptlen); + crypto_aegis128_aesni_process_crypt(&state, &walk, enc); + aegis128_aesni_final(&state, tag_xor, req->assoclen, cryptlen); kernel_fpu_end(); } static int crypto_aegis128_aesni_encrypt(struct aead_request *req) { - static const struct aegis_crypt_ops OPS = { - .skcipher_walk_init = skcipher_walk_aead_encrypt, - .crypt_blocks = crypto_aegis128_aesni_enc, - .crypt_tail = crypto_aegis128_aesni_enc_tail, - }; - struct crypto_aead *tfm = crypto_aead_reqtfm(req); struct aegis_block tag = {}; unsigned int authsize = crypto_aead_authsize(tfm); unsigned int cryptlen = req->cryptlen; - crypto_aegis128_aesni_crypt(req, &tag, cryptlen, &OPS); + crypto_aegis128_aesni_crypt(req, &tag, cryptlen, true); scatterwalk_map_and_copy(tag.bytes, req->dst, req->assoclen + cryptlen, authsize, 1); @@ -208,12 +208,6 @@ static int crypto_aegis128_aesni_decrypt(struct aead_request *req) { static const struct aegis_block zeros = {}; - static const struct aegis_crypt_ops OPS = { - .skcipher_walk_init = skcipher_walk_aead_decrypt, - .crypt_blocks = crypto_aegis128_aesni_dec, - .crypt_tail = crypto_aegis128_aesni_dec_tail, - }; - struct crypto_aead *tfm = crypto_aead_reqtfm(req); struct aegis_block tag; unsigned int authsize = crypto_aead_authsize(tfm); @@ -222,63 +216,47 @@ static int crypto_aegis128_aesni_decrypt(struct aead_request *req) scatterwalk_map_and_copy(tag.bytes, req->src, req->assoclen + cryptlen, authsize, 0); - crypto_aegis128_aesni_crypt(req, &tag, cryptlen, &OPS); + crypto_aegis128_aesni_crypt(req, &tag, cryptlen, false); return crypto_memneq(tag.bytes, zeros.bytes, authsize) ? -EBADMSG : 0; } -static int crypto_aegis128_aesni_init_tfm(struct crypto_aead *aead) -{ - return 0; -} - -static void crypto_aegis128_aesni_exit_tfm(struct crypto_aead *aead) -{ -} - static struct aead_alg crypto_aegis128_aesni_alg = { .setkey = crypto_aegis128_aesni_setkey, .setauthsize = crypto_aegis128_aesni_setauthsize, .encrypt = crypto_aegis128_aesni_encrypt, .decrypt = crypto_aegis128_aesni_decrypt, - .init = crypto_aegis128_aesni_init_tfm, - .exit = crypto_aegis128_aesni_exit_tfm, .ivsize = AEGIS128_NONCE_SIZE, .maxauthsize = AEGIS128_MAX_AUTH_SIZE, .chunksize = AEGIS128_BLOCK_SIZE, .base = { - .cra_flags = CRYPTO_ALG_INTERNAL, .cra_blocksize = 1, .cra_ctxsize = sizeof(struct aegis_ctx) + __alignof__(struct aegis_ctx), - .cra_alignmask = 0, .cra_priority = 400, - .cra_name = "__aegis128", - .cra_driver_name = "__aegis128-aesni", + .cra_name = "aegis128", + .cra_driver_name = "aegis128-aesni", .cra_module = THIS_MODULE, } }; -static struct simd_aead_alg *simd_alg; - static int __init crypto_aegis128_aesni_module_init(void) { - if (!boot_cpu_has(X86_FEATURE_XMM2) || + if (!boot_cpu_has(X86_FEATURE_XMM4_1) || !boot_cpu_has(X86_FEATURE_AES) || !cpu_has_xfeatures(XFEATURE_MASK_SSE, NULL)) return -ENODEV; - return simd_register_aeads_compat(&crypto_aegis128_aesni_alg, 1, - &simd_alg); + return crypto_register_aead(&crypto_aegis128_aesni_alg); } static void __exit crypto_aegis128_aesni_module_exit(void) { - simd_unregister_aeads(&crypto_aegis128_aesni_alg, 1, &simd_alg); + crypto_unregister_aead(&crypto_aegis128_aesni_alg); } module_init(crypto_aegis128_aesni_module_init); @@ -286,6 +264,6 @@ module_exit(crypto_aegis128_aesni_module_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Ondrej Mosnacek <omosnacek@gmail.com>"); -MODULE_DESCRIPTION("AEGIS-128 AEAD algorithm -- AESNI+SSE2 implementation"); +MODULE_DESCRIPTION("AEGIS-128 AEAD algorithm -- AESNI+SSE4.1 implementation"); MODULE_ALIAS_CRYPTO("aegis128"); MODULE_ALIAS_CRYPTO("aegis128-aesni"); diff --git a/arch/x86/crypto/aes-ctr-avx-x86_64.S b/arch/x86/crypto/aes-ctr-avx-x86_64.S new file mode 100644 index 000000000000..bbbfd80f5a50 --- /dev/null +++ b/arch/x86/crypto/aes-ctr-avx-x86_64.S @@ -0,0 +1,573 @@ +/* SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause */ +// +// Copyright 2025 Google LLC +// +// Author: Eric Biggers <ebiggers@google.com> +// +// This file is dual-licensed, meaning that you can use it under your choice of +// either of the following two licenses: +// +// Licensed under the Apache License 2.0 (the "License"). You may obtain a copy +// of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// or +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// 1. Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +// POSSIBILITY OF SUCH DAMAGE. +// +//------------------------------------------------------------------------------ +// +// This file contains x86_64 assembly implementations of AES-CTR and AES-XCTR +// using the following sets of CPU features: +// - AES-NI && AVX +// - VAES && AVX2 +// - VAES && AVX512BW && AVX512VL && BMI2 +// +// See the function definitions at the bottom of the file for more information. + +#include <linux/linkage.h> +#include <linux/cfi_types.h> + +.section .rodata +.p2align 4 + +.Lbswap_mask: + .octa 0x000102030405060708090a0b0c0d0e0f + +.Lctr_pattern: + .quad 0, 0 +.Lone: + .quad 1, 0 +.Ltwo: + .quad 2, 0 + .quad 3, 0 + +.Lfour: + .quad 4, 0 + +.text + +// Move a vector between memory and a register. +.macro _vmovdqu src, dst +.if VL < 64 + vmovdqu \src, \dst +.else + vmovdqu8 \src, \dst +.endif +.endm + +// Move a vector between registers. +.macro _vmovdqa src, dst +.if VL < 64 + vmovdqa \src, \dst +.else + vmovdqa64 \src, \dst +.endif +.endm + +// Broadcast a 128-bit value from memory to all 128-bit lanes of a vector +// register. +.macro _vbroadcast128 src, dst +.if VL == 16 + vmovdqu \src, \dst +.elseif VL == 32 + vbroadcasti128 \src, \dst +.else + vbroadcasti32x4 \src, \dst +.endif +.endm + +// XOR two vectors together. +.macro _vpxor src1, src2, dst +.if VL < 64 + vpxor \src1, \src2, \dst +.else + vpxord \src1, \src2, \dst +.endif +.endm + +// Load 1 <= %ecx <= 15 bytes from the pointer \src into the xmm register \dst +// and zeroize any remaining bytes. Clobbers %rax, %rcx, and \tmp{64,32}. +.macro _load_partial_block src, dst, tmp64, tmp32 + sub $8, %ecx // LEN - 8 + jle .Lle8\@ + + // Load 9 <= LEN <= 15 bytes. + vmovq (\src), \dst // Load first 8 bytes + mov (\src, %rcx), %rax // Load last 8 bytes + neg %ecx + shl $3, %ecx + shr %cl, %rax // Discard overlapping bytes + vpinsrq $1, %rax, \dst, \dst + jmp .Ldone\@ + +.Lle8\@: + add $4, %ecx // LEN - 4 + jl .Llt4\@ + + // Load 4 <= LEN <= 8 bytes. + mov (\src), %eax // Load first 4 bytes + mov (\src, %rcx), \tmp32 // Load last 4 bytes + jmp .Lcombine\@ + +.Llt4\@: + // Load 1 <= LEN <= 3 bytes. + add $2, %ecx // LEN - 2 + movzbl (\src), %eax // Load first byte + jl .Lmovq\@ + movzwl (\src, %rcx), \tmp32 // Load last 2 bytes +.Lcombine\@: + shl $3, %ecx + shl %cl, \tmp64 + or \tmp64, %rax // Combine the two parts +.Lmovq\@: + vmovq %rax, \dst +.Ldone\@: +.endm + +// Store 1 <= %ecx <= 15 bytes from the xmm register \src to the pointer \dst. +// Clobbers %rax, %rcx, and \tmp{64,32}. +.macro _store_partial_block src, dst, tmp64, tmp32 + sub $8, %ecx // LEN - 8 + jl .Llt8\@ + + // Store 8 <= LEN <= 15 bytes. + vpextrq $1, \src, %rax + mov %ecx, \tmp32 + shl $3, %ecx + ror %cl, %rax + mov %rax, (\dst, \tmp64) // Store last LEN - 8 bytes + vmovq \src, (\dst) // Store first 8 bytes + jmp .Ldone\@ + +.Llt8\@: + add $4, %ecx // LEN - 4 + jl .Llt4\@ + + // Store 4 <= LEN <= 7 bytes. + vpextrd $1, \src, %eax + mov %ecx, \tmp32 + shl $3, %ecx + ror %cl, %eax + mov %eax, (\dst, \tmp64) // Store last LEN - 4 bytes + vmovd \src, (\dst) // Store first 4 bytes + jmp .Ldone\@ + +.Llt4\@: + // Store 1 <= LEN <= 3 bytes. + vpextrb $0, \src, 0(\dst) + cmp $-2, %ecx // LEN - 4 == -2, i.e. LEN == 2? + jl .Ldone\@ + vpextrb $1, \src, 1(\dst) + je .Ldone\@ + vpextrb $2, \src, 2(\dst) +.Ldone\@: +.endm + +// Prepare the next two vectors of AES inputs in AESDATA\i0 and AESDATA\i1, and +// XOR each with the zero-th round key. Also update LE_CTR if !\final. +.macro _prepare_2_ctr_vecs is_xctr, i0, i1, final=0 +.if \is_xctr + .if USE_AVX512 + vmovdqa64 LE_CTR, AESDATA\i0 + vpternlogd $0x96, XCTR_IV, RNDKEY0, AESDATA\i0 + .else + vpxor XCTR_IV, LE_CTR, AESDATA\i0 + vpxor RNDKEY0, AESDATA\i0, AESDATA\i0 + .endif + vpaddq LE_CTR_INC1, LE_CTR, AESDATA\i1 + + .if USE_AVX512 + vpternlogd $0x96, XCTR_IV, RNDKEY0, AESDATA\i1 + .else + vpxor XCTR_IV, AESDATA\i1, AESDATA\i1 + vpxor RNDKEY0, AESDATA\i1, AESDATA\i1 + .endif +.else + vpshufb BSWAP_MASK, LE_CTR, AESDATA\i0 + _vpxor RNDKEY0, AESDATA\i0, AESDATA\i0 + vpaddq LE_CTR_INC1, LE_CTR, AESDATA\i1 + vpshufb BSWAP_MASK, AESDATA\i1, AESDATA\i1 + _vpxor RNDKEY0, AESDATA\i1, AESDATA\i1 +.endif +.if !\final + vpaddq LE_CTR_INC2, LE_CTR, LE_CTR +.endif +.endm + +// Do all AES rounds on the data in the given AESDATA vectors, excluding the +// zero-th and last rounds. +.macro _aesenc_loop vecs:vararg + mov KEY, %rax +1: + _vbroadcast128 (%rax), RNDKEY +.irp i, \vecs + vaesenc RNDKEY, AESDATA\i, AESDATA\i +.endr + add $16, %rax + cmp %rax, RNDKEYLAST_PTR + jne 1b +.endm + +// Finalize the keystream blocks in the given AESDATA vectors by doing the last +// AES round, then XOR those keystream blocks with the corresponding data. +// Reduce latency by doing the XOR before the vaesenclast, utilizing the +// property vaesenclast(key, a) ^ b == vaesenclast(key ^ b, a). +.macro _aesenclast_and_xor vecs:vararg +.irp i, \vecs + _vpxor \i*VL(SRC), RNDKEYLAST, RNDKEY + vaesenclast RNDKEY, AESDATA\i, AESDATA\i +.endr +.irp i, \vecs + _vmovdqu AESDATA\i, \i*VL(DST) +.endr +.endm + +// XOR the keystream blocks in the specified AESDATA vectors with the +// corresponding data. +.macro _xor_data vecs:vararg +.irp i, \vecs + _vpxor \i*VL(SRC), AESDATA\i, AESDATA\i +.endr +.irp i, \vecs + _vmovdqu AESDATA\i, \i*VL(DST) +.endr +.endm + +.macro _aes_ctr_crypt is_xctr + + // Define register aliases V0-V15 that map to the xmm, ymm, or zmm + // registers according to the selected Vector Length (VL). +.irp i, 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 + .if VL == 16 + .set V\i, %xmm\i + .elseif VL == 32 + .set V\i, %ymm\i + .elseif VL == 64 + .set V\i, %zmm\i + .else + .error "Unsupported Vector Length (VL)" + .endif +.endr + + // Function arguments + .set KEY, %rdi // Initially points to the start of the + // crypto_aes_ctx, then is advanced to + // point to the index 1 round key + .set KEY32, %edi // Available as temp register after all + // keystream blocks have been generated + .set SRC, %rsi // Pointer to next source data + .set DST, %rdx // Pointer to next destination data + .set LEN, %ecx // Remaining length in bytes. + // Note: _load_partial_block relies on + // this being in %ecx. + .set LEN64, %rcx // Zero-extend LEN before using! + .set LEN8, %cl +.if \is_xctr + .set XCTR_IV_PTR, %r8 // const u8 iv[AES_BLOCK_SIZE]; + .set XCTR_CTR, %r9 // u64 ctr; +.else + .set LE_CTR_PTR, %r8 // const u64 le_ctr[2]; +.endif + + // Additional local variables + .set RNDKEYLAST_PTR, %r10 + .set AESDATA0, V0 + .set AESDATA0_XMM, %xmm0 + .set AESDATA1, V1 + .set AESDATA1_XMM, %xmm1 + .set AESDATA2, V2 + .set AESDATA3, V3 + .set AESDATA4, V4 + .set AESDATA5, V5 + .set AESDATA6, V6 + .set AESDATA7, V7 +.if \is_xctr + .set XCTR_IV, V8 +.else + .set BSWAP_MASK, V8 +.endif + .set LE_CTR, V9 + .set LE_CTR_XMM, %xmm9 + .set LE_CTR_INC1, V10 + .set LE_CTR_INC2, V11 + .set RNDKEY0, V12 + .set RNDKEYLAST, V13 + .set RNDKEY, V14 + + // Create the first vector of counters. +.if \is_xctr + .if VL == 16 + vmovq XCTR_CTR, LE_CTR + .elseif VL == 32 + vmovq XCTR_CTR, LE_CTR_XMM + inc XCTR_CTR + vmovq XCTR_CTR, AESDATA0_XMM + vinserti128 $1, AESDATA0_XMM, LE_CTR, LE_CTR + .else + vpbroadcastq XCTR_CTR, LE_CTR + vpsrldq $8, LE_CTR, LE_CTR + vpaddq .Lctr_pattern(%rip), LE_CTR, LE_CTR + .endif + _vbroadcast128 (XCTR_IV_PTR), XCTR_IV +.else + _vbroadcast128 (LE_CTR_PTR), LE_CTR + .if VL > 16 + vpaddq .Lctr_pattern(%rip), LE_CTR, LE_CTR + .endif + _vbroadcast128 .Lbswap_mask(%rip), BSWAP_MASK +.endif + +.if VL == 16 + _vbroadcast128 .Lone(%rip), LE_CTR_INC1 +.elseif VL == 32 + _vbroadcast128 .Ltwo(%rip), LE_CTR_INC1 +.else + _vbroadcast128 .Lfour(%rip), LE_CTR_INC1 +.endif + vpsllq $1, LE_CTR_INC1, LE_CTR_INC2 + + // Load the AES key length: 16 (AES-128), 24 (AES-192), or 32 (AES-256). + movl 480(KEY), %eax + + // Compute the pointer to the last round key. + lea 6*16(KEY, %rax, 4), RNDKEYLAST_PTR + + // Load the zero-th and last round keys. + _vbroadcast128 (KEY), RNDKEY0 + _vbroadcast128 (RNDKEYLAST_PTR), RNDKEYLAST + + // Make KEY point to the first round key. + add $16, KEY + + // This is the main loop, which encrypts 8 vectors of data at a time. + add $-8*VL, LEN + jl .Lloop_8x_done\@ +.Lloop_8x\@: + _prepare_2_ctr_vecs \is_xctr, 0, 1 + _prepare_2_ctr_vecs \is_xctr, 2, 3 + _prepare_2_ctr_vecs \is_xctr, 4, 5 + _prepare_2_ctr_vecs \is_xctr, 6, 7 + _aesenc_loop 0,1,2,3,4,5,6,7 + _aesenclast_and_xor 0,1,2,3,4,5,6,7 + sub $-8*VL, SRC + sub $-8*VL, DST + add $-8*VL, LEN + jge .Lloop_8x\@ +.Lloop_8x_done\@: + sub $-8*VL, LEN + jz .Ldone\@ + + // 1 <= LEN < 8*VL. Generate 2, 4, or 8 more vectors of keystream + // blocks, depending on the remaining LEN. + + _prepare_2_ctr_vecs \is_xctr, 0, 1 + _prepare_2_ctr_vecs \is_xctr, 2, 3 + cmp $4*VL, LEN + jle .Lenc_tail_atmost4vecs\@ + + // 4*VL < LEN < 8*VL. Generate 8 vectors of keystream blocks. Use the + // first 4 to XOR 4 full vectors of data. Then XOR the remaining data. + _prepare_2_ctr_vecs \is_xctr, 4, 5 + _prepare_2_ctr_vecs \is_xctr, 6, 7, final=1 + _aesenc_loop 0,1,2,3,4,5,6,7 + _aesenclast_and_xor 0,1,2,3 + vaesenclast RNDKEYLAST, AESDATA4, AESDATA0 + vaesenclast RNDKEYLAST, AESDATA5, AESDATA1 + vaesenclast RNDKEYLAST, AESDATA6, AESDATA2 + vaesenclast RNDKEYLAST, AESDATA7, AESDATA3 + sub $-4*VL, SRC + sub $-4*VL, DST + add $-4*VL, LEN + cmp $1*VL-1, LEN + jle .Lxor_tail_partial_vec_0\@ + _xor_data 0 + cmp $2*VL-1, LEN + jle .Lxor_tail_partial_vec_1\@ + _xor_data 1 + cmp $3*VL-1, LEN + jle .Lxor_tail_partial_vec_2\@ + _xor_data 2 + cmp $4*VL-1, LEN + jle .Lxor_tail_partial_vec_3\@ + _xor_data 3 + jmp .Ldone\@ + +.Lenc_tail_atmost4vecs\@: + cmp $2*VL, LEN + jle .Lenc_tail_atmost2vecs\@ + + // 2*VL < LEN <= 4*VL. Generate 4 vectors of keystream blocks. Use the + // first 2 to XOR 2 full vectors of data. Then XOR the remaining data. + _aesenc_loop 0,1,2,3 + _aesenclast_and_xor 0,1 + vaesenclast RNDKEYLAST, AESDATA2, AESDATA0 + vaesenclast RNDKEYLAST, AESDATA3, AESDATA1 + sub $-2*VL, SRC + sub $-2*VL, DST + add $-2*VL, LEN + jmp .Lxor_tail_upto2vecs\@ + +.Lenc_tail_atmost2vecs\@: + // 1 <= LEN <= 2*VL. Generate 2 vectors of keystream blocks. Then XOR + // the remaining data. + _aesenc_loop 0,1 + vaesenclast RNDKEYLAST, AESDATA0, AESDATA0 + vaesenclast RNDKEYLAST, AESDATA1, AESDATA1 + +.Lxor_tail_upto2vecs\@: + cmp $1*VL-1, LEN + jle .Lxor_tail_partial_vec_0\@ + _xor_data 0 + cmp $2*VL-1, LEN + jle .Lxor_tail_partial_vec_1\@ + _xor_data 1 + jmp .Ldone\@ + +.Lxor_tail_partial_vec_1\@: + add $-1*VL, LEN + jz .Ldone\@ + sub $-1*VL, SRC + sub $-1*VL, DST + _vmovdqa AESDATA1, AESDATA0 + jmp .Lxor_tail_partial_vec_0\@ + +.Lxor_tail_partial_vec_2\@: + add $-2*VL, LEN + jz .Ldone\@ + sub $-2*VL, SRC + sub $-2*VL, DST + _vmovdqa AESDATA2, AESDATA0 + jmp .Lxor_tail_partial_vec_0\@ + +.Lxor_tail_partial_vec_3\@: + add $-3*VL, LEN + jz .Ldone\@ + sub $-3*VL, SRC + sub $-3*VL, DST + _vmovdqa AESDATA3, AESDATA0 + +.Lxor_tail_partial_vec_0\@: + // XOR the remaining 1 <= LEN < VL bytes. It's easy if masked + // loads/stores are available; otherwise it's a bit harder... +.if USE_AVX512 + mov $-1, %rax + bzhi LEN64, %rax, %rax + kmovq %rax, %k1 + vmovdqu8 (SRC), AESDATA1{%k1}{z} + vpxord AESDATA1, AESDATA0, AESDATA0 + vmovdqu8 AESDATA0, (DST){%k1} +.else + .if VL == 32 + cmp $16, LEN + jl 1f + vpxor (SRC), AESDATA0_XMM, AESDATA1_XMM + vmovdqu AESDATA1_XMM, (DST) + add $16, SRC + add $16, DST + sub $16, LEN + jz .Ldone\@ + vextracti128 $1, AESDATA0, AESDATA0_XMM +1: + .endif + mov LEN, %r10d + _load_partial_block SRC, AESDATA1_XMM, KEY, KEY32 + vpxor AESDATA1_XMM, AESDATA0_XMM, AESDATA0_XMM + mov %r10d, %ecx + _store_partial_block AESDATA0_XMM, DST, KEY, KEY32 +.endif + +.Ldone\@: +.if VL > 16 + vzeroupper +.endif + RET +.endm + +// Below are the definitions of the functions generated by the above macro. +// They have the following prototypes: +// +// +// void aes_ctr64_crypt_##suffix(const struct crypto_aes_ctx *key, +// const u8 *src, u8 *dst, int len, +// const u64 le_ctr[2]); +// +// void aes_xctr_crypt_##suffix(const struct crypto_aes_ctx *key, +// const u8 *src, u8 *dst, int len, +// const u8 iv[AES_BLOCK_SIZE], u64 ctr); +// +// Both functions generate |len| bytes of keystream, XOR it with the data from +// |src|, and write the result to |dst|. On non-final calls, |len| must be a +// multiple of 16. On the final call, |len| can be any value. +// +// aes_ctr64_crypt_* implement "regular" CTR, where the keystream is generated +// from a 128-bit big endian counter that increments by 1 for each AES block. +// HOWEVER, to keep the assembly code simple, some of the counter management is +// left to the caller. aes_ctr64_crypt_* take the counter in little endian +// form, only increment the low 64 bits internally, do the conversion to big +// endian internally, and don't write the updated counter back to memory. The +// caller is responsible for converting the starting IV to the little endian +// le_ctr, detecting the (very rare) case of a carry out of the low 64 bits +// being needed and splitting at that point with a carry done in between, and +// updating le_ctr after each part if the message is multi-part. +// +// aes_xctr_crypt_* implement XCTR as specified in "Length-preserving encryption +// with HCTR2" (https://eprint.iacr.org/2021/1441.pdf). XCTR is an +// easier-to-implement variant of CTR that uses little endian byte order and +// eliminates carries. |ctr| is the per-message block counter starting at 1. + +.set VL, 16 +.set USE_AVX512, 0 +SYM_TYPED_FUNC_START(aes_ctr64_crypt_aesni_avx) + _aes_ctr_crypt 0 +SYM_FUNC_END(aes_ctr64_crypt_aesni_avx) +SYM_TYPED_FUNC_START(aes_xctr_crypt_aesni_avx) + _aes_ctr_crypt 1 +SYM_FUNC_END(aes_xctr_crypt_aesni_avx) + +#if defined(CONFIG_AS_VAES) && defined(CONFIG_AS_VPCLMULQDQ) +.set VL, 32 +.set USE_AVX512, 0 +SYM_TYPED_FUNC_START(aes_ctr64_crypt_vaes_avx2) + _aes_ctr_crypt 0 +SYM_FUNC_END(aes_ctr64_crypt_vaes_avx2) +SYM_TYPED_FUNC_START(aes_xctr_crypt_vaes_avx2) + _aes_ctr_crypt 1 +SYM_FUNC_END(aes_xctr_crypt_vaes_avx2) + +.set VL, 64 +.set USE_AVX512, 1 +SYM_TYPED_FUNC_START(aes_ctr64_crypt_vaes_avx512) + _aes_ctr_crypt 0 +SYM_FUNC_END(aes_ctr64_crypt_vaes_avx512) +SYM_TYPED_FUNC_START(aes_xctr_crypt_vaes_avx512) + _aes_ctr_crypt 1 +SYM_FUNC_END(aes_xctr_crypt_vaes_avx512) +#endif // CONFIG_AS_VAES && CONFIG_AS_VPCLMULQDQ diff --git a/arch/x86/crypto/aes-gcm-aesni-x86_64.S b/arch/x86/crypto/aes-gcm-aesni-x86_64.S new file mode 100644 index 000000000000..45940e2883a0 --- /dev/null +++ b/arch/x86/crypto/aes-gcm-aesni-x86_64.S @@ -0,0 +1,1128 @@ +/* SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause */ +// +// AES-NI optimized AES-GCM for x86_64 +// +// Copyright 2024 Google LLC +// +// Author: Eric Biggers <ebiggers@google.com> +// +//------------------------------------------------------------------------------ +// +// This file is dual-licensed, meaning that you can use it under your choice of +// either of the following two licenses: +// +// Licensed under the Apache License 2.0 (the "License"). You may obtain a copy +// of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// or +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// 1. Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +// POSSIBILITY OF SUCH DAMAGE. +// +//------------------------------------------------------------------------------ +// +// This file implements AES-GCM (Galois/Counter Mode) for x86_64 CPUs that +// support the original set of AES instructions, i.e. AES-NI. Two +// implementations are provided, one that uses AVX and one that doesn't. They +// are very similar, being generated by the same macros. The only difference is +// that the AVX implementation takes advantage of VEX-coded instructions in some +// places to avoid some 'movdqu' and 'movdqa' instructions. The AVX +// implementation does *not* use 256-bit vectors, as AES is not supported on +// 256-bit vectors until the VAES feature (which this file doesn't target). +// +// The specific CPU feature prerequisites are AES-NI and PCLMULQDQ, plus SSE4.1 +// for the *_aesni functions or AVX for the *_aesni_avx ones. (But it seems +// there are no CPUs that support AES-NI without also PCLMULQDQ and SSE4.1.) +// +// The design generally follows that of aes-gcm-avx10-x86_64.S, and that file is +// more thoroughly commented. This file has the following notable changes: +// +// - The vector length is fixed at 128-bit, i.e. xmm registers. This means +// there is only one AES block (and GHASH block) per register. +// +// - Without AVX512 / AVX10, only 16 SIMD registers are available instead of +// 32. We work around this by being much more careful about using +// registers, relying heavily on loads to load values as they are needed. +// +// - Masking is not available either. We work around this by implementing +// partial block loads and stores using overlapping scalar loads and stores +// combined with shifts and SSE4.1 insertion and extraction instructions. +// +// - The main loop is organized differently due to the different design +// constraints. First, with just one AES block per SIMD register, on some +// CPUs 4 registers don't saturate the 'aesenc' throughput. We therefore +// do an 8-register wide loop. Considering that and the fact that we have +// just 16 SIMD registers to work with, it's not feasible to cache AES +// round keys and GHASH key powers in registers across loop iterations. +// That's not ideal, but also not actually that bad, since loads can run in +// parallel with other instructions. Significantly, this also makes it +// possible to roll up the inner loops, relying on hardware loop unrolling +// instead of software loop unrolling, greatly reducing code size. +// +// - We implement the GHASH multiplications in the main loop using Karatsuba +// multiplication instead of schoolbook multiplication. This saves one +// pclmulqdq instruction per block, at the cost of one 64-bit load, one +// pshufd, and 0.25 pxors per block. (This is without the three-argument +// XOR support that would be provided by AVX512 / AVX10, which would be +// more beneficial to schoolbook than Karatsuba.) +// +// As a rough approximation, we can assume that Karatsuba multiplication is +// faster than schoolbook multiplication in this context if one pshufd and +// 0.25 pxors are cheaper than a pclmulqdq. (We assume that the 64-bit +// load is "free" due to running in parallel with arithmetic instructions.) +// This is true on AMD CPUs, including all that support pclmulqdq up to at +// least Zen 3. It's also true on older Intel CPUs: Westmere through +// Haswell on the Core side, and Silvermont through Goldmont Plus on the +// low-power side. On some of these CPUs, pclmulqdq is quite slow, and the +// benefit of Karatsuba should be substantial. On newer Intel CPUs, +// schoolbook multiplication should be faster, but only marginally. +// +// Not all these CPUs were available to be tested. However, benchmarks on +// available CPUs suggest that this approximation is plausible. Switching +// to Karatsuba showed negligible change (< 1%) on Intel Broadwell, +// Skylake, and Cascade Lake, but it improved AMD Zen 1-3 by 6-7%. +// Considering that and the fact that Karatsuba should be even more +// beneficial on older Intel CPUs, it seems like the right choice here. +// +// An additional 0.25 pclmulqdq per block (2 per 8 blocks) could be +// saved by using a multiplication-less reduction method. We don't do that +// because it would require a large number of shift and xor instructions, +// making it less worthwhile and likely harmful on newer CPUs. +// +// It does make sense to sometimes use a different reduction optimization +// that saves a pclmulqdq, though: precompute the hash key times x^64, and +// multiply the low half of the data block by the hash key with the extra +// factor of x^64. This eliminates one step of the reduction. However, +// this is incompatible with Karatsuba multiplication. Therefore, for +// multi-block processing we use Karatsuba multiplication with a regular +// reduction. For single-block processing, we use the x^64 optimization. + +#include <linux/linkage.h> + +.section .rodata +.p2align 4 +.Lbswap_mask: + .octa 0x000102030405060708090a0b0c0d0e0f +.Lgfpoly: + .quad 0xc200000000000000 +.Lone: + .quad 1 +.Lgfpoly_and_internal_carrybit: + .octa 0xc2000000000000010000000000000001 + // Loading 16 bytes from '.Lzeropad_mask + 16 - len' produces a mask of + // 'len' 0xff bytes and the rest zeroes. +.Lzeropad_mask: + .octa 0xffffffffffffffffffffffffffffffff + .octa 0 + +// Offsets in struct aes_gcm_key_aesni +#define OFFSETOF_AESKEYLEN 480 +#define OFFSETOF_H_POWERS 496 +#define OFFSETOF_H_POWERS_XORED 624 +#define OFFSETOF_H_TIMES_X64 688 + +.text + +// Do a vpclmulqdq, or fall back to a movdqa and a pclmulqdq. The fallback +// assumes that all operands are distinct and that any mem operand is aligned. +.macro _vpclmulqdq imm, src1, src2, dst +.if USE_AVX + vpclmulqdq \imm, \src1, \src2, \dst +.else + movdqa \src2, \dst + pclmulqdq \imm, \src1, \dst +.endif +.endm + +// Do a vpshufb, or fall back to a movdqa and a pshufb. The fallback assumes +// that all operands are distinct and that any mem operand is aligned. +.macro _vpshufb src1, src2, dst +.if USE_AVX + vpshufb \src1, \src2, \dst +.else + movdqa \src2, \dst + pshufb \src1, \dst +.endif +.endm + +// Do a vpand, or fall back to a movdqu and a pand. The fallback assumes that +// all operands are distinct. +.macro _vpand src1, src2, dst +.if USE_AVX + vpand \src1, \src2, \dst +.else + movdqu \src1, \dst + pand \src2, \dst +.endif +.endm + +// XOR the unaligned memory operand \mem into the xmm register \reg. \tmp must +// be a temporary xmm register. +.macro _xor_mem_to_reg mem, reg, tmp +.if USE_AVX + vpxor \mem, \reg, \reg +.else + movdqu \mem, \tmp + pxor \tmp, \reg +.endif +.endm + +// Test the unaligned memory operand \mem against the xmm register \reg. \tmp +// must be a temporary xmm register. +.macro _test_mem mem, reg, tmp +.if USE_AVX + vptest \mem, \reg +.else + movdqu \mem, \tmp + ptest \tmp, \reg +.endif +.endm + +// Load 1 <= %ecx <= 15 bytes from the pointer \src into the xmm register \dst +// and zeroize any remaining bytes. Clobbers %rax, %rcx, and \tmp{64,32}. +.macro _load_partial_block src, dst, tmp64, tmp32 + sub $8, %ecx // LEN - 8 + jle .Lle8\@ + + // Load 9 <= LEN <= 15 bytes. + movq (\src), \dst // Load first 8 bytes + mov (\src, %rcx), %rax // Load last 8 bytes + neg %ecx + shl $3, %ecx + shr %cl, %rax // Discard overlapping bytes + pinsrq $1, %rax, \dst + jmp .Ldone\@ + +.Lle8\@: + add $4, %ecx // LEN - 4 + jl .Llt4\@ + + // Load 4 <= LEN <= 8 bytes. + mov (\src), %eax // Load first 4 bytes + mov (\src, %rcx), \tmp32 // Load last 4 bytes + jmp .Lcombine\@ + +.Llt4\@: + // Load 1 <= LEN <= 3 bytes. + add $2, %ecx // LEN - 2 + movzbl (\src), %eax // Load first byte + jl .Lmovq\@ + movzwl (\src, %rcx), \tmp32 // Load last 2 bytes +.Lcombine\@: + shl $3, %ecx + shl %cl, \tmp64 + or \tmp64, %rax // Combine the two parts +.Lmovq\@: + movq %rax, \dst +.Ldone\@: +.endm + +// Store 1 <= %ecx <= 15 bytes from the xmm register \src to the pointer \dst. +// Clobbers %rax, %rcx, and %rsi. +.macro _store_partial_block src, dst + sub $8, %ecx // LEN - 8 + jl .Llt8\@ + + // Store 8 <= LEN <= 15 bytes. + pextrq $1, \src, %rax + mov %ecx, %esi + shl $3, %ecx + ror %cl, %rax + mov %rax, (\dst, %rsi) // Store last LEN - 8 bytes + movq \src, (\dst) // Store first 8 bytes + jmp .Ldone\@ + +.Llt8\@: + add $4, %ecx // LEN - 4 + jl .Llt4\@ + + // Store 4 <= LEN <= 7 bytes. + pextrd $1, \src, %eax + mov %ecx, %esi + shl $3, %ecx + ror %cl, %eax + mov %eax, (\dst, %rsi) // Store last LEN - 4 bytes + movd \src, (\dst) // Store first 4 bytes + jmp .Ldone\@ + +.Llt4\@: + // Store 1 <= LEN <= 3 bytes. + pextrb $0, \src, 0(\dst) + cmp $-2, %ecx // LEN - 4 == -2, i.e. LEN == 2? + jl .Ldone\@ + pextrb $1, \src, 1(\dst) + je .Ldone\@ + pextrb $2, \src, 2(\dst) +.Ldone\@: +.endm + +// Do one step of GHASH-multiplying \a by \b and storing the reduced product in +// \b. To complete all steps, this must be invoked with \i=0 through \i=9. +// \a_times_x64 must contain \a * x^64 in reduced form, \gfpoly must contain the +// .Lgfpoly constant, and \t0-\t1 must be temporary registers. +.macro _ghash_mul_step i, a, a_times_x64, b, gfpoly, t0, t1 + + // MI = (a_L * b_H) + ((a*x^64)_L * b_L) +.if \i == 0 + _vpclmulqdq $0x01, \a, \b, \t0 +.elseif \i == 1 + _vpclmulqdq $0x00, \a_times_x64, \b, \t1 +.elseif \i == 2 + pxor \t1, \t0 + + // HI = (a_H * b_H) + ((a*x^64)_H * b_L) +.elseif \i == 3 + _vpclmulqdq $0x11, \a, \b, \t1 +.elseif \i == 4 + pclmulqdq $0x10, \a_times_x64, \b +.elseif \i == 5 + pxor \t1, \b +.elseif \i == 6 + + // Fold MI into HI. + pshufd $0x4e, \t0, \t1 // Swap halves of MI +.elseif \i == 7 + pclmulqdq $0x00, \gfpoly, \t0 // MI_L*(x^63 + x^62 + x^57) +.elseif \i == 8 + pxor \t1, \b +.elseif \i == 9 + pxor \t0, \b +.endif +.endm + +// GHASH-multiply \a by \b and store the reduced product in \b. +// See _ghash_mul_step for details. +.macro _ghash_mul a, a_times_x64, b, gfpoly, t0, t1 +.irp i, 0,1,2,3,4,5,6,7,8,9 + _ghash_mul_step \i, \a, \a_times_x64, \b, \gfpoly, \t0, \t1 +.endr +.endm + +// GHASH-multiply \a by \b and add the unreduced product to \lo, \mi, and \hi. +// This does Karatsuba multiplication and must be paired with _ghash_reduce. On +// the first call, \lo, \mi, and \hi must be zero. \a_xored must contain the +// two halves of \a XOR'd together, i.e. a_L + a_H. \b is clobbered. +.macro _ghash_mul_noreduce a, a_xored, b, lo, mi, hi, t0 + + // LO += a_L * b_L + _vpclmulqdq $0x00, \a, \b, \t0 + pxor \t0, \lo + + // b_L + b_H + pshufd $0x4e, \b, \t0 + pxor \b, \t0 + + // HI += a_H * b_H + pclmulqdq $0x11, \a, \b + pxor \b, \hi + + // MI += (a_L + a_H) * (b_L + b_H) + pclmulqdq $0x00, \a_xored, \t0 + pxor \t0, \mi +.endm + +// Reduce the product from \lo, \mi, and \hi, and store the result in \dst. +// This assumes that _ghash_mul_noreduce was used. +.macro _ghash_reduce lo, mi, hi, dst, t0 + + movq .Lgfpoly(%rip), \t0 + + // MI += LO + HI (needed because we used Karatsuba multiplication) + pxor \lo, \mi + pxor \hi, \mi + + // Fold LO into MI. + pshufd $0x4e, \lo, \dst + pclmulqdq $0x00, \t0, \lo + pxor \dst, \mi + pxor \lo, \mi + + // Fold MI into HI. + pshufd $0x4e, \mi, \dst + pclmulqdq $0x00, \t0, \mi + pxor \hi, \dst + pxor \mi, \dst +.endm + +// Do the first step of the GHASH update of a set of 8 ciphertext blocks. +// +// The whole GHASH update does: +// +// GHASH_ACC = (blk0+GHASH_ACC)*H^8 + blk1*H^7 + blk2*H^6 + blk3*H^5 + +// blk4*H^4 + blk5*H^3 + blk6*H^2 + blk7*H^1 +// +// This macro just does the first step: it does the unreduced multiplication +// (blk0+GHASH_ACC)*H^8 and starts gathering the unreduced product in the xmm +// registers LO, MI, and GHASH_ACC a.k.a. HI. It also zero-initializes the +// inner block counter in %rax, which is a value that counts up by 8 for each +// block in the set of 8 and is used later to index by 8*blknum and 16*blknum. +// +// To reduce the number of pclmulqdq instructions required, both this macro and +// _ghash_update_continue_8x use Karatsuba multiplication instead of schoolbook +// multiplication. See the file comment for more details about this choice. +// +// Both macros expect the ciphertext blocks blk[0-7] to be available at DST if +// encrypting, or SRC if decrypting. They also expect the precomputed hash key +// powers H^i and their XOR'd-together halves to be available in the struct +// pointed to by KEY. Both macros clobber TMP[0-2]. +.macro _ghash_update_begin_8x enc + + // Initialize the inner block counter. + xor %eax, %eax + + // Load the highest hash key power, H^8. + movdqa OFFSETOF_H_POWERS(KEY), TMP0 + + // Load the first ciphertext block and byte-reflect it. +.if \enc + movdqu (DST), TMP1 +.else + movdqu (SRC), TMP1 +.endif + pshufb BSWAP_MASK, TMP1 + + // Add the GHASH accumulator to the ciphertext block to get the block + // 'b' that needs to be multiplied with the hash key power 'a'. + pxor TMP1, GHASH_ACC + + // b_L + b_H + pshufd $0x4e, GHASH_ACC, MI + pxor GHASH_ACC, MI + + // LO = a_L * b_L + _vpclmulqdq $0x00, TMP0, GHASH_ACC, LO + + // HI = a_H * b_H + pclmulqdq $0x11, TMP0, GHASH_ACC + + // MI = (a_L + a_H) * (b_L + b_H) + pclmulqdq $0x00, OFFSETOF_H_POWERS_XORED(KEY), MI +.endm + +// Continue the GHASH update of 8 ciphertext blocks as described above by doing +// an unreduced multiplication of the next ciphertext block by the next lowest +// key power and accumulating the result into LO, MI, and GHASH_ACC a.k.a. HI. +.macro _ghash_update_continue_8x enc + add $8, %eax + + // Load the next lowest key power. + movdqa OFFSETOF_H_POWERS(KEY,%rax,2), TMP0 + + // Load the next ciphertext block and byte-reflect it. +.if \enc + movdqu (DST,%rax,2), TMP1 +.else + movdqu (SRC,%rax,2), TMP1 +.endif + pshufb BSWAP_MASK, TMP1 + + // LO += a_L * b_L + _vpclmulqdq $0x00, TMP0, TMP1, TMP2 + pxor TMP2, LO + + // b_L + b_H + pshufd $0x4e, TMP1, TMP2 + pxor TMP1, TMP2 + + // HI += a_H * b_H + pclmulqdq $0x11, TMP0, TMP1 + pxor TMP1, GHASH_ACC + + // MI += (a_L + a_H) * (b_L + b_H) + movq OFFSETOF_H_POWERS_XORED(KEY,%rax), TMP1 + pclmulqdq $0x00, TMP1, TMP2 + pxor TMP2, MI +.endm + +// Reduce LO, MI, and GHASH_ACC a.k.a. HI into GHASH_ACC. This is similar to +// _ghash_reduce, but it's hardcoded to use the registers of the main loop and +// it uses the same register for HI and the destination. It's also divided into +// two steps. TMP1 must be preserved across steps. +// +// One pshufd could be saved by shuffling MI and XOR'ing LO into it, instead of +// shuffling LO, XOR'ing LO into MI, and shuffling MI. However, this would +// increase the critical path length, and it seems to slightly hurt performance. +.macro _ghash_update_end_8x_step i +.if \i == 0 + movq .Lgfpoly(%rip), TMP1 + pxor LO, MI + pxor GHASH_ACC, MI + pshufd $0x4e, LO, TMP2 + pclmulqdq $0x00, TMP1, LO + pxor TMP2, MI + pxor LO, MI +.elseif \i == 1 + pshufd $0x4e, MI, TMP2 + pclmulqdq $0x00, TMP1, MI + pxor TMP2, GHASH_ACC + pxor MI, GHASH_ACC +.endif +.endm + +// void aes_gcm_precompute_##suffix(struct aes_gcm_key_aesni *key); +// +// Given the expanded AES key, derive the GHASH subkey and initialize the GHASH +// related fields in the key struct. +.macro _aes_gcm_precompute + + // Function arguments + .set KEY, %rdi + + // Additional local variables. + // %xmm0-%xmm1 and %rax are used as temporaries. + .set RNDKEYLAST_PTR, %rsi + .set H_CUR, %xmm2 + .set H_POW1, %xmm3 // H^1 + .set H_POW1_X64, %xmm4 // H^1 * x^64 + .set GFPOLY, %xmm5 + + // Encrypt an all-zeroes block to get the raw hash subkey. + movl OFFSETOF_AESKEYLEN(KEY), %eax + lea 6*16(KEY,%rax,4), RNDKEYLAST_PTR + movdqa (KEY), H_POW1 // Zero-th round key XOR all-zeroes block + lea 16(KEY), %rax +1: + aesenc (%rax), H_POW1 + add $16, %rax + cmp %rax, RNDKEYLAST_PTR + jne 1b + aesenclast (RNDKEYLAST_PTR), H_POW1 + + // Preprocess the raw hash subkey as needed to operate on GHASH's + // bit-reflected values directly: reflect its bytes, then multiply it by + // x^-1 (using the backwards interpretation of polynomial coefficients + // from the GCM spec) or equivalently x^1 (using the alternative, + // natural interpretation of polynomial coefficients). + pshufb .Lbswap_mask(%rip), H_POW1 + movdqa H_POW1, %xmm0 + pshufd $0xd3, %xmm0, %xmm0 + psrad $31, %xmm0 + paddq H_POW1, H_POW1 + pand .Lgfpoly_and_internal_carrybit(%rip), %xmm0 + pxor %xmm0, H_POW1 + + // Store H^1. + movdqa H_POW1, OFFSETOF_H_POWERS+7*16(KEY) + + // Compute and store H^1 * x^64. + movq .Lgfpoly(%rip), GFPOLY + pshufd $0x4e, H_POW1, %xmm0 + _vpclmulqdq $0x00, H_POW1, GFPOLY, H_POW1_X64 + pxor %xmm0, H_POW1_X64 + movdqa H_POW1_X64, OFFSETOF_H_TIMES_X64(KEY) + + // Compute and store the halves of H^1 XOR'd together. + pxor H_POW1, %xmm0 + movq %xmm0, OFFSETOF_H_POWERS_XORED+7*8(KEY) + + // Compute and store the remaining key powers H^2 through H^8. + movdqa H_POW1, H_CUR + mov $6*8, %eax +.Lprecompute_next\@: + // Compute H^i = H^{i-1} * H^1. + _ghash_mul H_POW1, H_POW1_X64, H_CUR, GFPOLY, %xmm0, %xmm1 + // Store H^i. + movdqa H_CUR, OFFSETOF_H_POWERS(KEY,%rax,2) + // Compute and store the halves of H^i XOR'd together. + pshufd $0x4e, H_CUR, %xmm0 + pxor H_CUR, %xmm0 + movq %xmm0, OFFSETOF_H_POWERS_XORED(KEY,%rax) + sub $8, %eax + jge .Lprecompute_next\@ + + RET +.endm + +// void aes_gcm_aad_update_aesni(const struct aes_gcm_key_aesni *key, +// u8 ghash_acc[16], const u8 *aad, int aadlen); +// +// This function processes the AAD (Additional Authenticated Data) in GCM. +// Using the key |key|, it updates the GHASH accumulator |ghash_acc| with the +// data given by |aad| and |aadlen|. On the first call, |ghash_acc| must be all +// zeroes. |aadlen| must be a multiple of 16, except on the last call where it +// can be any length. The caller must do any buffering needed to ensure this. +.macro _aes_gcm_aad_update + + // Function arguments + .set KEY, %rdi + .set GHASH_ACC_PTR, %rsi + .set AAD, %rdx + .set AADLEN, %ecx + // Note: _load_partial_block relies on AADLEN being in %ecx. + + // Additional local variables. + // %rax, %r10, and %xmm0-%xmm1 are used as temporary registers. + .set BSWAP_MASK, %xmm2 + .set GHASH_ACC, %xmm3 + .set H_POW1, %xmm4 // H^1 + .set H_POW1_X64, %xmm5 // H^1 * x^64 + .set GFPOLY, %xmm6 + + movdqa .Lbswap_mask(%rip), BSWAP_MASK + movdqu (GHASH_ACC_PTR), GHASH_ACC + movdqa OFFSETOF_H_POWERS+7*16(KEY), H_POW1 + movdqa OFFSETOF_H_TIMES_X64(KEY), H_POW1_X64 + movq .Lgfpoly(%rip), GFPOLY + + // Process the AAD one full block at a time. + sub $16, AADLEN + jl .Laad_loop_1x_done\@ +.Laad_loop_1x\@: + movdqu (AAD), %xmm0 + pshufb BSWAP_MASK, %xmm0 + pxor %xmm0, GHASH_ACC + _ghash_mul H_POW1, H_POW1_X64, GHASH_ACC, GFPOLY, %xmm0, %xmm1 + add $16, AAD + sub $16, AADLEN + jge .Laad_loop_1x\@ +.Laad_loop_1x_done\@: + // Check whether there is a partial block at the end. + add $16, AADLEN + jz .Laad_done\@ + + // Process a partial block of length 1 <= AADLEN <= 15. + // _load_partial_block assumes that %ecx contains AADLEN. + _load_partial_block AAD, %xmm0, %r10, %r10d + pshufb BSWAP_MASK, %xmm0 + pxor %xmm0, GHASH_ACC + _ghash_mul H_POW1, H_POW1_X64, GHASH_ACC, GFPOLY, %xmm0, %xmm1 + +.Laad_done\@: + movdqu GHASH_ACC, (GHASH_ACC_PTR) + RET +.endm + +// Increment LE_CTR eight times to generate eight little-endian counter blocks, +// swap each to big-endian, and store them in AESDATA[0-7]. Also XOR them with +// the zero-th AES round key. Clobbers TMP0 and TMP1. +.macro _ctr_begin_8x + movq .Lone(%rip), TMP0 + movdqa (KEY), TMP1 // zero-th round key +.irp i, 0,1,2,3,4,5,6,7 + _vpshufb BSWAP_MASK, LE_CTR, AESDATA\i + pxor TMP1, AESDATA\i + paddd TMP0, LE_CTR +.endr +.endm + +// Do a non-last round of AES on AESDATA[0-7] using \round_key. +.macro _aesenc_8x round_key +.irp i, 0,1,2,3,4,5,6,7 + aesenc \round_key, AESDATA\i +.endr +.endm + +// Do the last round of AES on AESDATA[0-7] using \round_key. +.macro _aesenclast_8x round_key +.irp i, 0,1,2,3,4,5,6,7 + aesenclast \round_key, AESDATA\i +.endr +.endm + +// XOR eight blocks from SRC with the keystream blocks in AESDATA[0-7], and +// store the result to DST. Clobbers TMP0. +.macro _xor_data_8x +.irp i, 0,1,2,3,4,5,6,7 + _xor_mem_to_reg \i*16(SRC), AESDATA\i, tmp=TMP0 +.endr +.irp i, 0,1,2,3,4,5,6,7 + movdqu AESDATA\i, \i*16(DST) +.endr +.endm + +// void aes_gcm_{enc,dec}_update_##suffix(const struct aes_gcm_key_aesni *key, +// const u32 le_ctr[4], u8 ghash_acc[16], +// const u8 *src, u8 *dst, int datalen); +// +// This macro generates a GCM encryption or decryption update function with the +// above prototype (with \enc selecting which one). +// +// This function computes the next portion of the CTR keystream, XOR's it with +// |datalen| bytes from |src|, and writes the resulting encrypted or decrypted +// data to |dst|. It also updates the GHASH accumulator |ghash_acc| using the +// next |datalen| ciphertext bytes. +// +// |datalen| must be a multiple of 16, except on the last call where it can be +// any length. The caller must do any buffering needed to ensure this. Both +// in-place and out-of-place en/decryption are supported. +// +// |le_ctr| must give the current counter in little-endian format. For a new +// message, the low word of the counter must be 2. This function loads the +// counter from |le_ctr| and increments the loaded counter as needed, but it +// does *not* store the updated counter back to |le_ctr|. The caller must +// update |le_ctr| if any more data segments follow. Internally, only the low +// 32-bit word of the counter is incremented, following the GCM standard. +.macro _aes_gcm_update enc + + // Function arguments + .set KEY, %rdi + .set LE_CTR_PTR, %rsi // Note: overlaps with usage as temp reg + .set GHASH_ACC_PTR, %rdx + .set SRC, %rcx + .set DST, %r8 + .set DATALEN, %r9d + .set DATALEN64, %r9 // Zero-extend DATALEN before using! + // Note: the code setting up for _load_partial_block assumes that SRC is + // in %rcx (and that DATALEN is *not* in %rcx). + + // Additional local variables + + // %rax and %rsi are used as temporary registers. Note: %rsi overlaps + // with LE_CTR_PTR, which is used only at the beginning. + + .set AESKEYLEN, %r10d // AES key length in bytes + .set AESKEYLEN64, %r10 + .set RNDKEYLAST_PTR, %r11 // Pointer to last AES round key + + // Put the most frequently used values in %xmm0-%xmm7 to reduce code + // size. (%xmm0-%xmm7 take fewer bytes to encode than %xmm8-%xmm15.) + .set TMP0, %xmm0 + .set TMP1, %xmm1 + .set TMP2, %xmm2 + .set LO, %xmm3 // Low part of unreduced product + .set MI, %xmm4 // Middle part of unreduced product + .set GHASH_ACC, %xmm5 // GHASH accumulator; in main loop also + // the high part of unreduced product + .set BSWAP_MASK, %xmm6 // Shuffle mask for reflecting bytes + .set LE_CTR, %xmm7 // Little-endian counter value + .set AESDATA0, %xmm8 + .set AESDATA1, %xmm9 + .set AESDATA2, %xmm10 + .set AESDATA3, %xmm11 + .set AESDATA4, %xmm12 + .set AESDATA5, %xmm13 + .set AESDATA6, %xmm14 + .set AESDATA7, %xmm15 + + movdqa .Lbswap_mask(%rip), BSWAP_MASK + movdqu (GHASH_ACC_PTR), GHASH_ACC + movdqu (LE_CTR_PTR), LE_CTR + + movl OFFSETOF_AESKEYLEN(KEY), AESKEYLEN + lea 6*16(KEY,AESKEYLEN64,4), RNDKEYLAST_PTR + + // If there are at least 8*16 bytes of data, then continue into the main + // loop, which processes 8*16 bytes of data per iteration. + // + // The main loop interleaves AES and GHASH to improve performance on + // CPUs that can execute these instructions in parallel. When + // decrypting, the GHASH input (the ciphertext) is immediately + // available. When encrypting, we instead encrypt a set of 8 blocks + // first and then GHASH those blocks while encrypting the next set of 8, + // repeat that as needed, and finally GHASH the last set of 8 blocks. + // + // Code size optimization: Prefer adding or subtracting -8*16 over 8*16, + // as this makes the immediate fit in a signed byte, saving 3 bytes. + add $-8*16, DATALEN + jl .Lcrypt_loop_8x_done\@ +.if \enc + // Encrypt the first 8 plaintext blocks. + _ctr_begin_8x + lea 16(KEY), %rsi + .p2align 4 +1: + movdqa (%rsi), TMP0 + _aesenc_8x TMP0 + add $16, %rsi + cmp %rsi, RNDKEYLAST_PTR + jne 1b + movdqa (%rsi), TMP0 + _aesenclast_8x TMP0 + _xor_data_8x + // Don't increment DST until the ciphertext blocks have been hashed. + sub $-8*16, SRC + add $-8*16, DATALEN + jl .Lghash_last_ciphertext_8x\@ +.endif + + .p2align 4 +.Lcrypt_loop_8x\@: + + // Generate the next set of 8 counter blocks and start encrypting them. + _ctr_begin_8x + lea 16(KEY), %rsi + + // Do a round of AES, and start the GHASH update of 8 ciphertext blocks + // by doing the unreduced multiplication for the first ciphertext block. + movdqa (%rsi), TMP0 + add $16, %rsi + _aesenc_8x TMP0 + _ghash_update_begin_8x \enc + + // Do 7 more rounds of AES, and continue the GHASH update by doing the + // unreduced multiplication for the remaining ciphertext blocks. + .p2align 4 +1: + movdqa (%rsi), TMP0 + add $16, %rsi + _aesenc_8x TMP0 + _ghash_update_continue_8x \enc + cmp $7*8, %eax + jne 1b + + // Do the remaining AES rounds. + .p2align 4 +1: + movdqa (%rsi), TMP0 + add $16, %rsi + _aesenc_8x TMP0 + cmp %rsi, RNDKEYLAST_PTR + jne 1b + + // Do the GHASH reduction and the last round of AES. + movdqa (RNDKEYLAST_PTR), TMP0 + _ghash_update_end_8x_step 0 + _aesenclast_8x TMP0 + _ghash_update_end_8x_step 1 + + // XOR the data with the AES-CTR keystream blocks. +.if \enc + sub $-8*16, DST +.endif + _xor_data_8x + sub $-8*16, SRC +.if !\enc + sub $-8*16, DST +.endif + add $-8*16, DATALEN + jge .Lcrypt_loop_8x\@ + +.if \enc +.Lghash_last_ciphertext_8x\@: + // Update GHASH with the last set of 8 ciphertext blocks. + _ghash_update_begin_8x \enc + .p2align 4 +1: + _ghash_update_continue_8x \enc + cmp $7*8, %eax + jne 1b + _ghash_update_end_8x_step 0 + _ghash_update_end_8x_step 1 + sub $-8*16, DST +.endif + +.Lcrypt_loop_8x_done\@: + + sub $-8*16, DATALEN + jz .Ldone\@ + + // Handle the remainder of length 1 <= DATALEN < 8*16 bytes. We keep + // things simple and keep the code size down by just going one block at + // a time, again taking advantage of hardware loop unrolling. Since + // there are enough key powers available for all remaining data, we do + // the GHASH multiplications unreduced, and only reduce at the very end. + + .set HI, TMP2 + .set H_POW, AESDATA0 + .set H_POW_XORED, AESDATA1 + .set ONE, AESDATA2 + + movq .Lone(%rip), ONE + + // Start collecting the unreduced GHASH intermediate value LO, MI, HI. + pxor LO, LO + pxor MI, MI + pxor HI, HI + + // Set up a block counter %rax to contain 8*(8-n), where n is the number + // of blocks that remain, counting any partial block. This will be used + // to access the key powers H^n through H^1. + mov DATALEN, %eax + neg %eax + and $~15, %eax + sar $1, %eax + add $64, %eax + + sub $16, DATALEN + jl .Lcrypt_loop_1x_done\@ + + // Process the data one full block at a time. +.Lcrypt_loop_1x\@: + + // Encrypt the next counter block. + _vpshufb BSWAP_MASK, LE_CTR, TMP0 + paddd ONE, LE_CTR + pxor (KEY), TMP0 + lea -6*16(RNDKEYLAST_PTR), %rsi // Reduce code size + cmp $24, AESKEYLEN + jl 128f // AES-128? + je 192f // AES-192? + // AES-256 + aesenc -7*16(%rsi), TMP0 + aesenc -6*16(%rsi), TMP0 +192: + aesenc -5*16(%rsi), TMP0 + aesenc -4*16(%rsi), TMP0 +128: +.irp i, -3,-2,-1,0,1,2,3,4,5 + aesenc \i*16(%rsi), TMP0 +.endr + aesenclast (RNDKEYLAST_PTR), TMP0 + + // Load the next key power H^i. + movdqa OFFSETOF_H_POWERS(KEY,%rax,2), H_POW + movq OFFSETOF_H_POWERS_XORED(KEY,%rax), H_POW_XORED + + // XOR the keystream block that was just generated in TMP0 with the next + // source data block and store the resulting en/decrypted data to DST. +.if \enc + _xor_mem_to_reg (SRC), TMP0, tmp=TMP1 + movdqu TMP0, (DST) +.else + movdqu (SRC), TMP1 + pxor TMP1, TMP0 + movdqu TMP0, (DST) +.endif + + // Update GHASH with the ciphertext block. +.if \enc + pshufb BSWAP_MASK, TMP0 + pxor TMP0, GHASH_ACC +.else + pshufb BSWAP_MASK, TMP1 + pxor TMP1, GHASH_ACC +.endif + _ghash_mul_noreduce H_POW, H_POW_XORED, GHASH_ACC, LO, MI, HI, TMP0 + pxor GHASH_ACC, GHASH_ACC + + add $8, %eax + add $16, SRC + add $16, DST + sub $16, DATALEN + jge .Lcrypt_loop_1x\@ +.Lcrypt_loop_1x_done\@: + // Check whether there is a partial block at the end. + add $16, DATALEN + jz .Lghash_reduce\@ + + // Process a partial block of length 1 <= DATALEN <= 15. + + // Encrypt a counter block for the last time. + pshufb BSWAP_MASK, LE_CTR + pxor (KEY), LE_CTR + lea 16(KEY), %rsi +1: + aesenc (%rsi), LE_CTR + add $16, %rsi + cmp %rsi, RNDKEYLAST_PTR + jne 1b + aesenclast (RNDKEYLAST_PTR), LE_CTR + + // Load the lowest key power, H^1. + movdqa OFFSETOF_H_POWERS(KEY,%rax,2), H_POW + movq OFFSETOF_H_POWERS_XORED(KEY,%rax), H_POW_XORED + + // Load and zero-pad 1 <= DATALEN <= 15 bytes of data from SRC. SRC is + // in %rcx, but _load_partial_block needs DATALEN in %rcx instead. + // RNDKEYLAST_PTR is no longer needed, so reuse it for SRC. + mov SRC, RNDKEYLAST_PTR + mov DATALEN, %ecx + _load_partial_block RNDKEYLAST_PTR, TMP0, %rsi, %esi + + // XOR the keystream block that was just generated in LE_CTR with the + // source data block and store the resulting en/decrypted data to DST. + pxor TMP0, LE_CTR + mov DATALEN, %ecx + _store_partial_block LE_CTR, DST + + // If encrypting, zero-pad the final ciphertext block for GHASH. (If + // decrypting, this was already done by _load_partial_block.) +.if \enc + lea .Lzeropad_mask+16(%rip), %rax + sub DATALEN64, %rax + _vpand (%rax), LE_CTR, TMP0 +.endif + + // Update GHASH with the final ciphertext block. + pshufb BSWAP_MASK, TMP0 + pxor TMP0, GHASH_ACC + _ghash_mul_noreduce H_POW, H_POW_XORED, GHASH_ACC, LO, MI, HI, TMP0 + +.Lghash_reduce\@: + // Finally, do the GHASH reduction. + _ghash_reduce LO, MI, HI, GHASH_ACC, TMP0 + +.Ldone\@: + // Store the updated GHASH accumulator back to memory. + movdqu GHASH_ACC, (GHASH_ACC_PTR) + + RET +.endm + +// void aes_gcm_enc_final_##suffix(const struct aes_gcm_key_aesni *key, +// const u32 le_ctr[4], u8 ghash_acc[16], +// u64 total_aadlen, u64 total_datalen); +// bool aes_gcm_dec_final_##suffix(const struct aes_gcm_key_aesni *key, +// const u32 le_ctr[4], const u8 ghash_acc[16], +// u64 total_aadlen, u64 total_datalen, +// const u8 tag[16], int taglen); +// +// This macro generates one of the above two functions (with \enc selecting +// which one). Both functions finish computing the GCM authentication tag by +// updating GHASH with the lengths block and encrypting the GHASH accumulator. +// |total_aadlen| and |total_datalen| must be the total length of the additional +// authenticated data and the en/decrypted data in bytes, respectively. +// +// The encryption function then stores the full-length (16-byte) computed +// authentication tag to |ghash_acc|. The decryption function instead loads the +// expected authentication tag (the one that was transmitted) from the 16-byte +// buffer |tag|, compares the first 4 <= |taglen| <= 16 bytes of it to the +// computed tag in constant time, and returns true if and only if they match. +.macro _aes_gcm_final enc + + // Function arguments + .set KEY, %rdi + .set LE_CTR_PTR, %rsi + .set GHASH_ACC_PTR, %rdx + .set TOTAL_AADLEN, %rcx + .set TOTAL_DATALEN, %r8 + .set TAG, %r9 + .set TAGLEN, %r10d // Originally at 8(%rsp) + .set TAGLEN64, %r10 + + // Additional local variables. + // %rax and %xmm0-%xmm2 are used as temporary registers. + .set AESKEYLEN, %r11d + .set AESKEYLEN64, %r11 + .set BSWAP_MASK, %xmm3 + .set GHASH_ACC, %xmm4 + .set H_POW1, %xmm5 // H^1 + .set H_POW1_X64, %xmm6 // H^1 * x^64 + .set GFPOLY, %xmm7 + + movdqa .Lbswap_mask(%rip), BSWAP_MASK + movl OFFSETOF_AESKEYLEN(KEY), AESKEYLEN + + // Set up a counter block with 1 in the low 32-bit word. This is the + // counter that produces the ciphertext needed to encrypt the auth tag. + movdqu (LE_CTR_PTR), %xmm0 + mov $1, %eax + pinsrd $0, %eax, %xmm0 + + // Build the lengths block and XOR it into the GHASH accumulator. + movq TOTAL_DATALEN, GHASH_ACC + pinsrq $1, TOTAL_AADLEN, GHASH_ACC + psllq $3, GHASH_ACC // Bytes to bits + _xor_mem_to_reg (GHASH_ACC_PTR), GHASH_ACC, %xmm1 + + movdqa OFFSETOF_H_POWERS+7*16(KEY), H_POW1 + movdqa OFFSETOF_H_TIMES_X64(KEY), H_POW1_X64 + movq .Lgfpoly(%rip), GFPOLY + + // Make %rax point to the 6th from last AES round key. (Using signed + // byte offsets -7*16 through 6*16 decreases code size.) + lea (KEY,AESKEYLEN64,4), %rax + + // AES-encrypt the counter block and also multiply GHASH_ACC by H^1. + // Interleave the AES and GHASH instructions to improve performance. + pshufb BSWAP_MASK, %xmm0 + pxor (KEY), %xmm0 + cmp $24, AESKEYLEN + jl 128f // AES-128? + je 192f // AES-192? + // AES-256 + aesenc -7*16(%rax), %xmm0 + aesenc -6*16(%rax), %xmm0 +192: + aesenc -5*16(%rax), %xmm0 + aesenc -4*16(%rax), %xmm0 +128: +.irp i, 0,1,2,3,4,5,6,7,8 + aesenc (\i-3)*16(%rax), %xmm0 + _ghash_mul_step \i, H_POW1, H_POW1_X64, GHASH_ACC, GFPOLY, %xmm1, %xmm2 +.endr + aesenclast 6*16(%rax), %xmm0 + _ghash_mul_step 9, H_POW1, H_POW1_X64, GHASH_ACC, GFPOLY, %xmm1, %xmm2 + + // Undo the byte reflection of the GHASH accumulator. + pshufb BSWAP_MASK, GHASH_ACC + + // Encrypt the GHASH accumulator. + pxor %xmm0, GHASH_ACC + +.if \enc + // Return the computed auth tag. + movdqu GHASH_ACC, (GHASH_ACC_PTR) +.else + .set ZEROPAD_MASK_PTR, TOTAL_AADLEN // Reusing TOTAL_AADLEN! + + // Verify the auth tag in constant time by XOR'ing the transmitted and + // computed auth tags together and using the ptest instruction to check + // whether the first TAGLEN bytes of the result are zero. + _xor_mem_to_reg (TAG), GHASH_ACC, tmp=%xmm0 + movl 8(%rsp), TAGLEN + lea .Lzeropad_mask+16(%rip), ZEROPAD_MASK_PTR + sub TAGLEN64, ZEROPAD_MASK_PTR + xor %eax, %eax + _test_mem (ZEROPAD_MASK_PTR), GHASH_ACC, tmp=%xmm0 + sete %al +.endif + RET +.endm + +.set USE_AVX, 0 +SYM_FUNC_START(aes_gcm_precompute_aesni) + _aes_gcm_precompute +SYM_FUNC_END(aes_gcm_precompute_aesni) +SYM_FUNC_START(aes_gcm_aad_update_aesni) + _aes_gcm_aad_update +SYM_FUNC_END(aes_gcm_aad_update_aesni) +SYM_FUNC_START(aes_gcm_enc_update_aesni) + _aes_gcm_update 1 +SYM_FUNC_END(aes_gcm_enc_update_aesni) +SYM_FUNC_START(aes_gcm_dec_update_aesni) + _aes_gcm_update 0 +SYM_FUNC_END(aes_gcm_dec_update_aesni) +SYM_FUNC_START(aes_gcm_enc_final_aesni) + _aes_gcm_final 1 +SYM_FUNC_END(aes_gcm_enc_final_aesni) +SYM_FUNC_START(aes_gcm_dec_final_aesni) + _aes_gcm_final 0 +SYM_FUNC_END(aes_gcm_dec_final_aesni) + +.set USE_AVX, 1 +SYM_FUNC_START(aes_gcm_precompute_aesni_avx) + _aes_gcm_precompute +SYM_FUNC_END(aes_gcm_precompute_aesni_avx) +SYM_FUNC_START(aes_gcm_aad_update_aesni_avx) + _aes_gcm_aad_update +SYM_FUNC_END(aes_gcm_aad_update_aesni_avx) +SYM_FUNC_START(aes_gcm_enc_update_aesni_avx) + _aes_gcm_update 1 +SYM_FUNC_END(aes_gcm_enc_update_aesni_avx) +SYM_FUNC_START(aes_gcm_dec_update_aesni_avx) + _aes_gcm_update 0 +SYM_FUNC_END(aes_gcm_dec_update_aesni_avx) +SYM_FUNC_START(aes_gcm_enc_final_aesni_avx) + _aes_gcm_final 1 +SYM_FUNC_END(aes_gcm_enc_final_aesni_avx) +SYM_FUNC_START(aes_gcm_dec_final_aesni_avx) + _aes_gcm_final 0 +SYM_FUNC_END(aes_gcm_dec_final_aesni_avx) diff --git a/arch/x86/crypto/aes-gcm-avx10-x86_64.S b/arch/x86/crypto/aes-gcm-avx10-x86_64.S new file mode 100644 index 000000000000..02ee11083d4f --- /dev/null +++ b/arch/x86/crypto/aes-gcm-avx10-x86_64.S @@ -0,0 +1,1199 @@ +/* SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause */ +// +// VAES and VPCLMULQDQ optimized AES-GCM for x86_64 +// +// Copyright 2024 Google LLC +// +// Author: Eric Biggers <ebiggers@google.com> +// +//------------------------------------------------------------------------------ +// +// This file is dual-licensed, meaning that you can use it under your choice of +// either of the following two licenses: +// +// Licensed under the Apache License 2.0 (the "License"). You may obtain a copy +// of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// or +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// 1. Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +// POSSIBILITY OF SUCH DAMAGE. +// +//------------------------------------------------------------------------------ +// +// This file implements AES-GCM (Galois/Counter Mode) for x86_64 CPUs that +// support VAES (vector AES), VPCLMULQDQ (vector carryless multiplication), and +// either AVX512 or AVX10. Some of the functions, notably the encryption and +// decryption update functions which are the most performance-critical, are +// provided in two variants generated from a macro: one using 256-bit vectors +// (suffix: vaes_avx10_256) and one using 512-bit vectors (vaes_avx10_512). The +// other, "shared" functions (vaes_avx10) use at most 256-bit vectors. +// +// The functions that use 512-bit vectors are intended for CPUs that support +// 512-bit vectors *and* where using them doesn't cause significant +// downclocking. They require the following CPU features: +// +// VAES && VPCLMULQDQ && BMI2 && ((AVX512BW && AVX512VL) || AVX10/512) +// +// The other functions require the following CPU features: +// +// VAES && VPCLMULQDQ && BMI2 && ((AVX512BW && AVX512VL) || AVX10/256) +// +// All functions use the "System V" ABI. The Windows ABI is not supported. +// +// Note that we use "avx10" in the names of the functions as a shorthand to +// really mean "AVX10 or a certain set of AVX512 features". Due to Intel's +// introduction of AVX512 and then its replacement by AVX10, there doesn't seem +// to be a simple way to name things that makes sense on all CPUs. +// +// Note that the macros that support both 256-bit and 512-bit vectors could +// fairly easily be changed to support 128-bit too. However, this would *not* +// be sufficient to allow the code to run on CPUs without AVX512 or AVX10, +// because the code heavily uses several features of these extensions other than +// the vector length: the increase in the number of SIMD registers from 16 to +// 32, masking support, and new instructions such as vpternlogd (which can do a +// three-argument XOR). These features are very useful for AES-GCM. + +#include <linux/linkage.h> + +.section .rodata +.p2align 6 + + // A shuffle mask that reflects the bytes of 16-byte blocks +.Lbswap_mask: + .octa 0x000102030405060708090a0b0c0d0e0f + + // This is the GHASH reducing polynomial without its constant term, i.e. + // x^128 + x^7 + x^2 + x, represented using the backwards mapping + // between bits and polynomial coefficients. + // + // Alternatively, it can be interpreted as the naturally-ordered + // representation of the polynomial x^127 + x^126 + x^121 + 1, i.e. the + // "reversed" GHASH reducing polynomial without its x^128 term. +.Lgfpoly: + .octa 0xc2000000000000000000000000000001 + + // Same as above, but with the (1 << 64) bit set. +.Lgfpoly_and_internal_carrybit: + .octa 0xc2000000000000010000000000000001 + + // The below constants are used for incrementing the counter blocks. + // ctr_pattern points to the four 128-bit values [0, 1, 2, 3]. + // inc_2blocks and inc_4blocks point to the single 128-bit values 2 and + // 4. Note that the same '2' is reused in ctr_pattern and inc_2blocks. +.Lctr_pattern: + .octa 0 + .octa 1 +.Linc_2blocks: + .octa 2 + .octa 3 +.Linc_4blocks: + .octa 4 + +// Number of powers of the hash key stored in the key struct. The powers are +// stored from highest (H^NUM_H_POWERS) to lowest (H^1). +#define NUM_H_POWERS 16 + +// Offset to AES key length (in bytes) in the key struct +#define OFFSETOF_AESKEYLEN 480 + +// Offset to start of hash key powers array in the key struct +#define OFFSETOF_H_POWERS 512 + +// Offset to end of hash key powers array in the key struct. +// +// This is immediately followed by three zeroized padding blocks, which are +// included so that partial vectors can be handled more easily. E.g. if VL=64 +// and two blocks remain, we load the 4 values [H^2, H^1, 0, 0]. The most +// padding blocks needed is 3, which occurs if [H^1, 0, 0, 0] is loaded. +#define OFFSETOFEND_H_POWERS (OFFSETOF_H_POWERS + (NUM_H_POWERS * 16)) + +.text + +// Set the vector length in bytes. This sets the VL variable and defines +// register aliases V0-V31 that map to the ymm or zmm registers. +.macro _set_veclen vl + .set VL, \vl +.irp i, 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15, \ + 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31 +.if VL == 32 + .set V\i, %ymm\i +.elseif VL == 64 + .set V\i, %zmm\i +.else + .error "Unsupported vector length" +.endif +.endr +.endm + +// The _ghash_mul_step macro does one step of GHASH multiplication of the +// 128-bit lanes of \a by the corresponding 128-bit lanes of \b and storing the +// reduced products in \dst. \t0, \t1, and \t2 are temporary registers of the +// same size as \a and \b. To complete all steps, this must invoked with \i=0 +// through \i=9. The division into steps allows users of this macro to +// optionally interleave the computation with other instructions. Users of this +// macro must preserve the parameter registers across steps. +// +// The multiplications are done in GHASH's representation of the finite field +// GF(2^128). Elements of GF(2^128) are represented as binary polynomials +// (i.e. polynomials whose coefficients are bits) modulo a reducing polynomial +// G. The GCM specification uses G = x^128 + x^7 + x^2 + x + 1. Addition is +// just XOR, while multiplication is more complex and has two parts: (a) do +// carryless multiplication of two 128-bit input polynomials to get a 256-bit +// intermediate product polynomial, and (b) reduce the intermediate product to +// 128 bits by adding multiples of G that cancel out terms in it. (Adding +// multiples of G doesn't change which field element the polynomial represents.) +// +// Unfortunately, the GCM specification maps bits to/from polynomial +// coefficients backwards from the natural order. In each byte it specifies the +// highest bit to be the lowest order polynomial coefficient, *not* the highest! +// This makes it nontrivial to work with the GHASH polynomials. We could +// reflect the bits, but x86 doesn't have an instruction that does that. +// +// Instead, we operate on the values without bit-reflecting them. This *mostly* +// just works, since XOR and carryless multiplication are symmetric with respect +// to bit order, but it has some consequences. First, due to GHASH's byte +// order, by skipping bit reflection, *byte* reflection becomes necessary to +// give the polynomial terms a consistent order. E.g., considering an N-bit +// value interpreted using the G = x^128 + x^7 + x^2 + x + 1 convention, bits 0 +// through N-1 of the byte-reflected value represent the coefficients of x^(N-1) +// through x^0, whereas bits 0 through N-1 of the non-byte-reflected value +// represent x^7...x^0, x^15...x^8, ..., x^(N-1)...x^(N-8) which can't be worked +// with. Fortunately, x86's vpshufb instruction can do byte reflection. +// +// Second, forgoing the bit reflection causes an extra multiple of x (still +// using the G = x^128 + x^7 + x^2 + x + 1 convention) to be introduced by each +// multiplication. This is because an M-bit by N-bit carryless multiplication +// really produces a (M+N-1)-bit product, but in practice it's zero-extended to +// M+N bits. In the G = x^128 + x^7 + x^2 + x + 1 convention, which maps bits +// to polynomial coefficients backwards, this zero-extension actually changes +// the product by introducing an extra factor of x. Therefore, users of this +// macro must ensure that one of the inputs has an extra factor of x^-1, i.e. +// the multiplicative inverse of x, to cancel out the extra x. +// +// Third, the backwards coefficients convention is just confusing to work with, +// since it makes "low" and "high" in the polynomial math mean the opposite of +// their normal meaning in computer programming. This can be solved by using an +// alternative interpretation: the polynomial coefficients are understood to be +// in the natural order, and the multiplication is actually \a * \b * x^-128 mod +// x^128 + x^127 + x^126 + x^121 + 1. This doesn't change the inputs, outputs, +// or the implementation at all; it just changes the mathematical interpretation +// of what each instruction is doing. Starting from here, we'll use this +// alternative interpretation, as it's easier to understand the code that way. +// +// Moving onto the implementation, the vpclmulqdq instruction does 64 x 64 => +// 128-bit carryless multiplication, so we break the 128 x 128 multiplication +// into parts as follows (the _L and _H suffixes denote low and high 64 bits): +// +// LO = a_L * b_L +// MI = (a_L * b_H) + (a_H * b_L) +// HI = a_H * b_H +// +// The 256-bit product is x^128*HI + x^64*MI + LO. LO, MI, and HI are 128-bit. +// Note that MI "overlaps" with LO and HI. We don't consolidate MI into LO and +// HI right away, since the way the reduction works makes that unnecessary. +// +// For the reduction, we cancel out the low 128 bits by adding multiples of G = +// x^128 + x^127 + x^126 + x^121 + 1. This is done by two iterations, each of +// which cancels out the next lowest 64 bits. Consider a value x^64*A + B, +// where A and B are 128-bit. Adding B_L*G to that value gives: +// +// x^64*A + B + B_L*G +// = x^64*A + x^64*B_H + B_L + B_L*(x^128 + x^127 + x^126 + x^121 + 1) +// = x^64*A + x^64*B_H + B_L + x^128*B_L + x^64*B_L*(x^63 + x^62 + x^57) + B_L +// = x^64*A + x^64*B_H + x^128*B_L + x^64*B_L*(x^63 + x^62 + x^57) + B_L + B_L +// = x^64*(A + B_H + x^64*B_L + B_L*(x^63 + x^62 + x^57)) +// +// So: if we sum A, B with its halves swapped, and the low half of B times x^63 +// + x^62 + x^57, we get a 128-bit value C where x^64*C is congruent to the +// original value x^64*A + B. I.e., the low 64 bits got canceled out. +// +// We just need to apply this twice: first to fold LO into MI, and second to +// fold the updated MI into HI. +// +// The needed three-argument XORs are done using the vpternlogd instruction with +// immediate 0x96, since this is faster than two vpxord instructions. +// +// A potential optimization, assuming that b is fixed per-key (if a is fixed +// per-key it would work the other way around), is to use one iteration of the +// reduction described above to precompute a value c such that x^64*c = b mod G, +// and then multiply a_L by c (and implicitly by x^64) instead of by b: +// +// MI = (a_L * c_L) + (a_H * b_L) +// HI = (a_L * c_H) + (a_H * b_H) +// +// This would eliminate the LO part of the intermediate product, which would +// eliminate the need to fold LO into MI. This would save two instructions, +// including a vpclmulqdq. However, we currently don't use this optimization +// because it would require twice as many per-key precomputed values. +// +// Using Karatsuba multiplication instead of "schoolbook" multiplication +// similarly would save a vpclmulqdq but does not seem to be worth it. +.macro _ghash_mul_step i, a, b, dst, gfpoly, t0, t1, t2 +.if \i == 0 + vpclmulqdq $0x00, \a, \b, \t0 // LO = a_L * b_L + vpclmulqdq $0x01, \a, \b, \t1 // MI_0 = a_L * b_H +.elseif \i == 1 + vpclmulqdq $0x10, \a, \b, \t2 // MI_1 = a_H * b_L +.elseif \i == 2 + vpxord \t2, \t1, \t1 // MI = MI_0 + MI_1 +.elseif \i == 3 + vpclmulqdq $0x01, \t0, \gfpoly, \t2 // LO_L*(x^63 + x^62 + x^57) +.elseif \i == 4 + vpshufd $0x4e, \t0, \t0 // Swap halves of LO +.elseif \i == 5 + vpternlogd $0x96, \t2, \t0, \t1 // Fold LO into MI +.elseif \i == 6 + vpclmulqdq $0x11, \a, \b, \dst // HI = a_H * b_H +.elseif \i == 7 + vpclmulqdq $0x01, \t1, \gfpoly, \t0 // MI_L*(x^63 + x^62 + x^57) +.elseif \i == 8 + vpshufd $0x4e, \t1, \t1 // Swap halves of MI +.elseif \i == 9 + vpternlogd $0x96, \t0, \t1, \dst // Fold MI into HI +.endif +.endm + +// GHASH-multiply the 128-bit lanes of \a by the 128-bit lanes of \b and store +// the reduced products in \dst. See _ghash_mul_step for full explanation. +.macro _ghash_mul a, b, dst, gfpoly, t0, t1, t2 +.irp i, 0,1,2,3,4,5,6,7,8,9 + _ghash_mul_step \i, \a, \b, \dst, \gfpoly, \t0, \t1, \t2 +.endr +.endm + +// GHASH-multiply the 128-bit lanes of \a by the 128-bit lanes of \b and add the +// *unreduced* products to \lo, \mi, and \hi. +.macro _ghash_mul_noreduce a, b, lo, mi, hi, t0, t1, t2, t3 + vpclmulqdq $0x00, \a, \b, \t0 // a_L * b_L + vpclmulqdq $0x01, \a, \b, \t1 // a_L * b_H + vpclmulqdq $0x10, \a, \b, \t2 // a_H * b_L + vpclmulqdq $0x11, \a, \b, \t3 // a_H * b_H + vpxord \t0, \lo, \lo + vpternlogd $0x96, \t2, \t1, \mi + vpxord \t3, \hi, \hi +.endm + +// Reduce the unreduced products from \lo, \mi, and \hi and store the 128-bit +// reduced products in \hi. See _ghash_mul_step for explanation of reduction. +.macro _ghash_reduce lo, mi, hi, gfpoly, t0 + vpclmulqdq $0x01, \lo, \gfpoly, \t0 + vpshufd $0x4e, \lo, \lo + vpternlogd $0x96, \t0, \lo, \mi + vpclmulqdq $0x01, \mi, \gfpoly, \t0 + vpshufd $0x4e, \mi, \mi + vpternlogd $0x96, \t0, \mi, \hi +.endm + +// void aes_gcm_precompute_##suffix(struct aes_gcm_key_avx10 *key); +// +// Given the expanded AES key |key->aes_key|, this function derives the GHASH +// subkey and initializes |key->ghash_key_powers| with powers of it. +// +// The number of key powers initialized is NUM_H_POWERS, and they are stored in +// the order H^NUM_H_POWERS to H^1. The zeroized padding blocks after the key +// powers themselves are also initialized. +// +// This macro supports both VL=32 and VL=64. _set_veclen must have been invoked +// with the desired length. In the VL=32 case, the function computes twice as +// many key powers than are actually used by the VL=32 GCM update functions. +// This is done to keep the key format the same regardless of vector length. +.macro _aes_gcm_precompute + + // Function arguments + .set KEY, %rdi + + // Additional local variables. V0-V2 and %rax are used as temporaries. + .set POWERS_PTR, %rsi + .set RNDKEYLAST_PTR, %rdx + .set H_CUR, V3 + .set H_CUR_YMM, %ymm3 + .set H_CUR_XMM, %xmm3 + .set H_INC, V4 + .set H_INC_YMM, %ymm4 + .set H_INC_XMM, %xmm4 + .set GFPOLY, V5 + .set GFPOLY_YMM, %ymm5 + .set GFPOLY_XMM, %xmm5 + + // Get pointer to lowest set of key powers (located at end of array). + lea OFFSETOFEND_H_POWERS-VL(KEY), POWERS_PTR + + // Encrypt an all-zeroes block to get the raw hash subkey. + movl OFFSETOF_AESKEYLEN(KEY), %eax + lea 6*16(KEY,%rax,4), RNDKEYLAST_PTR + vmovdqu (KEY), %xmm0 // Zero-th round key XOR all-zeroes block + add $16, KEY +1: + vaesenc (KEY), %xmm0, %xmm0 + add $16, KEY + cmp KEY, RNDKEYLAST_PTR + jne 1b + vaesenclast (RNDKEYLAST_PTR), %xmm0, %xmm0 + + // Reflect the bytes of the raw hash subkey. + vpshufb .Lbswap_mask(%rip), %xmm0, H_CUR_XMM + + // Zeroize the padding blocks. + vpxor %xmm0, %xmm0, %xmm0 + vmovdqu %ymm0, VL(POWERS_PTR) + vmovdqu %xmm0, VL+2*16(POWERS_PTR) + + // Finish preprocessing the first key power, H^1. Since this GHASH + // implementation operates directly on values with the backwards bit + // order specified by the GCM standard, it's necessary to preprocess the + // raw key as follows. First, reflect its bytes. Second, multiply it + // by x^-1 mod x^128 + x^7 + x^2 + x + 1 (if using the backwards + // interpretation of polynomial coefficients), which can also be + // interpreted as multiplication by x mod x^128 + x^127 + x^126 + x^121 + // + 1 using the alternative, natural interpretation of polynomial + // coefficients. For details, see the comment above _ghash_mul_step. + // + // Either way, for the multiplication the concrete operation performed + // is a left shift of the 128-bit value by 1 bit, then an XOR with (0xc2 + // << 120) | 1 if a 1 bit was carried out. However, there's no 128-bit + // wide shift instruction, so instead double each of the two 64-bit + // halves and incorporate the internal carry bit into the value XOR'd. + vpshufd $0xd3, H_CUR_XMM, %xmm0 + vpsrad $31, %xmm0, %xmm0 + vpaddq H_CUR_XMM, H_CUR_XMM, H_CUR_XMM + // H_CUR_XMM ^= xmm0 & gfpoly_and_internal_carrybit + vpternlogd $0x78, .Lgfpoly_and_internal_carrybit(%rip), %xmm0, H_CUR_XMM + + // Load the gfpoly constant. + vbroadcasti32x4 .Lgfpoly(%rip), GFPOLY + + // Square H^1 to get H^2. + // + // Note that as with H^1, all higher key powers also need an extra + // factor of x^-1 (or x using the natural interpretation). Nothing + // special needs to be done to make this happen, though: H^1 * H^1 would + // end up with two factors of x^-1, but the multiplication consumes one. + // So the product H^2 ends up with the desired one factor of x^-1. + _ghash_mul H_CUR_XMM, H_CUR_XMM, H_INC_XMM, GFPOLY_XMM, \ + %xmm0, %xmm1, %xmm2 + + // Create H_CUR_YMM = [H^2, H^1] and H_INC_YMM = [H^2, H^2]. + vinserti128 $1, H_CUR_XMM, H_INC_YMM, H_CUR_YMM + vinserti128 $1, H_INC_XMM, H_INC_YMM, H_INC_YMM + +.if VL == 64 + // Create H_CUR = [H^4, H^3, H^2, H^1] and H_INC = [H^4, H^4, H^4, H^4]. + _ghash_mul H_INC_YMM, H_CUR_YMM, H_INC_YMM, GFPOLY_YMM, \ + %ymm0, %ymm1, %ymm2 + vinserti64x4 $1, H_CUR_YMM, H_INC, H_CUR + vshufi64x2 $0, H_INC, H_INC, H_INC +.endif + + // Store the lowest set of key powers. + vmovdqu8 H_CUR, (POWERS_PTR) + + // Compute and store the remaining key powers. With VL=32, repeatedly + // multiply [H^(i+1), H^i] by [H^2, H^2] to get [H^(i+3), H^(i+2)]. + // With VL=64, repeatedly multiply [H^(i+3), H^(i+2), H^(i+1), H^i] by + // [H^4, H^4, H^4, H^4] to get [H^(i+7), H^(i+6), H^(i+5), H^(i+4)]. + mov $(NUM_H_POWERS*16/VL) - 1, %eax +.Lprecompute_next\@: + sub $VL, POWERS_PTR + _ghash_mul H_INC, H_CUR, H_CUR, GFPOLY, V0, V1, V2 + vmovdqu8 H_CUR, (POWERS_PTR) + dec %eax + jnz .Lprecompute_next\@ + + vzeroupper // This is needed after using ymm or zmm registers. + RET +.endm + +// XOR together the 128-bit lanes of \src (whose low lane is \src_xmm) and store +// the result in \dst_xmm. This implicitly zeroizes the other lanes of dst. +.macro _horizontal_xor src, src_xmm, dst_xmm, t0_xmm, t1_xmm, t2_xmm + vextracti32x4 $1, \src, \t0_xmm +.if VL == 32 + vpxord \t0_xmm, \src_xmm, \dst_xmm +.elseif VL == 64 + vextracti32x4 $2, \src, \t1_xmm + vextracti32x4 $3, \src, \t2_xmm + vpxord \t0_xmm, \src_xmm, \dst_xmm + vpternlogd $0x96, \t1_xmm, \t2_xmm, \dst_xmm +.else + .error "Unsupported vector length" +.endif +.endm + +// Do one step of the GHASH update of the data blocks given in the vector +// registers GHASHDATA[0-3]. \i specifies the step to do, 0 through 9. The +// division into steps allows users of this macro to optionally interleave the +// computation with other instructions. This macro uses the vector register +// GHASH_ACC as input/output; GHASHDATA[0-3] as inputs that are clobbered; +// H_POW[4-1], GFPOLY, and BSWAP_MASK as inputs that aren't clobbered; and +// GHASHTMP[0-2] as temporaries. This macro handles the byte-reflection of the +// data blocks. The parameter registers must be preserved across steps. +// +// The GHASH update does: GHASH_ACC = H_POW4*(GHASHDATA0 + GHASH_ACC) + +// H_POW3*GHASHDATA1 + H_POW2*GHASHDATA2 + H_POW1*GHASHDATA3, where the +// operations are vectorized operations on vectors of 16-byte blocks. E.g., +// with VL=32 there are 2 blocks per vector and the vectorized terms correspond +// to the following non-vectorized terms: +// +// H_POW4*(GHASHDATA0 + GHASH_ACC) => H^8*(blk0 + GHASH_ACC_XMM) and H^7*(blk1 + 0) +// H_POW3*GHASHDATA1 => H^6*blk2 and H^5*blk3 +// H_POW2*GHASHDATA2 => H^4*blk4 and H^3*blk5 +// H_POW1*GHASHDATA3 => H^2*blk6 and H^1*blk7 +// +// With VL=64, we use 4 blocks/vector, H^16 through H^1, and blk0 through blk15. +// +// More concretely, this code does: +// - Do vectorized "schoolbook" multiplications to compute the intermediate +// 256-bit product of each block and its corresponding hash key power. +// There are 4*VL/16 of these intermediate products. +// - Sum (XOR) the intermediate 256-bit products across vectors. This leaves +// VL/16 256-bit intermediate values. +// - Do a vectorized reduction of these 256-bit intermediate values to +// 128-bits each. This leaves VL/16 128-bit intermediate values. +// - Sum (XOR) these values and store the 128-bit result in GHASH_ACC_XMM. +// +// See _ghash_mul_step for the full explanation of the operations performed for +// each individual finite field multiplication and reduction. +.macro _ghash_step_4x i +.if \i == 0 + vpshufb BSWAP_MASK, GHASHDATA0, GHASHDATA0 + vpxord GHASH_ACC, GHASHDATA0, GHASHDATA0 + vpshufb BSWAP_MASK, GHASHDATA1, GHASHDATA1 + vpshufb BSWAP_MASK, GHASHDATA2, GHASHDATA2 +.elseif \i == 1 + vpshufb BSWAP_MASK, GHASHDATA3, GHASHDATA3 + vpclmulqdq $0x00, H_POW4, GHASHDATA0, GHASH_ACC // LO_0 + vpclmulqdq $0x00, H_POW3, GHASHDATA1, GHASHTMP0 // LO_1 + vpclmulqdq $0x00, H_POW2, GHASHDATA2, GHASHTMP1 // LO_2 +.elseif \i == 2 + vpxord GHASHTMP0, GHASH_ACC, GHASH_ACC // sum(LO_{1,0}) + vpclmulqdq $0x00, H_POW1, GHASHDATA3, GHASHTMP2 // LO_3 + vpternlogd $0x96, GHASHTMP2, GHASHTMP1, GHASH_ACC // LO = sum(LO_{3,2,1,0}) + vpclmulqdq $0x01, H_POW4, GHASHDATA0, GHASHTMP0 // MI_0 +.elseif \i == 3 + vpclmulqdq $0x01, H_POW3, GHASHDATA1, GHASHTMP1 // MI_1 + vpclmulqdq $0x01, H_POW2, GHASHDATA2, GHASHTMP2 // MI_2 + vpternlogd $0x96, GHASHTMP2, GHASHTMP1, GHASHTMP0 // sum(MI_{2,1,0}) + vpclmulqdq $0x01, H_POW1, GHASHDATA3, GHASHTMP1 // MI_3 +.elseif \i == 4 + vpclmulqdq $0x10, H_POW4, GHASHDATA0, GHASHTMP2 // MI_4 + vpternlogd $0x96, GHASHTMP2, GHASHTMP1, GHASHTMP0 // sum(MI_{4,3,2,1,0}) + vpclmulqdq $0x10, H_POW3, GHASHDATA1, GHASHTMP1 // MI_5 + vpclmulqdq $0x10, H_POW2, GHASHDATA2, GHASHTMP2 // MI_6 +.elseif \i == 5 + vpternlogd $0x96, GHASHTMP2, GHASHTMP1, GHASHTMP0 // sum(MI_{6,5,4,3,2,1,0}) + vpclmulqdq $0x01, GHASH_ACC, GFPOLY, GHASHTMP2 // LO_L*(x^63 + x^62 + x^57) + vpclmulqdq $0x10, H_POW1, GHASHDATA3, GHASHTMP1 // MI_7 + vpxord GHASHTMP1, GHASHTMP0, GHASHTMP0 // MI = sum(MI_{7,6,5,4,3,2,1,0}) +.elseif \i == 6 + vpshufd $0x4e, GHASH_ACC, GHASH_ACC // Swap halves of LO + vpclmulqdq $0x11, H_POW4, GHASHDATA0, GHASHDATA0 // HI_0 + vpclmulqdq $0x11, H_POW3, GHASHDATA1, GHASHDATA1 // HI_1 + vpclmulqdq $0x11, H_POW2, GHASHDATA2, GHASHDATA2 // HI_2 +.elseif \i == 7 + vpternlogd $0x96, GHASHTMP2, GHASH_ACC, GHASHTMP0 // Fold LO into MI + vpclmulqdq $0x11, H_POW1, GHASHDATA3, GHASHDATA3 // HI_3 + vpternlogd $0x96, GHASHDATA2, GHASHDATA1, GHASHDATA0 // sum(HI_{2,1,0}) + vpclmulqdq $0x01, GHASHTMP0, GFPOLY, GHASHTMP1 // MI_L*(x^63 + x^62 + x^57) +.elseif \i == 8 + vpxord GHASHDATA3, GHASHDATA0, GHASH_ACC // HI = sum(HI_{3,2,1,0}) + vpshufd $0x4e, GHASHTMP0, GHASHTMP0 // Swap halves of MI + vpternlogd $0x96, GHASHTMP1, GHASHTMP0, GHASH_ACC // Fold MI into HI +.elseif \i == 9 + _horizontal_xor GHASH_ACC, GHASH_ACC_XMM, GHASH_ACC_XMM, \ + GHASHDATA0_XMM, GHASHDATA1_XMM, GHASHDATA2_XMM +.endif +.endm + +// Do one non-last round of AES encryption on the counter blocks in V0-V3 using +// the round key that has been broadcast to all 128-bit lanes of \round_key. +.macro _vaesenc_4x round_key + vaesenc \round_key, V0, V0 + vaesenc \round_key, V1, V1 + vaesenc \round_key, V2, V2 + vaesenc \round_key, V3, V3 +.endm + +// Start the AES encryption of four vectors of counter blocks. +.macro _ctr_begin_4x + + // Increment LE_CTR four times to generate four vectors of little-endian + // counter blocks, swap each to big-endian, and store them in V0-V3. + vpshufb BSWAP_MASK, LE_CTR, V0 + vpaddd LE_CTR_INC, LE_CTR, LE_CTR + vpshufb BSWAP_MASK, LE_CTR, V1 + vpaddd LE_CTR_INC, LE_CTR, LE_CTR + vpshufb BSWAP_MASK, LE_CTR, V2 + vpaddd LE_CTR_INC, LE_CTR, LE_CTR + vpshufb BSWAP_MASK, LE_CTR, V3 + vpaddd LE_CTR_INC, LE_CTR, LE_CTR + + // AES "round zero": XOR in the zero-th round key. + vpxord RNDKEY0, V0, V0 + vpxord RNDKEY0, V1, V1 + vpxord RNDKEY0, V2, V2 + vpxord RNDKEY0, V3, V3 +.endm + +// Do the last AES round for four vectors of counter blocks V0-V3, XOR source +// data with the resulting keystream, and write the result to DST and +// GHASHDATA[0-3]. (Implementation differs slightly, but has the same effect.) +.macro _aesenclast_and_xor_4x + // XOR the source data with the last round key, saving the result in + // GHASHDATA[0-3]. This reduces latency by taking advantage of the + // property vaesenclast(key, a) ^ b == vaesenclast(key ^ b, a). + vpxord 0*VL(SRC), RNDKEYLAST, GHASHDATA0 + vpxord 1*VL(SRC), RNDKEYLAST, GHASHDATA1 + vpxord 2*VL(SRC), RNDKEYLAST, GHASHDATA2 + vpxord 3*VL(SRC), RNDKEYLAST, GHASHDATA3 + + // Do the last AES round. This handles the XOR with the source data + // too, as per the optimization described above. + vaesenclast GHASHDATA0, V0, GHASHDATA0 + vaesenclast GHASHDATA1, V1, GHASHDATA1 + vaesenclast GHASHDATA2, V2, GHASHDATA2 + vaesenclast GHASHDATA3, V3, GHASHDATA3 + + // Store the en/decrypted data to DST. + vmovdqu8 GHASHDATA0, 0*VL(DST) + vmovdqu8 GHASHDATA1, 1*VL(DST) + vmovdqu8 GHASHDATA2, 2*VL(DST) + vmovdqu8 GHASHDATA3, 3*VL(DST) +.endm + +// void aes_gcm_{enc,dec}_update_##suffix(const struct aes_gcm_key_avx10 *key, +// const u32 le_ctr[4], u8 ghash_acc[16], +// const u8 *src, u8 *dst, int datalen); +// +// This macro generates a GCM encryption or decryption update function with the +// above prototype (with \enc selecting which one). This macro supports both +// VL=32 and VL=64. _set_veclen must have been invoked with the desired length. +// +// This function computes the next portion of the CTR keystream, XOR's it with +// |datalen| bytes from |src|, and writes the resulting encrypted or decrypted +// data to |dst|. It also updates the GHASH accumulator |ghash_acc| using the +// next |datalen| ciphertext bytes. +// +// |datalen| must be a multiple of 16, except on the last call where it can be +// any length. The caller must do any buffering needed to ensure this. Both +// in-place and out-of-place en/decryption are supported. +// +// |le_ctr| must give the current counter in little-endian format. For a new +// message, the low word of the counter must be 2. This function loads the +// counter from |le_ctr| and increments the loaded counter as needed, but it +// does *not* store the updated counter back to |le_ctr|. The caller must +// update |le_ctr| if any more data segments follow. Internally, only the low +// 32-bit word of the counter is incremented, following the GCM standard. +.macro _aes_gcm_update enc + + // Function arguments + .set KEY, %rdi + .set LE_CTR_PTR, %rsi + .set GHASH_ACC_PTR, %rdx + .set SRC, %rcx + .set DST, %r8 + .set DATALEN, %r9d + .set DATALEN64, %r9 // Zero-extend DATALEN before using! + + // Additional local variables + + // %rax and %k1 are used as temporary registers. LE_CTR_PTR is also + // available as a temporary register after the counter is loaded. + + // AES key length in bytes + .set AESKEYLEN, %r10d + .set AESKEYLEN64, %r10 + + // Pointer to the last AES round key for the chosen AES variant + .set RNDKEYLAST_PTR, %r11 + + // In the main loop, V0-V3 are used as AES input and output. Elsewhere + // they are used as temporary registers. + + // GHASHDATA[0-3] hold the ciphertext blocks and GHASH input data. + .set GHASHDATA0, V4 + .set GHASHDATA0_XMM, %xmm4 + .set GHASHDATA1, V5 + .set GHASHDATA1_XMM, %xmm5 + .set GHASHDATA2, V6 + .set GHASHDATA2_XMM, %xmm6 + .set GHASHDATA3, V7 + + // BSWAP_MASK is the shuffle mask for byte-reflecting 128-bit values + // using vpshufb, copied to all 128-bit lanes. + .set BSWAP_MASK, V8 + + // RNDKEY temporarily holds the next AES round key. + .set RNDKEY, V9 + + // GHASH_ACC is the accumulator variable for GHASH. When fully reduced, + // only the lowest 128-bit lane can be nonzero. When not fully reduced, + // more than one lane may be used, and they need to be XOR'd together. + .set GHASH_ACC, V10 + .set GHASH_ACC_XMM, %xmm10 + + // LE_CTR_INC is the vector of 32-bit words that need to be added to a + // vector of little-endian counter blocks to advance it forwards. + .set LE_CTR_INC, V11 + + // LE_CTR contains the next set of little-endian counter blocks. + .set LE_CTR, V12 + + // RNDKEY0, RNDKEYLAST, and RNDKEY_M[9-1] contain cached AES round keys, + // copied to all 128-bit lanes. RNDKEY0 is the zero-th round key, + // RNDKEYLAST the last, and RNDKEY_M\i the one \i-th from the last. + .set RNDKEY0, V13 + .set RNDKEYLAST, V14 + .set RNDKEY_M9, V15 + .set RNDKEY_M8, V16 + .set RNDKEY_M7, V17 + .set RNDKEY_M6, V18 + .set RNDKEY_M5, V19 + .set RNDKEY_M4, V20 + .set RNDKEY_M3, V21 + .set RNDKEY_M2, V22 + .set RNDKEY_M1, V23 + + // GHASHTMP[0-2] are temporary variables used by _ghash_step_4x. These + // cannot coincide with anything used for AES encryption, since for + // performance reasons GHASH and AES encryption are interleaved. + .set GHASHTMP0, V24 + .set GHASHTMP1, V25 + .set GHASHTMP2, V26 + + // H_POW[4-1] contain the powers of the hash key H^(4*VL/16)...H^1. The + // descending numbering reflects the order of the key powers. + .set H_POW4, V27 + .set H_POW3, V28 + .set H_POW2, V29 + .set H_POW1, V30 + + // GFPOLY contains the .Lgfpoly constant, copied to all 128-bit lanes. + .set GFPOLY, V31 + + // Load some constants. + vbroadcasti32x4 .Lbswap_mask(%rip), BSWAP_MASK + vbroadcasti32x4 .Lgfpoly(%rip), GFPOLY + + // Load the GHASH accumulator and the starting counter. + vmovdqu (GHASH_ACC_PTR), GHASH_ACC_XMM + vbroadcasti32x4 (LE_CTR_PTR), LE_CTR + + // Load the AES key length in bytes. + movl OFFSETOF_AESKEYLEN(KEY), AESKEYLEN + + // Make RNDKEYLAST_PTR point to the last AES round key. This is the + // round key with index 10, 12, or 14 for AES-128, AES-192, or AES-256 + // respectively. Then load the zero-th and last round keys. + lea 6*16(KEY,AESKEYLEN64,4), RNDKEYLAST_PTR + vbroadcasti32x4 (KEY), RNDKEY0 + vbroadcasti32x4 (RNDKEYLAST_PTR), RNDKEYLAST + + // Finish initializing LE_CTR by adding [0, 1, ...] to its low words. + vpaddd .Lctr_pattern(%rip), LE_CTR, LE_CTR + + // Initialize LE_CTR_INC to contain VL/16 in all 128-bit lanes. +.if VL == 32 + vbroadcasti32x4 .Linc_2blocks(%rip), LE_CTR_INC +.elseif VL == 64 + vbroadcasti32x4 .Linc_4blocks(%rip), LE_CTR_INC +.else + .error "Unsupported vector length" +.endif + + // If there are at least 4*VL bytes of data, then continue into the loop + // that processes 4*VL bytes of data at a time. Otherwise skip it. + // + // Pre-subtracting 4*VL from DATALEN saves an instruction from the main + // loop and also ensures that at least one write always occurs to + // DATALEN, zero-extending it and allowing DATALEN64 to be used later. + add $-4*VL, DATALEN // shorter than 'sub 4*VL' when VL=32 + jl .Lcrypt_loop_4x_done\@ + + // Load powers of the hash key. + vmovdqu8 OFFSETOFEND_H_POWERS-4*VL(KEY), H_POW4 + vmovdqu8 OFFSETOFEND_H_POWERS-3*VL(KEY), H_POW3 + vmovdqu8 OFFSETOFEND_H_POWERS-2*VL(KEY), H_POW2 + vmovdqu8 OFFSETOFEND_H_POWERS-1*VL(KEY), H_POW1 + + // Main loop: en/decrypt and hash 4 vectors at a time. + // + // When possible, interleave the AES encryption of the counter blocks + // with the GHASH update of the ciphertext blocks. This improves + // performance on many CPUs because the execution ports used by the VAES + // instructions often differ from those used by vpclmulqdq and other + // instructions used in GHASH. For example, many Intel CPUs dispatch + // vaesenc to ports 0 and 1 and vpclmulqdq to port 5. + // + // The interleaving is easiest to do during decryption, since during + // decryption the ciphertext blocks are immediately available. For + // encryption, instead encrypt the first set of blocks, then hash those + // blocks while encrypting the next set of blocks, repeat that as + // needed, and finally hash the last set of blocks. + +.if \enc + // Encrypt the first 4 vectors of plaintext blocks. Leave the resulting + // ciphertext in GHASHDATA[0-3] for GHASH. + _ctr_begin_4x + lea 16(KEY), %rax +1: + vbroadcasti32x4 (%rax), RNDKEY + _vaesenc_4x RNDKEY + add $16, %rax + cmp %rax, RNDKEYLAST_PTR + jne 1b + _aesenclast_and_xor_4x + sub $-4*VL, SRC // shorter than 'add 4*VL' when VL=32 + sub $-4*VL, DST + add $-4*VL, DATALEN + jl .Lghash_last_ciphertext_4x\@ +.endif + + // Cache as many additional AES round keys as possible. +.irp i, 9,8,7,6,5,4,3,2,1 + vbroadcasti32x4 -\i*16(RNDKEYLAST_PTR), RNDKEY_M\i +.endr + +.Lcrypt_loop_4x\@: + + // If decrypting, load more ciphertext blocks into GHASHDATA[0-3]. If + // encrypting, GHASHDATA[0-3] already contain the previous ciphertext. +.if !\enc + vmovdqu8 0*VL(SRC), GHASHDATA0 + vmovdqu8 1*VL(SRC), GHASHDATA1 + vmovdqu8 2*VL(SRC), GHASHDATA2 + vmovdqu8 3*VL(SRC), GHASHDATA3 +.endif + + // Start the AES encryption of the counter blocks. + _ctr_begin_4x + cmp $24, AESKEYLEN + jl 128f // AES-128? + je 192f // AES-192? + // AES-256 + vbroadcasti32x4 -13*16(RNDKEYLAST_PTR), RNDKEY + _vaesenc_4x RNDKEY + vbroadcasti32x4 -12*16(RNDKEYLAST_PTR), RNDKEY + _vaesenc_4x RNDKEY +192: + vbroadcasti32x4 -11*16(RNDKEYLAST_PTR), RNDKEY + _vaesenc_4x RNDKEY + vbroadcasti32x4 -10*16(RNDKEYLAST_PTR), RNDKEY + _vaesenc_4x RNDKEY +128: + + // Finish the AES encryption of the counter blocks in V0-V3, interleaved + // with the GHASH update of the ciphertext blocks in GHASHDATA[0-3]. +.irp i, 9,8,7,6,5,4,3,2,1 + _ghash_step_4x (9 - \i) + _vaesenc_4x RNDKEY_M\i +.endr + _ghash_step_4x 9 + _aesenclast_and_xor_4x + sub $-4*VL, SRC // shorter than 'add 4*VL' when VL=32 + sub $-4*VL, DST + add $-4*VL, DATALEN + jge .Lcrypt_loop_4x\@ + +.if \enc +.Lghash_last_ciphertext_4x\@: + // Update GHASH with the last set of ciphertext blocks. +.irp i, 0,1,2,3,4,5,6,7,8,9 + _ghash_step_4x \i +.endr +.endif + +.Lcrypt_loop_4x_done\@: + + // Undo the extra subtraction by 4*VL and check whether data remains. + sub $-4*VL, DATALEN // shorter than 'add 4*VL' when VL=32 + jz .Ldone\@ + + // The data length isn't a multiple of 4*VL. Process the remaining data + // of length 1 <= DATALEN < 4*VL, up to one vector (VL bytes) at a time. + // Going one vector at a time may seem inefficient compared to having + // separate code paths for each possible number of vectors remaining. + // However, using a loop keeps the code size down, and it performs + // surprising well; modern CPUs will start executing the next iteration + // before the previous one finishes and also predict the number of loop + // iterations. For a similar reason, we roll up the AES rounds. + // + // On the last iteration, the remaining length may be less than VL. + // Handle this using masking. + // + // Since there are enough key powers available for all remaining data, + // there is no need to do a GHASH reduction after each iteration. + // Instead, multiply each remaining block by its own key power, and only + // do a GHASH reduction at the very end. + + // Make POWERS_PTR point to the key powers [H^N, H^(N-1), ...] where N + // is the number of blocks that remain. + .set POWERS_PTR, LE_CTR_PTR // LE_CTR_PTR is free to be reused. + mov DATALEN, %eax + neg %rax + and $~15, %rax // -round_up(DATALEN, 16) + lea OFFSETOFEND_H_POWERS(KEY,%rax), POWERS_PTR + + // Start collecting the unreduced GHASH intermediate value LO, MI, HI. + .set LO, GHASHDATA0 + .set LO_XMM, GHASHDATA0_XMM + .set MI, GHASHDATA1 + .set MI_XMM, GHASHDATA1_XMM + .set HI, GHASHDATA2 + .set HI_XMM, GHASHDATA2_XMM + vpxor LO_XMM, LO_XMM, LO_XMM + vpxor MI_XMM, MI_XMM, MI_XMM + vpxor HI_XMM, HI_XMM, HI_XMM + +.Lcrypt_loop_1x\@: + + // Select the appropriate mask for this iteration: all 1's if + // DATALEN >= VL, otherwise DATALEN 1's. Do this branchlessly using the + // bzhi instruction from BMI2. (This relies on DATALEN <= 255.) +.if VL < 64 + mov $-1, %eax + bzhi DATALEN, %eax, %eax + kmovd %eax, %k1 +.else + mov $-1, %rax + bzhi DATALEN64, %rax, %rax + kmovq %rax, %k1 +.endif + + // Encrypt a vector of counter blocks. This does not need to be masked. + vpshufb BSWAP_MASK, LE_CTR, V0 + vpaddd LE_CTR_INC, LE_CTR, LE_CTR + vpxord RNDKEY0, V0, V0 + lea 16(KEY), %rax +1: + vbroadcasti32x4 (%rax), RNDKEY + vaesenc RNDKEY, V0, V0 + add $16, %rax + cmp %rax, RNDKEYLAST_PTR + jne 1b + vaesenclast RNDKEYLAST, V0, V0 + + // XOR the data with the appropriate number of keystream bytes. + vmovdqu8 (SRC), V1{%k1}{z} + vpxord V1, V0, V0 + vmovdqu8 V0, (DST){%k1} + + // Update GHASH with the ciphertext block(s), without reducing. + // + // In the case of DATALEN < VL, the ciphertext is zero-padded to VL. + // (If decrypting, it's done by the above masked load. If encrypting, + // it's done by the below masked register-to-register move.) Note that + // if DATALEN <= VL - 16, there will be additional padding beyond the + // padding of the last block specified by GHASH itself; i.e., there may + // be whole block(s) that get processed by the GHASH multiplication and + // reduction instructions but should not actually be included in the + // GHASH. However, any such blocks are all-zeroes, and the values that + // they're multiplied with are also all-zeroes. Therefore they just add + // 0 * 0 = 0 to the final GHASH result, which makes no difference. + vmovdqu8 (POWERS_PTR), H_POW1 +.if \enc + vmovdqu8 V0, V1{%k1}{z} +.endif + vpshufb BSWAP_MASK, V1, V0 + vpxord GHASH_ACC, V0, V0 + _ghash_mul_noreduce H_POW1, V0, LO, MI, HI, GHASHDATA3, V1, V2, V3 + vpxor GHASH_ACC_XMM, GHASH_ACC_XMM, GHASH_ACC_XMM + + add $VL, POWERS_PTR + add $VL, SRC + add $VL, DST + sub $VL, DATALEN + jg .Lcrypt_loop_1x\@ + + // Finally, do the GHASH reduction. + _ghash_reduce LO, MI, HI, GFPOLY, V0 + _horizontal_xor HI, HI_XMM, GHASH_ACC_XMM, %xmm0, %xmm1, %xmm2 + +.Ldone\@: + // Store the updated GHASH accumulator back to memory. + vmovdqu GHASH_ACC_XMM, (GHASH_ACC_PTR) + + vzeroupper // This is needed after using ymm or zmm registers. + RET +.endm + +// void aes_gcm_enc_final_vaes_avx10(const struct aes_gcm_key_avx10 *key, +// const u32 le_ctr[4], u8 ghash_acc[16], +// u64 total_aadlen, u64 total_datalen); +// bool aes_gcm_dec_final_vaes_avx10(const struct aes_gcm_key_avx10 *key, +// const u32 le_ctr[4], +// const u8 ghash_acc[16], +// u64 total_aadlen, u64 total_datalen, +// const u8 tag[16], int taglen); +// +// This macro generates one of the above two functions (with \enc selecting +// which one). Both functions finish computing the GCM authentication tag by +// updating GHASH with the lengths block and encrypting the GHASH accumulator. +// |total_aadlen| and |total_datalen| must be the total length of the additional +// authenticated data and the en/decrypted data in bytes, respectively. +// +// The encryption function then stores the full-length (16-byte) computed +// authentication tag to |ghash_acc|. The decryption function instead loads the +// expected authentication tag (the one that was transmitted) from the 16-byte +// buffer |tag|, compares the first 4 <= |taglen| <= 16 bytes of it to the +// computed tag in constant time, and returns true if and only if they match. +.macro _aes_gcm_final enc + + // Function arguments + .set KEY, %rdi + .set LE_CTR_PTR, %rsi + .set GHASH_ACC_PTR, %rdx + .set TOTAL_AADLEN, %rcx + .set TOTAL_DATALEN, %r8 + .set TAG, %r9 + .set TAGLEN, %r10d // Originally at 8(%rsp) + + // Additional local variables. + // %rax, %xmm0-%xmm3, and %k1 are used as temporary registers. + .set AESKEYLEN, %r11d + .set AESKEYLEN64, %r11 + .set GFPOLY, %xmm4 + .set BSWAP_MASK, %xmm5 + .set LE_CTR, %xmm6 + .set GHASH_ACC, %xmm7 + .set H_POW1, %xmm8 + + // Load some constants. + vmovdqa .Lgfpoly(%rip), GFPOLY + vmovdqa .Lbswap_mask(%rip), BSWAP_MASK + + // Load the AES key length in bytes. + movl OFFSETOF_AESKEYLEN(KEY), AESKEYLEN + + // Set up a counter block with 1 in the low 32-bit word. This is the + // counter that produces the ciphertext needed to encrypt the auth tag. + // GFPOLY has 1 in the low word, so grab the 1 from there using a blend. + vpblendd $0xe, (LE_CTR_PTR), GFPOLY, LE_CTR + + // Build the lengths block and XOR it with the GHASH accumulator. + // Although the lengths block is defined as the AAD length followed by + // the en/decrypted data length, both in big-endian byte order, a byte + // reflection of the full block is needed because of the way we compute + // GHASH (see _ghash_mul_step). By using little-endian values in the + // opposite order, we avoid having to reflect any bytes here. + vmovq TOTAL_DATALEN, %xmm0 + vpinsrq $1, TOTAL_AADLEN, %xmm0, %xmm0 + vpsllq $3, %xmm0, %xmm0 // Bytes to bits + vpxor (GHASH_ACC_PTR), %xmm0, GHASH_ACC + + // Load the first hash key power (H^1), which is stored last. + vmovdqu8 OFFSETOFEND_H_POWERS-16(KEY), H_POW1 + +.if !\enc + // Prepare a mask of TAGLEN one bits. + movl 8(%rsp), TAGLEN + mov $-1, %eax + bzhi TAGLEN, %eax, %eax + kmovd %eax, %k1 +.endif + + // Make %rax point to the last AES round key for the chosen AES variant. + lea 6*16(KEY,AESKEYLEN64,4), %rax + + // Start the AES encryption of the counter block by swapping the counter + // block to big-endian and XOR-ing it with the zero-th AES round key. + vpshufb BSWAP_MASK, LE_CTR, %xmm0 + vpxor (KEY), %xmm0, %xmm0 + + // Complete the AES encryption and multiply GHASH_ACC by H^1. + // Interleave the AES and GHASH instructions to improve performance. + cmp $24, AESKEYLEN + jl 128f // AES-128? + je 192f // AES-192? + // AES-256 + vaesenc -13*16(%rax), %xmm0, %xmm0 + vaesenc -12*16(%rax), %xmm0, %xmm0 +192: + vaesenc -11*16(%rax), %xmm0, %xmm0 + vaesenc -10*16(%rax), %xmm0, %xmm0 +128: +.irp i, 0,1,2,3,4,5,6,7,8 + _ghash_mul_step \i, H_POW1, GHASH_ACC, GHASH_ACC, GFPOLY, \ + %xmm1, %xmm2, %xmm3 + vaesenc (\i-9)*16(%rax), %xmm0, %xmm0 +.endr + _ghash_mul_step 9, H_POW1, GHASH_ACC, GHASH_ACC, GFPOLY, \ + %xmm1, %xmm2, %xmm3 + + // Undo the byte reflection of the GHASH accumulator. + vpshufb BSWAP_MASK, GHASH_ACC, GHASH_ACC + + // Do the last AES round and XOR the resulting keystream block with the + // GHASH accumulator to produce the full computed authentication tag. + // + // Reduce latency by taking advantage of the property vaesenclast(key, + // a) ^ b == vaesenclast(key ^ b, a). I.e., XOR GHASH_ACC into the last + // round key, instead of XOR'ing the final AES output with GHASH_ACC. + // + // enc_final then returns the computed auth tag, while dec_final + // compares it with the transmitted one and returns a bool. To compare + // the tags, dec_final XORs them together and uses vptest to check + // whether the result is all-zeroes. This should be constant-time. + // dec_final applies the vaesenclast optimization to this additional + // value XOR'd too, using vpternlogd to XOR the last round key, GHASH + // accumulator, and transmitted auth tag together in one instruction. +.if \enc + vpxor (%rax), GHASH_ACC, %xmm1 + vaesenclast %xmm1, %xmm0, GHASH_ACC + vmovdqu GHASH_ACC, (GHASH_ACC_PTR) +.else + vmovdqu (TAG), %xmm1 + vpternlogd $0x96, (%rax), GHASH_ACC, %xmm1 + vaesenclast %xmm1, %xmm0, %xmm0 + xor %eax, %eax + vmovdqu8 %xmm0, %xmm0{%k1}{z} // Truncate to TAGLEN bytes + vptest %xmm0, %xmm0 + sete %al +.endif + // No need for vzeroupper here, since only used xmm registers were used. + RET +.endm + +_set_veclen 32 +SYM_FUNC_START(aes_gcm_precompute_vaes_avx10_256) + _aes_gcm_precompute +SYM_FUNC_END(aes_gcm_precompute_vaes_avx10_256) +SYM_FUNC_START(aes_gcm_enc_update_vaes_avx10_256) + _aes_gcm_update 1 +SYM_FUNC_END(aes_gcm_enc_update_vaes_avx10_256) +SYM_FUNC_START(aes_gcm_dec_update_vaes_avx10_256) + _aes_gcm_update 0 +SYM_FUNC_END(aes_gcm_dec_update_vaes_avx10_256) + +_set_veclen 64 +SYM_FUNC_START(aes_gcm_precompute_vaes_avx10_512) + _aes_gcm_precompute +SYM_FUNC_END(aes_gcm_precompute_vaes_avx10_512) +SYM_FUNC_START(aes_gcm_enc_update_vaes_avx10_512) + _aes_gcm_update 1 +SYM_FUNC_END(aes_gcm_enc_update_vaes_avx10_512) +SYM_FUNC_START(aes_gcm_dec_update_vaes_avx10_512) + _aes_gcm_update 0 +SYM_FUNC_END(aes_gcm_dec_update_vaes_avx10_512) + +// void aes_gcm_aad_update_vaes_avx10(const struct aes_gcm_key_avx10 *key, +// u8 ghash_acc[16], +// const u8 *aad, int aadlen); +// +// This function processes the AAD (Additional Authenticated Data) in GCM. +// Using the key |key|, it updates the GHASH accumulator |ghash_acc| with the +// data given by |aad| and |aadlen|. |key->ghash_key_powers| must have been +// initialized. On the first call, |ghash_acc| must be all zeroes. |aadlen| +// must be a multiple of 16, except on the last call where it can be any length. +// The caller must do any buffering needed to ensure this. +// +// AES-GCM is almost always used with small amounts of AAD, less than 32 bytes. +// Therefore, for AAD processing we currently only provide this implementation +// which uses 256-bit vectors (ymm registers) and only has a 1x-wide loop. This +// keeps the code size down, and it enables some micro-optimizations, e.g. using +// VEX-coded instructions instead of EVEX-coded to save some instruction bytes. +// To optimize for large amounts of AAD, we could implement a 4x-wide loop and +// provide a version using 512-bit vectors, but that doesn't seem to be useful. +SYM_FUNC_START(aes_gcm_aad_update_vaes_avx10) + + // Function arguments + .set KEY, %rdi + .set GHASH_ACC_PTR, %rsi + .set AAD, %rdx + .set AADLEN, %ecx + .set AADLEN64, %rcx // Zero-extend AADLEN before using! + + // Additional local variables. + // %rax, %ymm0-%ymm3, and %k1 are used as temporary registers. + .set BSWAP_MASK, %ymm4 + .set GFPOLY, %ymm5 + .set GHASH_ACC, %ymm6 + .set GHASH_ACC_XMM, %xmm6 + .set H_POW1, %ymm7 + + // Load some constants. + vbroadcasti128 .Lbswap_mask(%rip), BSWAP_MASK + vbroadcasti128 .Lgfpoly(%rip), GFPOLY + + // Load the GHASH accumulator. + vmovdqu (GHASH_ACC_PTR), GHASH_ACC_XMM + + // Update GHASH with 32 bytes of AAD at a time. + // + // Pre-subtracting 32 from AADLEN saves an instruction from the loop and + // also ensures that at least one write always occurs to AADLEN, + // zero-extending it and allowing AADLEN64 to be used later. + sub $32, AADLEN + jl .Laad_loop_1x_done + vmovdqu8 OFFSETOFEND_H_POWERS-32(KEY), H_POW1 // [H^2, H^1] +.Laad_loop_1x: + vmovdqu (AAD), %ymm0 + vpshufb BSWAP_MASK, %ymm0, %ymm0 + vpxor %ymm0, GHASH_ACC, GHASH_ACC + _ghash_mul H_POW1, GHASH_ACC, GHASH_ACC, GFPOLY, \ + %ymm0, %ymm1, %ymm2 + vextracti128 $1, GHASH_ACC, %xmm0 + vpxor %xmm0, GHASH_ACC_XMM, GHASH_ACC_XMM + add $32, AAD + sub $32, AADLEN + jge .Laad_loop_1x +.Laad_loop_1x_done: + add $32, AADLEN + jz .Laad_done + + // Update GHASH with the remaining 1 <= AADLEN < 32 bytes of AAD. + mov $-1, %eax + bzhi AADLEN, %eax, %eax + kmovd %eax, %k1 + vmovdqu8 (AAD), %ymm0{%k1}{z} + neg AADLEN64 + and $~15, AADLEN64 // -round_up(AADLEN, 16) + vmovdqu8 OFFSETOFEND_H_POWERS(KEY,AADLEN64), H_POW1 + vpshufb BSWAP_MASK, %ymm0, %ymm0 + vpxor %ymm0, GHASH_ACC, GHASH_ACC + _ghash_mul H_POW1, GHASH_ACC, GHASH_ACC, GFPOLY, \ + %ymm0, %ymm1, %ymm2 + vextracti128 $1, GHASH_ACC, %xmm0 + vpxor %xmm0, GHASH_ACC_XMM, GHASH_ACC_XMM + +.Laad_done: + // Store the updated GHASH accumulator back to memory. + vmovdqu GHASH_ACC_XMM, (GHASH_ACC_PTR) + + vzeroupper // This is needed after using ymm or zmm registers. + RET +SYM_FUNC_END(aes_gcm_aad_update_vaes_avx10) + +SYM_FUNC_START(aes_gcm_enc_final_vaes_avx10) + _aes_gcm_final 1 +SYM_FUNC_END(aes_gcm_enc_final_vaes_avx10) +SYM_FUNC_START(aes_gcm_dec_final_vaes_avx10) + _aes_gcm_final 0 +SYM_FUNC_END(aes_gcm_dec_final_vaes_avx10) diff --git a/arch/x86/crypto/aes-xts-avx-x86_64.S b/arch/x86/crypto/aes-xts-avx-x86_64.S new file mode 100644 index 000000000000..db79cdf81588 --- /dev/null +++ b/arch/x86/crypto/aes-xts-avx-x86_64.S @@ -0,0 +1,907 @@ +/* SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause */ +// +// AES-XTS for modern x86_64 CPUs +// +// Copyright 2024 Google LLC +// +// Author: Eric Biggers <ebiggers@google.com> +// +//------------------------------------------------------------------------------ +// +// This file is dual-licensed, meaning that you can use it under your choice of +// either of the following two licenses: +// +// Licensed under the Apache License 2.0 (the "License"). You may obtain a copy +// of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// or +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// 1. Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +// POSSIBILITY OF SUCH DAMAGE. + +/* + * This file implements AES-XTS for modern x86_64 CPUs. To handle the + * complexities of coding for x86 SIMD, e.g. where every vector length needs + * different code, it uses a macro to generate several implementations that + * share similar source code but are targeted at different CPUs, listed below: + * + * AES-NI && AVX + * - 128-bit vectors (1 AES block per vector) + * - VEX-coded instructions + * - xmm0-xmm15 + * - This is for older CPUs that lack VAES but do have AVX. + * + * VAES && VPCLMULQDQ && AVX2 + * - 256-bit vectors (2 AES blocks per vector) + * - VEX-coded instructions + * - ymm0-ymm15 + * - This is for CPUs that have VAES but either lack AVX512 (e.g. Intel's + * Alder Lake and AMD's Zen 3) or downclock too eagerly when using zmm + * registers (e.g. Intel's Ice Lake). + * + * VAES && VPCLMULQDQ && AVX512BW && AVX512VL && BMI2 + * - 512-bit vectors (4 AES blocks per vector) + * - EVEX-coded instructions + * - zmm0-zmm31 + * - This is for CPUs that have good AVX512 support. + * + * This file doesn't have an implementation for AES-NI alone (without AVX), as + * the lack of VEX would make all the assembly code different. + * + * When we use VAES, we also use VPCLMULQDQ to parallelize the computation of + * the XTS tweaks. This avoids a bottleneck. Currently there don't seem to be + * any CPUs that support VAES but not VPCLMULQDQ. If that changes, we might + * need to start also providing an implementation using VAES alone. + * + * The AES-XTS implementations in this file support everything required by the + * crypto API, including support for arbitrary input lengths and multi-part + * processing. However, they are most heavily optimized for the common case of + * power-of-2 length inputs that are processed in a single part (disk sectors). + */ + +#include <linux/linkage.h> +#include <linux/cfi_types.h> + +.section .rodata +.p2align 4 +.Lgf_poly: + // The low 64 bits of this value represent the polynomial x^7 + x^2 + x + // + 1. It is the value that must be XOR'd into the low 64 bits of the + // tweak each time a 1 is carried out of the high 64 bits. + // + // The high 64 bits of this value is just the internal carry bit that + // exists when there's a carry out of the low 64 bits of the tweak. + .quad 0x87, 1 + + // These are the shift amounts that are needed when multiplying by [x^0, + // x^1, x^2, x^3] to compute the first vector of tweaks when VL=64. + // + // The right shifts by 64 are expected to zeroize the destination. + // 'vpsrlvq' is indeed defined to do that; i.e. it doesn't truncate the + // amount to 64 & 63 = 0 like the 'shr' scalar shift instruction would. +.Lrshift_amounts: + .byte 64, 64, 63, 63, 62, 62, 61, 61 +.Llshift_amounts: + .byte 0, 0, 1, 1, 2, 2, 3, 3 + + // This table contains constants for vpshufb and vpblendvb, used to + // handle variable byte shifts and blending during ciphertext stealing + // on CPUs that don't support AVX512-style masking. +.Lcts_permute_table: + .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 + .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 + .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 + .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f + .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 + .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 +.text + +.macro _define_Vi i +.if VL == 16 + .set V\i, %xmm\i +.elseif VL == 32 + .set V\i, %ymm\i +.elseif VL == 64 + .set V\i, %zmm\i +.else + .error "Unsupported Vector Length (VL)" +.endif +.endm + +.macro _define_aliases + // Define register aliases V0-V15, or V0-V31 if all 32 SIMD registers + // are available, that map to the xmm, ymm, or zmm registers according + // to the selected Vector Length (VL). +.irp i, 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 + _define_Vi \i +.endr +.if USE_AVX512 +.irp i, 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31 + _define_Vi \i +.endr +.endif + + // Function parameters + .set KEY, %rdi // Initially points to crypto_aes_ctx, then is + // advanced to point to 7th-from-last round key + .set SRC, %rsi // Pointer to next source data + .set DST, %rdx // Pointer to next destination data + .set LEN, %ecx // Remaining length in bytes + .set LEN8, %cl + .set LEN64, %rcx + .set TWEAK, %r8 // Pointer to next tweak + + // %rax holds the AES key length in bytes. + .set KEYLEN, %eax + .set KEYLEN64, %rax + + // %r9-r11 are available as temporaries. + + // V0-V3 hold the data blocks during the main loop, or temporary values + // otherwise. V4-V5 hold temporary values. + + // V6-V9 hold XTS tweaks. Each 128-bit lane holds one tweak. + .set TWEAK0_XMM, %xmm6 + .set TWEAK0, V6 + .set TWEAK1_XMM, %xmm7 + .set TWEAK1, V7 + .set TWEAK2, V8 + .set TWEAK3, V9 + + // V10-V13 are used for computing the next values of TWEAK[0-3]. + .set NEXT_TWEAK0, V10 + .set NEXT_TWEAK1, V11 + .set NEXT_TWEAK2, V12 + .set NEXT_TWEAK3, V13 + + // V14 holds the constant from .Lgf_poly, copied to all 128-bit lanes. + .set GF_POLY_XMM, %xmm14 + .set GF_POLY, V14 + + // V15 holds the key for AES "round 0", copied to all 128-bit lanes. + .set KEY0_XMM, %xmm15 + .set KEY0, V15 + + // If 32 SIMD registers are available, then V16-V29 hold the remaining + // AES round keys, copied to all 128-bit lanes. + // + // AES-128, AES-192, and AES-256 use different numbers of round keys. + // To allow handling all three variants efficiently, we align the round + // keys to the *end* of this register range. I.e., AES-128 uses + // KEY5-KEY14, AES-192 uses KEY3-KEY14, and AES-256 uses KEY1-KEY14. + // (All also use KEY0 for the XOR-only "round" at the beginning.) +.if USE_AVX512 + .set KEY1_XMM, %xmm16 + .set KEY1, V16 + .set KEY2_XMM, %xmm17 + .set KEY2, V17 + .set KEY3_XMM, %xmm18 + .set KEY3, V18 + .set KEY4_XMM, %xmm19 + .set KEY4, V19 + .set KEY5_XMM, %xmm20 + .set KEY5, V20 + .set KEY6_XMM, %xmm21 + .set KEY6, V21 + .set KEY7_XMM, %xmm22 + .set KEY7, V22 + .set KEY8_XMM, %xmm23 + .set KEY8, V23 + .set KEY9_XMM, %xmm24 + .set KEY9, V24 + .set KEY10_XMM, %xmm25 + .set KEY10, V25 + .set KEY11_XMM, %xmm26 + .set KEY11, V26 + .set KEY12_XMM, %xmm27 + .set KEY12, V27 + .set KEY13_XMM, %xmm28 + .set KEY13, V28 + .set KEY14_XMM, %xmm29 + .set KEY14, V29 +.endif + // V30-V31 are currently unused. +.endm + +// Move a vector between memory and a register. +.macro _vmovdqu src, dst +.if VL < 64 + vmovdqu \src, \dst +.else + vmovdqu8 \src, \dst +.endif +.endm + +// Broadcast a 128-bit value into a vector. +.macro _vbroadcast128 src, dst +.if VL == 16 + vmovdqu \src, \dst +.elseif VL == 32 + vbroadcasti128 \src, \dst +.else + vbroadcasti32x4 \src, \dst +.endif +.endm + +// XOR two vectors together. +.macro _vpxor src1, src2, dst +.if VL < 64 + vpxor \src1, \src2, \dst +.else + vpxord \src1, \src2, \dst +.endif +.endm + +// XOR three vectors together. +.macro _xor3 src1, src2, src3_and_dst +.if USE_AVX512 + // vpternlogd with immediate 0x96 is a three-argument XOR. + vpternlogd $0x96, \src1, \src2, \src3_and_dst +.else + vpxor \src1, \src3_and_dst, \src3_and_dst + vpxor \src2, \src3_and_dst, \src3_and_dst +.endif +.endm + +// Given a 128-bit XTS tweak in the xmm register \src, compute the next tweak +// (by multiplying by the polynomial 'x') and write it to \dst. +.macro _next_tweak src, tmp, dst + vpshufd $0x13, \src, \tmp + vpaddq \src, \src, \dst + vpsrad $31, \tmp, \tmp +.if USE_AVX512 + vpternlogd $0x78, GF_POLY_XMM, \tmp, \dst +.else + vpand GF_POLY_XMM, \tmp, \tmp + vpxor \tmp, \dst, \dst +.endif +.endm + +// Given the XTS tweak(s) in the vector \src, compute the next vector of +// tweak(s) (by multiplying by the polynomial 'x^(VL/16)') and write it to \dst. +// +// If VL > 16, then there are multiple tweaks, and we use vpclmulqdq to compute +// all tweaks in the vector in parallel. If VL=16, we just do the regular +// computation without vpclmulqdq, as it's the faster method for a single tweak. +.macro _next_tweakvec src, tmp1, tmp2, dst +.if VL == 16 + _next_tweak \src, \tmp1, \dst +.else + vpsrlq $64 - VL/16, \src, \tmp1 + vpclmulqdq $0x01, GF_POLY, \tmp1, \tmp2 + vpslldq $8, \tmp1, \tmp1 + vpsllq $VL/16, \src, \dst + _xor3 \tmp1, \tmp2, \dst +.endif +.endm + +// Given the first XTS tweak at (TWEAK), compute the first set of tweaks and +// store them in the vector registers TWEAK0-TWEAK3. Clobbers V0-V5. +.macro _compute_first_set_of_tweaks +.if VL == 16 + vmovdqu (TWEAK), TWEAK0_XMM + vmovdqu .Lgf_poly(%rip), GF_POLY + _next_tweak TWEAK0, %xmm0, TWEAK1 + _next_tweak TWEAK1, %xmm0, TWEAK2 + _next_tweak TWEAK2, %xmm0, TWEAK3 +.elseif VL == 32 + vmovdqu (TWEAK), TWEAK0_XMM + vbroadcasti128 .Lgf_poly(%rip), GF_POLY + + // Compute the first vector of tweaks. + _next_tweak TWEAK0_XMM, %xmm0, %xmm1 + vinserti128 $1, %xmm1, TWEAK0, TWEAK0 + + // Compute the next three vectors of tweaks: + // TWEAK1 = TWEAK0 * [x^2, x^2] + // TWEAK2 = TWEAK0 * [x^4, x^4] + // TWEAK3 = TWEAK0 * [x^6, x^6] + vpsrlq $64 - 2, TWEAK0, V0 + vpsrlq $64 - 4, TWEAK0, V2 + vpsrlq $64 - 6, TWEAK0, V4 + vpclmulqdq $0x01, GF_POLY, V0, V1 + vpclmulqdq $0x01, GF_POLY, V2, V3 + vpclmulqdq $0x01, GF_POLY, V4, V5 + vpslldq $8, V0, V0 + vpslldq $8, V2, V2 + vpslldq $8, V4, V4 + vpsllq $2, TWEAK0, TWEAK1 + vpsllq $4, TWEAK0, TWEAK2 + vpsllq $6, TWEAK0, TWEAK3 + vpxor V0, TWEAK1, TWEAK1 + vpxor V2, TWEAK2, TWEAK2 + vpxor V4, TWEAK3, TWEAK3 + vpxor V1, TWEAK1, TWEAK1 + vpxor V3, TWEAK2, TWEAK2 + vpxor V5, TWEAK3, TWEAK3 +.else + vbroadcasti32x4 (TWEAK), TWEAK0 + vbroadcasti32x4 .Lgf_poly(%rip), GF_POLY + + // Compute the first vector of tweaks: + // TWEAK0 = broadcast128(TWEAK) * [x^0, x^1, x^2, x^3] + vpmovzxbq .Lrshift_amounts(%rip), V4 + vpsrlvq V4, TWEAK0, V0 + vpclmulqdq $0x01, GF_POLY, V0, V1 + vpmovzxbq .Llshift_amounts(%rip), V4 + vpslldq $8, V0, V0 + vpsllvq V4, TWEAK0, TWEAK0 + vpternlogd $0x96, V0, V1, TWEAK0 + + // Compute the next three vectors of tweaks: + // TWEAK1 = TWEAK0 * [x^4, x^4, x^4, x^4] + // TWEAK2 = TWEAK0 * [x^8, x^8, x^8, x^8] + // TWEAK3 = TWEAK0 * [x^12, x^12, x^12, x^12] + // x^8 only needs byte-aligned shifts, so optimize accordingly. + vpsrlq $64 - 4, TWEAK0, V0 + vpsrldq $(64 - 8) / 8, TWEAK0, V2 + vpsrlq $64 - 12, TWEAK0, V4 + vpclmulqdq $0x01, GF_POLY, V0, V1 + vpclmulqdq $0x01, GF_POLY, V2, V3 + vpclmulqdq $0x01, GF_POLY, V4, V5 + vpslldq $8, V0, V0 + vpslldq $8, V4, V4 + vpsllq $4, TWEAK0, TWEAK1 + vpslldq $8 / 8, TWEAK0, TWEAK2 + vpsllq $12, TWEAK0, TWEAK3 + vpternlogd $0x96, V0, V1, TWEAK1 + vpxord V3, TWEAK2, TWEAK2 + vpternlogd $0x96, V4, V5, TWEAK3 +.endif +.endm + +// Do one step in computing the next set of tweaks using the method of just +// multiplying by x repeatedly (the same method _next_tweak uses). +.macro _tweak_step_mulx i +.if \i == 0 + .set PREV_TWEAK, TWEAK3 + .set NEXT_TWEAK, NEXT_TWEAK0 +.elseif \i == 5 + .set PREV_TWEAK, NEXT_TWEAK0 + .set NEXT_TWEAK, NEXT_TWEAK1 +.elseif \i == 10 + .set PREV_TWEAK, NEXT_TWEAK1 + .set NEXT_TWEAK, NEXT_TWEAK2 +.elseif \i == 15 + .set PREV_TWEAK, NEXT_TWEAK2 + .set NEXT_TWEAK, NEXT_TWEAK3 +.endif +.if \i >= 0 && \i < 20 && \i % 5 == 0 + vpshufd $0x13, PREV_TWEAK, V5 +.elseif \i >= 0 && \i < 20 && \i % 5 == 1 + vpaddq PREV_TWEAK, PREV_TWEAK, NEXT_TWEAK +.elseif \i >= 0 && \i < 20 && \i % 5 == 2 + vpsrad $31, V5, V5 +.elseif \i >= 0 && \i < 20 && \i % 5 == 3 + vpand GF_POLY, V5, V5 +.elseif \i >= 0 && \i < 20 && \i % 5 == 4 + vpxor V5, NEXT_TWEAK, NEXT_TWEAK +.elseif \i == 1000 + vmovdqa NEXT_TWEAK0, TWEAK0 + vmovdqa NEXT_TWEAK1, TWEAK1 + vmovdqa NEXT_TWEAK2, TWEAK2 + vmovdqa NEXT_TWEAK3, TWEAK3 +.endif +.endm + +// Do one step in computing the next set of tweaks using the VPCLMULQDQ method +// (the same method _next_tweakvec uses for VL > 16). This means multiplying +// each tweak by x^(4*VL/16) independently. +// +// Since 4*VL/16 is a multiple of 8 when VL > 16 (which it is here), the needed +// shift amounts are byte-aligned, which allows the use of vpsrldq and vpslldq +// to do 128-bit wide shifts. The 128-bit left shift (vpslldq) saves +// instructions directly. The 128-bit right shift (vpsrldq) performs better +// than a 64-bit right shift on Intel CPUs in the context where it is used here, +// because it runs on a different execution port from the AES instructions. +.macro _tweak_step_pclmul i +.if \i == 0 + vpsrldq $(128 - 4*VL/16) / 8, TWEAK0, NEXT_TWEAK0 +.elseif \i == 2 + vpsrldq $(128 - 4*VL/16) / 8, TWEAK1, NEXT_TWEAK1 +.elseif \i == 4 + vpsrldq $(128 - 4*VL/16) / 8, TWEAK2, NEXT_TWEAK2 +.elseif \i == 6 + vpsrldq $(128 - 4*VL/16) / 8, TWEAK3, NEXT_TWEAK3 +.elseif \i == 8 + vpclmulqdq $0x00, GF_POLY, NEXT_TWEAK0, NEXT_TWEAK0 +.elseif \i == 10 + vpclmulqdq $0x00, GF_POLY, NEXT_TWEAK1, NEXT_TWEAK1 +.elseif \i == 12 + vpclmulqdq $0x00, GF_POLY, NEXT_TWEAK2, NEXT_TWEAK2 +.elseif \i == 14 + vpclmulqdq $0x00, GF_POLY, NEXT_TWEAK3, NEXT_TWEAK3 +.elseif \i == 1000 + vpslldq $(4*VL/16) / 8, TWEAK0, TWEAK0 + vpslldq $(4*VL/16) / 8, TWEAK1, TWEAK1 + vpslldq $(4*VL/16) / 8, TWEAK2, TWEAK2 + vpslldq $(4*VL/16) / 8, TWEAK3, TWEAK3 + _vpxor NEXT_TWEAK0, TWEAK0, TWEAK0 + _vpxor NEXT_TWEAK1, TWEAK1, TWEAK1 + _vpxor NEXT_TWEAK2, TWEAK2, TWEAK2 + _vpxor NEXT_TWEAK3, TWEAK3, TWEAK3 +.endif +.endm + +// _tweak_step does one step of the computation of the next set of tweaks from +// TWEAK[0-3]. To complete all steps, this is invoked with increasing values of +// \i that include at least 0 through 19, then 1000 which signals the last step. +// +// This is used to interleave the computation of the next set of tweaks with the +// AES en/decryptions, which increases performance in some cases. Clobbers V5. +.macro _tweak_step i +.if VL == 16 + _tweak_step_mulx \i +.else + _tweak_step_pclmul \i +.endif +.endm + +.macro _setup_round_keys enc + + // Select either the encryption round keys or the decryption round keys. +.if \enc + .set OFFS, 0 +.else + .set OFFS, 240 +.endif + + // Load the round key for "round 0". + _vbroadcast128 OFFS(KEY), KEY0 + + // Increment KEY to make it so that 7*16(KEY) is the last round key. + // For AES-128, increment by 3*16, resulting in the 10 round keys (not + // counting the zero-th round key which was just loaded into KEY0) being + // -2*16(KEY) through 7*16(KEY). For AES-192, increment by 5*16 and use + // 12 round keys -4*16(KEY) through 7*16(KEY). For AES-256, increment + // by 7*16 and use 14 round keys -6*16(KEY) through 7*16(KEY). + // + // This rebasing provides two benefits. First, it makes the offset to + // any round key be in the range [-96, 112], fitting in a signed byte. + // This shortens VEX-encoded instructions that access the later round + // keys which otherwise would need 4-byte offsets. Second, it makes it + // easy to do AES-128 and AES-192 by skipping irrelevant rounds at the + // beginning. Skipping rounds at the end doesn't work as well because + // the last round needs different instructions. + // + // An alternative approach would be to roll up all the round loops. We + // don't do that because (a) it isn't compatible with caching the round + // keys in registers which we do when possible (see below), (b) we + // interleave the AES rounds with the XTS tweak computation, and (c) it + // seems unwise to rely *too* heavily on the CPU's branch predictor. + lea OFFS-16(KEY, KEYLEN64, 4), KEY + + // If all 32 SIMD registers are available, cache all the round keys. +.if USE_AVX512 + cmp $24, KEYLEN + jl .Laes128\@ + je .Laes192\@ + vbroadcasti32x4 -6*16(KEY), KEY1 + vbroadcasti32x4 -5*16(KEY), KEY2 +.Laes192\@: + vbroadcasti32x4 -4*16(KEY), KEY3 + vbroadcasti32x4 -3*16(KEY), KEY4 +.Laes128\@: + vbroadcasti32x4 -2*16(KEY), KEY5 + vbroadcasti32x4 -1*16(KEY), KEY6 + vbroadcasti32x4 0*16(KEY), KEY7 + vbroadcasti32x4 1*16(KEY), KEY8 + vbroadcasti32x4 2*16(KEY), KEY9 + vbroadcasti32x4 3*16(KEY), KEY10 + vbroadcasti32x4 4*16(KEY), KEY11 + vbroadcasti32x4 5*16(KEY), KEY12 + vbroadcasti32x4 6*16(KEY), KEY13 + vbroadcasti32x4 7*16(KEY), KEY14 +.endif +.endm + +// Do a single non-last round of AES encryption (if \enc==1) or decryption (if +// \enc==0) on the block(s) in \data using the round key(s) in \key. The +// register length determines the number of AES blocks en/decrypted. +.macro _vaes enc, key, data +.if \enc + vaesenc \key, \data, \data +.else + vaesdec \key, \data, \data +.endif +.endm + +// Same as _vaes, but does the last round. +.macro _vaeslast enc, key, data +.if \enc + vaesenclast \key, \data, \data +.else + vaesdeclast \key, \data, \data +.endif +.endm + +// Do a single non-last round of AES en/decryption on the block(s) in \data, +// using the same key for all block(s). The round key is loaded from the +// appropriate register or memory location for round \i. May clobber \tmp. +.macro _vaes_1x enc, i, xmm_suffix, data, tmp +.if USE_AVX512 + _vaes \enc, KEY\i\xmm_suffix, \data +.else +.ifnb \xmm_suffix + _vaes \enc, (\i-7)*16(KEY), \data +.else + _vbroadcast128 (\i-7)*16(KEY), \tmp + _vaes \enc, \tmp, \data +.endif +.endif +.endm + +// Do a single non-last round of AES en/decryption on the blocks in registers +// V0-V3, using the same key for all blocks. The round key is loaded from the +// appropriate register or memory location for round \i. In addition, does two +// steps of the computation of the next set of tweaks. May clobber V4 and V5. +.macro _vaes_4x enc, i +.if USE_AVX512 + _tweak_step (2*(\i-5)) + _vaes \enc, KEY\i, V0 + _vaes \enc, KEY\i, V1 + _tweak_step (2*(\i-5) + 1) + _vaes \enc, KEY\i, V2 + _vaes \enc, KEY\i, V3 +.else + _vbroadcast128 (\i-7)*16(KEY), V4 + _tweak_step (2*(\i-5)) + _vaes \enc, V4, V0 + _vaes \enc, V4, V1 + _tweak_step (2*(\i-5) + 1) + _vaes \enc, V4, V2 + _vaes \enc, V4, V3 +.endif +.endm + +// Do tweaked AES en/decryption (i.e., XOR with \tweak, then AES en/decrypt, +// then XOR with \tweak again) of the block(s) in \data. To process a single +// block, use xmm registers and set \xmm_suffix=_XMM. To process a vector of +// length VL, use V* registers and leave \xmm_suffix empty. Clobbers \tmp. +.macro _aes_crypt enc, xmm_suffix, tweak, data, tmp + _xor3 KEY0\xmm_suffix, \tweak, \data + cmp $24, KEYLEN + jl .Laes128\@ + je .Laes192\@ + _vaes_1x \enc, 1, \xmm_suffix, \data, tmp=\tmp + _vaes_1x \enc, 2, \xmm_suffix, \data, tmp=\tmp +.Laes192\@: + _vaes_1x \enc, 3, \xmm_suffix, \data, tmp=\tmp + _vaes_1x \enc, 4, \xmm_suffix, \data, tmp=\tmp +.Laes128\@: +.irp i, 5,6,7,8,9,10,11,12,13 + _vaes_1x \enc, \i, \xmm_suffix, \data, tmp=\tmp +.endr +.if USE_AVX512 + vpxord KEY14\xmm_suffix, \tweak, \tmp +.else +.ifnb \xmm_suffix + vpxor 7*16(KEY), \tweak, \tmp +.else + _vbroadcast128 7*16(KEY), \tmp + vpxor \tweak, \tmp, \tmp +.endif +.endif + _vaeslast \enc, \tmp, \data +.endm + +.macro _aes_xts_crypt enc + _define_aliases + +.if !\enc + // When decrypting a message whose length isn't a multiple of the AES + // block length, exclude the last full block from the main loop by + // subtracting 16 from LEN. This is needed because ciphertext stealing + // decryption uses the last two tweaks in reverse order. We'll handle + // the last full block and the partial block specially at the end. + lea -16(LEN), %eax + test $15, LEN8 + cmovnz %eax, LEN +.endif + + // Load the AES key length: 16 (AES-128), 24 (AES-192), or 32 (AES-256). + movl 480(KEY), KEYLEN + + // Setup the pointer to the round keys and cache as many as possible. + _setup_round_keys \enc + + // Compute the first set of tweaks TWEAK[0-3]. + _compute_first_set_of_tweaks + + add $-4*VL, LEN // shorter than 'sub 4*VL' when VL=32 + jl .Lhandle_remainder\@ + +.Lmain_loop\@: + // This is the main loop, en/decrypting 4*VL bytes per iteration. + + // XOR each source block with its tweak and the zero-th round key. +.if USE_AVX512 + vmovdqu8 0*VL(SRC), V0 + vmovdqu8 1*VL(SRC), V1 + vmovdqu8 2*VL(SRC), V2 + vmovdqu8 3*VL(SRC), V3 + vpternlogd $0x96, TWEAK0, KEY0, V0 + vpternlogd $0x96, TWEAK1, KEY0, V1 + vpternlogd $0x96, TWEAK2, KEY0, V2 + vpternlogd $0x96, TWEAK3, KEY0, V3 +.else + vpxor 0*VL(SRC), KEY0, V0 + vpxor 1*VL(SRC), KEY0, V1 + vpxor 2*VL(SRC), KEY0, V2 + vpxor 3*VL(SRC), KEY0, V3 + vpxor TWEAK0, V0, V0 + vpxor TWEAK1, V1, V1 + vpxor TWEAK2, V2, V2 + vpxor TWEAK3, V3, V3 +.endif + cmp $24, KEYLEN + jl .Laes128\@ + je .Laes192\@ + // Do all the AES rounds on the data blocks, interleaved with + // the computation of the next set of tweaks. + _vaes_4x \enc, 1 + _vaes_4x \enc, 2 +.Laes192\@: + _vaes_4x \enc, 3 + _vaes_4x \enc, 4 +.Laes128\@: +.irp i, 5,6,7,8,9,10,11,12,13 + _vaes_4x \enc, \i +.endr + // Do the last AES round, then XOR the results with the tweaks again. + // Reduce latency by doing the XOR before the vaesenclast, utilizing the + // property vaesenclast(key, a) ^ b == vaesenclast(key ^ b, a) + // (and likewise for vaesdeclast). +.if USE_AVX512 + _tweak_step 18 + _tweak_step 19 + vpxord TWEAK0, KEY14, V4 + vpxord TWEAK1, KEY14, V5 + _vaeslast \enc, V4, V0 + _vaeslast \enc, V5, V1 + vpxord TWEAK2, KEY14, V4 + vpxord TWEAK3, KEY14, V5 + _vaeslast \enc, V4, V2 + _vaeslast \enc, V5, V3 +.else + _vbroadcast128 7*16(KEY), V4 + _tweak_step 18 // uses V5 + _tweak_step 19 // uses V5 + vpxor TWEAK0, V4, V5 + _vaeslast \enc, V5, V0 + vpxor TWEAK1, V4, V5 + _vaeslast \enc, V5, V1 + vpxor TWEAK2, V4, V5 + vpxor TWEAK3, V4, V4 + _vaeslast \enc, V5, V2 + _vaeslast \enc, V4, V3 +.endif + + // Store the destination blocks. + _vmovdqu V0, 0*VL(DST) + _vmovdqu V1, 1*VL(DST) + _vmovdqu V2, 2*VL(DST) + _vmovdqu V3, 3*VL(DST) + + // Finish computing the next set of tweaks. + _tweak_step 1000 + + sub $-4*VL, SRC // shorter than 'add 4*VL' when VL=32 + sub $-4*VL, DST + add $-4*VL, LEN + jge .Lmain_loop\@ + + // Check for the uncommon case where the data length isn't a multiple of + // 4*VL. Handle it out-of-line in order to optimize for the common + // case. In the common case, just fall through to the ret. + test $4*VL-1, LEN8 + jnz .Lhandle_remainder\@ +.Ldone\@: + // Store the next tweak back to *TWEAK to support continuation calls. + vmovdqu TWEAK0_XMM, (TWEAK) +.if VL > 16 + vzeroupper +.endif + RET + +.Lhandle_remainder\@: + + // En/decrypt any remaining full blocks, one vector at a time. +.if VL > 16 + add $3*VL, LEN // Undo extra sub of 4*VL, then sub VL. + jl .Lvec_at_a_time_done\@ +.Lvec_at_a_time\@: + _vmovdqu (SRC), V0 + _aes_crypt \enc, , TWEAK0, V0, tmp=V1 + _vmovdqu V0, (DST) + _next_tweakvec TWEAK0, V0, V1, TWEAK0 + add $VL, SRC + add $VL, DST + sub $VL, LEN + jge .Lvec_at_a_time\@ +.Lvec_at_a_time_done\@: + add $VL-16, LEN // Undo extra sub of VL, then sub 16. +.else + add $4*VL-16, LEN // Undo extra sub of 4*VL, then sub 16. +.endif + + // En/decrypt any remaining full blocks, one at a time. + jl .Lblock_at_a_time_done\@ +.Lblock_at_a_time\@: + vmovdqu (SRC), %xmm0 + _aes_crypt \enc, _XMM, TWEAK0_XMM, %xmm0, tmp=%xmm1 + vmovdqu %xmm0, (DST) + _next_tweak TWEAK0_XMM, %xmm0, TWEAK0_XMM + add $16, SRC + add $16, DST + sub $16, LEN + jge .Lblock_at_a_time\@ +.Lblock_at_a_time_done\@: + add $16, LEN // Undo the extra sub of 16. + // Now 0 <= LEN <= 15. If LEN is zero, we're done. + jz .Ldone\@ + + // Otherwise 1 <= LEN <= 15, but the real remaining length is 16 + LEN. + // Do ciphertext stealing to process the last 16 + LEN bytes. + +.if \enc + // If encrypting, the main loop already encrypted the last full block to + // create the CTS intermediate ciphertext. Prepare for the rest of CTS + // by rewinding the pointers and loading the intermediate ciphertext. + sub $16, SRC + sub $16, DST + vmovdqu (DST), %xmm0 +.else + // If decrypting, the main loop didn't decrypt the last full block + // because CTS decryption uses the last two tweaks in reverse order. + // Do it now by advancing the tweak and decrypting the last full block. + _next_tweak TWEAK0_XMM, %xmm0, TWEAK1_XMM + vmovdqu (SRC), %xmm0 + _aes_crypt \enc, _XMM, TWEAK1_XMM, %xmm0, tmp=%xmm1 +.endif + +.if USE_AVX512 + // Create a mask that has the first LEN bits set. + mov $-1, %r9d + bzhi LEN, %r9d, %r9d + kmovd %r9d, %k1 + + // Swap the first LEN bytes of the en/decryption of the last full block + // with the partial block. Note that to support in-place en/decryption, + // the load from the src partial block must happen before the store to + // the dst partial block. + vmovdqa %xmm0, %xmm1 + vmovdqu8 16(SRC), %xmm0{%k1} + vmovdqu8 %xmm1, 16(DST){%k1} +.else + lea .Lcts_permute_table(%rip), %r9 + + // Load the src partial block, left-aligned. Note that to support + // in-place en/decryption, this must happen before the store to the dst + // partial block. + vmovdqu (SRC, LEN64, 1), %xmm1 + + // Shift the first LEN bytes of the en/decryption of the last full block + // to the end of a register, then store it to DST+LEN. This stores the + // dst partial block. It also writes to the second part of the dst last + // full block, but that part is overwritten later. + vpshufb (%r9, LEN64, 1), %xmm0, %xmm2 + vmovdqu %xmm2, (DST, LEN64, 1) + + // Make xmm3 contain [16-LEN,16-LEN+1,...,14,15,0x80,0x80,...]. + sub LEN64, %r9 + vmovdqu 32(%r9), %xmm3 + + // Shift the src partial block to the beginning of its register. + vpshufb %xmm3, %xmm1, %xmm1 + + // Do a blend to generate the src partial block followed by the second + // part of the en/decryption of the last full block. + vpblendvb %xmm3, %xmm0, %xmm1, %xmm0 +.endif + // En/decrypt again and store the last full block. + _aes_crypt \enc, _XMM, TWEAK0_XMM, %xmm0, tmp=%xmm1 + vmovdqu %xmm0, (DST) + jmp .Ldone\@ +.endm + +// void aes_xts_encrypt_iv(const struct crypto_aes_ctx *tweak_key, +// u8 iv[AES_BLOCK_SIZE]); +// +// Encrypt |iv| using the AES key |tweak_key| to get the first tweak. Assumes +// that the CPU supports AES-NI and AVX, but not necessarily VAES or AVX512. +SYM_TYPED_FUNC_START(aes_xts_encrypt_iv) + .set TWEAK_KEY, %rdi + .set IV, %rsi + .set KEYLEN, %eax + .set KEYLEN64, %rax + + vmovdqu (IV), %xmm0 + vpxor (TWEAK_KEY), %xmm0, %xmm0 + movl 480(TWEAK_KEY), KEYLEN + lea -16(TWEAK_KEY, KEYLEN64, 4), TWEAK_KEY + cmp $24, KEYLEN + jl .Lencrypt_iv_aes128 + je .Lencrypt_iv_aes192 + vaesenc -6*16(TWEAK_KEY), %xmm0, %xmm0 + vaesenc -5*16(TWEAK_KEY), %xmm0, %xmm0 +.Lencrypt_iv_aes192: + vaesenc -4*16(TWEAK_KEY), %xmm0, %xmm0 + vaesenc -3*16(TWEAK_KEY), %xmm0, %xmm0 +.Lencrypt_iv_aes128: +.irp i, -2,-1,0,1,2,3,4,5,6 + vaesenc \i*16(TWEAK_KEY), %xmm0, %xmm0 +.endr + vaesenclast 7*16(TWEAK_KEY), %xmm0, %xmm0 + vmovdqu %xmm0, (IV) + RET +SYM_FUNC_END(aes_xts_encrypt_iv) + +// Below are the actual AES-XTS encryption and decryption functions, +// instantiated from the above macro. They all have the following prototype: +// +// void (*xts_crypt_func)(const struct crypto_aes_ctx *key, +// const u8 *src, u8 *dst, int len, +// u8 tweak[AES_BLOCK_SIZE]); +// +// |key| is the data key. |tweak| contains the next tweak; the encryption of +// the original IV with the tweak key was already done. This function supports +// incremental computation, but |len| must always be >= 16 (AES_BLOCK_SIZE), and +// |len| must be a multiple of 16 except on the last call. If |len| is a +// multiple of 16, then this function updates |tweak| to contain the next tweak. + +.set VL, 16 +.set USE_AVX512, 0 +SYM_TYPED_FUNC_START(aes_xts_encrypt_aesni_avx) + _aes_xts_crypt 1 +SYM_FUNC_END(aes_xts_encrypt_aesni_avx) +SYM_TYPED_FUNC_START(aes_xts_decrypt_aesni_avx) + _aes_xts_crypt 0 +SYM_FUNC_END(aes_xts_decrypt_aesni_avx) + +#if defined(CONFIG_AS_VAES) && defined(CONFIG_AS_VPCLMULQDQ) +.set VL, 32 +.set USE_AVX512, 0 +SYM_TYPED_FUNC_START(aes_xts_encrypt_vaes_avx2) + _aes_xts_crypt 1 +SYM_FUNC_END(aes_xts_encrypt_vaes_avx2) +SYM_TYPED_FUNC_START(aes_xts_decrypt_vaes_avx2) + _aes_xts_crypt 0 +SYM_FUNC_END(aes_xts_decrypt_vaes_avx2) + +.set VL, 64 +.set USE_AVX512, 1 +SYM_TYPED_FUNC_START(aes_xts_encrypt_vaes_avx512) + _aes_xts_crypt 1 +SYM_FUNC_END(aes_xts_encrypt_vaes_avx512) +SYM_TYPED_FUNC_START(aes_xts_decrypt_vaes_avx512) + _aes_xts_crypt 0 +SYM_FUNC_END(aes_xts_decrypt_vaes_avx512) +#endif /* CONFIG_AS_VAES && CONFIG_AS_VPCLMULQDQ */ diff --git a/arch/x86/crypto/aes_ctrby8_avx-x86_64.S b/arch/x86/crypto/aes_ctrby8_avx-x86_64.S deleted file mode 100644 index 2402b9418cd7..000000000000 --- a/arch/x86/crypto/aes_ctrby8_avx-x86_64.S +++ /dev/null @@ -1,597 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only OR BSD-3-Clause */ -/* - * AES CTR mode by8 optimization with AVX instructions. (x86_64) - * - * Copyright(c) 2014 Intel Corporation. - * - * Contact Information: - * James Guilford <james.guilford@intel.com> - * Sean Gulley <sean.m.gulley@intel.com> - * Chandramouli Narayanan <mouli@linux.intel.com> - */ -/* - * This is AES128/192/256 CTR mode optimization implementation. It requires - * the support of Intel(R) AESNI and AVX instructions. - * - * This work was inspired by the AES CTR mode optimization published - * in Intel Optimized IPSEC Cryptographic library. - * Additional information on it can be found at: - * https://github.com/intel/intel-ipsec-mb - */ - -#include <linux/linkage.h> - -#define VMOVDQ vmovdqu - -/* - * Note: the "x" prefix in these aliases means "this is an xmm register". The - * alias prefixes have no relation to XCTR where the "X" prefix means "XOR - * counter". - */ -#define xdata0 %xmm0 -#define xdata1 %xmm1 -#define xdata2 %xmm2 -#define xdata3 %xmm3 -#define xdata4 %xmm4 -#define xdata5 %xmm5 -#define xdata6 %xmm6 -#define xdata7 %xmm7 -#define xcounter %xmm8 // CTR mode only -#define xiv %xmm8 // XCTR mode only -#define xbyteswap %xmm9 // CTR mode only -#define xtmp %xmm9 // XCTR mode only -#define xkey0 %xmm10 -#define xkey4 %xmm11 -#define xkey8 %xmm12 -#define xkey12 %xmm13 -#define xkeyA %xmm14 -#define xkeyB %xmm15 - -#define p_in %rdi -#define p_iv %rsi -#define p_keys %rdx -#define p_out %rcx -#define num_bytes %r8 -#define counter %r9 // XCTR mode only -#define tmp %r10 -#define DDQ_DATA 0 -#define XDATA 1 -#define KEY_128 1 -#define KEY_192 2 -#define KEY_256 3 - -.section .rodata -.align 16 - -byteswap_const: - .octa 0x000102030405060708090A0B0C0D0E0F -ddq_low_msk: - .octa 0x0000000000000000FFFFFFFFFFFFFFFF -ddq_high_add_1: - .octa 0x00000000000000010000000000000000 -ddq_add_1: - .octa 0x00000000000000000000000000000001 -ddq_add_2: - .octa 0x00000000000000000000000000000002 -ddq_add_3: - .octa 0x00000000000000000000000000000003 -ddq_add_4: - .octa 0x00000000000000000000000000000004 -ddq_add_5: - .octa 0x00000000000000000000000000000005 -ddq_add_6: - .octa 0x00000000000000000000000000000006 -ddq_add_7: - .octa 0x00000000000000000000000000000007 -ddq_add_8: - .octa 0x00000000000000000000000000000008 - -.text - -/* generate a unique variable for ddq_add_x */ - -/* generate a unique variable for xmm register */ -.macro setxdata n - var_xdata = %xmm\n -.endm - -/* club the numeric 'id' to the symbol 'name' */ - -.macro club name, id -.altmacro - .if \name == XDATA - setxdata %\id - .endif -.noaltmacro -.endm - -/* - * do_aes num_in_par load_keys key_len - * This increments p_in, but not p_out - */ -.macro do_aes b, k, key_len, xctr - .set by, \b - .set load_keys, \k - .set klen, \key_len - - .if (load_keys) - vmovdqa 0*16(p_keys), xkey0 - .endif - - .if \xctr - movq counter, xtmp - .set i, 0 - .rept (by) - club XDATA, i - vpaddq (ddq_add_1 + 16 * i)(%rip), xtmp, var_xdata - .set i, (i +1) - .endr - .set i, 0 - .rept (by) - club XDATA, i - vpxor xiv, var_xdata, var_xdata - .set i, (i +1) - .endr - .else - vpshufb xbyteswap, xcounter, xdata0 - .set i, 1 - .rept (by - 1) - club XDATA, i - vpaddq (ddq_add_1 + 16 * (i - 1))(%rip), xcounter, var_xdata - vptest ddq_low_msk(%rip), var_xdata - jnz 1f - vpaddq ddq_high_add_1(%rip), var_xdata, var_xdata - vpaddq ddq_high_add_1(%rip), xcounter, xcounter - 1: - vpshufb xbyteswap, var_xdata, var_xdata - .set i, (i +1) - .endr - .endif - - vmovdqa 1*16(p_keys), xkeyA - - vpxor xkey0, xdata0, xdata0 - .if \xctr - add $by, counter - .else - vpaddq (ddq_add_1 + 16 * (by - 1))(%rip), xcounter, xcounter - vptest ddq_low_msk(%rip), xcounter - jnz 1f - vpaddq ddq_high_add_1(%rip), xcounter, xcounter - 1: - .endif - - .set i, 1 - .rept (by - 1) - club XDATA, i - vpxor xkey0, var_xdata, var_xdata - .set i, (i +1) - .endr - - vmovdqa 2*16(p_keys), xkeyB - - .set i, 0 - .rept by - club XDATA, i - vaesenc xkeyA, var_xdata, var_xdata /* key 1 */ - .set i, (i +1) - .endr - - .if (klen == KEY_128) - .if (load_keys) - vmovdqa 3*16(p_keys), xkey4 - .endif - .else - vmovdqa 3*16(p_keys), xkeyA - .endif - - .set i, 0 - .rept by - club XDATA, i - vaesenc xkeyB, var_xdata, var_xdata /* key 2 */ - .set i, (i +1) - .endr - - add $(16*by), p_in - - .if (klen == KEY_128) - vmovdqa 4*16(p_keys), xkeyB - .else - .if (load_keys) - vmovdqa 4*16(p_keys), xkey4 - .endif - .endif - - .set i, 0 - .rept by - club XDATA, i - /* key 3 */ - .if (klen == KEY_128) - vaesenc xkey4, var_xdata, var_xdata - .else - vaesenc xkeyA, var_xdata, var_xdata - .endif - .set i, (i +1) - .endr - - vmovdqa 5*16(p_keys), xkeyA - - .set i, 0 - .rept by - club XDATA, i - /* key 4 */ - .if (klen == KEY_128) - vaesenc xkeyB, var_xdata, var_xdata - .else - vaesenc xkey4, var_xdata, var_xdata - .endif - .set i, (i +1) - .endr - - .if (klen == KEY_128) - .if (load_keys) - vmovdqa 6*16(p_keys), xkey8 - .endif - .else - vmovdqa 6*16(p_keys), xkeyB - .endif - - .set i, 0 - .rept by - club XDATA, i - vaesenc xkeyA, var_xdata, var_xdata /* key 5 */ - .set i, (i +1) - .endr - - vmovdqa 7*16(p_keys), xkeyA - - .set i, 0 - .rept by - club XDATA, i - /* key 6 */ - .if (klen == KEY_128) - vaesenc xkey8, var_xdata, var_xdata - .else - vaesenc xkeyB, var_xdata, var_xdata - .endif - .set i, (i +1) - .endr - - .if (klen == KEY_128) - vmovdqa 8*16(p_keys), xkeyB - .else - .if (load_keys) - vmovdqa 8*16(p_keys), xkey8 - .endif - .endif - - .set i, 0 - .rept by - club XDATA, i - vaesenc xkeyA, var_xdata, var_xdata /* key 7 */ - .set i, (i +1) - .endr - - .if (klen == KEY_128) - .if (load_keys) - vmovdqa 9*16(p_keys), xkey12 - .endif - .else - vmovdqa 9*16(p_keys), xkeyA - .endif - - .set i, 0 - .rept by - club XDATA, i - /* key 8 */ - .if (klen == KEY_128) - vaesenc xkeyB, var_xdata, var_xdata - .else - vaesenc xkey8, var_xdata, var_xdata - .endif - .set i, (i +1) - .endr - - vmovdqa 10*16(p_keys), xkeyB - - .set i, 0 - .rept by - club XDATA, i - /* key 9 */ - .if (klen == KEY_128) - vaesenc xkey12, var_xdata, var_xdata - .else - vaesenc xkeyA, var_xdata, var_xdata - .endif - .set i, (i +1) - .endr - - .if (klen != KEY_128) - vmovdqa 11*16(p_keys), xkeyA - .endif - - .set i, 0 - .rept by - club XDATA, i - /* key 10 */ - .if (klen == KEY_128) - vaesenclast xkeyB, var_xdata, var_xdata - .else - vaesenc xkeyB, var_xdata, var_xdata - .endif - .set i, (i +1) - .endr - - .if (klen != KEY_128) - .if (load_keys) - vmovdqa 12*16(p_keys), xkey12 - .endif - - .set i, 0 - .rept by - club XDATA, i - vaesenc xkeyA, var_xdata, var_xdata /* key 11 */ - .set i, (i +1) - .endr - - .if (klen == KEY_256) - vmovdqa 13*16(p_keys), xkeyA - .endif - - .set i, 0 - .rept by - club XDATA, i - .if (klen == KEY_256) - /* key 12 */ - vaesenc xkey12, var_xdata, var_xdata - .else - vaesenclast xkey12, var_xdata, var_xdata - .endif - .set i, (i +1) - .endr - - .if (klen == KEY_256) - vmovdqa 14*16(p_keys), xkeyB - - .set i, 0 - .rept by - club XDATA, i - /* key 13 */ - vaesenc xkeyA, var_xdata, var_xdata - .set i, (i +1) - .endr - - .set i, 0 - .rept by - club XDATA, i - /* key 14 */ - vaesenclast xkeyB, var_xdata, var_xdata - .set i, (i +1) - .endr - .endif - .endif - - .set i, 0 - .rept (by / 2) - .set j, (i+1) - VMOVDQ (i*16 - 16*by)(p_in), xkeyA - VMOVDQ (j*16 - 16*by)(p_in), xkeyB - club XDATA, i - vpxor xkeyA, var_xdata, var_xdata - club XDATA, j - vpxor xkeyB, var_xdata, var_xdata - .set i, (i+2) - .endr - - .if (i < by) - VMOVDQ (i*16 - 16*by)(p_in), xkeyA - club XDATA, i - vpxor xkeyA, var_xdata, var_xdata - .endif - - .set i, 0 - .rept by - club XDATA, i - VMOVDQ var_xdata, i*16(p_out) - .set i, (i+1) - .endr -.endm - -.macro do_aes_load val, key_len, xctr - do_aes \val, 1, \key_len, \xctr -.endm - -.macro do_aes_noload val, key_len, xctr - do_aes \val, 0, \key_len, \xctr -.endm - -/* main body of aes ctr load */ - -.macro do_aes_ctrmain key_len, xctr - cmp $16, num_bytes - jb .Ldo_return2\xctr\key_len - - .if \xctr - shr $4, counter - vmovdqu (p_iv), xiv - .else - vmovdqa byteswap_const(%rip), xbyteswap - vmovdqu (p_iv), xcounter - vpshufb xbyteswap, xcounter, xcounter - .endif - - mov num_bytes, tmp - and $(7*16), tmp - jz .Lmult_of_8_blks\xctr\key_len - - /* 1 <= tmp <= 7 */ - cmp $(4*16), tmp - jg .Lgt4\xctr\key_len - je .Leq4\xctr\key_len - -.Llt4\xctr\key_len: - cmp $(2*16), tmp - jg .Leq3\xctr\key_len - je .Leq2\xctr\key_len - -.Leq1\xctr\key_len: - do_aes_load 1, \key_len, \xctr - add $(1*16), p_out - and $(~7*16), num_bytes - jz .Ldo_return2\xctr\key_len - jmp .Lmain_loop2\xctr\key_len - -.Leq2\xctr\key_len: - do_aes_load 2, \key_len, \xctr - add $(2*16), p_out - and $(~7*16), num_bytes - jz .Ldo_return2\xctr\key_len - jmp .Lmain_loop2\xctr\key_len - - -.Leq3\xctr\key_len: - do_aes_load 3, \key_len, \xctr - add $(3*16), p_out - and $(~7*16), num_bytes - jz .Ldo_return2\xctr\key_len - jmp .Lmain_loop2\xctr\key_len - -.Leq4\xctr\key_len: - do_aes_load 4, \key_len, \xctr - add $(4*16), p_out - and $(~7*16), num_bytes - jz .Ldo_return2\xctr\key_len - jmp .Lmain_loop2\xctr\key_len - -.Lgt4\xctr\key_len: - cmp $(6*16), tmp - jg .Leq7\xctr\key_len - je .Leq6\xctr\key_len - -.Leq5\xctr\key_len: - do_aes_load 5, \key_len, \xctr - add $(5*16), p_out - and $(~7*16), num_bytes - jz .Ldo_return2\xctr\key_len - jmp .Lmain_loop2\xctr\key_len - -.Leq6\xctr\key_len: - do_aes_load 6, \key_len, \xctr - add $(6*16), p_out - and $(~7*16), num_bytes - jz .Ldo_return2\xctr\key_len - jmp .Lmain_loop2\xctr\key_len - -.Leq7\xctr\key_len: - do_aes_load 7, \key_len, \xctr - add $(7*16), p_out - and $(~7*16), num_bytes - jz .Ldo_return2\xctr\key_len - jmp .Lmain_loop2\xctr\key_len - -.Lmult_of_8_blks\xctr\key_len: - .if (\key_len != KEY_128) - vmovdqa 0*16(p_keys), xkey0 - vmovdqa 4*16(p_keys), xkey4 - vmovdqa 8*16(p_keys), xkey8 - vmovdqa 12*16(p_keys), xkey12 - .else - vmovdqa 0*16(p_keys), xkey0 - vmovdqa 3*16(p_keys), xkey4 - vmovdqa 6*16(p_keys), xkey8 - vmovdqa 9*16(p_keys), xkey12 - .endif -.align 16 -.Lmain_loop2\xctr\key_len: - /* num_bytes is a multiple of 8 and >0 */ - do_aes_noload 8, \key_len, \xctr - add $(8*16), p_out - sub $(8*16), num_bytes - jne .Lmain_loop2\xctr\key_len - -.Ldo_return2\xctr\key_len: - .if !\xctr - /* return updated IV */ - vpshufb xbyteswap, xcounter, xcounter - vmovdqu xcounter, (p_iv) - .endif - RET -.endm - -/* - * routine to do AES128 CTR enc/decrypt "by8" - * XMM registers are clobbered. - * Saving/restoring must be done at a higher level - * aes_ctr_enc_128_avx_by8(void *in, void *iv, void *keys, void *out, - * unsigned int num_bytes) - */ -SYM_FUNC_START(aes_ctr_enc_128_avx_by8) - /* call the aes main loop */ - do_aes_ctrmain KEY_128 0 - -SYM_FUNC_END(aes_ctr_enc_128_avx_by8) - -/* - * routine to do AES192 CTR enc/decrypt "by8" - * XMM registers are clobbered. - * Saving/restoring must be done at a higher level - * aes_ctr_enc_192_avx_by8(void *in, void *iv, void *keys, void *out, - * unsigned int num_bytes) - */ -SYM_FUNC_START(aes_ctr_enc_192_avx_by8) - /* call the aes main loop */ - do_aes_ctrmain KEY_192 0 - -SYM_FUNC_END(aes_ctr_enc_192_avx_by8) - -/* - * routine to do AES256 CTR enc/decrypt "by8" - * XMM registers are clobbered. - * Saving/restoring must be done at a higher level - * aes_ctr_enc_256_avx_by8(void *in, void *iv, void *keys, void *out, - * unsigned int num_bytes) - */ -SYM_FUNC_START(aes_ctr_enc_256_avx_by8) - /* call the aes main loop */ - do_aes_ctrmain KEY_256 0 - -SYM_FUNC_END(aes_ctr_enc_256_avx_by8) - -/* - * routine to do AES128 XCTR enc/decrypt "by8" - * XMM registers are clobbered. - * Saving/restoring must be done at a higher level - * aes_xctr_enc_128_avx_by8(const u8 *in, const u8 *iv, const void *keys, - * u8* out, unsigned int num_bytes, unsigned int byte_ctr) - */ -SYM_FUNC_START(aes_xctr_enc_128_avx_by8) - /* call the aes main loop */ - do_aes_ctrmain KEY_128 1 - -SYM_FUNC_END(aes_xctr_enc_128_avx_by8) - -/* - * routine to do AES192 XCTR enc/decrypt "by8" - * XMM registers are clobbered. - * Saving/restoring must be done at a higher level - * aes_xctr_enc_192_avx_by8(const u8 *in, const u8 *iv, const void *keys, - * u8* out, unsigned int num_bytes, unsigned int byte_ctr) - */ -SYM_FUNC_START(aes_xctr_enc_192_avx_by8) - /* call the aes main loop */ - do_aes_ctrmain KEY_192 1 - -SYM_FUNC_END(aes_xctr_enc_192_avx_by8) - -/* - * routine to do AES256 XCTR enc/decrypt "by8" - * XMM registers are clobbered. - * Saving/restoring must be done at a higher level - * aes_xctr_enc_256_avx_by8(const u8 *in, const u8 *iv, const void *keys, - * u8* out, unsigned int num_bytes, unsigned int byte_ctr) - */ -SYM_FUNC_START(aes_xctr_enc_256_avx_by8) - /* call the aes main loop */ - do_aes_ctrmain KEY_256 1 - -SYM_FUNC_END(aes_xctr_enc_256_avx_by8) diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S index 411d8c83e88a..b37881bb9f15 100644 --- a/arch/x86/crypto/aesni-intel_asm.S +++ b/arch/x86/crypto/aesni-intel_asm.S @@ -10,120 +10,15 @@ * Vinodh Gopal <vinodh.gopal@intel.com> * Kahraman Akdemir * - * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD - * interface for 64-bit kernels. - * Authors: Erdinc Ozturk (erdinc.ozturk@intel.com) - * Aidan O'Mahony (aidan.o.mahony@intel.com) - * Adrian Hoban <adrian.hoban@intel.com> - * James Guilford (james.guilford@intel.com) - * Gabriele Paoloni <gabriele.paoloni@intel.com> - * Tadeusz Struk (tadeusz.struk@intel.com) - * Wajdi Feghali (wajdi.k.feghali@intel.com) - * Copyright (c) 2010, Intel Corporation. + * Copyright (c) 2010, Intel Corporation. * * Ported x86_64 version to x86: * Author: Mathias Krause <minipli@googlemail.com> */ #include <linux/linkage.h> +#include <linux/objtool.h> #include <asm/frame.h> -#include <asm/nospec-branch.h> - -/* - * The following macros are used to move an (un)aligned 16 byte value to/from - * an XMM register. This can done for either FP or integer values, for FP use - * movaps (move aligned packed single) or integer use movdqa (move double quad - * aligned). It doesn't make a performance difference which instruction is used - * since Nehalem (original Core i7) was released. However, the movaps is a byte - * shorter, so that is the one we'll use for now. (same for unaligned). - */ -#define MOVADQ movaps -#define MOVUDQ movups - -#ifdef __x86_64__ - -# constants in mergeable sections, linker can reorder and merge -.section .rodata.cst16.POLY, "aM", @progbits, 16 -.align 16 -POLY: .octa 0xC2000000000000000000000000000001 -.section .rodata.cst16.TWOONE, "aM", @progbits, 16 -.align 16 -TWOONE: .octa 0x00000001000000000000000000000001 - -.section .rodata.cst16.SHUF_MASK, "aM", @progbits, 16 -.align 16 -SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F -.section .rodata.cst16.MASK1, "aM", @progbits, 16 -.align 16 -MASK1: .octa 0x0000000000000000ffffffffffffffff -.section .rodata.cst16.MASK2, "aM", @progbits, 16 -.align 16 -MASK2: .octa 0xffffffffffffffff0000000000000000 -.section .rodata.cst16.ONE, "aM", @progbits, 16 -.align 16 -ONE: .octa 0x00000000000000000000000000000001 -.section .rodata.cst16.F_MIN_MASK, "aM", @progbits, 16 -.align 16 -F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0 -.section .rodata.cst16.dec, "aM", @progbits, 16 -.align 16 -dec: .octa 0x1 -.section .rodata.cst16.enc, "aM", @progbits, 16 -.align 16 -enc: .octa 0x2 - -# order of these constants should not change. -# more specifically, ALL_F should follow SHIFT_MASK, -# and zero should follow ALL_F -.section .rodata, "a", @progbits -.align 16 -SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100 -ALL_F: .octa 0xffffffffffffffffffffffffffffffff - .octa 0x00000000000000000000000000000000 - -.text - - -#define STACK_OFFSET 8*3 - -#define AadHash 16*0 -#define AadLen 16*1 -#define InLen (16*1)+8 -#define PBlockEncKey 16*2 -#define OrigIV 16*3 -#define CurCount 16*4 -#define PBlockLen 16*5 -#define HashKey 16*6 // store HashKey <<1 mod poly here -#define HashKey_2 16*7 // store HashKey^2 <<1 mod poly here -#define HashKey_3 16*8 // store HashKey^3 <<1 mod poly here -#define HashKey_4 16*9 // store HashKey^4 <<1 mod poly here -#define HashKey_k 16*10 // store XOR of High 64 bits and Low 64 - // bits of HashKey <<1 mod poly here - //(for Karatsuba purposes) -#define HashKey_2_k 16*11 // store XOR of High 64 bits and Low 64 - // bits of HashKey^2 <<1 mod poly here - // (for Karatsuba purposes) -#define HashKey_3_k 16*12 // store XOR of High 64 bits and Low 64 - // bits of HashKey^3 <<1 mod poly here - // (for Karatsuba purposes) -#define HashKey_4_k 16*13 // store XOR of High 64 bits and Low 64 - // bits of HashKey^4 <<1 mod poly here - // (for Karatsuba purposes) - -#define arg1 rdi -#define arg2 rsi -#define arg3 rdx -#define arg4 rcx -#define arg5 r8 -#define arg6 r9 -#define arg7 STACK_OFFSET+8(%rsp) -#define arg8 STACK_OFFSET+16(%rsp) -#define arg9 STACK_OFFSET+24(%rsp) -#define arg10 STACK_OFFSET+32(%rsp) -#define arg11 STACK_OFFSET+40(%rsp) -#define keysize 2*15*16(%arg1) -#endif - #define STATE1 %xmm0 #define STATE2 %xmm4 @@ -170,1587 +65,6 @@ ALL_F: .octa 0xffffffffffffffffffffffffffffffff #define TKEYP T1 #endif -.macro FUNC_SAVE - push %r12 - push %r13 - push %r14 -# -# states of %xmm registers %xmm6:%xmm15 not saved -# all %xmm registers are clobbered -# -.endm - - -.macro FUNC_RESTORE - pop %r14 - pop %r13 - pop %r12 -.endm - -# Precompute hashkeys. -# Input: Hash subkey. -# Output: HashKeys stored in gcm_context_data. Only needs to be called -# once per key. -# clobbers r12, and tmp xmm registers. -.macro PRECOMPUTE SUBKEY TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 TMP7 - mov \SUBKEY, %r12 - movdqu (%r12), \TMP3 - movdqa SHUF_MASK(%rip), \TMP2 - pshufb \TMP2, \TMP3 - - # precompute HashKey<<1 mod poly from the HashKey (required for GHASH) - - movdqa \TMP3, \TMP2 - psllq $1, \TMP3 - psrlq $63, \TMP2 - movdqa \TMP2, \TMP1 - pslldq $8, \TMP2 - psrldq $8, \TMP1 - por \TMP2, \TMP3 - - # reduce HashKey<<1 - - pshufd $0x24, \TMP1, \TMP2 - pcmpeqd TWOONE(%rip), \TMP2 - pand POLY(%rip), \TMP2 - pxor \TMP2, \TMP3 - movdqu \TMP3, HashKey(%arg2) - - movdqa \TMP3, \TMP5 - pshufd $78, \TMP3, \TMP1 - pxor \TMP3, \TMP1 - movdqu \TMP1, HashKey_k(%arg2) - - GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 -# TMP5 = HashKey^2<<1 (mod poly) - movdqu \TMP5, HashKey_2(%arg2) -# HashKey_2 = HashKey^2<<1 (mod poly) - pshufd $78, \TMP5, \TMP1 - pxor \TMP5, \TMP1 - movdqu \TMP1, HashKey_2_k(%arg2) - - GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 -# TMP5 = HashKey^3<<1 (mod poly) - movdqu \TMP5, HashKey_3(%arg2) - pshufd $78, \TMP5, \TMP1 - pxor \TMP5, \TMP1 - movdqu \TMP1, HashKey_3_k(%arg2) - - GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 -# TMP5 = HashKey^3<<1 (mod poly) - movdqu \TMP5, HashKey_4(%arg2) - pshufd $78, \TMP5, \TMP1 - pxor \TMP5, \TMP1 - movdqu \TMP1, HashKey_4_k(%arg2) -.endm - -# GCM_INIT initializes a gcm_context struct to prepare for encoding/decoding. -# Clobbers rax, r10-r13 and xmm0-xmm6, %xmm13 -.macro GCM_INIT Iv SUBKEY AAD AADLEN - mov \AADLEN, %r11 - mov %r11, AadLen(%arg2) # ctx_data.aad_length = aad_length - xor %r11d, %r11d - mov %r11, InLen(%arg2) # ctx_data.in_length = 0 - mov %r11, PBlockLen(%arg2) # ctx_data.partial_block_length = 0 - mov %r11, PBlockEncKey(%arg2) # ctx_data.partial_block_enc_key = 0 - mov \Iv, %rax - movdqu (%rax), %xmm0 - movdqu %xmm0, OrigIV(%arg2) # ctx_data.orig_IV = iv - - movdqa SHUF_MASK(%rip), %xmm2 - pshufb %xmm2, %xmm0 - movdqu %xmm0, CurCount(%arg2) # ctx_data.current_counter = iv - - PRECOMPUTE \SUBKEY, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7 - movdqu HashKey(%arg2), %xmm13 - - CALC_AAD_HASH %xmm13, \AAD, \AADLEN, %xmm0, %xmm1, %xmm2, %xmm3, \ - %xmm4, %xmm5, %xmm6 -.endm - -# GCM_ENC_DEC Encodes/Decodes given data. Assumes that the passed gcm_context -# struct has been initialized by GCM_INIT. -# Requires the input data be at least 1 byte long because of READ_PARTIAL_BLOCK -# Clobbers rax, r10-r13, and xmm0-xmm15 -.macro GCM_ENC_DEC operation - movdqu AadHash(%arg2), %xmm8 - movdqu HashKey(%arg2), %xmm13 - add %arg5, InLen(%arg2) - - xor %r11d, %r11d # initialise the data pointer offset as zero - PARTIAL_BLOCK %arg3 %arg4 %arg5 %r11 %xmm8 \operation - - sub %r11, %arg5 # sub partial block data used - mov %arg5, %r13 # save the number of bytes - - and $-16, %r13 # %r13 = %r13 - (%r13 mod 16) - mov %r13, %r12 - # Encrypt/Decrypt first few blocks - - and $(3<<4), %r12 - jz .L_initial_num_blocks_is_0_\@ - cmp $(2<<4), %r12 - jb .L_initial_num_blocks_is_1_\@ - je .L_initial_num_blocks_is_2_\@ -.L_initial_num_blocks_is_3_\@: - INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ -%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, \operation - sub $48, %r13 - jmp .L_initial_blocks_\@ -.L_initial_num_blocks_is_2_\@: - INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ -%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, \operation - sub $32, %r13 - jmp .L_initial_blocks_\@ -.L_initial_num_blocks_is_1_\@: - INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ -%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, \operation - sub $16, %r13 - jmp .L_initial_blocks_\@ -.L_initial_num_blocks_is_0_\@: - INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ -%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, \operation -.L_initial_blocks_\@: - - # Main loop - Encrypt/Decrypt remaining blocks - - test %r13, %r13 - je .L_zero_cipher_left_\@ - sub $64, %r13 - je .L_four_cipher_left_\@ -.L_crypt_by_4_\@: - GHASH_4_ENCRYPT_4_PARALLEL_\operation %xmm9, %xmm10, %xmm11, %xmm12, \ - %xmm13, %xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, \ - %xmm7, %xmm8, enc - add $64, %r11 - sub $64, %r13 - jne .L_crypt_by_4_\@ -.L_four_cipher_left_\@: - GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \ -%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8 -.L_zero_cipher_left_\@: - movdqu %xmm8, AadHash(%arg2) - movdqu %xmm0, CurCount(%arg2) - - mov %arg5, %r13 - and $15, %r13 # %r13 = arg5 (mod 16) - je .L_multiple_of_16_bytes_\@ - - mov %r13, PBlockLen(%arg2) - - # Handle the last <16 Byte block separately - paddd ONE(%rip), %xmm0 # INCR CNT to get Yn - movdqu %xmm0, CurCount(%arg2) - movdqa SHUF_MASK(%rip), %xmm10 - pshufb %xmm10, %xmm0 - - ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # Encrypt(K, Yn) - movdqu %xmm0, PBlockEncKey(%arg2) - - cmp $16, %arg5 - jge .L_large_enough_update_\@ - - lea (%arg4,%r11,1), %r10 - mov %r13, %r12 - READ_PARTIAL_BLOCK %r10 %r12 %xmm2 %xmm1 - jmp .L_data_read_\@ - -.L_large_enough_update_\@: - sub $16, %r11 - add %r13, %r11 - - # receive the last <16 Byte block - movdqu (%arg4, %r11, 1), %xmm1 - - sub %r13, %r11 - add $16, %r11 - - lea SHIFT_MASK+16(%rip), %r12 - # adjust the shuffle mask pointer to be able to shift 16-r13 bytes - # (r13 is the number of bytes in plaintext mod 16) - sub %r13, %r12 - # get the appropriate shuffle mask - movdqu (%r12), %xmm2 - # shift right 16-r13 bytes - pshufb %xmm2, %xmm1 - -.L_data_read_\@: - lea ALL_F+16(%rip), %r12 - sub %r13, %r12 - -.ifc \operation, dec - movdqa %xmm1, %xmm2 -.endif - pxor %xmm1, %xmm0 # XOR Encrypt(K, Yn) - movdqu (%r12), %xmm1 - # get the appropriate mask to mask out top 16-r13 bytes of xmm0 - pand %xmm1, %xmm0 # mask out top 16-r13 bytes of xmm0 -.ifc \operation, dec - pand %xmm1, %xmm2 - movdqa SHUF_MASK(%rip), %xmm10 - pshufb %xmm10 ,%xmm2 - - pxor %xmm2, %xmm8 -.else - movdqa SHUF_MASK(%rip), %xmm10 - pshufb %xmm10,%xmm0 - - pxor %xmm0, %xmm8 -.endif - - movdqu %xmm8, AadHash(%arg2) -.ifc \operation, enc - # GHASH computation for the last <16 byte block - movdqa SHUF_MASK(%rip), %xmm10 - # shuffle xmm0 back to output as ciphertext - pshufb %xmm10, %xmm0 -.endif - - # Output %r13 bytes - movq %xmm0, %rax - cmp $8, %r13 - jle .L_less_than_8_bytes_left_\@ - mov %rax, (%arg3 , %r11, 1) - add $8, %r11 - psrldq $8, %xmm0 - movq %xmm0, %rax - sub $8, %r13 -.L_less_than_8_bytes_left_\@: - mov %al, (%arg3, %r11, 1) - add $1, %r11 - shr $8, %rax - sub $1, %r13 - jne .L_less_than_8_bytes_left_\@ -.L_multiple_of_16_bytes_\@: -.endm - -# GCM_COMPLETE Finishes update of tag of last partial block -# Output: Authorization Tag (AUTH_TAG) -# Clobbers rax, r10-r12, and xmm0, xmm1, xmm5-xmm15 -.macro GCM_COMPLETE AUTHTAG AUTHTAGLEN - movdqu AadHash(%arg2), %xmm8 - movdqu HashKey(%arg2), %xmm13 - - mov PBlockLen(%arg2), %r12 - - test %r12, %r12 - je .L_partial_done\@ - - GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6 - -.L_partial_done\@: - mov AadLen(%arg2), %r12 # %r13 = aadLen (number of bytes) - shl $3, %r12 # convert into number of bits - movd %r12d, %xmm15 # len(A) in %xmm15 - mov InLen(%arg2), %r12 - shl $3, %r12 # len(C) in bits (*128) - movq %r12, %xmm1 - - pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000 - pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C) - pxor %xmm15, %xmm8 - GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6 - # final GHASH computation - movdqa SHUF_MASK(%rip), %xmm10 - pshufb %xmm10, %xmm8 - - movdqu OrigIV(%arg2), %xmm0 # %xmm0 = Y0 - ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Y0) - pxor %xmm8, %xmm0 -.L_return_T_\@: - mov \AUTHTAG, %r10 # %r10 = authTag - mov \AUTHTAGLEN, %r11 # %r11 = auth_tag_len - cmp $16, %r11 - je .L_T_16_\@ - cmp $8, %r11 - jl .L_T_4_\@ -.L_T_8_\@: - movq %xmm0, %rax - mov %rax, (%r10) - add $8, %r10 - sub $8, %r11 - psrldq $8, %xmm0 - test %r11, %r11 - je .L_return_T_done_\@ -.L_T_4_\@: - movd %xmm0, %eax - mov %eax, (%r10) - add $4, %r10 - sub $4, %r11 - psrldq $4, %xmm0 - test %r11, %r11 - je .L_return_T_done_\@ -.L_T_123_\@: - movd %xmm0, %eax - cmp $2, %r11 - jl .L_T_1_\@ - mov %ax, (%r10) - cmp $2, %r11 - je .L_return_T_done_\@ - add $2, %r10 - sar $16, %eax -.L_T_1_\@: - mov %al, (%r10) - jmp .L_return_T_done_\@ -.L_T_16_\@: - movdqu %xmm0, (%r10) -.L_return_T_done_\@: -.endm - -#ifdef __x86_64__ -/* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0) -* -* -* Input: A and B (128-bits each, bit-reflected) -* Output: C = A*B*x mod poly, (i.e. >>1 ) -* To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input -* GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly. -* -*/ -.macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5 - movdqa \GH, \TMP1 - pshufd $78, \GH, \TMP2 - pshufd $78, \HK, \TMP3 - pxor \GH, \TMP2 # TMP2 = a1+a0 - pxor \HK, \TMP3 # TMP3 = b1+b0 - pclmulqdq $0x11, \HK, \TMP1 # TMP1 = a1*b1 - pclmulqdq $0x00, \HK, \GH # GH = a0*b0 - pclmulqdq $0x00, \TMP3, \TMP2 # TMP2 = (a0+a1)*(b1+b0) - pxor \GH, \TMP2 - pxor \TMP1, \TMP2 # TMP2 = (a0*b0)+(a1*b0) - movdqa \TMP2, \TMP3 - pslldq $8, \TMP3 # left shift TMP3 2 DWs - psrldq $8, \TMP2 # right shift TMP2 2 DWs - pxor \TMP3, \GH - pxor \TMP2, \TMP1 # TMP2:GH holds the result of GH*HK - - # first phase of the reduction - - movdqa \GH, \TMP2 - movdqa \GH, \TMP3 - movdqa \GH, \TMP4 # copy GH into TMP2,TMP3 and TMP4 - # in in order to perform - # independent shifts - pslld $31, \TMP2 # packed right shift <<31 - pslld $30, \TMP3 # packed right shift <<30 - pslld $25, \TMP4 # packed right shift <<25 - pxor \TMP3, \TMP2 # xor the shifted versions - pxor \TMP4, \TMP2 - movdqa \TMP2, \TMP5 - psrldq $4, \TMP5 # right shift TMP5 1 DW - pslldq $12, \TMP2 # left shift TMP2 3 DWs - pxor \TMP2, \GH - - # second phase of the reduction - - movdqa \GH,\TMP2 # copy GH into TMP2,TMP3 and TMP4 - # in in order to perform - # independent shifts - movdqa \GH,\TMP3 - movdqa \GH,\TMP4 - psrld $1,\TMP2 # packed left shift >>1 - psrld $2,\TMP3 # packed left shift >>2 - psrld $7,\TMP4 # packed left shift >>7 - pxor \TMP3,\TMP2 # xor the shifted versions - pxor \TMP4,\TMP2 - pxor \TMP5, \TMP2 - pxor \TMP2, \GH - pxor \TMP1, \GH # result is in TMP1 -.endm - -# Reads DLEN bytes starting at DPTR and stores in XMMDst -# where 0 < DLEN < 16 -# Clobbers %rax, DLEN and XMM1 -.macro READ_PARTIAL_BLOCK DPTR DLEN XMM1 XMMDst - cmp $8, \DLEN - jl .L_read_lt8_\@ - mov (\DPTR), %rax - movq %rax, \XMMDst - sub $8, \DLEN - jz .L_done_read_partial_block_\@ - xor %eax, %eax -.L_read_next_byte_\@: - shl $8, %rax - mov 7(\DPTR, \DLEN, 1), %al - dec \DLEN - jnz .L_read_next_byte_\@ - movq %rax, \XMM1 - pslldq $8, \XMM1 - por \XMM1, \XMMDst - jmp .L_done_read_partial_block_\@ -.L_read_lt8_\@: - xor %eax, %eax -.L_read_next_byte_lt8_\@: - shl $8, %rax - mov -1(\DPTR, \DLEN, 1), %al - dec \DLEN - jnz .L_read_next_byte_lt8_\@ - movq %rax, \XMMDst -.L_done_read_partial_block_\@: -.endm - -# CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted. -# clobbers r10-11, xmm14 -.macro CALC_AAD_HASH HASHKEY AAD AADLEN TMP1 TMP2 TMP3 TMP4 TMP5 \ - TMP6 TMP7 - MOVADQ SHUF_MASK(%rip), %xmm14 - mov \AAD, %r10 # %r10 = AAD - mov \AADLEN, %r11 # %r11 = aadLen - pxor \TMP7, \TMP7 - pxor \TMP6, \TMP6 - - cmp $16, %r11 - jl .L_get_AAD_rest\@ -.L_get_AAD_blocks\@: - movdqu (%r10), \TMP7 - pshufb %xmm14, \TMP7 # byte-reflect the AAD data - pxor \TMP7, \TMP6 - GHASH_MUL \TMP6, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5 - add $16, %r10 - sub $16, %r11 - cmp $16, %r11 - jge .L_get_AAD_blocks\@ - - movdqu \TMP6, \TMP7 - - /* read the last <16B of AAD */ -.L_get_AAD_rest\@: - test %r11, %r11 - je .L_get_AAD_done\@ - - READ_PARTIAL_BLOCK %r10, %r11, \TMP1, \TMP7 - pshufb %xmm14, \TMP7 # byte-reflect the AAD data - pxor \TMP6, \TMP7 - GHASH_MUL \TMP7, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5 - movdqu \TMP7, \TMP6 - -.L_get_AAD_done\@: - movdqu \TMP6, AadHash(%arg2) -.endm - -# PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks -# between update calls. -# Requires the input data be at least 1 byte long due to READ_PARTIAL_BLOCK -# Outputs encrypted bytes, and updates hash and partial info in gcm_data_context -# Clobbers rax, r10, r12, r13, xmm0-6, xmm9-13 -.macro PARTIAL_BLOCK CYPH_PLAIN_OUT PLAIN_CYPH_IN PLAIN_CYPH_LEN DATA_OFFSET \ - AAD_HASH operation - mov PBlockLen(%arg2), %r13 - test %r13, %r13 - je .L_partial_block_done_\@ # Leave Macro if no partial blocks - # Read in input data without over reading - cmp $16, \PLAIN_CYPH_LEN - jl .L_fewer_than_16_bytes_\@ - movups (\PLAIN_CYPH_IN), %xmm1 # If more than 16 bytes, just fill xmm - jmp .L_data_read_\@ - -.L_fewer_than_16_bytes_\@: - lea (\PLAIN_CYPH_IN, \DATA_OFFSET, 1), %r10 - mov \PLAIN_CYPH_LEN, %r12 - READ_PARTIAL_BLOCK %r10 %r12 %xmm0 %xmm1 - - mov PBlockLen(%arg2), %r13 - -.L_data_read_\@: # Finished reading in data - - movdqu PBlockEncKey(%arg2), %xmm9 - movdqu HashKey(%arg2), %xmm13 - - lea SHIFT_MASK(%rip), %r12 - - # adjust the shuffle mask pointer to be able to shift r13 bytes - # r16-r13 is the number of bytes in plaintext mod 16) - add %r13, %r12 - movdqu (%r12), %xmm2 # get the appropriate shuffle mask - pshufb %xmm2, %xmm9 # shift right r13 bytes - -.ifc \operation, dec - movdqa %xmm1, %xmm3 - pxor %xmm1, %xmm9 # Ciphertext XOR E(K, Yn) - - mov \PLAIN_CYPH_LEN, %r10 - add %r13, %r10 - # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling - sub $16, %r10 - # Determine if partial block is not being filled and - # shift mask accordingly - jge .L_no_extra_mask_1_\@ - sub %r10, %r12 -.L_no_extra_mask_1_\@: - - movdqu ALL_F-SHIFT_MASK(%r12), %xmm1 - # get the appropriate mask to mask out bottom r13 bytes of xmm9 - pand %xmm1, %xmm9 # mask out bottom r13 bytes of xmm9 - - pand %xmm1, %xmm3 - movdqa SHUF_MASK(%rip), %xmm10 - pshufb %xmm10, %xmm3 - pshufb %xmm2, %xmm3 - pxor %xmm3, \AAD_HASH - - test %r10, %r10 - jl .L_partial_incomplete_1_\@ - - # GHASH computation for the last <16 Byte block - GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 - xor %eax, %eax - - mov %rax, PBlockLen(%arg2) - jmp .L_dec_done_\@ -.L_partial_incomplete_1_\@: - add \PLAIN_CYPH_LEN, PBlockLen(%arg2) -.L_dec_done_\@: - movdqu \AAD_HASH, AadHash(%arg2) -.else - pxor %xmm1, %xmm9 # Plaintext XOR E(K, Yn) - - mov \PLAIN_CYPH_LEN, %r10 - add %r13, %r10 - # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling - sub $16, %r10 - # Determine if partial block is not being filled and - # shift mask accordingly - jge .L_no_extra_mask_2_\@ - sub %r10, %r12 -.L_no_extra_mask_2_\@: - - movdqu ALL_F-SHIFT_MASK(%r12), %xmm1 - # get the appropriate mask to mask out bottom r13 bytes of xmm9 - pand %xmm1, %xmm9 - - movdqa SHUF_MASK(%rip), %xmm1 - pshufb %xmm1, %xmm9 - pshufb %xmm2, %xmm9 - pxor %xmm9, \AAD_HASH - - test %r10, %r10 - jl .L_partial_incomplete_2_\@ - - # GHASH computation for the last <16 Byte block - GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 - xor %eax, %eax - - mov %rax, PBlockLen(%arg2) - jmp .L_encode_done_\@ -.L_partial_incomplete_2_\@: - add \PLAIN_CYPH_LEN, PBlockLen(%arg2) -.L_encode_done_\@: - movdqu \AAD_HASH, AadHash(%arg2) - - movdqa SHUF_MASK(%rip), %xmm10 - # shuffle xmm9 back to output as ciphertext - pshufb %xmm10, %xmm9 - pshufb %xmm2, %xmm9 -.endif - # output encrypted Bytes - test %r10, %r10 - jl .L_partial_fill_\@ - mov %r13, %r12 - mov $16, %r13 - # Set r13 to be the number of bytes to write out - sub %r12, %r13 - jmp .L_count_set_\@ -.L_partial_fill_\@: - mov \PLAIN_CYPH_LEN, %r13 -.L_count_set_\@: - movdqa %xmm9, %xmm0 - movq %xmm0, %rax - cmp $8, %r13 - jle .L_less_than_8_bytes_left_\@ - - mov %rax, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1) - add $8, \DATA_OFFSET - psrldq $8, %xmm0 - movq %xmm0, %rax - sub $8, %r13 -.L_less_than_8_bytes_left_\@: - movb %al, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1) - add $1, \DATA_OFFSET - shr $8, %rax - sub $1, %r13 - jne .L_less_than_8_bytes_left_\@ -.L_partial_block_done_\@: -.endm # PARTIAL_BLOCK - -/* -* if a = number of total plaintext bytes -* b = floor(a/16) -* num_initial_blocks = b mod 4 -* encrypt the initial num_initial_blocks blocks and apply ghash on -* the ciphertext -* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers -* are clobbered -* arg1, %arg2, %arg3 are used as a pointer only, not modified -*/ - - -.macro INITIAL_BLOCKS_ENC_DEC TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \ - XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation - MOVADQ SHUF_MASK(%rip), %xmm14 - - movdqu AadHash(%arg2), %xmm\i # XMM0 = Y0 - - # start AES for num_initial_blocks blocks - - movdqu CurCount(%arg2), \XMM0 # XMM0 = Y0 - -.if (\i == 5) || (\i == 6) || (\i == 7) - - MOVADQ ONE(%RIP),\TMP1 - MOVADQ 0(%arg1),\TMP2 -.irpc index, \i_seq - paddd \TMP1, \XMM0 # INCR Y0 -.ifc \operation, dec - movdqa \XMM0, %xmm\index -.else - MOVADQ \XMM0, %xmm\index -.endif - pshufb %xmm14, %xmm\index # perform a 16 byte swap - pxor \TMP2, %xmm\index -.endr - lea 0x10(%arg1),%r10 - mov keysize,%eax - shr $2,%eax # 128->4, 192->6, 256->8 - add $5,%eax # 128->9, 192->11, 256->13 - -.Laes_loop_initial_\@: - MOVADQ (%r10),\TMP1 -.irpc index, \i_seq - aesenc \TMP1, %xmm\index -.endr - add $16,%r10 - sub $1,%eax - jnz .Laes_loop_initial_\@ - - MOVADQ (%r10), \TMP1 -.irpc index, \i_seq - aesenclast \TMP1, %xmm\index # Last Round -.endr -.irpc index, \i_seq - movdqu (%arg4 , %r11, 1), \TMP1 - pxor \TMP1, %xmm\index - movdqu %xmm\index, (%arg3 , %r11, 1) - # write back plaintext/ciphertext for num_initial_blocks - add $16, %r11 - -.ifc \operation, dec - movdqa \TMP1, %xmm\index -.endif - pshufb %xmm14, %xmm\index - - # prepare plaintext/ciphertext for GHASH computation -.endr -.endif - - # apply GHASH on num_initial_blocks blocks - -.if \i == 5 - pxor %xmm5, %xmm6 - GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 - pxor %xmm6, %xmm7 - GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 - pxor %xmm7, %xmm8 - GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 -.elseif \i == 6 - pxor %xmm6, %xmm7 - GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 - pxor %xmm7, %xmm8 - GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 -.elseif \i == 7 - pxor %xmm7, %xmm8 - GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 -.endif - cmp $64, %r13 - jl .L_initial_blocks_done\@ - # no need for precomputed values -/* -* -* Precomputations for HashKey parallel with encryption of first 4 blocks. -* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i -*/ - MOVADQ ONE(%RIP),\TMP1 - paddd \TMP1, \XMM0 # INCR Y0 - MOVADQ \XMM0, \XMM1 - pshufb %xmm14, \XMM1 # perform a 16 byte swap - - paddd \TMP1, \XMM0 # INCR Y0 - MOVADQ \XMM0, \XMM2 - pshufb %xmm14, \XMM2 # perform a 16 byte swap - - paddd \TMP1, \XMM0 # INCR Y0 - MOVADQ \XMM0, \XMM3 - pshufb %xmm14, \XMM3 # perform a 16 byte swap - - paddd \TMP1, \XMM0 # INCR Y0 - MOVADQ \XMM0, \XMM4 - pshufb %xmm14, \XMM4 # perform a 16 byte swap - - MOVADQ 0(%arg1),\TMP1 - pxor \TMP1, \XMM1 - pxor \TMP1, \XMM2 - pxor \TMP1, \XMM3 - pxor \TMP1, \XMM4 -.irpc index, 1234 # do 4 rounds - movaps 0x10*\index(%arg1), \TMP1 - aesenc \TMP1, \XMM1 - aesenc \TMP1, \XMM2 - aesenc \TMP1, \XMM3 - aesenc \TMP1, \XMM4 -.endr -.irpc index, 56789 # do next 5 rounds - movaps 0x10*\index(%arg1), \TMP1 - aesenc \TMP1, \XMM1 - aesenc \TMP1, \XMM2 - aesenc \TMP1, \XMM3 - aesenc \TMP1, \XMM4 -.endr - lea 0xa0(%arg1),%r10 - mov keysize,%eax - shr $2,%eax # 128->4, 192->6, 256->8 - sub $4,%eax # 128->0, 192->2, 256->4 - jz .Laes_loop_pre_done\@ - -.Laes_loop_pre_\@: - MOVADQ (%r10),\TMP2 -.irpc index, 1234 - aesenc \TMP2, %xmm\index -.endr - add $16,%r10 - sub $1,%eax - jnz .Laes_loop_pre_\@ - -.Laes_loop_pre_done\@: - MOVADQ (%r10), \TMP2 - aesenclast \TMP2, \XMM1 - aesenclast \TMP2, \XMM2 - aesenclast \TMP2, \XMM3 - aesenclast \TMP2, \XMM4 - movdqu 16*0(%arg4 , %r11 , 1), \TMP1 - pxor \TMP1, \XMM1 -.ifc \operation, dec - movdqu \XMM1, 16*0(%arg3 , %r11 , 1) - movdqa \TMP1, \XMM1 -.endif - movdqu 16*1(%arg4 , %r11 , 1), \TMP1 - pxor \TMP1, \XMM2 -.ifc \operation, dec - movdqu \XMM2, 16*1(%arg3 , %r11 , 1) - movdqa \TMP1, \XMM2 -.endif - movdqu 16*2(%arg4 , %r11 , 1), \TMP1 - pxor \TMP1, \XMM3 -.ifc \operation, dec - movdqu \XMM3, 16*2(%arg3 , %r11 , 1) - movdqa \TMP1, \XMM3 -.endif - movdqu 16*3(%arg4 , %r11 , 1), \TMP1 - pxor \TMP1, \XMM4 -.ifc \operation, dec - movdqu \XMM4, 16*3(%arg3 , %r11 , 1) - movdqa \TMP1, \XMM4 -.else - movdqu \XMM1, 16*0(%arg3 , %r11 , 1) - movdqu \XMM2, 16*1(%arg3 , %r11 , 1) - movdqu \XMM3, 16*2(%arg3 , %r11 , 1) - movdqu \XMM4, 16*3(%arg3 , %r11 , 1) -.endif - - add $64, %r11 - pshufb %xmm14, \XMM1 # perform a 16 byte swap - pxor \XMMDst, \XMM1 -# combine GHASHed value with the corresponding ciphertext - pshufb %xmm14, \XMM2 # perform a 16 byte swap - pshufb %xmm14, \XMM3 # perform a 16 byte swap - pshufb %xmm14, \XMM4 # perform a 16 byte swap - -.L_initial_blocks_done\@: - -.endm - -/* -* encrypt 4 blocks at a time -* ghash the 4 previously encrypted ciphertext blocks -* arg1, %arg3, %arg4 are used as pointers only, not modified -* %r11 is the data offset value -*/ -.macro GHASH_4_ENCRYPT_4_PARALLEL_enc TMP1 TMP2 TMP3 TMP4 TMP5 \ -TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation - - movdqa \XMM1, \XMM5 - movdqa \XMM2, \XMM6 - movdqa \XMM3, \XMM7 - movdqa \XMM4, \XMM8 - - movdqa SHUF_MASK(%rip), %xmm15 - # multiply TMP5 * HashKey using karatsuba - - movdqa \XMM5, \TMP4 - pshufd $78, \XMM5, \TMP6 - pxor \XMM5, \TMP6 - paddd ONE(%rip), \XMM0 # INCR CNT - movdqu HashKey_4(%arg2), \TMP5 - pclmulqdq $0x11, \TMP5, \TMP4 # TMP4 = a1*b1 - movdqa \XMM0, \XMM1 - paddd ONE(%rip), \XMM0 # INCR CNT - movdqa \XMM0, \XMM2 - paddd ONE(%rip), \XMM0 # INCR CNT - movdqa \XMM0, \XMM3 - paddd ONE(%rip), \XMM0 # INCR CNT - movdqa \XMM0, \XMM4 - pshufb %xmm15, \XMM1 # perform a 16 byte swap - pclmulqdq $0x00, \TMP5, \XMM5 # XMM5 = a0*b0 - pshufb %xmm15, \XMM2 # perform a 16 byte swap - pshufb %xmm15, \XMM3 # perform a 16 byte swap - pshufb %xmm15, \XMM4 # perform a 16 byte swap - - pxor (%arg1), \XMM1 - pxor (%arg1), \XMM2 - pxor (%arg1), \XMM3 - pxor (%arg1), \XMM4 - movdqu HashKey_4_k(%arg2), \TMP5 - pclmulqdq $0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0) - movaps 0x10(%arg1), \TMP1 - aesenc \TMP1, \XMM1 # Round 1 - aesenc \TMP1, \XMM2 - aesenc \TMP1, \XMM3 - aesenc \TMP1, \XMM4 - movaps 0x20(%arg1), \TMP1 - aesenc \TMP1, \XMM1 # Round 2 - aesenc \TMP1, \XMM2 - aesenc \TMP1, \XMM3 - aesenc \TMP1, \XMM4 - movdqa \XMM6, \TMP1 - pshufd $78, \XMM6, \TMP2 - pxor \XMM6, \TMP2 - movdqu HashKey_3(%arg2), \TMP5 - pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1 * b1 - movaps 0x30(%arg1), \TMP3 - aesenc \TMP3, \XMM1 # Round 3 - aesenc \TMP3, \XMM2 - aesenc \TMP3, \XMM3 - aesenc \TMP3, \XMM4 - pclmulqdq $0x00, \TMP5, \XMM6 # XMM6 = a0*b0 - movaps 0x40(%arg1), \TMP3 - aesenc \TMP3, \XMM1 # Round 4 - aesenc \TMP3, \XMM2 - aesenc \TMP3, \XMM3 - aesenc \TMP3, \XMM4 - movdqu HashKey_3_k(%arg2), \TMP5 - pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) - movaps 0x50(%arg1), \TMP3 - aesenc \TMP3, \XMM1 # Round 5 - aesenc \TMP3, \XMM2 - aesenc \TMP3, \XMM3 - aesenc \TMP3, \XMM4 - pxor \TMP1, \TMP4 -# accumulate the results in TMP4:XMM5, TMP6 holds the middle part - pxor \XMM6, \XMM5 - pxor \TMP2, \TMP6 - movdqa \XMM7, \TMP1 - pshufd $78, \XMM7, \TMP2 - pxor \XMM7, \TMP2 - movdqu HashKey_2(%arg2), \TMP5 - - # Multiply TMP5 * HashKey using karatsuba - - pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1 - movaps 0x60(%arg1), \TMP3 - aesenc \TMP3, \XMM1 # Round 6 - aesenc \TMP3, \XMM2 - aesenc \TMP3, \XMM3 - aesenc \TMP3, \XMM4 - pclmulqdq $0x00, \TMP5, \XMM7 # XMM7 = a0*b0 - movaps 0x70(%arg1), \TMP3 - aesenc \TMP3, \XMM1 # Round 7 - aesenc \TMP3, \XMM2 - aesenc \TMP3, \XMM3 - aesenc \TMP3, \XMM4 - movdqu HashKey_2_k(%arg2), \TMP5 - pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) - movaps 0x80(%arg1), \TMP3 - aesenc \TMP3, \XMM1 # Round 8 - aesenc \TMP3, \XMM2 - aesenc \TMP3, \XMM3 - aesenc \TMP3, \XMM4 - pxor \TMP1, \TMP4 -# accumulate the results in TMP4:XMM5, TMP6 holds the middle part - pxor \XMM7, \XMM5 - pxor \TMP2, \TMP6 - - # Multiply XMM8 * HashKey - # XMM8 and TMP5 hold the values for the two operands - - movdqa \XMM8, \TMP1 - pshufd $78, \XMM8, \TMP2 - pxor \XMM8, \TMP2 - movdqu HashKey(%arg2), \TMP5 - pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1 - movaps 0x90(%arg1), \TMP3 - aesenc \TMP3, \XMM1 # Round 9 - aesenc \TMP3, \XMM2 - aesenc \TMP3, \XMM3 - aesenc \TMP3, \XMM4 - pclmulqdq $0x00, \TMP5, \XMM8 # XMM8 = a0*b0 - lea 0xa0(%arg1),%r10 - mov keysize,%eax - shr $2,%eax # 128->4, 192->6, 256->8 - sub $4,%eax # 128->0, 192->2, 256->4 - jz .Laes_loop_par_enc_done\@ - -.Laes_loop_par_enc\@: - MOVADQ (%r10),\TMP3 -.irpc index, 1234 - aesenc \TMP3, %xmm\index -.endr - add $16,%r10 - sub $1,%eax - jnz .Laes_loop_par_enc\@ - -.Laes_loop_par_enc_done\@: - MOVADQ (%r10), \TMP3 - aesenclast \TMP3, \XMM1 # Round 10 - aesenclast \TMP3, \XMM2 - aesenclast \TMP3, \XMM3 - aesenclast \TMP3, \XMM4 - movdqu HashKey_k(%arg2), \TMP5 - pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) - movdqu (%arg4,%r11,1), \TMP3 - pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK - movdqu 16(%arg4,%r11,1), \TMP3 - pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK - movdqu 32(%arg4,%r11,1), \TMP3 - pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK - movdqu 48(%arg4,%r11,1), \TMP3 - pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK - movdqu \XMM1, (%arg3,%r11,1) # Write to the ciphertext buffer - movdqu \XMM2, 16(%arg3,%r11,1) # Write to the ciphertext buffer - movdqu \XMM3, 32(%arg3,%r11,1) # Write to the ciphertext buffer - movdqu \XMM4, 48(%arg3,%r11,1) # Write to the ciphertext buffer - pshufb %xmm15, \XMM1 # perform a 16 byte swap - pshufb %xmm15, \XMM2 # perform a 16 byte swap - pshufb %xmm15, \XMM3 # perform a 16 byte swap - pshufb %xmm15, \XMM4 # perform a 16 byte swap - - pxor \TMP4, \TMP1 - pxor \XMM8, \XMM5 - pxor \TMP6, \TMP2 - pxor \TMP1, \TMP2 - pxor \XMM5, \TMP2 - movdqa \TMP2, \TMP3 - pslldq $8, \TMP3 # left shift TMP3 2 DWs - psrldq $8, \TMP2 # right shift TMP2 2 DWs - pxor \TMP3, \XMM5 - pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5 - - # first phase of reduction - - movdqa \XMM5, \TMP2 - movdqa \XMM5, \TMP3 - movdqa \XMM5, \TMP4 -# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently - pslld $31, \TMP2 # packed right shift << 31 - pslld $30, \TMP3 # packed right shift << 30 - pslld $25, \TMP4 # packed right shift << 25 - pxor \TMP3, \TMP2 # xor the shifted versions - pxor \TMP4, \TMP2 - movdqa \TMP2, \TMP5 - psrldq $4, \TMP5 # right shift T5 1 DW - pslldq $12, \TMP2 # left shift T2 3 DWs - pxor \TMP2, \XMM5 - - # second phase of reduction - - movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4 - movdqa \XMM5,\TMP3 - movdqa \XMM5,\TMP4 - psrld $1, \TMP2 # packed left shift >>1 - psrld $2, \TMP3 # packed left shift >>2 - psrld $7, \TMP4 # packed left shift >>7 - pxor \TMP3,\TMP2 # xor the shifted versions - pxor \TMP4,\TMP2 - pxor \TMP5, \TMP2 - pxor \TMP2, \XMM5 - pxor \TMP1, \XMM5 # result is in TMP1 - - pxor \XMM5, \XMM1 -.endm - -/* -* decrypt 4 blocks at a time -* ghash the 4 previously decrypted ciphertext blocks -* arg1, %arg3, %arg4 are used as pointers only, not modified -* %r11 is the data offset value -*/ -.macro GHASH_4_ENCRYPT_4_PARALLEL_dec TMP1 TMP2 TMP3 TMP4 TMP5 \ -TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation - - movdqa \XMM1, \XMM5 - movdqa \XMM2, \XMM6 - movdqa \XMM3, \XMM7 - movdqa \XMM4, \XMM8 - - movdqa SHUF_MASK(%rip), %xmm15 - # multiply TMP5 * HashKey using karatsuba - - movdqa \XMM5, \TMP4 - pshufd $78, \XMM5, \TMP6 - pxor \XMM5, \TMP6 - paddd ONE(%rip), \XMM0 # INCR CNT - movdqu HashKey_4(%arg2), \TMP5 - pclmulqdq $0x11, \TMP5, \TMP4 # TMP4 = a1*b1 - movdqa \XMM0, \XMM1 - paddd ONE(%rip), \XMM0 # INCR CNT - movdqa \XMM0, \XMM2 - paddd ONE(%rip), \XMM0 # INCR CNT - movdqa \XMM0, \XMM3 - paddd ONE(%rip), \XMM0 # INCR CNT - movdqa \XMM0, \XMM4 - pshufb %xmm15, \XMM1 # perform a 16 byte swap - pclmulqdq $0x00, \TMP5, \XMM5 # XMM5 = a0*b0 - pshufb %xmm15, \XMM2 # perform a 16 byte swap - pshufb %xmm15, \XMM3 # perform a 16 byte swap - pshufb %xmm15, \XMM4 # perform a 16 byte swap - - pxor (%arg1), \XMM1 - pxor (%arg1), \XMM2 - pxor (%arg1), \XMM3 - pxor (%arg1), \XMM4 - movdqu HashKey_4_k(%arg2), \TMP5 - pclmulqdq $0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0) - movaps 0x10(%arg1), \TMP1 - aesenc \TMP1, \XMM1 # Round 1 - aesenc \TMP1, \XMM2 - aesenc \TMP1, \XMM3 - aesenc \TMP1, \XMM4 - movaps 0x20(%arg1), \TMP1 - aesenc \TMP1, \XMM1 # Round 2 - aesenc \TMP1, \XMM2 - aesenc \TMP1, \XMM3 - aesenc \TMP1, \XMM4 - movdqa \XMM6, \TMP1 - pshufd $78, \XMM6, \TMP2 - pxor \XMM6, \TMP2 - movdqu HashKey_3(%arg2), \TMP5 - pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1 * b1 - movaps 0x30(%arg1), \TMP3 - aesenc \TMP3, \XMM1 # Round 3 - aesenc \TMP3, \XMM2 - aesenc \TMP3, \XMM3 - aesenc \TMP3, \XMM4 - pclmulqdq $0x00, \TMP5, \XMM6 # XMM6 = a0*b0 - movaps 0x40(%arg1), \TMP3 - aesenc \TMP3, \XMM1 # Round 4 - aesenc \TMP3, \XMM2 - aesenc \TMP3, \XMM3 - aesenc \TMP3, \XMM4 - movdqu HashKey_3_k(%arg2), \TMP5 - pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) - movaps 0x50(%arg1), \TMP3 - aesenc \TMP3, \XMM1 # Round 5 - aesenc \TMP3, \XMM2 - aesenc \TMP3, \XMM3 - aesenc \TMP3, \XMM4 - pxor \TMP1, \TMP4 -# accumulate the results in TMP4:XMM5, TMP6 holds the middle part - pxor \XMM6, \XMM5 - pxor \TMP2, \TMP6 - movdqa \XMM7, \TMP1 - pshufd $78, \XMM7, \TMP2 - pxor \XMM7, \TMP2 - movdqu HashKey_2(%arg2), \TMP5 - - # Multiply TMP5 * HashKey using karatsuba - - pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1 - movaps 0x60(%arg1), \TMP3 - aesenc \TMP3, \XMM1 # Round 6 - aesenc \TMP3, \XMM2 - aesenc \TMP3, \XMM3 - aesenc \TMP3, \XMM4 - pclmulqdq $0x00, \TMP5, \XMM7 # XMM7 = a0*b0 - movaps 0x70(%arg1), \TMP3 - aesenc \TMP3, \XMM1 # Round 7 - aesenc \TMP3, \XMM2 - aesenc \TMP3, \XMM3 - aesenc \TMP3, \XMM4 - movdqu HashKey_2_k(%arg2), \TMP5 - pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) - movaps 0x80(%arg1), \TMP3 - aesenc \TMP3, \XMM1 # Round 8 - aesenc \TMP3, \XMM2 - aesenc \TMP3, \XMM3 - aesenc \TMP3, \XMM4 - pxor \TMP1, \TMP4 -# accumulate the results in TMP4:XMM5, TMP6 holds the middle part - pxor \XMM7, \XMM5 - pxor \TMP2, \TMP6 - - # Multiply XMM8 * HashKey - # XMM8 and TMP5 hold the values for the two operands - - movdqa \XMM8, \TMP1 - pshufd $78, \XMM8, \TMP2 - pxor \XMM8, \TMP2 - movdqu HashKey(%arg2), \TMP5 - pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1 - movaps 0x90(%arg1), \TMP3 - aesenc \TMP3, \XMM1 # Round 9 - aesenc \TMP3, \XMM2 - aesenc \TMP3, \XMM3 - aesenc \TMP3, \XMM4 - pclmulqdq $0x00, \TMP5, \XMM8 # XMM8 = a0*b0 - lea 0xa0(%arg1),%r10 - mov keysize,%eax - shr $2,%eax # 128->4, 192->6, 256->8 - sub $4,%eax # 128->0, 192->2, 256->4 - jz .Laes_loop_par_dec_done\@ - -.Laes_loop_par_dec\@: - MOVADQ (%r10),\TMP3 -.irpc index, 1234 - aesenc \TMP3, %xmm\index -.endr - add $16,%r10 - sub $1,%eax - jnz .Laes_loop_par_dec\@ - -.Laes_loop_par_dec_done\@: - MOVADQ (%r10), \TMP3 - aesenclast \TMP3, \XMM1 # last round - aesenclast \TMP3, \XMM2 - aesenclast \TMP3, \XMM3 - aesenclast \TMP3, \XMM4 - movdqu HashKey_k(%arg2), \TMP5 - pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) - movdqu (%arg4,%r11,1), \TMP3 - pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK - movdqu \XMM1, (%arg3,%r11,1) # Write to plaintext buffer - movdqa \TMP3, \XMM1 - movdqu 16(%arg4,%r11,1), \TMP3 - pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK - movdqu \XMM2, 16(%arg3,%r11,1) # Write to plaintext buffer - movdqa \TMP3, \XMM2 - movdqu 32(%arg4,%r11,1), \TMP3 - pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK - movdqu \XMM3, 32(%arg3,%r11,1) # Write to plaintext buffer - movdqa \TMP3, \XMM3 - movdqu 48(%arg4,%r11,1), \TMP3 - pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK - movdqu \XMM4, 48(%arg3,%r11,1) # Write to plaintext buffer - movdqa \TMP3, \XMM4 - pshufb %xmm15, \XMM1 # perform a 16 byte swap - pshufb %xmm15, \XMM2 # perform a 16 byte swap - pshufb %xmm15, \XMM3 # perform a 16 byte swap - pshufb %xmm15, \XMM4 # perform a 16 byte swap - - pxor \TMP4, \TMP1 - pxor \XMM8, \XMM5 - pxor \TMP6, \TMP2 - pxor \TMP1, \TMP2 - pxor \XMM5, \TMP2 - movdqa \TMP2, \TMP3 - pslldq $8, \TMP3 # left shift TMP3 2 DWs - psrldq $8, \TMP2 # right shift TMP2 2 DWs - pxor \TMP3, \XMM5 - pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5 - - # first phase of reduction - - movdqa \XMM5, \TMP2 - movdqa \XMM5, \TMP3 - movdqa \XMM5, \TMP4 -# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently - pslld $31, \TMP2 # packed right shift << 31 - pslld $30, \TMP3 # packed right shift << 30 - pslld $25, \TMP4 # packed right shift << 25 - pxor \TMP3, \TMP2 # xor the shifted versions - pxor \TMP4, \TMP2 - movdqa \TMP2, \TMP5 - psrldq $4, \TMP5 # right shift T5 1 DW - pslldq $12, \TMP2 # left shift T2 3 DWs - pxor \TMP2, \XMM5 - - # second phase of reduction - - movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4 - movdqa \XMM5,\TMP3 - movdqa \XMM5,\TMP4 - psrld $1, \TMP2 # packed left shift >>1 - psrld $2, \TMP3 # packed left shift >>2 - psrld $7, \TMP4 # packed left shift >>7 - pxor \TMP3,\TMP2 # xor the shifted versions - pxor \TMP4,\TMP2 - pxor \TMP5, \TMP2 - pxor \TMP2, \XMM5 - pxor \TMP1, \XMM5 # result is in TMP1 - - pxor \XMM5, \XMM1 -.endm - -/* GHASH the last 4 ciphertext blocks. */ -.macro GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \ -TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst - - # Multiply TMP6 * HashKey (using Karatsuba) - - movdqa \XMM1, \TMP6 - pshufd $78, \XMM1, \TMP2 - pxor \XMM1, \TMP2 - movdqu HashKey_4(%arg2), \TMP5 - pclmulqdq $0x11, \TMP5, \TMP6 # TMP6 = a1*b1 - pclmulqdq $0x00, \TMP5, \XMM1 # XMM1 = a0*b0 - movdqu HashKey_4_k(%arg2), \TMP4 - pclmulqdq $0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) - movdqa \XMM1, \XMMDst - movdqa \TMP2, \XMM1 # result in TMP6, XMMDst, XMM1 - - # Multiply TMP1 * HashKey (using Karatsuba) - - movdqa \XMM2, \TMP1 - pshufd $78, \XMM2, \TMP2 - pxor \XMM2, \TMP2 - movdqu HashKey_3(%arg2), \TMP5 - pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1 - pclmulqdq $0x00, \TMP5, \XMM2 # XMM2 = a0*b0 - movdqu HashKey_3_k(%arg2), \TMP4 - pclmulqdq $0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) - pxor \TMP1, \TMP6 - pxor \XMM2, \XMMDst - pxor \TMP2, \XMM1 -# results accumulated in TMP6, XMMDst, XMM1 - - # Multiply TMP1 * HashKey (using Karatsuba) - - movdqa \XMM3, \TMP1 - pshufd $78, \XMM3, \TMP2 - pxor \XMM3, \TMP2 - movdqu HashKey_2(%arg2), \TMP5 - pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1 - pclmulqdq $0x00, \TMP5, \XMM3 # XMM3 = a0*b0 - movdqu HashKey_2_k(%arg2), \TMP4 - pclmulqdq $0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) - pxor \TMP1, \TMP6 - pxor \XMM3, \XMMDst - pxor \TMP2, \XMM1 # results accumulated in TMP6, XMMDst, XMM1 - - # Multiply TMP1 * HashKey (using Karatsuba) - movdqa \XMM4, \TMP1 - pshufd $78, \XMM4, \TMP2 - pxor \XMM4, \TMP2 - movdqu HashKey(%arg2), \TMP5 - pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1 - pclmulqdq $0x00, \TMP5, \XMM4 # XMM4 = a0*b0 - movdqu HashKey_k(%arg2), \TMP4 - pclmulqdq $0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) - pxor \TMP1, \TMP6 - pxor \XMM4, \XMMDst - pxor \XMM1, \TMP2 - pxor \TMP6, \TMP2 - pxor \XMMDst, \TMP2 - # middle section of the temp results combined as in karatsuba algorithm - movdqa \TMP2, \TMP4 - pslldq $8, \TMP4 # left shift TMP4 2 DWs - psrldq $8, \TMP2 # right shift TMP2 2 DWs - pxor \TMP4, \XMMDst - pxor \TMP2, \TMP6 -# TMP6:XMMDst holds the result of the accumulated carry-less multiplications - # first phase of the reduction - movdqa \XMMDst, \TMP2 - movdqa \XMMDst, \TMP3 - movdqa \XMMDst, \TMP4 -# move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently - pslld $31, \TMP2 # packed right shifting << 31 - pslld $30, \TMP3 # packed right shifting << 30 - pslld $25, \TMP4 # packed right shifting << 25 - pxor \TMP3, \TMP2 # xor the shifted versions - pxor \TMP4, \TMP2 - movdqa \TMP2, \TMP7 - psrldq $4, \TMP7 # right shift TMP7 1 DW - pslldq $12, \TMP2 # left shift TMP2 3 DWs - pxor \TMP2, \XMMDst - - # second phase of the reduction - movdqa \XMMDst, \TMP2 - # make 3 copies of XMMDst for doing 3 shift operations - movdqa \XMMDst, \TMP3 - movdqa \XMMDst, \TMP4 - psrld $1, \TMP2 # packed left shift >> 1 - psrld $2, \TMP3 # packed left shift >> 2 - psrld $7, \TMP4 # packed left shift >> 7 - pxor \TMP3, \TMP2 # xor the shifted versions - pxor \TMP4, \TMP2 - pxor \TMP7, \TMP2 - pxor \TMP2, \XMMDst - pxor \TMP6, \XMMDst # reduced result is in XMMDst -.endm - - -/* Encryption of a single block -* uses eax & r10 -*/ - -.macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1 - - pxor (%arg1), \XMM0 - mov keysize,%eax - shr $2,%eax # 128->4, 192->6, 256->8 - add $5,%eax # 128->9, 192->11, 256->13 - lea 16(%arg1), %r10 # get first expanded key address - -_esb_loop_\@: - MOVADQ (%r10),\TMP1 - aesenc \TMP1,\XMM0 - add $16,%r10 - sub $1,%eax - jnz _esb_loop_\@ - - MOVADQ (%r10),\TMP1 - aesenclast \TMP1,\XMM0 -.endm -/***************************************************************************** -* void aesni_gcm_dec(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary. -* struct gcm_context_data *data -* // Context data -* u8 *out, // Plaintext output. Encrypt in-place is allowed. -* const u8 *in, // Ciphertext input -* u64 plaintext_len, // Length of data in bytes for decryption. -* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association) -* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload) -* // concatenated with 0x00000001. 16-byte aligned pointer. -* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary. -* const u8 *aad, // Additional Authentication Data (AAD) -* u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes -* u8 *auth_tag, // Authenticated Tag output. The driver will compare this to the -* // given authentication tag and only return the plaintext if they match. -* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 -* // (most likely), 12 or 8. -* -* Assumptions: -* -* keys: -* keys are pre-expanded and aligned to 16 bytes. we are using the first -* set of 11 keys in the data structure void *aes_ctx -* -* iv: -* 0 1 2 3 -* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 -* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -* | Salt (From the SA) | -* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -* | Initialization Vector | -* | (This is the sequence number from IPSec header) | -* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -* | 0x1 | -* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -* -* -* -* AAD: -* AAD padded to 128 bits with 0 -* for example, assume AAD is a u32 vector -* -* if AAD is 8 bytes: -* AAD[3] = {A0, A1}; -* padded AAD in xmm register = {A1 A0 0 0} -* -* 0 1 2 3 -* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 -* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -* | SPI (A1) | -* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -* | 32-bit Sequence Number (A0) | -* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -* | 0x0 | -* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -* -* AAD Format with 32-bit Sequence Number -* -* if AAD is 12 bytes: -* AAD[3] = {A0, A1, A2}; -* padded AAD in xmm register = {A2 A1 A0 0} -* -* 0 1 2 3 -* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 -* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 -* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -* | SPI (A2) | -* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -* | 64-bit Extended Sequence Number {A1,A0} | -* | | -* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -* | 0x0 | -* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -* -* AAD Format with 64-bit Extended Sequence Number -* -* poly = x^128 + x^127 + x^126 + x^121 + 1 -* -*****************************************************************************/ -SYM_FUNC_START(aesni_gcm_dec) - FUNC_SAVE - - GCM_INIT %arg6, arg7, arg8, arg9 - GCM_ENC_DEC dec - GCM_COMPLETE arg10, arg11 - FUNC_RESTORE - RET -SYM_FUNC_END(aesni_gcm_dec) - - -/***************************************************************************** -* void aesni_gcm_enc(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary. -* struct gcm_context_data *data -* // Context data -* u8 *out, // Ciphertext output. Encrypt in-place is allowed. -* const u8 *in, // Plaintext input -* u64 plaintext_len, // Length of data in bytes for encryption. -* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association) -* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload) -* // concatenated with 0x00000001. 16-byte aligned pointer. -* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary. -* const u8 *aad, // Additional Authentication Data (AAD) -* u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes -* u8 *auth_tag, // Authenticated Tag output. -* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 (most likely), -* // 12 or 8. -* -* Assumptions: -* -* keys: -* keys are pre-expanded and aligned to 16 bytes. we are using the -* first set of 11 keys in the data structure void *aes_ctx -* -* -* iv: -* 0 1 2 3 -* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 -* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -* | Salt (From the SA) | -* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -* | Initialization Vector | -* | (This is the sequence number from IPSec header) | -* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -* | 0x1 | -* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -* -* -* -* AAD: -* AAD padded to 128 bits with 0 -* for example, assume AAD is a u32 vector -* -* if AAD is 8 bytes: -* AAD[3] = {A0, A1}; -* padded AAD in xmm register = {A1 A0 0 0} -* -* 0 1 2 3 -* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 -* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -* | SPI (A1) | -* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -* | 32-bit Sequence Number (A0) | -* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -* | 0x0 | -* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -* -* AAD Format with 32-bit Sequence Number -* -* if AAD is 12 bytes: -* AAD[3] = {A0, A1, A2}; -* padded AAD in xmm register = {A2 A1 A0 0} -* -* 0 1 2 3 -* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 -* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -* | SPI (A2) | -* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -* | 64-bit Extended Sequence Number {A1,A0} | -* | | -* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -* | 0x0 | -* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -* -* AAD Format with 64-bit Extended Sequence Number -* -* poly = x^128 + x^127 + x^126 + x^121 + 1 -***************************************************************************/ -SYM_FUNC_START(aesni_gcm_enc) - FUNC_SAVE - - GCM_INIT %arg6, arg7, arg8, arg9 - GCM_ENC_DEC enc - - GCM_COMPLETE arg10, arg11 - FUNC_RESTORE - RET -SYM_FUNC_END(aesni_gcm_enc) - -/***************************************************************************** -* void aesni_gcm_init(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary. -* struct gcm_context_data *data, -* // context data -* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association) -* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload) -* // concatenated with 0x00000001. 16-byte aligned pointer. -* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary. -* const u8 *aad, // Additional Authentication Data (AAD) -* u64 aad_len) // Length of AAD in bytes. -*/ -SYM_FUNC_START(aesni_gcm_init) - FUNC_SAVE - GCM_INIT %arg3, %arg4,%arg5, %arg6 - FUNC_RESTORE - RET -SYM_FUNC_END(aesni_gcm_init) - -/***************************************************************************** -* void aesni_gcm_enc_update(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary. -* struct gcm_context_data *data, -* // context data -* u8 *out, // Ciphertext output. Encrypt in-place is allowed. -* const u8 *in, // Plaintext input -* u64 plaintext_len, // Length of data in bytes for encryption. -*/ -SYM_FUNC_START(aesni_gcm_enc_update) - FUNC_SAVE - GCM_ENC_DEC enc - FUNC_RESTORE - RET -SYM_FUNC_END(aesni_gcm_enc_update) - -/***************************************************************************** -* void aesni_gcm_dec_update(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary. -* struct gcm_context_data *data, -* // context data -* u8 *out, // Ciphertext output. Encrypt in-place is allowed. -* const u8 *in, // Plaintext input -* u64 plaintext_len, // Length of data in bytes for encryption. -*/ -SYM_FUNC_START(aesni_gcm_dec_update) - FUNC_SAVE - GCM_ENC_DEC dec - FUNC_RESTORE - RET -SYM_FUNC_END(aesni_gcm_dec_update) - -/***************************************************************************** -* void aesni_gcm_finalize(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary. -* struct gcm_context_data *data, -* // context data -* u8 *auth_tag, // Authenticated Tag output. -* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 (most likely), -* // 12 or 8. -*/ -SYM_FUNC_START(aesni_gcm_finalize) - FUNC_SAVE - GCM_COMPLETE %arg3 %arg4 - FUNC_RESTORE - RET -SYM_FUNC_END(aesni_gcm_finalize) - -#endif - SYM_FUNC_START_LOCAL(_key_expansion_256a) pshufd $0b11111111, %xmm1, %xmm1 shufps $0b00010000, %xmm0, %xmm4 @@ -1820,8 +134,8 @@ SYM_FUNC_START_LOCAL(_key_expansion_256b) SYM_FUNC_END(_key_expansion_256b) /* - * int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key, - * unsigned int key_len) + * void aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key, + * unsigned int key_len) */ SYM_FUNC_START(aesni_set_key) FRAME_BEGIN @@ -1926,7 +240,6 @@ SYM_FUNC_START(aesni_set_key) sub $0x10, UKEYP cmp TKEYP, KEYP jb .Ldec_key_loop - xor AREG, AREG #ifndef __x86_64__ popl KEYP #endif @@ -2759,6 +1072,7 @@ SYM_FUNC_END(_aesni_inc) * size_t len, u8 *iv) */ SYM_FUNC_START(aesni_ctr_enc) + ANNOTATE_NOENDBR FRAME_BEGIN cmp $16, LEN jb .Lctr_enc_just_ret @@ -2826,28 +1140,24 @@ SYM_FUNC_END(aesni_ctr_enc) .previous /* - * _aesni_gf128mul_x_ble: internal ABI - * Multiply in GF(2^128) for XTS IVs + * _aesni_gf128mul_x_ble: Multiply in GF(2^128) for XTS IVs * input: * IV: current IV * GF128MUL_MASK == mask with 0x87 and 0x01 * output: * IV: next IV * changed: - * CTR: == temporary value + * KEY: == temporary value */ -#define _aesni_gf128mul_x_ble() \ - pshufd $0x13, IV, KEY; \ - paddq IV, IV; \ - psrad $31, KEY; \ - pand GF128MUL_MASK, KEY; \ - pxor KEY, IV; +.macro _aesni_gf128mul_x_ble + pshufd $0x13, IV, KEY + paddq IV, IV + psrad $31, KEY + pand GF128MUL_MASK, KEY + pxor KEY, IV +.endm -/* - * void aesni_xts_encrypt(const struct crypto_aes_ctx *ctx, u8 *dst, - * const u8 *src, unsigned int len, le128 *iv) - */ -SYM_FUNC_START(aesni_xts_encrypt) +.macro _aesni_xts_crypt enc FRAME_BEGIN #ifndef __x86_64__ pushl IVP @@ -2866,35 +1176,46 @@ SYM_FUNC_START(aesni_xts_encrypt) movups (IVP), IV mov 480(KEYP), KLEN +.if !\enc + add $240, KEYP + + test $15, LEN + jz .Lxts_loop4\@ + sub $16, LEN +.endif -.Lxts_enc_loop4: +.Lxts_loop4\@: sub $64, LEN - jl .Lxts_enc_1x + jl .Lxts_1x\@ movdqa IV, STATE1 movdqu 0x00(INP), IN pxor IN, STATE1 movdqu IV, 0x00(OUTP) - _aesni_gf128mul_x_ble() + _aesni_gf128mul_x_ble movdqa IV, STATE2 movdqu 0x10(INP), IN pxor IN, STATE2 movdqu IV, 0x10(OUTP) - _aesni_gf128mul_x_ble() + _aesni_gf128mul_x_ble movdqa IV, STATE3 movdqu 0x20(INP), IN pxor IN, STATE3 movdqu IV, 0x20(OUTP) - _aesni_gf128mul_x_ble() + _aesni_gf128mul_x_ble movdqa IV, STATE4 movdqu 0x30(INP), IN pxor IN, STATE4 movdqu IV, 0x30(OUTP) +.if \enc call _aesni_enc4 +.else + call _aesni_dec4 +.endif movdqu 0x00(OUTP), IN pxor IN, STATE1 @@ -2912,17 +1233,17 @@ SYM_FUNC_START(aesni_xts_encrypt) pxor IN, STATE4 movdqu STATE4, 0x30(OUTP) - _aesni_gf128mul_x_ble() + _aesni_gf128mul_x_ble add $64, INP add $64, OUTP test LEN, LEN - jnz .Lxts_enc_loop4 + jnz .Lxts_loop4\@ -.Lxts_enc_ret_iv: +.Lxts_ret_iv\@: movups IV, (IVP) -.Lxts_enc_ret: +.Lxts_ret\@: #ifndef __x86_64__ popl KLEN popl KEYP @@ -2932,201 +1253,60 @@ SYM_FUNC_START(aesni_xts_encrypt) FRAME_END RET -.Lxts_enc_1x: +.Lxts_1x\@: add $64, LEN - jz .Lxts_enc_ret_iv + jz .Lxts_ret_iv\@ +.if \enc sub $16, LEN - jl .Lxts_enc_cts4 + jl .Lxts_cts4\@ +.endif -.Lxts_enc_loop1: +.Lxts_loop1\@: movdqu (INP), STATE +.if \enc pxor IV, STATE call _aesni_enc1 - pxor IV, STATE - _aesni_gf128mul_x_ble() - - test LEN, LEN - jz .Lxts_enc_out - +.else add $16, INP sub $16, LEN - jl .Lxts_enc_cts1 - - movdqu STATE, (OUTP) - add $16, OUTP - jmp .Lxts_enc_loop1 - -.Lxts_enc_out: - movdqu STATE, (OUTP) - jmp .Lxts_enc_ret_iv - -.Lxts_enc_cts4: - movdqa STATE4, STATE - sub $16, OUTP - -.Lxts_enc_cts1: -#ifndef __x86_64__ - lea .Lcts_permute_table, T1 -#else - lea .Lcts_permute_table(%rip), T1 -#endif - add LEN, INP /* rewind input pointer */ - add $16, LEN /* # bytes in final block */ - movups (INP), IN1 - - mov T1, IVP - add $32, IVP - add LEN, T1 - sub LEN, IVP - add OUTP, LEN - - movups (T1), %xmm4 - movaps STATE, IN2 - pshufb %xmm4, STATE - movups STATE, (LEN) - - movups (IVP), %xmm0 - pshufb %xmm0, IN1 - pblendvb IN2, IN1 - movaps IN1, STATE - + jl .Lxts_cts1\@ pxor IV, STATE - call _aesni_enc1 + call _aesni_dec1 +.endif pxor IV, STATE + _aesni_gf128mul_x_ble - movups STATE, (OUTP) - jmp .Lxts_enc_ret -SYM_FUNC_END(aesni_xts_encrypt) - -/* - * void aesni_xts_decrypt(const struct crypto_aes_ctx *ctx, u8 *dst, - * const u8 *src, unsigned int len, le128 *iv) - */ -SYM_FUNC_START(aesni_xts_decrypt) - FRAME_BEGIN -#ifndef __x86_64__ - pushl IVP - pushl LEN - pushl KEYP - pushl KLEN - movl (FRAME_OFFSET+20)(%esp), KEYP # ctx - movl (FRAME_OFFSET+24)(%esp), OUTP # dst - movl (FRAME_OFFSET+28)(%esp), INP # src - movl (FRAME_OFFSET+32)(%esp), LEN # len - movl (FRAME_OFFSET+36)(%esp), IVP # iv - movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK -#else - movdqa .Lgf128mul_x_ble_mask(%rip), GF128MUL_MASK -#endif - movups (IVP), IV - - mov 480(KEYP), KLEN - add $240, KEYP - - test $15, LEN - jz .Lxts_dec_loop4 - sub $16, LEN - -.Lxts_dec_loop4: - sub $64, LEN - jl .Lxts_dec_1x - - movdqa IV, STATE1 - movdqu 0x00(INP), IN - pxor IN, STATE1 - movdqu IV, 0x00(OUTP) - - _aesni_gf128mul_x_ble() - movdqa IV, STATE2 - movdqu 0x10(INP), IN - pxor IN, STATE2 - movdqu IV, 0x10(OUTP) - - _aesni_gf128mul_x_ble() - movdqa IV, STATE3 - movdqu 0x20(INP), IN - pxor IN, STATE3 - movdqu IV, 0x20(OUTP) - - _aesni_gf128mul_x_ble() - movdqa IV, STATE4 - movdqu 0x30(INP), IN - pxor IN, STATE4 - movdqu IV, 0x30(OUTP) - - call _aesni_dec4 - - movdqu 0x00(OUTP), IN - pxor IN, STATE1 - movdqu STATE1, 0x00(OUTP) - - movdqu 0x10(OUTP), IN - pxor IN, STATE2 - movdqu STATE2, 0x10(OUTP) - - movdqu 0x20(OUTP), IN - pxor IN, STATE3 - movdqu STATE3, 0x20(OUTP) - - movdqu 0x30(OUTP), IN - pxor IN, STATE4 - movdqu STATE4, 0x30(OUTP) - - _aesni_gf128mul_x_ble() - - add $64, INP - add $64, OUTP test LEN, LEN - jnz .Lxts_dec_loop4 - -.Lxts_dec_ret_iv: - movups IV, (IVP) - -.Lxts_dec_ret: -#ifndef __x86_64__ - popl KLEN - popl KEYP - popl LEN - popl IVP -#endif - FRAME_END - RET - -.Lxts_dec_1x: - add $64, LEN - jz .Lxts_dec_ret_iv - -.Lxts_dec_loop1: - movdqu (INP), STATE + jz .Lxts_out\@ +.if \enc add $16, INP sub $16, LEN - jl .Lxts_dec_cts1 - - pxor IV, STATE - call _aesni_dec1 - pxor IV, STATE - _aesni_gf128mul_x_ble() - - test LEN, LEN - jz .Lxts_dec_out + jl .Lxts_cts1\@ +.endif movdqu STATE, (OUTP) add $16, OUTP - jmp .Lxts_dec_loop1 + jmp .Lxts_loop1\@ -.Lxts_dec_out: +.Lxts_out\@: movdqu STATE, (OUTP) - jmp .Lxts_dec_ret_iv + jmp .Lxts_ret_iv\@ -.Lxts_dec_cts1: +.if \enc +.Lxts_cts4\@: + movdqa STATE4, STATE + sub $16, OUTP +.Lxts_cts1\@: +.else +.Lxts_cts1\@: movdqa IV, STATE4 - _aesni_gf128mul_x_ble() + _aesni_gf128mul_x_ble pxor IV, STATE call _aesni_dec1 pxor IV, STATE - +.endif #ifndef __x86_64__ lea .Lcts_permute_table, T1 #else @@ -3152,10 +1332,32 @@ SYM_FUNC_START(aesni_xts_decrypt) pblendvb IN2, IN1 movaps IN1, STATE +.if \enc + pxor IV, STATE + call _aesni_enc1 + pxor IV, STATE +.else pxor STATE4, STATE call _aesni_dec1 pxor STATE4, STATE +.endif movups STATE, (OUTP) - jmp .Lxts_dec_ret -SYM_FUNC_END(aesni_xts_decrypt) + jmp .Lxts_ret\@ +.endm + +/* + * void aesni_xts_enc(const struct crypto_aes_ctx *ctx, u8 *dst, + * const u8 *src, unsigned int len, le128 *iv) + */ +SYM_FUNC_START(aesni_xts_enc) + _aesni_xts_crypt 1 +SYM_FUNC_END(aesni_xts_enc) + +/* + * void aesni_xts_dec(const struct crypto_aes_ctx *ctx, u8 *dst, + * const u8 *src, unsigned int len, le128 *iv) + */ +SYM_FUNC_START(aesni_xts_dec) + _aesni_xts_crypt 0 +SYM_FUNC_END(aesni_xts_dec) diff --git a/arch/x86/crypto/aesni-intel_avx-x86_64.S b/arch/x86/crypto/aesni-intel_avx-x86_64.S deleted file mode 100644 index 8c9749ed0651..000000000000 --- a/arch/x86/crypto/aesni-intel_avx-x86_64.S +++ /dev/null @@ -1,2804 +0,0 @@ -######################################################################## -# Copyright (c) 2013, Intel Corporation -# -# This software is available to you under a choice of one of two -# licenses. You may choose to be licensed under the terms of the GNU -# General Public License (GPL) Version 2, available from the file -# COPYING in the main directory of this source tree, or the -# OpenIB.org BSD license below: -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are -# met: -# -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the -# distribution. -# -# * Neither the name of the Intel Corporation nor the names of its -# contributors may be used to endorse or promote products derived from -# this software without specific prior written permission. -# -# -# THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY -# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES# LOSS OF USE, DATA, OR -# PROFITS# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -######################################################################## -## -## Authors: -## Erdinc Ozturk <erdinc.ozturk@intel.com> -## Vinodh Gopal <vinodh.gopal@intel.com> -## James Guilford <james.guilford@intel.com> -## Tim Chen <tim.c.chen@linux.intel.com> -## -## References: -## This code was derived and highly optimized from the code described in paper: -## Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation -## on Intel Architecture Processors. August, 2010 -## The details of the implementation is explained in: -## Erdinc Ozturk et. al. Enabling High-Performance Galois-Counter-Mode -## on Intel Architecture Processors. October, 2012. -## -## Assumptions: -## -## -## -## iv: -## 0 1 2 3 -## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 -## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -## | Salt (From the SA) | -## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -## | Initialization Vector | -## | (This is the sequence number from IPSec header) | -## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -## | 0x1 | -## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -## -## -## -## AAD: -## AAD padded to 128 bits with 0 -## for example, assume AAD is a u32 vector -## -## if AAD is 8 bytes: -## AAD[3] = {A0, A1}# -## padded AAD in xmm register = {A1 A0 0 0} -## -## 0 1 2 3 -## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 -## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -## | SPI (A1) | -## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -## | 32-bit Sequence Number (A0) | -## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -## | 0x0 | -## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -## -## AAD Format with 32-bit Sequence Number -## -## if AAD is 12 bytes: -## AAD[3] = {A0, A1, A2}# -## padded AAD in xmm register = {A2 A1 A0 0} -## -## 0 1 2 3 -## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 -## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -## | SPI (A2) | -## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -## | 64-bit Extended Sequence Number {A1,A0} | -## | | -## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -## | 0x0 | -## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -## -## AAD Format with 64-bit Extended Sequence Number -## -## -## aadLen: -## from the definition of the spec, aadLen can only be 8 or 12 bytes. -## The code additionally supports aadLen of length 16 bytes. -## -## TLen: -## from the definition of the spec, TLen can only be 8, 12 or 16 bytes. -## -## poly = x^128 + x^127 + x^126 + x^121 + 1 -## throughout the code, one tab and two tab indentations are used. one tab is -## for GHASH part, two tabs is for AES part. -## - -#include <linux/linkage.h> - -# constants in mergeable sections, linker can reorder and merge -.section .rodata.cst16.POLY, "aM", @progbits, 16 -.align 16 -POLY: .octa 0xC2000000000000000000000000000001 - -.section .rodata.cst16.POLY2, "aM", @progbits, 16 -.align 16 -POLY2: .octa 0xC20000000000000000000001C2000000 - -.section .rodata.cst16.TWOONE, "aM", @progbits, 16 -.align 16 -TWOONE: .octa 0x00000001000000000000000000000001 - -.section .rodata.cst16.SHUF_MASK, "aM", @progbits, 16 -.align 16 -SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F - -.section .rodata.cst16.ONE, "aM", @progbits, 16 -.align 16 -ONE: .octa 0x00000000000000000000000000000001 - -.section .rodata.cst16.ONEf, "aM", @progbits, 16 -.align 16 -ONEf: .octa 0x01000000000000000000000000000000 - -# order of these constants should not change. -# more specifically, ALL_F should follow SHIFT_MASK, and zero should follow ALL_F -.section .rodata, "a", @progbits -.align 16 -SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100 -ALL_F: .octa 0xffffffffffffffffffffffffffffffff - .octa 0x00000000000000000000000000000000 - -.text - - -#define AadHash 16*0 -#define AadLen 16*1 -#define InLen (16*1)+8 -#define PBlockEncKey 16*2 -#define OrigIV 16*3 -#define CurCount 16*4 -#define PBlockLen 16*5 - -HashKey = 16*6 # store HashKey <<1 mod poly here -HashKey_2 = 16*7 # store HashKey^2 <<1 mod poly here -HashKey_3 = 16*8 # store HashKey^3 <<1 mod poly here -HashKey_4 = 16*9 # store HashKey^4 <<1 mod poly here -HashKey_5 = 16*10 # store HashKey^5 <<1 mod poly here -HashKey_6 = 16*11 # store HashKey^6 <<1 mod poly here -HashKey_7 = 16*12 # store HashKey^7 <<1 mod poly here -HashKey_8 = 16*13 # store HashKey^8 <<1 mod poly here -HashKey_k = 16*14 # store XOR of HashKey <<1 mod poly here (for Karatsuba purposes) -HashKey_2_k = 16*15 # store XOR of HashKey^2 <<1 mod poly here (for Karatsuba purposes) -HashKey_3_k = 16*16 # store XOR of HashKey^3 <<1 mod poly here (for Karatsuba purposes) -HashKey_4_k = 16*17 # store XOR of HashKey^4 <<1 mod poly here (for Karatsuba purposes) -HashKey_5_k = 16*18 # store XOR of HashKey^5 <<1 mod poly here (for Karatsuba purposes) -HashKey_6_k = 16*19 # store XOR of HashKey^6 <<1 mod poly here (for Karatsuba purposes) -HashKey_7_k = 16*20 # store XOR of HashKey^7 <<1 mod poly here (for Karatsuba purposes) -HashKey_8_k = 16*21 # store XOR of HashKey^8 <<1 mod poly here (for Karatsuba purposes) - -#define arg1 %rdi -#define arg2 %rsi -#define arg3 %rdx -#define arg4 %rcx -#define arg5 %r8 -#define arg6 %r9 -#define keysize 2*15*16(arg1) - -i = 0 -j = 0 - -out_order = 0 -in_order = 1 -DEC = 0 -ENC = 1 - -.macro define_reg r n -reg_\r = %xmm\n -.endm - -.macro setreg -.altmacro -define_reg i %i -define_reg j %j -.noaltmacro -.endm - -TMP1 = 16*0 # Temporary storage for AAD -TMP2 = 16*1 # Temporary storage for AES State 2 (State 1 is stored in an XMM register) -TMP3 = 16*2 # Temporary storage for AES State 3 -TMP4 = 16*3 # Temporary storage for AES State 4 -TMP5 = 16*4 # Temporary storage for AES State 5 -TMP6 = 16*5 # Temporary storage for AES State 6 -TMP7 = 16*6 # Temporary storage for AES State 7 -TMP8 = 16*7 # Temporary storage for AES State 8 - -VARIABLE_OFFSET = 16*8 - -################################ -# Utility Macros -################################ - -.macro FUNC_SAVE - push %r12 - push %r13 - push %r15 - - push %rbp - mov %rsp, %rbp - - sub $VARIABLE_OFFSET, %rsp - and $~63, %rsp # align rsp to 64 bytes -.endm - -.macro FUNC_RESTORE - mov %rbp, %rsp - pop %rbp - - pop %r15 - pop %r13 - pop %r12 -.endm - -# Encryption of a single block -.macro ENCRYPT_SINGLE_BLOCK REP XMM0 - vpxor (arg1), \XMM0, \XMM0 - i = 1 - setreg -.rep \REP - vaesenc 16*i(arg1), \XMM0, \XMM0 - i = (i+1) - setreg -.endr - vaesenclast 16*i(arg1), \XMM0, \XMM0 -.endm - -# combined for GCM encrypt and decrypt functions -# clobbering all xmm registers -# clobbering r10, r11, r12, r13, r15, rax -.macro GCM_ENC_DEC INITIAL_BLOCKS GHASH_8_ENCRYPT_8_PARALLEL GHASH_LAST_8 GHASH_MUL ENC_DEC REP - vmovdqu AadHash(arg2), %xmm8 - vmovdqu HashKey(arg2), %xmm13 # xmm13 = HashKey - add arg5, InLen(arg2) - - # initialize the data pointer offset as zero - xor %r11d, %r11d - - PARTIAL_BLOCK \GHASH_MUL, arg3, arg4, arg5, %r11, %xmm8, \ENC_DEC - sub %r11, arg5 - - mov arg5, %r13 # save the number of bytes of plaintext/ciphertext - and $-16, %r13 # r13 = r13 - (r13 mod 16) - - mov %r13, %r12 - shr $4, %r12 - and $7, %r12 - jz .L_initial_num_blocks_is_0\@ - - cmp $7, %r12 - je .L_initial_num_blocks_is_7\@ - cmp $6, %r12 - je .L_initial_num_blocks_is_6\@ - cmp $5, %r12 - je .L_initial_num_blocks_is_5\@ - cmp $4, %r12 - je .L_initial_num_blocks_is_4\@ - cmp $3, %r12 - je .L_initial_num_blocks_is_3\@ - cmp $2, %r12 - je .L_initial_num_blocks_is_2\@ - - jmp .L_initial_num_blocks_is_1\@ - -.L_initial_num_blocks_is_7\@: - \INITIAL_BLOCKS \REP, 7, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC - sub $16*7, %r13 - jmp .L_initial_blocks_encrypted\@ - -.L_initial_num_blocks_is_6\@: - \INITIAL_BLOCKS \REP, 6, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC - sub $16*6, %r13 - jmp .L_initial_blocks_encrypted\@ - -.L_initial_num_blocks_is_5\@: - \INITIAL_BLOCKS \REP, 5, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC - sub $16*5, %r13 - jmp .L_initial_blocks_encrypted\@ - -.L_initial_num_blocks_is_4\@: - \INITIAL_BLOCKS \REP, 4, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC - sub $16*4, %r13 - jmp .L_initial_blocks_encrypted\@ - -.L_initial_num_blocks_is_3\@: - \INITIAL_BLOCKS \REP, 3, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC - sub $16*3, %r13 - jmp .L_initial_blocks_encrypted\@ - -.L_initial_num_blocks_is_2\@: - \INITIAL_BLOCKS \REP, 2, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC - sub $16*2, %r13 - jmp .L_initial_blocks_encrypted\@ - -.L_initial_num_blocks_is_1\@: - \INITIAL_BLOCKS \REP, 1, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC - sub $16*1, %r13 - jmp .L_initial_blocks_encrypted\@ - -.L_initial_num_blocks_is_0\@: - \INITIAL_BLOCKS \REP, 0, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC - - -.L_initial_blocks_encrypted\@: - test %r13, %r13 - je .L_zero_cipher_left\@ - - sub $128, %r13 - je .L_eight_cipher_left\@ - - - - - vmovd %xmm9, %r15d - and $255, %r15d - vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 - - -.L_encrypt_by_8_new\@: - cmp $(255-8), %r15d - jg .L_encrypt_by_8\@ - - - - add $8, %r15b - \GHASH_8_ENCRYPT_8_PARALLEL \REP, %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, out_order, \ENC_DEC - add $128, %r11 - sub $128, %r13 - jne .L_encrypt_by_8_new\@ - - vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 - jmp .L_eight_cipher_left\@ - -.L_encrypt_by_8\@: - vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 - add $8, %r15b - \GHASH_8_ENCRYPT_8_PARALLEL \REP, %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, in_order, \ENC_DEC - vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 - add $128, %r11 - sub $128, %r13 - jne .L_encrypt_by_8_new\@ - - vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 - - - - -.L_eight_cipher_left\@: - \GHASH_LAST_8 %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8 - - -.L_zero_cipher_left\@: - vmovdqu %xmm14, AadHash(arg2) - vmovdqu %xmm9, CurCount(arg2) - - # check for 0 length - mov arg5, %r13 - and $15, %r13 # r13 = (arg5 mod 16) - - je .L_multiple_of_16_bytes\@ - - # handle the last <16 Byte block separately - - mov %r13, PBlockLen(arg2) - - vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn - vmovdqu %xmm9, CurCount(arg2) - vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 - - ENCRYPT_SINGLE_BLOCK \REP, %xmm9 # E(K, Yn) - vmovdqu %xmm9, PBlockEncKey(arg2) - - cmp $16, arg5 - jge .L_large_enough_update\@ - - lea (arg4,%r11,1), %r10 - mov %r13, %r12 - - READ_PARTIAL_BLOCK %r10 %r12 %xmm1 - - lea SHIFT_MASK+16(%rip), %r12 - sub %r13, %r12 # adjust the shuffle mask pointer to be - # able to shift 16-r13 bytes (r13 is the - # number of bytes in plaintext mod 16) - - jmp .L_final_ghash_mul\@ - -.L_large_enough_update\@: - sub $16, %r11 - add %r13, %r11 - - # receive the last <16 Byte block - vmovdqu (arg4, %r11, 1), %xmm1 - - sub %r13, %r11 - add $16, %r11 - - lea SHIFT_MASK+16(%rip), %r12 - # adjust the shuffle mask pointer to be able to shift 16-r13 bytes - # (r13 is the number of bytes in plaintext mod 16) - sub %r13, %r12 - # get the appropriate shuffle mask - vmovdqu (%r12), %xmm2 - # shift right 16-r13 bytes - vpshufb %xmm2, %xmm1, %xmm1 - -.L_final_ghash_mul\@: - .if \ENC_DEC == DEC - vmovdqa %xmm1, %xmm2 - vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn) - vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to - # mask out top 16-r13 bytes of xmm9 - vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9 - vpand %xmm1, %xmm2, %xmm2 - vpshufb SHUF_MASK(%rip), %xmm2, %xmm2 - vpxor %xmm2, %xmm14, %xmm14 - - vmovdqu %xmm14, AadHash(arg2) - .else - vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn) - vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to - # mask out top 16-r13 bytes of xmm9 - vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9 - vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 - vpxor %xmm9, %xmm14, %xmm14 - - vmovdqu %xmm14, AadHash(arg2) - vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 # shuffle xmm9 back to output as ciphertext - .endif - - - ############################# - # output r13 Bytes - vmovq %xmm9, %rax - cmp $8, %r13 - jle .L_less_than_8_bytes_left\@ - - mov %rax, (arg3 , %r11) - add $8, %r11 - vpsrldq $8, %xmm9, %xmm9 - vmovq %xmm9, %rax - sub $8, %r13 - -.L_less_than_8_bytes_left\@: - movb %al, (arg3 , %r11) - add $1, %r11 - shr $8, %rax - sub $1, %r13 - jne .L_less_than_8_bytes_left\@ - ############################# - -.L_multiple_of_16_bytes\@: -.endm - - -# GCM_COMPLETE Finishes update of tag of last partial block -# Output: Authorization Tag (AUTH_TAG) -# Clobbers rax, r10-r12, and xmm0, xmm1, xmm5-xmm15 -.macro GCM_COMPLETE GHASH_MUL REP AUTH_TAG AUTH_TAG_LEN - vmovdqu AadHash(arg2), %xmm14 - vmovdqu HashKey(arg2), %xmm13 - - mov PBlockLen(arg2), %r12 - test %r12, %r12 - je .L_partial_done\@ - - #GHASH computation for the last <16 Byte block - \GHASH_MUL %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 - -.L_partial_done\@: - mov AadLen(arg2), %r12 # r12 = aadLen (number of bytes) - shl $3, %r12 # convert into number of bits - vmovd %r12d, %xmm15 # len(A) in xmm15 - - mov InLen(arg2), %r12 - shl $3, %r12 # len(C) in bits (*128) - vmovq %r12, %xmm1 - vpslldq $8, %xmm15, %xmm15 # xmm15 = len(A)|| 0x0000000000000000 - vpxor %xmm1, %xmm15, %xmm15 # xmm15 = len(A)||len(C) - - vpxor %xmm15, %xmm14, %xmm14 - \GHASH_MUL %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 # final GHASH computation - vpshufb SHUF_MASK(%rip), %xmm14, %xmm14 # perform a 16Byte swap - - vmovdqu OrigIV(arg2), %xmm9 - - ENCRYPT_SINGLE_BLOCK \REP, %xmm9 # E(K, Y0) - - vpxor %xmm14, %xmm9, %xmm9 - - - -.L_return_T\@: - mov \AUTH_TAG, %r10 # r10 = authTag - mov \AUTH_TAG_LEN, %r11 # r11 = auth_tag_len - - cmp $16, %r11 - je .L_T_16\@ - - cmp $8, %r11 - jl .L_T_4\@ - -.L_T_8\@: - vmovq %xmm9, %rax - mov %rax, (%r10) - add $8, %r10 - sub $8, %r11 - vpsrldq $8, %xmm9, %xmm9 - test %r11, %r11 - je .L_return_T_done\@ -.L_T_4\@: - vmovd %xmm9, %eax - mov %eax, (%r10) - add $4, %r10 - sub $4, %r11 - vpsrldq $4, %xmm9, %xmm9 - test %r11, %r11 - je .L_return_T_done\@ -.L_T_123\@: - vmovd %xmm9, %eax - cmp $2, %r11 - jl .L_T_1\@ - mov %ax, (%r10) - cmp $2, %r11 - je .L_return_T_done\@ - add $2, %r10 - sar $16, %eax -.L_T_1\@: - mov %al, (%r10) - jmp .L_return_T_done\@ - -.L_T_16\@: - vmovdqu %xmm9, (%r10) - -.L_return_T_done\@: -.endm - -.macro CALC_AAD_HASH GHASH_MUL AAD AADLEN T1 T2 T3 T4 T5 T6 T7 T8 - - mov \AAD, %r10 # r10 = AAD - mov \AADLEN, %r12 # r12 = aadLen - - - mov %r12, %r11 - - vpxor \T8, \T8, \T8 - vpxor \T7, \T7, \T7 - cmp $16, %r11 - jl .L_get_AAD_rest8\@ -.L_get_AAD_blocks\@: - vmovdqu (%r10), \T7 - vpshufb SHUF_MASK(%rip), \T7, \T7 - vpxor \T7, \T8, \T8 - \GHASH_MUL \T8, \T2, \T1, \T3, \T4, \T5, \T6 - add $16, %r10 - sub $16, %r12 - sub $16, %r11 - cmp $16, %r11 - jge .L_get_AAD_blocks\@ - vmovdqu \T8, \T7 - test %r11, %r11 - je .L_get_AAD_done\@ - - vpxor \T7, \T7, \T7 - - /* read the last <16B of AAD. since we have at least 4B of - data right after the AAD (the ICV, and maybe some CT), we can - read 4B/8B blocks safely, and then get rid of the extra stuff */ -.L_get_AAD_rest8\@: - cmp $4, %r11 - jle .L_get_AAD_rest4\@ - movq (%r10), \T1 - add $8, %r10 - sub $8, %r11 - vpslldq $8, \T1, \T1 - vpsrldq $8, \T7, \T7 - vpxor \T1, \T7, \T7 - jmp .L_get_AAD_rest8\@ -.L_get_AAD_rest4\@: - test %r11, %r11 - jle .L_get_AAD_rest0\@ - mov (%r10), %eax - movq %rax, \T1 - add $4, %r10 - sub $4, %r11 - vpslldq $12, \T1, \T1 - vpsrldq $4, \T7, \T7 - vpxor \T1, \T7, \T7 -.L_get_AAD_rest0\@: - /* finalize: shift out the extra bytes we read, and align - left. since pslldq can only shift by an immediate, we use - vpshufb and a pair of shuffle masks */ - leaq ALL_F(%rip), %r11 - subq %r12, %r11 - vmovdqu 16(%r11), \T1 - andq $~3, %r11 - vpshufb (%r11), \T7, \T7 - vpand \T1, \T7, \T7 -.L_get_AAD_rest_final\@: - vpshufb SHUF_MASK(%rip), \T7, \T7 - vpxor \T8, \T7, \T7 - \GHASH_MUL \T7, \T2, \T1, \T3, \T4, \T5, \T6 - -.L_get_AAD_done\@: - vmovdqu \T7, AadHash(arg2) -.endm - -.macro INIT GHASH_MUL PRECOMPUTE - mov arg6, %r11 - mov %r11, AadLen(arg2) # ctx_data.aad_length = aad_length - xor %r11d, %r11d - mov %r11, InLen(arg2) # ctx_data.in_length = 0 - - mov %r11, PBlockLen(arg2) # ctx_data.partial_block_length = 0 - mov %r11, PBlockEncKey(arg2) # ctx_data.partial_block_enc_key = 0 - mov arg3, %rax - movdqu (%rax), %xmm0 - movdqu %xmm0, OrigIV(arg2) # ctx_data.orig_IV = iv - - vpshufb SHUF_MASK(%rip), %xmm0, %xmm0 - movdqu %xmm0, CurCount(arg2) # ctx_data.current_counter = iv - - vmovdqu (arg4), %xmm6 # xmm6 = HashKey - - vpshufb SHUF_MASK(%rip), %xmm6, %xmm6 - ############### PRECOMPUTATION of HashKey<<1 mod poly from the HashKey - vmovdqa %xmm6, %xmm2 - vpsllq $1, %xmm6, %xmm6 - vpsrlq $63, %xmm2, %xmm2 - vmovdqa %xmm2, %xmm1 - vpslldq $8, %xmm2, %xmm2 - vpsrldq $8, %xmm1, %xmm1 - vpor %xmm2, %xmm6, %xmm6 - #reduction - vpshufd $0b00100100, %xmm1, %xmm2 - vpcmpeqd TWOONE(%rip), %xmm2, %xmm2 - vpand POLY(%rip), %xmm2, %xmm2 - vpxor %xmm2, %xmm6, %xmm6 # xmm6 holds the HashKey<<1 mod poly - ####################################################################### - vmovdqu %xmm6, HashKey(arg2) # store HashKey<<1 mod poly - - CALC_AAD_HASH \GHASH_MUL, arg5, arg6, %xmm2, %xmm6, %xmm3, %xmm4, %xmm5, %xmm7, %xmm1, %xmm0 - - \PRECOMPUTE %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5 -.endm - - -# Reads DLEN bytes starting at DPTR and stores in XMMDst -# where 0 < DLEN < 16 -# Clobbers %rax, DLEN -.macro READ_PARTIAL_BLOCK DPTR DLEN XMMDst - vpxor \XMMDst, \XMMDst, \XMMDst - - cmp $8, \DLEN - jl .L_read_lt8_\@ - mov (\DPTR), %rax - vpinsrq $0, %rax, \XMMDst, \XMMDst - sub $8, \DLEN - jz .L_done_read_partial_block_\@ - xor %eax, %eax -.L_read_next_byte_\@: - shl $8, %rax - mov 7(\DPTR, \DLEN, 1), %al - dec \DLEN - jnz .L_read_next_byte_\@ - vpinsrq $1, %rax, \XMMDst, \XMMDst - jmp .L_done_read_partial_block_\@ -.L_read_lt8_\@: - xor %eax, %eax -.L_read_next_byte_lt8_\@: - shl $8, %rax - mov -1(\DPTR, \DLEN, 1), %al - dec \DLEN - jnz .L_read_next_byte_lt8_\@ - vpinsrq $0, %rax, \XMMDst, \XMMDst -.L_done_read_partial_block_\@: -.endm - -# PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks -# between update calls. -# Requires the input data be at least 1 byte long due to READ_PARTIAL_BLOCK -# Outputs encrypted bytes, and updates hash and partial info in gcm_data_context -# Clobbers rax, r10, r12, r13, xmm0-6, xmm9-13 -.macro PARTIAL_BLOCK GHASH_MUL CYPH_PLAIN_OUT PLAIN_CYPH_IN PLAIN_CYPH_LEN DATA_OFFSET \ - AAD_HASH ENC_DEC - mov PBlockLen(arg2), %r13 - test %r13, %r13 - je .L_partial_block_done_\@ # Leave Macro if no partial blocks - # Read in input data without over reading - cmp $16, \PLAIN_CYPH_LEN - jl .L_fewer_than_16_bytes_\@ - vmovdqu (\PLAIN_CYPH_IN), %xmm1 # If more than 16 bytes, just fill xmm - jmp .L_data_read_\@ - -.L_fewer_than_16_bytes_\@: - lea (\PLAIN_CYPH_IN, \DATA_OFFSET, 1), %r10 - mov \PLAIN_CYPH_LEN, %r12 - READ_PARTIAL_BLOCK %r10 %r12 %xmm1 - - mov PBlockLen(arg2), %r13 - -.L_data_read_\@: # Finished reading in data - - vmovdqu PBlockEncKey(arg2), %xmm9 - vmovdqu HashKey(arg2), %xmm13 - - lea SHIFT_MASK(%rip), %r12 - - # adjust the shuffle mask pointer to be able to shift r13 bytes - # r16-r13 is the number of bytes in plaintext mod 16) - add %r13, %r12 - vmovdqu (%r12), %xmm2 # get the appropriate shuffle mask - vpshufb %xmm2, %xmm9, %xmm9 # shift right r13 bytes - -.if \ENC_DEC == DEC - vmovdqa %xmm1, %xmm3 - pxor %xmm1, %xmm9 # Ciphertext XOR E(K, Yn) - - mov \PLAIN_CYPH_LEN, %r10 - add %r13, %r10 - # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling - sub $16, %r10 - # Determine if partial block is not being filled and - # shift mask accordingly - jge .L_no_extra_mask_1_\@ - sub %r10, %r12 -.L_no_extra_mask_1_\@: - - vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 - # get the appropriate mask to mask out bottom r13 bytes of xmm9 - vpand %xmm1, %xmm9, %xmm9 # mask out bottom r13 bytes of xmm9 - - vpand %xmm1, %xmm3, %xmm3 - vmovdqa SHUF_MASK(%rip), %xmm10 - vpshufb %xmm10, %xmm3, %xmm3 - vpshufb %xmm2, %xmm3, %xmm3 - vpxor %xmm3, \AAD_HASH, \AAD_HASH - - test %r10, %r10 - jl .L_partial_incomplete_1_\@ - - # GHASH computation for the last <16 Byte block - \GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 - xor %eax,%eax - - mov %rax, PBlockLen(arg2) - jmp .L_dec_done_\@ -.L_partial_incomplete_1_\@: - add \PLAIN_CYPH_LEN, PBlockLen(arg2) -.L_dec_done_\@: - vmovdqu \AAD_HASH, AadHash(arg2) -.else - vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn) - - mov \PLAIN_CYPH_LEN, %r10 - add %r13, %r10 - # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling - sub $16, %r10 - # Determine if partial block is not being filled and - # shift mask accordingly - jge .L_no_extra_mask_2_\@ - sub %r10, %r12 -.L_no_extra_mask_2_\@: - - vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 - # get the appropriate mask to mask out bottom r13 bytes of xmm9 - vpand %xmm1, %xmm9, %xmm9 - - vmovdqa SHUF_MASK(%rip), %xmm1 - vpshufb %xmm1, %xmm9, %xmm9 - vpshufb %xmm2, %xmm9, %xmm9 - vpxor %xmm9, \AAD_HASH, \AAD_HASH - - test %r10, %r10 - jl .L_partial_incomplete_2_\@ - - # GHASH computation for the last <16 Byte block - \GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 - xor %eax,%eax - - mov %rax, PBlockLen(arg2) - jmp .L_encode_done_\@ -.L_partial_incomplete_2_\@: - add \PLAIN_CYPH_LEN, PBlockLen(arg2) -.L_encode_done_\@: - vmovdqu \AAD_HASH, AadHash(arg2) - - vmovdqa SHUF_MASK(%rip), %xmm10 - # shuffle xmm9 back to output as ciphertext - vpshufb %xmm10, %xmm9, %xmm9 - vpshufb %xmm2, %xmm9, %xmm9 -.endif - # output encrypted Bytes - test %r10, %r10 - jl .L_partial_fill_\@ - mov %r13, %r12 - mov $16, %r13 - # Set r13 to be the number of bytes to write out - sub %r12, %r13 - jmp .L_count_set_\@ -.L_partial_fill_\@: - mov \PLAIN_CYPH_LEN, %r13 -.L_count_set_\@: - vmovdqa %xmm9, %xmm0 - vmovq %xmm0, %rax - cmp $8, %r13 - jle .L_less_than_8_bytes_left_\@ - - mov %rax, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1) - add $8, \DATA_OFFSET - psrldq $8, %xmm0 - vmovq %xmm0, %rax - sub $8, %r13 -.L_less_than_8_bytes_left_\@: - movb %al, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1) - add $1, \DATA_OFFSET - shr $8, %rax - sub $1, %r13 - jne .L_less_than_8_bytes_left_\@ -.L_partial_block_done_\@: -.endm # PARTIAL_BLOCK - -############################################################################### -# GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0) -# Input: A and B (128-bits each, bit-reflected) -# Output: C = A*B*x mod poly, (i.e. >>1 ) -# To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input -# GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly. -############################################################################### -.macro GHASH_MUL_AVX GH HK T1 T2 T3 T4 T5 - - vpshufd $0b01001110, \GH, \T2 - vpshufd $0b01001110, \HK, \T3 - vpxor \GH , \T2, \T2 # T2 = (a1+a0) - vpxor \HK , \T3, \T3 # T3 = (b1+b0) - - vpclmulqdq $0x11, \HK, \GH, \T1 # T1 = a1*b1 - vpclmulqdq $0x00, \HK, \GH, \GH # GH = a0*b0 - vpclmulqdq $0x00, \T3, \T2, \T2 # T2 = (a1+a0)*(b1+b0) - vpxor \GH, \T2,\T2 - vpxor \T1, \T2,\T2 # T2 = a0*b1+a1*b0 - - vpslldq $8, \T2,\T3 # shift-L T3 2 DWs - vpsrldq $8, \T2,\T2 # shift-R T2 2 DWs - vpxor \T3, \GH, \GH - vpxor \T2, \T1, \T1 # <T1:GH> = GH x HK - - #first phase of the reduction - vpslld $31, \GH, \T2 # packed right shifting << 31 - vpslld $30, \GH, \T3 # packed right shifting shift << 30 - vpslld $25, \GH, \T4 # packed right shifting shift << 25 - - vpxor \T3, \T2, \T2 # xor the shifted versions - vpxor \T4, \T2, \T2 - - vpsrldq $4, \T2, \T5 # shift-R T5 1 DW - - vpslldq $12, \T2, \T2 # shift-L T2 3 DWs - vpxor \T2, \GH, \GH # first phase of the reduction complete - - #second phase of the reduction - - vpsrld $1,\GH, \T2 # packed left shifting >> 1 - vpsrld $2,\GH, \T3 # packed left shifting >> 2 - vpsrld $7,\GH, \T4 # packed left shifting >> 7 - vpxor \T3, \T2, \T2 # xor the shifted versions - vpxor \T4, \T2, \T2 - - vpxor \T5, \T2, \T2 - vpxor \T2, \GH, \GH - vpxor \T1, \GH, \GH # the result is in GH - - -.endm - -.macro PRECOMPUTE_AVX HK T1 T2 T3 T4 T5 T6 - - # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i - vmovdqa \HK, \T5 - - vpshufd $0b01001110, \T5, \T1 - vpxor \T5, \T1, \T1 - vmovdqu \T1, HashKey_k(arg2) - - GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^2<<1 mod poly - vmovdqu \T5, HashKey_2(arg2) # [HashKey_2] = HashKey^2<<1 mod poly - vpshufd $0b01001110, \T5, \T1 - vpxor \T5, \T1, \T1 - vmovdqu \T1, HashKey_2_k(arg2) - - GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^3<<1 mod poly - vmovdqu \T5, HashKey_3(arg2) - vpshufd $0b01001110, \T5, \T1 - vpxor \T5, \T1, \T1 - vmovdqu \T1, HashKey_3_k(arg2) - - GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^4<<1 mod poly - vmovdqu \T5, HashKey_4(arg2) - vpshufd $0b01001110, \T5, \T1 - vpxor \T5, \T1, \T1 - vmovdqu \T1, HashKey_4_k(arg2) - - GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^5<<1 mod poly - vmovdqu \T5, HashKey_5(arg2) - vpshufd $0b01001110, \T5, \T1 - vpxor \T5, \T1, \T1 - vmovdqu \T1, HashKey_5_k(arg2) - - GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^6<<1 mod poly - vmovdqu \T5, HashKey_6(arg2) - vpshufd $0b01001110, \T5, \T1 - vpxor \T5, \T1, \T1 - vmovdqu \T1, HashKey_6_k(arg2) - - GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^7<<1 mod poly - vmovdqu \T5, HashKey_7(arg2) - vpshufd $0b01001110, \T5, \T1 - vpxor \T5, \T1, \T1 - vmovdqu \T1, HashKey_7_k(arg2) - - GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^8<<1 mod poly - vmovdqu \T5, HashKey_8(arg2) - vpshufd $0b01001110, \T5, \T1 - vpxor \T5, \T1, \T1 - vmovdqu \T1, HashKey_8_k(arg2) - -.endm - -## if a = number of total plaintext bytes -## b = floor(a/16) -## num_initial_blocks = b mod 4# -## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext -## r10, r11, r12, rax are clobbered -## arg1, arg2, arg3, arg4 are used as pointers only, not modified - -.macro INITIAL_BLOCKS_AVX REP num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC - i = (8-\num_initial_blocks) - setreg - vmovdqu AadHash(arg2), reg_i - - # start AES for num_initial_blocks blocks - vmovdqu CurCount(arg2), \CTR - - i = (9-\num_initial_blocks) - setreg -.rep \num_initial_blocks - vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 - vmovdqa \CTR, reg_i - vpshufb SHUF_MASK(%rip), reg_i, reg_i # perform a 16Byte swap - i = (i+1) - setreg -.endr - - vmovdqa (arg1), \T_key - i = (9-\num_initial_blocks) - setreg -.rep \num_initial_blocks - vpxor \T_key, reg_i, reg_i - i = (i+1) - setreg -.endr - - j = 1 - setreg -.rep \REP - vmovdqa 16*j(arg1), \T_key - i = (9-\num_initial_blocks) - setreg -.rep \num_initial_blocks - vaesenc \T_key, reg_i, reg_i - i = (i+1) - setreg -.endr - - j = (j+1) - setreg -.endr - - vmovdqa 16*j(arg1), \T_key - i = (9-\num_initial_blocks) - setreg -.rep \num_initial_blocks - vaesenclast \T_key, reg_i, reg_i - i = (i+1) - setreg -.endr - - i = (9-\num_initial_blocks) - setreg -.rep \num_initial_blocks - vmovdqu (arg4, %r11), \T1 - vpxor \T1, reg_i, reg_i - vmovdqu reg_i, (arg3 , %r11) # write back ciphertext for num_initial_blocks blocks - add $16, %r11 -.if \ENC_DEC == DEC - vmovdqa \T1, reg_i -.endif - vpshufb SHUF_MASK(%rip), reg_i, reg_i # prepare ciphertext for GHASH computations - i = (i+1) - setreg -.endr - - - i = (8-\num_initial_blocks) - j = (9-\num_initial_blocks) - setreg - -.rep \num_initial_blocks - vpxor reg_i, reg_j, reg_j - GHASH_MUL_AVX reg_j, \T2, \T1, \T3, \T4, \T5, \T6 # apply GHASH on num_initial_blocks blocks - i = (i+1) - j = (j+1) - setreg -.endr - # XMM8 has the combined result here - - vmovdqa \XMM8, TMP1(%rsp) - vmovdqa \XMM8, \T3 - - cmp $128, %r13 - jl .L_initial_blocks_done\@ # no need for precomputed constants - -############################################################################### -# Haskey_i_k holds XORed values of the low and high parts of the Haskey_i - vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 - vmovdqa \CTR, \XMM1 - vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap - - vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 - vmovdqa \CTR, \XMM2 - vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap - - vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 - vmovdqa \CTR, \XMM3 - vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap - - vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 - vmovdqa \CTR, \XMM4 - vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap - - vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 - vmovdqa \CTR, \XMM5 - vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap - - vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 - vmovdqa \CTR, \XMM6 - vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap - - vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 - vmovdqa \CTR, \XMM7 - vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap - - vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 - vmovdqa \CTR, \XMM8 - vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap - - vmovdqa (arg1), \T_key - vpxor \T_key, \XMM1, \XMM1 - vpxor \T_key, \XMM2, \XMM2 - vpxor \T_key, \XMM3, \XMM3 - vpxor \T_key, \XMM4, \XMM4 - vpxor \T_key, \XMM5, \XMM5 - vpxor \T_key, \XMM6, \XMM6 - vpxor \T_key, \XMM7, \XMM7 - vpxor \T_key, \XMM8, \XMM8 - - i = 1 - setreg -.rep \REP # do REP rounds - vmovdqa 16*i(arg1), \T_key - vaesenc \T_key, \XMM1, \XMM1 - vaesenc \T_key, \XMM2, \XMM2 - vaesenc \T_key, \XMM3, \XMM3 - vaesenc \T_key, \XMM4, \XMM4 - vaesenc \T_key, \XMM5, \XMM5 - vaesenc \T_key, \XMM6, \XMM6 - vaesenc \T_key, \XMM7, \XMM7 - vaesenc \T_key, \XMM8, \XMM8 - i = (i+1) - setreg -.endr - - vmovdqa 16*i(arg1), \T_key - vaesenclast \T_key, \XMM1, \XMM1 - vaesenclast \T_key, \XMM2, \XMM2 - vaesenclast \T_key, \XMM3, \XMM3 - vaesenclast \T_key, \XMM4, \XMM4 - vaesenclast \T_key, \XMM5, \XMM5 - vaesenclast \T_key, \XMM6, \XMM6 - vaesenclast \T_key, \XMM7, \XMM7 - vaesenclast \T_key, \XMM8, \XMM8 - - vmovdqu (arg4, %r11), \T1 - vpxor \T1, \XMM1, \XMM1 - vmovdqu \XMM1, (arg3 , %r11) - .if \ENC_DEC == DEC - vmovdqa \T1, \XMM1 - .endif - - vmovdqu 16*1(arg4, %r11), \T1 - vpxor \T1, \XMM2, \XMM2 - vmovdqu \XMM2, 16*1(arg3 , %r11) - .if \ENC_DEC == DEC - vmovdqa \T1, \XMM2 - .endif - - vmovdqu 16*2(arg4, %r11), \T1 - vpxor \T1, \XMM3, \XMM3 - vmovdqu \XMM3, 16*2(arg3 , %r11) - .if \ENC_DEC == DEC - vmovdqa \T1, \XMM3 - .endif - - vmovdqu 16*3(arg4, %r11), \T1 - vpxor \T1, \XMM4, \XMM4 - vmovdqu \XMM4, 16*3(arg3 , %r11) - .if \ENC_DEC == DEC - vmovdqa \T1, \XMM4 - .endif - - vmovdqu 16*4(arg4, %r11), \T1 - vpxor \T1, \XMM5, \XMM5 - vmovdqu \XMM5, 16*4(arg3 , %r11) - .if \ENC_DEC == DEC - vmovdqa \T1, \XMM5 - .endif - - vmovdqu 16*5(arg4, %r11), \T1 - vpxor \T1, \XMM6, \XMM6 - vmovdqu \XMM6, 16*5(arg3 , %r11) - .if \ENC_DEC == DEC - vmovdqa \T1, \XMM6 - .endif - - vmovdqu 16*6(arg4, %r11), \T1 - vpxor \T1, \XMM7, \XMM7 - vmovdqu \XMM7, 16*6(arg3 , %r11) - .if \ENC_DEC == DEC - vmovdqa \T1, \XMM7 - .endif - - vmovdqu 16*7(arg4, %r11), \T1 - vpxor \T1, \XMM8, \XMM8 - vmovdqu \XMM8, 16*7(arg3 , %r11) - .if \ENC_DEC == DEC - vmovdqa \T1, \XMM8 - .endif - - add $128, %r11 - - vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap - vpxor TMP1(%rsp), \XMM1, \XMM1 # combine GHASHed value with the corresponding ciphertext - vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap - -############################################################################### - -.L_initial_blocks_done\@: - -.endm - -# encrypt 8 blocks at a time -# ghash the 8 previously encrypted ciphertext blocks -# arg1, arg2, arg3, arg4 are used as pointers only, not modified -# r11 is the data offset value -.macro GHASH_8_ENCRYPT_8_PARALLEL_AVX REP T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC - - vmovdqa \XMM1, \T2 - vmovdqa \XMM2, TMP2(%rsp) - vmovdqa \XMM3, TMP3(%rsp) - vmovdqa \XMM4, TMP4(%rsp) - vmovdqa \XMM5, TMP5(%rsp) - vmovdqa \XMM6, TMP6(%rsp) - vmovdqa \XMM7, TMP7(%rsp) - vmovdqa \XMM8, TMP8(%rsp) - -.if \loop_idx == in_order - vpaddd ONE(%rip), \CTR, \XMM1 # INCR CNT - vpaddd ONE(%rip), \XMM1, \XMM2 - vpaddd ONE(%rip), \XMM2, \XMM3 - vpaddd ONE(%rip), \XMM3, \XMM4 - vpaddd ONE(%rip), \XMM4, \XMM5 - vpaddd ONE(%rip), \XMM5, \XMM6 - vpaddd ONE(%rip), \XMM6, \XMM7 - vpaddd ONE(%rip), \XMM7, \XMM8 - vmovdqa \XMM8, \CTR - - vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap -.else - vpaddd ONEf(%rip), \CTR, \XMM1 # INCR CNT - vpaddd ONEf(%rip), \XMM1, \XMM2 - vpaddd ONEf(%rip), \XMM2, \XMM3 - vpaddd ONEf(%rip), \XMM3, \XMM4 - vpaddd ONEf(%rip), \XMM4, \XMM5 - vpaddd ONEf(%rip), \XMM5, \XMM6 - vpaddd ONEf(%rip), \XMM6, \XMM7 - vpaddd ONEf(%rip), \XMM7, \XMM8 - vmovdqa \XMM8, \CTR -.endif - - - ####################################################################### - - vmovdqu (arg1), \T1 - vpxor \T1, \XMM1, \XMM1 - vpxor \T1, \XMM2, \XMM2 - vpxor \T1, \XMM3, \XMM3 - vpxor \T1, \XMM4, \XMM4 - vpxor \T1, \XMM5, \XMM5 - vpxor \T1, \XMM6, \XMM6 - vpxor \T1, \XMM7, \XMM7 - vpxor \T1, \XMM8, \XMM8 - - ####################################################################### - - - - - - vmovdqu 16*1(arg1), \T1 - vaesenc \T1, \XMM1, \XMM1 - vaesenc \T1, \XMM2, \XMM2 - vaesenc \T1, \XMM3, \XMM3 - vaesenc \T1, \XMM4, \XMM4 - vaesenc \T1, \XMM5, \XMM5 - vaesenc \T1, \XMM6, \XMM6 - vaesenc \T1, \XMM7, \XMM7 - vaesenc \T1, \XMM8, \XMM8 - - vmovdqu 16*2(arg1), \T1 - vaesenc \T1, \XMM1, \XMM1 - vaesenc \T1, \XMM2, \XMM2 - vaesenc \T1, \XMM3, \XMM3 - vaesenc \T1, \XMM4, \XMM4 - vaesenc \T1, \XMM5, \XMM5 - vaesenc \T1, \XMM6, \XMM6 - vaesenc \T1, \XMM7, \XMM7 - vaesenc \T1, \XMM8, \XMM8 - - - ####################################################################### - - vmovdqu HashKey_8(arg2), \T5 - vpclmulqdq $0x11, \T5, \T2, \T4 # T4 = a1*b1 - vpclmulqdq $0x00, \T5, \T2, \T7 # T7 = a0*b0 - - vpshufd $0b01001110, \T2, \T6 - vpxor \T2, \T6, \T6 - - vmovdqu HashKey_8_k(arg2), \T5 - vpclmulqdq $0x00, \T5, \T6, \T6 - - vmovdqu 16*3(arg1), \T1 - vaesenc \T1, \XMM1, \XMM1 - vaesenc \T1, \XMM2, \XMM2 - vaesenc \T1, \XMM3, \XMM3 - vaesenc \T1, \XMM4, \XMM4 - vaesenc \T1, \XMM5, \XMM5 - vaesenc \T1, \XMM6, \XMM6 - vaesenc \T1, \XMM7, \XMM7 - vaesenc \T1, \XMM8, \XMM8 - - vmovdqa TMP2(%rsp), \T1 - vmovdqu HashKey_7(arg2), \T5 - vpclmulqdq $0x11, \T5, \T1, \T3 - vpxor \T3, \T4, \T4 - vpclmulqdq $0x00, \T5, \T1, \T3 - vpxor \T3, \T7, \T7 - - vpshufd $0b01001110, \T1, \T3 - vpxor \T1, \T3, \T3 - vmovdqu HashKey_7_k(arg2), \T5 - vpclmulqdq $0x10, \T5, \T3, \T3 - vpxor \T3, \T6, \T6 - - vmovdqu 16*4(arg1), \T1 - vaesenc \T1, \XMM1, \XMM1 - vaesenc \T1, \XMM2, \XMM2 - vaesenc \T1, \XMM3, \XMM3 - vaesenc \T1, \XMM4, \XMM4 - vaesenc \T1, \XMM5, \XMM5 - vaesenc \T1, \XMM6, \XMM6 - vaesenc \T1, \XMM7, \XMM7 - vaesenc \T1, \XMM8, \XMM8 - - ####################################################################### - - vmovdqa TMP3(%rsp), \T1 - vmovdqu HashKey_6(arg2), \T5 - vpclmulqdq $0x11, \T5, \T1, \T3 - vpxor \T3, \T4, \T4 - vpclmulqdq $0x00, \T5, \T1, \T3 - vpxor \T3, \T7, \T7 - - vpshufd $0b01001110, \T1, \T3 - vpxor \T1, \T3, \T3 - vmovdqu HashKey_6_k(arg2), \T5 - vpclmulqdq $0x10, \T5, \T3, \T3 - vpxor \T3, \T6, \T6 - - vmovdqu 16*5(arg1), \T1 - vaesenc \T1, \XMM1, \XMM1 - vaesenc \T1, \XMM2, \XMM2 - vaesenc \T1, \XMM3, \XMM3 - vaesenc \T1, \XMM4, \XMM4 - vaesenc \T1, \XMM5, \XMM5 - vaesenc \T1, \XMM6, \XMM6 - vaesenc \T1, \XMM7, \XMM7 - vaesenc \T1, \XMM8, \XMM8 - - vmovdqa TMP4(%rsp), \T1 - vmovdqu HashKey_5(arg2), \T5 - vpclmulqdq $0x11, \T5, \T1, \T3 - vpxor \T3, \T4, \T4 - vpclmulqdq $0x00, \T5, \T1, \T3 - vpxor \T3, \T7, \T7 - - vpshufd $0b01001110, \T1, \T3 - vpxor \T1, \T3, \T3 - vmovdqu HashKey_5_k(arg2), \T5 - vpclmulqdq $0x10, \T5, \T3, \T3 - vpxor \T3, \T6, \T6 - - vmovdqu 16*6(arg1), \T1 - vaesenc \T1, \XMM1, \XMM1 - vaesenc \T1, \XMM2, \XMM2 - vaesenc \T1, \XMM3, \XMM3 - vaesenc \T1, \XMM4, \XMM4 - vaesenc \T1, \XMM5, \XMM5 - vaesenc \T1, \XMM6, \XMM6 - vaesenc \T1, \XMM7, \XMM7 - vaesenc \T1, \XMM8, \XMM8 - - - vmovdqa TMP5(%rsp), \T1 - vmovdqu HashKey_4(arg2), \T5 - vpclmulqdq $0x11, \T5, \T1, \T3 - vpxor \T3, \T4, \T4 - vpclmulqdq $0x00, \T5, \T1, \T3 - vpxor \T3, \T7, \T7 - - vpshufd $0b01001110, \T1, \T3 - vpxor \T1, \T3, \T3 - vmovdqu HashKey_4_k(arg2), \T5 - vpclmulqdq $0x10, \T5, \T3, \T3 - vpxor \T3, \T6, \T6 - - vmovdqu 16*7(arg1), \T1 - vaesenc \T1, \XMM1, \XMM1 - vaesenc \T1, \XMM2, \XMM2 - vaesenc \T1, \XMM3, \XMM3 - vaesenc \T1, \XMM4, \XMM4 - vaesenc \T1, \XMM5, \XMM5 - vaesenc \T1, \XMM6, \XMM6 - vaesenc \T1, \XMM7, \XMM7 - vaesenc \T1, \XMM8, \XMM8 - - vmovdqa TMP6(%rsp), \T1 - vmovdqu HashKey_3(arg2), \T5 - vpclmulqdq $0x11, \T5, \T1, \T3 - vpxor \T3, \T4, \T4 - vpclmulqdq $0x00, \T5, \T1, \T3 - vpxor \T3, \T7, \T7 - - vpshufd $0b01001110, \T1, \T3 - vpxor \T1, \T3, \T3 - vmovdqu HashKey_3_k(arg2), \T5 - vpclmulqdq $0x10, \T5, \T3, \T3 - vpxor \T3, \T6, \T6 - - - vmovdqu 16*8(arg1), \T1 - vaesenc \T1, \XMM1, \XMM1 - vaesenc \T1, \XMM2, \XMM2 - vaesenc \T1, \XMM3, \XMM3 - vaesenc \T1, \XMM4, \XMM4 - vaesenc \T1, \XMM5, \XMM5 - vaesenc \T1, \XMM6, \XMM6 - vaesenc \T1, \XMM7, \XMM7 - vaesenc \T1, \XMM8, \XMM8 - - vmovdqa TMP7(%rsp), \T1 - vmovdqu HashKey_2(arg2), \T5 - vpclmulqdq $0x11, \T5, \T1, \T3 - vpxor \T3, \T4, \T4 - vpclmulqdq $0x00, \T5, \T1, \T3 - vpxor \T3, \T7, \T7 - - vpshufd $0b01001110, \T1, \T3 - vpxor \T1, \T3, \T3 - vmovdqu HashKey_2_k(arg2), \T5 - vpclmulqdq $0x10, \T5, \T3, \T3 - vpxor \T3, \T6, \T6 - - ####################################################################### - - vmovdqu 16*9(arg1), \T5 - vaesenc \T5, \XMM1, \XMM1 - vaesenc \T5, \XMM2, \XMM2 - vaesenc \T5, \XMM3, \XMM3 - vaesenc \T5, \XMM4, \XMM4 - vaesenc \T5, \XMM5, \XMM5 - vaesenc \T5, \XMM6, \XMM6 - vaesenc \T5, \XMM7, \XMM7 - vaesenc \T5, \XMM8, \XMM8 - - vmovdqa TMP8(%rsp), \T1 - vmovdqu HashKey(arg2), \T5 - vpclmulqdq $0x11, \T5, \T1, \T3 - vpxor \T3, \T4, \T4 - vpclmulqdq $0x00, \T5, \T1, \T3 - vpxor \T3, \T7, \T7 - - vpshufd $0b01001110, \T1, \T3 - vpxor \T1, \T3, \T3 - vmovdqu HashKey_k(arg2), \T5 - vpclmulqdq $0x10, \T5, \T3, \T3 - vpxor \T3, \T6, \T6 - - vpxor \T4, \T6, \T6 - vpxor \T7, \T6, \T6 - - vmovdqu 16*10(arg1), \T5 - - i = 11 - setreg -.rep (\REP-9) - - vaesenc \T5, \XMM1, \XMM1 - vaesenc \T5, \XMM2, \XMM2 - vaesenc \T5, \XMM3, \XMM3 - vaesenc \T5, \XMM4, \XMM4 - vaesenc \T5, \XMM5, \XMM5 - vaesenc \T5, \XMM6, \XMM6 - vaesenc \T5, \XMM7, \XMM7 - vaesenc \T5, \XMM8, \XMM8 - - vmovdqu 16*i(arg1), \T5 - i = i + 1 - setreg -.endr - - i = 0 - j = 1 - setreg -.rep 8 - vpxor 16*i(arg4, %r11), \T5, \T2 - .if \ENC_DEC == ENC - vaesenclast \T2, reg_j, reg_j - .else - vaesenclast \T2, reg_j, \T3 - vmovdqu 16*i(arg4, %r11), reg_j - vmovdqu \T3, 16*i(arg3, %r11) - .endif - i = (i+1) - j = (j+1) - setreg -.endr - ####################################################################### - - - vpslldq $8, \T6, \T3 # shift-L T3 2 DWs - vpsrldq $8, \T6, \T6 # shift-R T2 2 DWs - vpxor \T3, \T7, \T7 - vpxor \T4, \T6, \T6 # accumulate the results in T6:T7 - - - - ####################################################################### - #first phase of the reduction - ####################################################################### - vpslld $31, \T7, \T2 # packed right shifting << 31 - vpslld $30, \T7, \T3 # packed right shifting shift << 30 - vpslld $25, \T7, \T4 # packed right shifting shift << 25 - - vpxor \T3, \T2, \T2 # xor the shifted versions - vpxor \T4, \T2, \T2 - - vpsrldq $4, \T2, \T1 # shift-R T1 1 DW - - vpslldq $12, \T2, \T2 # shift-L T2 3 DWs - vpxor \T2, \T7, \T7 # first phase of the reduction complete - ####################################################################### - .if \ENC_DEC == ENC - vmovdqu \XMM1, 16*0(arg3,%r11) # Write to the Ciphertext buffer - vmovdqu \XMM2, 16*1(arg3,%r11) # Write to the Ciphertext buffer - vmovdqu \XMM3, 16*2(arg3,%r11) # Write to the Ciphertext buffer - vmovdqu \XMM4, 16*3(arg3,%r11) # Write to the Ciphertext buffer - vmovdqu \XMM5, 16*4(arg3,%r11) # Write to the Ciphertext buffer - vmovdqu \XMM6, 16*5(arg3,%r11) # Write to the Ciphertext buffer - vmovdqu \XMM7, 16*6(arg3,%r11) # Write to the Ciphertext buffer - vmovdqu \XMM8, 16*7(arg3,%r11) # Write to the Ciphertext buffer - .endif - - ####################################################################### - #second phase of the reduction - vpsrld $1, \T7, \T2 # packed left shifting >> 1 - vpsrld $2, \T7, \T3 # packed left shifting >> 2 - vpsrld $7, \T7, \T4 # packed left shifting >> 7 - vpxor \T3, \T2, \T2 # xor the shifted versions - vpxor \T4, \T2, \T2 - - vpxor \T1, \T2, \T2 - vpxor \T2, \T7, \T7 - vpxor \T7, \T6, \T6 # the result is in T6 - ####################################################################### - - vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap - - - vpxor \T6, \XMM1, \XMM1 - - - -.endm - - -# GHASH the last 4 ciphertext blocks. -.macro GHASH_LAST_8_AVX T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 - - ## Karatsuba Method - - - vpshufd $0b01001110, \XMM1, \T2 - vpxor \XMM1, \T2, \T2 - vmovdqu HashKey_8(arg2), \T5 - vpclmulqdq $0x11, \T5, \XMM1, \T6 - vpclmulqdq $0x00, \T5, \XMM1, \T7 - - vmovdqu HashKey_8_k(arg2), \T3 - vpclmulqdq $0x00, \T3, \T2, \XMM1 - - ###################### - - vpshufd $0b01001110, \XMM2, \T2 - vpxor \XMM2, \T2, \T2 - vmovdqu HashKey_7(arg2), \T5 - vpclmulqdq $0x11, \T5, \XMM2, \T4 - vpxor \T4, \T6, \T6 - - vpclmulqdq $0x00, \T5, \XMM2, \T4 - vpxor \T4, \T7, \T7 - - vmovdqu HashKey_7_k(arg2), \T3 - vpclmulqdq $0x00, \T3, \T2, \T2 - vpxor \T2, \XMM1, \XMM1 - - ###################### - - vpshufd $0b01001110, \XMM3, \T2 - vpxor \XMM3, \T2, \T2 - vmovdqu HashKey_6(arg2), \T5 - vpclmulqdq $0x11, \T5, \XMM3, \T4 - vpxor \T4, \T6, \T6 - - vpclmulqdq $0x00, \T5, \XMM3, \T4 - vpxor \T4, \T7, \T7 - - vmovdqu HashKey_6_k(arg2), \T3 - vpclmulqdq $0x00, \T3, \T2, \T2 - vpxor \T2, \XMM1, \XMM1 - - ###################### - - vpshufd $0b01001110, \XMM4, \T2 - vpxor \XMM4, \T2, \T2 - vmovdqu HashKey_5(arg2), \T5 - vpclmulqdq $0x11, \T5, \XMM4, \T4 - vpxor \T4, \T6, \T6 - - vpclmulqdq $0x00, \T5, \XMM4, \T4 - vpxor \T4, \T7, \T7 - - vmovdqu HashKey_5_k(arg2), \T3 - vpclmulqdq $0x00, \T3, \T2, \T2 - vpxor \T2, \XMM1, \XMM1 - - ###################### - - vpshufd $0b01001110, \XMM5, \T2 - vpxor \XMM5, \T2, \T2 - vmovdqu HashKey_4(arg2), \T5 - vpclmulqdq $0x11, \T5, \XMM5, \T4 - vpxor \T4, \T6, \T6 - - vpclmulqdq $0x00, \T5, \XMM5, \T4 - vpxor \T4, \T7, \T7 - - vmovdqu HashKey_4_k(arg2), \T3 - vpclmulqdq $0x00, \T3, \T2, \T2 - vpxor \T2, \XMM1, \XMM1 - - ###################### - - vpshufd $0b01001110, \XMM6, \T2 - vpxor \XMM6, \T2, \T2 - vmovdqu HashKey_3(arg2), \T5 - vpclmulqdq $0x11, \T5, \XMM6, \T4 - vpxor \T4, \T6, \T6 - - vpclmulqdq $0x00, \T5, \XMM6, \T4 - vpxor \T4, \T7, \T7 - - vmovdqu HashKey_3_k(arg2), \T3 - vpclmulqdq $0x00, \T3, \T2, \T2 - vpxor \T2, \XMM1, \XMM1 - - ###################### - - vpshufd $0b01001110, \XMM7, \T2 - vpxor \XMM7, \T2, \T2 - vmovdqu HashKey_2(arg2), \T5 - vpclmulqdq $0x11, \T5, \XMM7, \T4 - vpxor \T4, \T6, \T6 - - vpclmulqdq $0x00, \T5, \XMM7, \T4 - vpxor \T4, \T7, \T7 - - vmovdqu HashKey_2_k(arg2), \T3 - vpclmulqdq $0x00, \T3, \T2, \T2 - vpxor \T2, \XMM1, \XMM1 - - ###################### - - vpshufd $0b01001110, \XMM8, \T2 - vpxor \XMM8, \T2, \T2 - vmovdqu HashKey(arg2), \T5 - vpclmulqdq $0x11, \T5, \XMM8, \T4 - vpxor \T4, \T6, \T6 - - vpclmulqdq $0x00, \T5, \XMM8, \T4 - vpxor \T4, \T7, \T7 - - vmovdqu HashKey_k(arg2), \T3 - vpclmulqdq $0x00, \T3, \T2, \T2 - - vpxor \T2, \XMM1, \XMM1 - vpxor \T6, \XMM1, \XMM1 - vpxor \T7, \XMM1, \T2 - - - - - vpslldq $8, \T2, \T4 - vpsrldq $8, \T2, \T2 - - vpxor \T4, \T7, \T7 - vpxor \T2, \T6, \T6 # <T6:T7> holds the result of - # the accumulated carry-less multiplications - - ####################################################################### - #first phase of the reduction - vpslld $31, \T7, \T2 # packed right shifting << 31 - vpslld $30, \T7, \T3 # packed right shifting shift << 30 - vpslld $25, \T7, \T4 # packed right shifting shift << 25 - - vpxor \T3, \T2, \T2 # xor the shifted versions - vpxor \T4, \T2, \T2 - - vpsrldq $4, \T2, \T1 # shift-R T1 1 DW - - vpslldq $12, \T2, \T2 # shift-L T2 3 DWs - vpxor \T2, \T7, \T7 # first phase of the reduction complete - ####################################################################### - - - #second phase of the reduction - vpsrld $1, \T7, \T2 # packed left shifting >> 1 - vpsrld $2, \T7, \T3 # packed left shifting >> 2 - vpsrld $7, \T7, \T4 # packed left shifting >> 7 - vpxor \T3, \T2, \T2 # xor the shifted versions - vpxor \T4, \T2, \T2 - - vpxor \T1, \T2, \T2 - vpxor \T2, \T7, \T7 - vpxor \T7, \T6, \T6 # the result is in T6 - -.endm - -############################################################# -#void aesni_gcm_precomp_avx_gen2 -# (gcm_data *my_ctx_data, -# gcm_context_data *data, -# u8 *hash_subkey# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */ -# u8 *iv, /* Pre-counter block j0: 4 byte salt -# (from Security Association) concatenated with 8 byte -# Initialisation Vector (from IPSec ESP Payload) -# concatenated with 0x00000001. 16-byte aligned pointer. */ -# const u8 *aad, /* Additional Authentication Data (AAD)*/ -# u64 aad_len) /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */ -############################################################# -SYM_FUNC_START(aesni_gcm_init_avx_gen2) - FUNC_SAVE - INIT GHASH_MUL_AVX, PRECOMPUTE_AVX - FUNC_RESTORE - RET -SYM_FUNC_END(aesni_gcm_init_avx_gen2) - -############################################################################### -#void aesni_gcm_enc_update_avx_gen2( -# gcm_data *my_ctx_data, /* aligned to 16 Bytes */ -# gcm_context_data *data, -# u8 *out, /* Ciphertext output. Encrypt in-place is allowed. */ -# const u8 *in, /* Plaintext input */ -# u64 plaintext_len) /* Length of data in Bytes for encryption. */ -############################################################################### -SYM_FUNC_START(aesni_gcm_enc_update_avx_gen2) - FUNC_SAVE - mov keysize, %eax - cmp $32, %eax - je key_256_enc_update - cmp $16, %eax - je key_128_enc_update - # must be 192 - GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 11 - FUNC_RESTORE - RET -key_128_enc_update: - GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 9 - FUNC_RESTORE - RET -key_256_enc_update: - GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 13 - FUNC_RESTORE - RET -SYM_FUNC_END(aesni_gcm_enc_update_avx_gen2) - -############################################################################### -#void aesni_gcm_dec_update_avx_gen2( -# gcm_data *my_ctx_data, /* aligned to 16 Bytes */ -# gcm_context_data *data, -# u8 *out, /* Plaintext output. Decrypt in-place is allowed. */ -# const u8 *in, /* Ciphertext input */ -# u64 plaintext_len) /* Length of data in Bytes for encryption. */ -############################################################################### -SYM_FUNC_START(aesni_gcm_dec_update_avx_gen2) - FUNC_SAVE - mov keysize,%eax - cmp $32, %eax - je key_256_dec_update - cmp $16, %eax - je key_128_dec_update - # must be 192 - GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 11 - FUNC_RESTORE - RET -key_128_dec_update: - GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 9 - FUNC_RESTORE - RET -key_256_dec_update: - GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 13 - FUNC_RESTORE - RET -SYM_FUNC_END(aesni_gcm_dec_update_avx_gen2) - -############################################################################### -#void aesni_gcm_finalize_avx_gen2( -# gcm_data *my_ctx_data, /* aligned to 16 Bytes */ -# gcm_context_data *data, -# u8 *auth_tag, /* Authenticated Tag output. */ -# u64 auth_tag_len)# /* Authenticated Tag Length in bytes. -# Valid values are 16 (most likely), 12 or 8. */ -############################################################################### -SYM_FUNC_START(aesni_gcm_finalize_avx_gen2) - FUNC_SAVE - mov keysize,%eax - cmp $32, %eax - je key_256_finalize - cmp $16, %eax - je key_128_finalize - # must be 192 - GCM_COMPLETE GHASH_MUL_AVX, 11, arg3, arg4 - FUNC_RESTORE - RET -key_128_finalize: - GCM_COMPLETE GHASH_MUL_AVX, 9, arg3, arg4 - FUNC_RESTORE - RET -key_256_finalize: - GCM_COMPLETE GHASH_MUL_AVX, 13, arg3, arg4 - FUNC_RESTORE - RET -SYM_FUNC_END(aesni_gcm_finalize_avx_gen2) - -############################################################################### -# GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0) -# Input: A and B (128-bits each, bit-reflected) -# Output: C = A*B*x mod poly, (i.e. >>1 ) -# To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input -# GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly. -############################################################################### -.macro GHASH_MUL_AVX2 GH HK T1 T2 T3 T4 T5 - - vpclmulqdq $0x11,\HK,\GH,\T1 # T1 = a1*b1 - vpclmulqdq $0x00,\HK,\GH,\T2 # T2 = a0*b0 - vpclmulqdq $0x01,\HK,\GH,\T3 # T3 = a1*b0 - vpclmulqdq $0x10,\HK,\GH,\GH # GH = a0*b1 - vpxor \T3, \GH, \GH - - - vpsrldq $8 , \GH, \T3 # shift-R GH 2 DWs - vpslldq $8 , \GH, \GH # shift-L GH 2 DWs - - vpxor \T3, \T1, \T1 - vpxor \T2, \GH, \GH - - ####################################################################### - #first phase of the reduction - vmovdqa POLY2(%rip), \T3 - - vpclmulqdq $0x01, \GH, \T3, \T2 - vpslldq $8, \T2, \T2 # shift-L T2 2 DWs - - vpxor \T2, \GH, \GH # first phase of the reduction complete - ####################################################################### - #second phase of the reduction - vpclmulqdq $0x00, \GH, \T3, \T2 - vpsrldq $4, \T2, \T2 # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R) - - vpclmulqdq $0x10, \GH, \T3, \GH - vpslldq $4, \GH, \GH # shift-L GH 1 DW (Shift-L 1-DW to obtain result with no shifts) - - vpxor \T2, \GH, \GH # second phase of the reduction complete - ####################################################################### - vpxor \T1, \GH, \GH # the result is in GH - - -.endm - -.macro PRECOMPUTE_AVX2 HK T1 T2 T3 T4 T5 T6 - - # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i - vmovdqa \HK, \T5 - GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^2<<1 mod poly - vmovdqu \T5, HashKey_2(arg2) # [HashKey_2] = HashKey^2<<1 mod poly - - GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^3<<1 mod poly - vmovdqu \T5, HashKey_3(arg2) - - GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^4<<1 mod poly - vmovdqu \T5, HashKey_4(arg2) - - GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^5<<1 mod poly - vmovdqu \T5, HashKey_5(arg2) - - GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^6<<1 mod poly - vmovdqu \T5, HashKey_6(arg2) - - GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^7<<1 mod poly - vmovdqu \T5, HashKey_7(arg2) - - GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^8<<1 mod poly - vmovdqu \T5, HashKey_8(arg2) - -.endm - -## if a = number of total plaintext bytes -## b = floor(a/16) -## num_initial_blocks = b mod 4# -## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext -## r10, r11, r12, rax are clobbered -## arg1, arg2, arg3, arg4 are used as pointers only, not modified - -.macro INITIAL_BLOCKS_AVX2 REP num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC VER - i = (8-\num_initial_blocks) - setreg - vmovdqu AadHash(arg2), reg_i - - # start AES for num_initial_blocks blocks - vmovdqu CurCount(arg2), \CTR - - i = (9-\num_initial_blocks) - setreg -.rep \num_initial_blocks - vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 - vmovdqa \CTR, reg_i - vpshufb SHUF_MASK(%rip), reg_i, reg_i # perform a 16Byte swap - i = (i+1) - setreg -.endr - - vmovdqa (arg1), \T_key - i = (9-\num_initial_blocks) - setreg -.rep \num_initial_blocks - vpxor \T_key, reg_i, reg_i - i = (i+1) - setreg -.endr - - j = 1 - setreg -.rep \REP - vmovdqa 16*j(arg1), \T_key - i = (9-\num_initial_blocks) - setreg -.rep \num_initial_blocks - vaesenc \T_key, reg_i, reg_i - i = (i+1) - setreg -.endr - - j = (j+1) - setreg -.endr - - - vmovdqa 16*j(arg1), \T_key - i = (9-\num_initial_blocks) - setreg -.rep \num_initial_blocks - vaesenclast \T_key, reg_i, reg_i - i = (i+1) - setreg -.endr - - i = (9-\num_initial_blocks) - setreg -.rep \num_initial_blocks - vmovdqu (arg4, %r11), \T1 - vpxor \T1, reg_i, reg_i - vmovdqu reg_i, (arg3 , %r11) # write back ciphertext for - # num_initial_blocks blocks - add $16, %r11 -.if \ENC_DEC == DEC - vmovdqa \T1, reg_i -.endif - vpshufb SHUF_MASK(%rip), reg_i, reg_i # prepare ciphertext for GHASH computations - i = (i+1) - setreg -.endr - - - i = (8-\num_initial_blocks) - j = (9-\num_initial_blocks) - setreg - -.rep \num_initial_blocks - vpxor reg_i, reg_j, reg_j - GHASH_MUL_AVX2 reg_j, \T2, \T1, \T3, \T4, \T5, \T6 # apply GHASH on num_initial_blocks blocks - i = (i+1) - j = (j+1) - setreg -.endr - # XMM8 has the combined result here - - vmovdqa \XMM8, TMP1(%rsp) - vmovdqa \XMM8, \T3 - - cmp $128, %r13 - jl .L_initial_blocks_done\@ # no need for precomputed constants - -############################################################################### -# Haskey_i_k holds XORed values of the low and high parts of the Haskey_i - vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 - vmovdqa \CTR, \XMM1 - vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap - - vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 - vmovdqa \CTR, \XMM2 - vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap - - vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 - vmovdqa \CTR, \XMM3 - vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap - - vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 - vmovdqa \CTR, \XMM4 - vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap - - vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 - vmovdqa \CTR, \XMM5 - vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap - - vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 - vmovdqa \CTR, \XMM6 - vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap - - vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 - vmovdqa \CTR, \XMM7 - vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap - - vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 - vmovdqa \CTR, \XMM8 - vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap - - vmovdqa (arg1), \T_key - vpxor \T_key, \XMM1, \XMM1 - vpxor \T_key, \XMM2, \XMM2 - vpxor \T_key, \XMM3, \XMM3 - vpxor \T_key, \XMM4, \XMM4 - vpxor \T_key, \XMM5, \XMM5 - vpxor \T_key, \XMM6, \XMM6 - vpxor \T_key, \XMM7, \XMM7 - vpxor \T_key, \XMM8, \XMM8 - - i = 1 - setreg -.rep \REP # do REP rounds - vmovdqa 16*i(arg1), \T_key - vaesenc \T_key, \XMM1, \XMM1 - vaesenc \T_key, \XMM2, \XMM2 - vaesenc \T_key, \XMM3, \XMM3 - vaesenc \T_key, \XMM4, \XMM4 - vaesenc \T_key, \XMM5, \XMM5 - vaesenc \T_key, \XMM6, \XMM6 - vaesenc \T_key, \XMM7, \XMM7 - vaesenc \T_key, \XMM8, \XMM8 - i = (i+1) - setreg -.endr - - - vmovdqa 16*i(arg1), \T_key - vaesenclast \T_key, \XMM1, \XMM1 - vaesenclast \T_key, \XMM2, \XMM2 - vaesenclast \T_key, \XMM3, \XMM3 - vaesenclast \T_key, \XMM4, \XMM4 - vaesenclast \T_key, \XMM5, \XMM5 - vaesenclast \T_key, \XMM6, \XMM6 - vaesenclast \T_key, \XMM7, \XMM7 - vaesenclast \T_key, \XMM8, \XMM8 - - vmovdqu (arg4, %r11), \T1 - vpxor \T1, \XMM1, \XMM1 - vmovdqu \XMM1, (arg3 , %r11) - .if \ENC_DEC == DEC - vmovdqa \T1, \XMM1 - .endif - - vmovdqu 16*1(arg4, %r11), \T1 - vpxor \T1, \XMM2, \XMM2 - vmovdqu \XMM2, 16*1(arg3 , %r11) - .if \ENC_DEC == DEC - vmovdqa \T1, \XMM2 - .endif - - vmovdqu 16*2(arg4, %r11), \T1 - vpxor \T1, \XMM3, \XMM3 - vmovdqu \XMM3, 16*2(arg3 , %r11) - .if \ENC_DEC == DEC - vmovdqa \T1, \XMM3 - .endif - - vmovdqu 16*3(arg4, %r11), \T1 - vpxor \T1, \XMM4, \XMM4 - vmovdqu \XMM4, 16*3(arg3 , %r11) - .if \ENC_DEC == DEC - vmovdqa \T1, \XMM4 - .endif - - vmovdqu 16*4(arg4, %r11), \T1 - vpxor \T1, \XMM5, \XMM5 - vmovdqu \XMM5, 16*4(arg3 , %r11) - .if \ENC_DEC == DEC - vmovdqa \T1, \XMM5 - .endif - - vmovdqu 16*5(arg4, %r11), \T1 - vpxor \T1, \XMM6, \XMM6 - vmovdqu \XMM6, 16*5(arg3 , %r11) - .if \ENC_DEC == DEC - vmovdqa \T1, \XMM6 - .endif - - vmovdqu 16*6(arg4, %r11), \T1 - vpxor \T1, \XMM7, \XMM7 - vmovdqu \XMM7, 16*6(arg3 , %r11) - .if \ENC_DEC == DEC - vmovdqa \T1, \XMM7 - .endif - - vmovdqu 16*7(arg4, %r11), \T1 - vpxor \T1, \XMM8, \XMM8 - vmovdqu \XMM8, 16*7(arg3 , %r11) - .if \ENC_DEC == DEC - vmovdqa \T1, \XMM8 - .endif - - add $128, %r11 - - vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap - vpxor TMP1(%rsp), \XMM1, \XMM1 # combine GHASHed value with - # the corresponding ciphertext - vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap - -############################################################################### - -.L_initial_blocks_done\@: - - -.endm - - - -# encrypt 8 blocks at a time -# ghash the 8 previously encrypted ciphertext blocks -# arg1, arg2, arg3, arg4 are used as pointers only, not modified -# r11 is the data offset value -.macro GHASH_8_ENCRYPT_8_PARALLEL_AVX2 REP T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC - - vmovdqa \XMM1, \T2 - vmovdqa \XMM2, TMP2(%rsp) - vmovdqa \XMM3, TMP3(%rsp) - vmovdqa \XMM4, TMP4(%rsp) - vmovdqa \XMM5, TMP5(%rsp) - vmovdqa \XMM6, TMP6(%rsp) - vmovdqa \XMM7, TMP7(%rsp) - vmovdqa \XMM8, TMP8(%rsp) - -.if \loop_idx == in_order - vpaddd ONE(%rip), \CTR, \XMM1 # INCR CNT - vpaddd ONE(%rip), \XMM1, \XMM2 - vpaddd ONE(%rip), \XMM2, \XMM3 - vpaddd ONE(%rip), \XMM3, \XMM4 - vpaddd ONE(%rip), \XMM4, \XMM5 - vpaddd ONE(%rip), \XMM5, \XMM6 - vpaddd ONE(%rip), \XMM6, \XMM7 - vpaddd ONE(%rip), \XMM7, \XMM8 - vmovdqa \XMM8, \CTR - - vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap -.else - vpaddd ONEf(%rip), \CTR, \XMM1 # INCR CNT - vpaddd ONEf(%rip), \XMM1, \XMM2 - vpaddd ONEf(%rip), \XMM2, \XMM3 - vpaddd ONEf(%rip), \XMM3, \XMM4 - vpaddd ONEf(%rip), \XMM4, \XMM5 - vpaddd ONEf(%rip), \XMM5, \XMM6 - vpaddd ONEf(%rip), \XMM6, \XMM7 - vpaddd ONEf(%rip), \XMM7, \XMM8 - vmovdqa \XMM8, \CTR -.endif - - - ####################################################################### - - vmovdqu (arg1), \T1 - vpxor \T1, \XMM1, \XMM1 - vpxor \T1, \XMM2, \XMM2 - vpxor \T1, \XMM3, \XMM3 - vpxor \T1, \XMM4, \XMM4 - vpxor \T1, \XMM5, \XMM5 - vpxor \T1, \XMM6, \XMM6 - vpxor \T1, \XMM7, \XMM7 - vpxor \T1, \XMM8, \XMM8 - - ####################################################################### - - - - - - vmovdqu 16*1(arg1), \T1 - vaesenc \T1, \XMM1, \XMM1 - vaesenc \T1, \XMM2, \XMM2 - vaesenc \T1, \XMM3, \XMM3 - vaesenc \T1, \XMM4, \XMM4 - vaesenc \T1, \XMM5, \XMM5 - vaesenc \T1, \XMM6, \XMM6 - vaesenc \T1, \XMM7, \XMM7 - vaesenc \T1, \XMM8, \XMM8 - - vmovdqu 16*2(arg1), \T1 - vaesenc \T1, \XMM1, \XMM1 - vaesenc \T1, \XMM2, \XMM2 - vaesenc \T1, \XMM3, \XMM3 - vaesenc \T1, \XMM4, \XMM4 - vaesenc \T1, \XMM5, \XMM5 - vaesenc \T1, \XMM6, \XMM6 - vaesenc \T1, \XMM7, \XMM7 - vaesenc \T1, \XMM8, \XMM8 - - - ####################################################################### - - vmovdqu HashKey_8(arg2), \T5 - vpclmulqdq $0x11, \T5, \T2, \T4 # T4 = a1*b1 - vpclmulqdq $0x00, \T5, \T2, \T7 # T7 = a0*b0 - vpclmulqdq $0x01, \T5, \T2, \T6 # T6 = a1*b0 - vpclmulqdq $0x10, \T5, \T2, \T5 # T5 = a0*b1 - vpxor \T5, \T6, \T6 - - vmovdqu 16*3(arg1), \T1 - vaesenc \T1, \XMM1, \XMM1 - vaesenc \T1, \XMM2, \XMM2 - vaesenc \T1, \XMM3, \XMM3 - vaesenc \T1, \XMM4, \XMM4 - vaesenc \T1, \XMM5, \XMM5 - vaesenc \T1, \XMM6, \XMM6 - vaesenc \T1, \XMM7, \XMM7 - vaesenc \T1, \XMM8, \XMM8 - - vmovdqa TMP2(%rsp), \T1 - vmovdqu HashKey_7(arg2), \T5 - vpclmulqdq $0x11, \T5, \T1, \T3 - vpxor \T3, \T4, \T4 - - vpclmulqdq $0x00, \T5, \T1, \T3 - vpxor \T3, \T7, \T7 - - vpclmulqdq $0x01, \T5, \T1, \T3 - vpxor \T3, \T6, \T6 - - vpclmulqdq $0x10, \T5, \T1, \T3 - vpxor \T3, \T6, \T6 - - vmovdqu 16*4(arg1), \T1 - vaesenc \T1, \XMM1, \XMM1 - vaesenc \T1, \XMM2, \XMM2 - vaesenc \T1, \XMM3, \XMM3 - vaesenc \T1, \XMM4, \XMM4 - vaesenc \T1, \XMM5, \XMM5 - vaesenc \T1, \XMM6, \XMM6 - vaesenc \T1, \XMM7, \XMM7 - vaesenc \T1, \XMM8, \XMM8 - - ####################################################################### - - vmovdqa TMP3(%rsp), \T1 - vmovdqu HashKey_6(arg2), \T5 - vpclmulqdq $0x11, \T5, \T1, \T3 - vpxor \T3, \T4, \T4 - - vpclmulqdq $0x00, \T5, \T1, \T3 - vpxor \T3, \T7, \T7 - - vpclmulqdq $0x01, \T5, \T1, \T3 - vpxor \T3, \T6, \T6 - - vpclmulqdq $0x10, \T5, \T1, \T3 - vpxor \T3, \T6, \T6 - - vmovdqu 16*5(arg1), \T1 - vaesenc \T1, \XMM1, \XMM1 - vaesenc \T1, \XMM2, \XMM2 - vaesenc \T1, \XMM3, \XMM3 - vaesenc \T1, \XMM4, \XMM4 - vaesenc \T1, \XMM5, \XMM5 - vaesenc \T1, \XMM6, \XMM6 - vaesenc \T1, \XMM7, \XMM7 - vaesenc \T1, \XMM8, \XMM8 - - vmovdqa TMP4(%rsp), \T1 - vmovdqu HashKey_5(arg2), \T5 - vpclmulqdq $0x11, \T5, \T1, \T3 - vpxor \T3, \T4, \T4 - - vpclmulqdq $0x00, \T5, \T1, \T3 - vpxor \T3, \T7, \T7 - - vpclmulqdq $0x01, \T5, \T1, \T3 - vpxor \T3, \T6, \T6 - - vpclmulqdq $0x10, \T5, \T1, \T3 - vpxor \T3, \T6, \T6 - - vmovdqu 16*6(arg1), \T1 - vaesenc \T1, \XMM1, \XMM1 - vaesenc \T1, \XMM2, \XMM2 - vaesenc \T1, \XMM3, \XMM3 - vaesenc \T1, \XMM4, \XMM4 - vaesenc \T1, \XMM5, \XMM5 - vaesenc \T1, \XMM6, \XMM6 - vaesenc \T1, \XMM7, \XMM7 - vaesenc \T1, \XMM8, \XMM8 - - - vmovdqa TMP5(%rsp), \T1 - vmovdqu HashKey_4(arg2), \T5 - vpclmulqdq $0x11, \T5, \T1, \T3 - vpxor \T3, \T4, \T4 - - vpclmulqdq $0x00, \T5, \T1, \T3 - vpxor \T3, \T7, \T7 - - vpclmulqdq $0x01, \T5, \T1, \T3 - vpxor \T3, \T6, \T6 - - vpclmulqdq $0x10, \T5, \T1, \T3 - vpxor \T3, \T6, \T6 - - vmovdqu 16*7(arg1), \T1 - vaesenc \T1, \XMM1, \XMM1 - vaesenc \T1, \XMM2, \XMM2 - vaesenc \T1, \XMM3, \XMM3 - vaesenc \T1, \XMM4, \XMM4 - vaesenc \T1, \XMM5, \XMM5 - vaesenc \T1, \XMM6, \XMM6 - vaesenc \T1, \XMM7, \XMM7 - vaesenc \T1, \XMM8, \XMM8 - - vmovdqa TMP6(%rsp), \T1 - vmovdqu HashKey_3(arg2), \T5 - vpclmulqdq $0x11, \T5, \T1, \T3 - vpxor \T3, \T4, \T4 - - vpclmulqdq $0x00, \T5, \T1, \T3 - vpxor \T3, \T7, \T7 - - vpclmulqdq $0x01, \T5, \T1, \T3 - vpxor \T3, \T6, \T6 - - vpclmulqdq $0x10, \T5, \T1, \T3 - vpxor \T3, \T6, \T6 - - vmovdqu 16*8(arg1), \T1 - vaesenc \T1, \XMM1, \XMM1 - vaesenc \T1, \XMM2, \XMM2 - vaesenc \T1, \XMM3, \XMM3 - vaesenc \T1, \XMM4, \XMM4 - vaesenc \T1, \XMM5, \XMM5 - vaesenc \T1, \XMM6, \XMM6 - vaesenc \T1, \XMM7, \XMM7 - vaesenc \T1, \XMM8, \XMM8 - - vmovdqa TMP7(%rsp), \T1 - vmovdqu HashKey_2(arg2), \T5 - vpclmulqdq $0x11, \T5, \T1, \T3 - vpxor \T3, \T4, \T4 - - vpclmulqdq $0x00, \T5, \T1, \T3 - vpxor \T3, \T7, \T7 - - vpclmulqdq $0x01, \T5, \T1, \T3 - vpxor \T3, \T6, \T6 - - vpclmulqdq $0x10, \T5, \T1, \T3 - vpxor \T3, \T6, \T6 - - - ####################################################################### - - vmovdqu 16*9(arg1), \T5 - vaesenc \T5, \XMM1, \XMM1 - vaesenc \T5, \XMM2, \XMM2 - vaesenc \T5, \XMM3, \XMM3 - vaesenc \T5, \XMM4, \XMM4 - vaesenc \T5, \XMM5, \XMM5 - vaesenc \T5, \XMM6, \XMM6 - vaesenc \T5, \XMM7, \XMM7 - vaesenc \T5, \XMM8, \XMM8 - - vmovdqa TMP8(%rsp), \T1 - vmovdqu HashKey(arg2), \T5 - - vpclmulqdq $0x00, \T5, \T1, \T3 - vpxor \T3, \T7, \T7 - - vpclmulqdq $0x01, \T5, \T1, \T3 - vpxor \T3, \T6, \T6 - - vpclmulqdq $0x10, \T5, \T1, \T3 - vpxor \T3, \T6, \T6 - - vpclmulqdq $0x11, \T5, \T1, \T3 - vpxor \T3, \T4, \T1 - - - vmovdqu 16*10(arg1), \T5 - - i = 11 - setreg -.rep (\REP-9) - vaesenc \T5, \XMM1, \XMM1 - vaesenc \T5, \XMM2, \XMM2 - vaesenc \T5, \XMM3, \XMM3 - vaesenc \T5, \XMM4, \XMM4 - vaesenc \T5, \XMM5, \XMM5 - vaesenc \T5, \XMM6, \XMM6 - vaesenc \T5, \XMM7, \XMM7 - vaesenc \T5, \XMM8, \XMM8 - - vmovdqu 16*i(arg1), \T5 - i = i + 1 - setreg -.endr - - i = 0 - j = 1 - setreg -.rep 8 - vpxor 16*i(arg4, %r11), \T5, \T2 - .if \ENC_DEC == ENC - vaesenclast \T2, reg_j, reg_j - .else - vaesenclast \T2, reg_j, \T3 - vmovdqu 16*i(arg4, %r11), reg_j - vmovdqu \T3, 16*i(arg3, %r11) - .endif - i = (i+1) - j = (j+1) - setreg -.endr - ####################################################################### - - - vpslldq $8, \T6, \T3 # shift-L T3 2 DWs - vpsrldq $8, \T6, \T6 # shift-R T2 2 DWs - vpxor \T3, \T7, \T7 - vpxor \T6, \T1, \T1 # accumulate the results in T1:T7 - - - - ####################################################################### - #first phase of the reduction - vmovdqa POLY2(%rip), \T3 - - vpclmulqdq $0x01, \T7, \T3, \T2 - vpslldq $8, \T2, \T2 # shift-L xmm2 2 DWs - - vpxor \T2, \T7, \T7 # first phase of the reduction complete - ####################################################################### - .if \ENC_DEC == ENC - vmovdqu \XMM1, 16*0(arg3,%r11) # Write to the Ciphertext buffer - vmovdqu \XMM2, 16*1(arg3,%r11) # Write to the Ciphertext buffer - vmovdqu \XMM3, 16*2(arg3,%r11) # Write to the Ciphertext buffer - vmovdqu \XMM4, 16*3(arg3,%r11) # Write to the Ciphertext buffer - vmovdqu \XMM5, 16*4(arg3,%r11) # Write to the Ciphertext buffer - vmovdqu \XMM6, 16*5(arg3,%r11) # Write to the Ciphertext buffer - vmovdqu \XMM7, 16*6(arg3,%r11) # Write to the Ciphertext buffer - vmovdqu \XMM8, 16*7(arg3,%r11) # Write to the Ciphertext buffer - .endif - - ####################################################################### - #second phase of the reduction - vpclmulqdq $0x00, \T7, \T3, \T2 - vpsrldq $4, \T2, \T2 # shift-R xmm2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R) - - vpclmulqdq $0x10, \T7, \T3, \T4 - vpslldq $4, \T4, \T4 # shift-L xmm0 1 DW (Shift-L 1-DW to obtain result with no shifts) - - vpxor \T2, \T4, \T4 # second phase of the reduction complete - ####################################################################### - vpxor \T4, \T1, \T1 # the result is in T1 - - vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap - - - vpxor \T1, \XMM1, \XMM1 - - - -.endm - - -# GHASH the last 4 ciphertext blocks. -.macro GHASH_LAST_8_AVX2 T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 - - ## Karatsuba Method - - vmovdqu HashKey_8(arg2), \T5 - - vpshufd $0b01001110, \XMM1, \T2 - vpshufd $0b01001110, \T5, \T3 - vpxor \XMM1, \T2, \T2 - vpxor \T5, \T3, \T3 - - vpclmulqdq $0x11, \T5, \XMM1, \T6 - vpclmulqdq $0x00, \T5, \XMM1, \T7 - - vpclmulqdq $0x00, \T3, \T2, \XMM1 - - ###################### - - vmovdqu HashKey_7(arg2), \T5 - vpshufd $0b01001110, \XMM2, \T2 - vpshufd $0b01001110, \T5, \T3 - vpxor \XMM2, \T2, \T2 - vpxor \T5, \T3, \T3 - - vpclmulqdq $0x11, \T5, \XMM2, \T4 - vpxor \T4, \T6, \T6 - - vpclmulqdq $0x00, \T5, \XMM2, \T4 - vpxor \T4, \T7, \T7 - - vpclmulqdq $0x00, \T3, \T2, \T2 - - vpxor \T2, \XMM1, \XMM1 - - ###################### - - vmovdqu HashKey_6(arg2), \T5 - vpshufd $0b01001110, \XMM3, \T2 - vpshufd $0b01001110, \T5, \T3 - vpxor \XMM3, \T2, \T2 - vpxor \T5, \T3, \T3 - - vpclmulqdq $0x11, \T5, \XMM3, \T4 - vpxor \T4, \T6, \T6 - - vpclmulqdq $0x00, \T5, \XMM3, \T4 - vpxor \T4, \T7, \T7 - - vpclmulqdq $0x00, \T3, \T2, \T2 - - vpxor \T2, \XMM1, \XMM1 - - ###################### - - vmovdqu HashKey_5(arg2), \T5 - vpshufd $0b01001110, \XMM4, \T2 - vpshufd $0b01001110, \T5, \T3 - vpxor \XMM4, \T2, \T2 - vpxor \T5, \T3, \T3 - - vpclmulqdq $0x11, \T5, \XMM4, \T4 - vpxor \T4, \T6, \T6 - - vpclmulqdq $0x00, \T5, \XMM4, \T4 - vpxor \T4, \T7, \T7 - - vpclmulqdq $0x00, \T3, \T2, \T2 - - vpxor \T2, \XMM1, \XMM1 - - ###################### - - vmovdqu HashKey_4(arg2), \T5 - vpshufd $0b01001110, \XMM5, \T2 - vpshufd $0b01001110, \T5, \T3 - vpxor \XMM5, \T2, \T2 - vpxor \T5, \T3, \T3 - - vpclmulqdq $0x11, \T5, \XMM5, \T4 - vpxor \T4, \T6, \T6 - - vpclmulqdq $0x00, \T5, \XMM5, \T4 - vpxor \T4, \T7, \T7 - - vpclmulqdq $0x00, \T3, \T2, \T2 - - vpxor \T2, \XMM1, \XMM1 - - ###################### - - vmovdqu HashKey_3(arg2), \T5 - vpshufd $0b01001110, \XMM6, \T2 - vpshufd $0b01001110, \T5, \T3 - vpxor \XMM6, \T2, \T2 - vpxor \T5, \T3, \T3 - - vpclmulqdq $0x11, \T5, \XMM6, \T4 - vpxor \T4, \T6, \T6 - - vpclmulqdq $0x00, \T5, \XMM6, \T4 - vpxor \T4, \T7, \T7 - - vpclmulqdq $0x00, \T3, \T2, \T2 - - vpxor \T2, \XMM1, \XMM1 - - ###################### - - vmovdqu HashKey_2(arg2), \T5 - vpshufd $0b01001110, \XMM7, \T2 - vpshufd $0b01001110, \T5, \T3 - vpxor \XMM7, \T2, \T2 - vpxor \T5, \T3, \T3 - - vpclmulqdq $0x11, \T5, \XMM7, \T4 - vpxor \T4, \T6, \T6 - - vpclmulqdq $0x00, \T5, \XMM7, \T4 - vpxor \T4, \T7, \T7 - - vpclmulqdq $0x00, \T3, \T2, \T2 - - vpxor \T2, \XMM1, \XMM1 - - ###################### - - vmovdqu HashKey(arg2), \T5 - vpshufd $0b01001110, \XMM8, \T2 - vpshufd $0b01001110, \T5, \T3 - vpxor \XMM8, \T2, \T2 - vpxor \T5, \T3, \T3 - - vpclmulqdq $0x11, \T5, \XMM8, \T4 - vpxor \T4, \T6, \T6 - - vpclmulqdq $0x00, \T5, \XMM8, \T4 - vpxor \T4, \T7, \T7 - - vpclmulqdq $0x00, \T3, \T2, \T2 - - vpxor \T2, \XMM1, \XMM1 - vpxor \T6, \XMM1, \XMM1 - vpxor \T7, \XMM1, \T2 - - - - - vpslldq $8, \T2, \T4 - vpsrldq $8, \T2, \T2 - - vpxor \T4, \T7, \T7 - vpxor \T2, \T6, \T6 # <T6:T7> holds the result of the - # accumulated carry-less multiplications - - ####################################################################### - #first phase of the reduction - vmovdqa POLY2(%rip), \T3 - - vpclmulqdq $0x01, \T7, \T3, \T2 - vpslldq $8, \T2, \T2 # shift-L xmm2 2 DWs - - vpxor \T2, \T7, \T7 # first phase of the reduction complete - ####################################################################### - - - #second phase of the reduction - vpclmulqdq $0x00, \T7, \T3, \T2 - vpsrldq $4, \T2, \T2 # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R) - - vpclmulqdq $0x10, \T7, \T3, \T4 - vpslldq $4, \T4, \T4 # shift-L T4 1 DW (Shift-L 1-DW to obtain result with no shifts) - - vpxor \T2, \T4, \T4 # second phase of the reduction complete - ####################################################################### - vpxor \T4, \T6, \T6 # the result is in T6 -.endm - - - -############################################################# -#void aesni_gcm_init_avx_gen4 -# (gcm_data *my_ctx_data, -# gcm_context_data *data, -# u8 *iv, /* Pre-counter block j0: 4 byte salt -# (from Security Association) concatenated with 8 byte -# Initialisation Vector (from IPSec ESP Payload) -# concatenated with 0x00000001. 16-byte aligned pointer. */ -# u8 *hash_subkey# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */ -# const u8 *aad, /* Additional Authentication Data (AAD)*/ -# u64 aad_len) /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */ -############################################################# -SYM_FUNC_START(aesni_gcm_init_avx_gen4) - FUNC_SAVE - INIT GHASH_MUL_AVX2, PRECOMPUTE_AVX2 - FUNC_RESTORE - RET -SYM_FUNC_END(aesni_gcm_init_avx_gen4) - -############################################################################### -#void aesni_gcm_enc_avx_gen4( -# gcm_data *my_ctx_data, /* aligned to 16 Bytes */ -# gcm_context_data *data, -# u8 *out, /* Ciphertext output. Encrypt in-place is allowed. */ -# const u8 *in, /* Plaintext input */ -# u64 plaintext_len) /* Length of data in Bytes for encryption. */ -############################################################################### -SYM_FUNC_START(aesni_gcm_enc_update_avx_gen4) - FUNC_SAVE - mov keysize,%eax - cmp $32, %eax - je key_256_enc_update4 - cmp $16, %eax - je key_128_enc_update4 - # must be 192 - GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 11 - FUNC_RESTORE - RET -key_128_enc_update4: - GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 9 - FUNC_RESTORE - RET -key_256_enc_update4: - GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 13 - FUNC_RESTORE - RET -SYM_FUNC_END(aesni_gcm_enc_update_avx_gen4) - -############################################################################### -#void aesni_gcm_dec_update_avx_gen4( -# gcm_data *my_ctx_data, /* aligned to 16 Bytes */ -# gcm_context_data *data, -# u8 *out, /* Plaintext output. Decrypt in-place is allowed. */ -# const u8 *in, /* Ciphertext input */ -# u64 plaintext_len) /* Length of data in Bytes for encryption. */ -############################################################################### -SYM_FUNC_START(aesni_gcm_dec_update_avx_gen4) - FUNC_SAVE - mov keysize,%eax - cmp $32, %eax - je key_256_dec_update4 - cmp $16, %eax - je key_128_dec_update4 - # must be 192 - GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 11 - FUNC_RESTORE - RET -key_128_dec_update4: - GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 9 - FUNC_RESTORE - RET -key_256_dec_update4: - GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 13 - FUNC_RESTORE - RET -SYM_FUNC_END(aesni_gcm_dec_update_avx_gen4) - -############################################################################### -#void aesni_gcm_finalize_avx_gen4( -# gcm_data *my_ctx_data, /* aligned to 16 Bytes */ -# gcm_context_data *data, -# u8 *auth_tag, /* Authenticated Tag output. */ -# u64 auth_tag_len)# /* Authenticated Tag Length in bytes. -# Valid values are 16 (most likely), 12 or 8. */ -############################################################################### -SYM_FUNC_START(aesni_gcm_finalize_avx_gen4) - FUNC_SAVE - mov keysize,%eax - cmp $32, %eax - je key_256_finalize4 - cmp $16, %eax - je key_128_finalize4 - # must be 192 - GCM_COMPLETE GHASH_MUL_AVX2, 11, arg3, arg4 - FUNC_RESTORE - RET -key_128_finalize4: - GCM_COMPLETE GHASH_MUL_AVX2, 9, arg3, arg4 - FUNC_RESTORE - RET -key_256_finalize4: - GCM_COMPLETE GHASH_MUL_AVX2, 13, arg3, arg4 - FUNC_RESTORE - RET -SYM_FUNC_END(aesni_gcm_finalize_avx_gen4) diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c index b1d90c25975a..061b1ced93c5 100644 --- a/arch/x86/crypto/aesni-intel_glue.c +++ b/arch/x86/crypto/aesni-intel_glue.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-or-later /* - * Support for Intel AES-NI instructions. This file contains glue - * code, the real AES implementation is in intel-aes_asm.S. + * Support for AES-NI and VAES instructions. This file contains glue code. + * The real AES implementations are in aesni-intel_asm.S and other .S files. * * Copyright (C) 2008, Intel Corp. * Author: Huang Ying <ying.huang@intel.com> @@ -13,6 +13,8 @@ * Tadeusz Struk (tadeusz.struk@intel.com) * Aidan O'Mahony (aidan.o.mahony@intel.com) * Copyright (c) 2010, Intel Corporation. + * + * Copyright 2024 Google LLC */ #include <linux/hardirq.h> @@ -21,7 +23,6 @@ #include <linux/err.h> #include <crypto/algapi.h> #include <crypto/aes.h> -#include <crypto/ctr.h> #include <crypto/b128ops.h> #include <crypto/gcm.h> #include <crypto/xts.h> @@ -40,46 +41,15 @@ #define AESNI_ALIGN 16 #define AESNI_ALIGN_ATTR __attribute__ ((__aligned__(AESNI_ALIGN))) #define AES_BLOCK_MASK (~(AES_BLOCK_SIZE - 1)) -#define RFC4106_HASH_SUBKEY_SIZE 16 #define AESNI_ALIGN_EXTRA ((AESNI_ALIGN - 1) & ~(CRYPTO_MINALIGN - 1)) #define CRYPTO_AES_CTX_SIZE (sizeof(struct crypto_aes_ctx) + AESNI_ALIGN_EXTRA) #define XTS_AES_CTX_SIZE (sizeof(struct aesni_xts_ctx) + AESNI_ALIGN_EXTRA) -/* This data is stored at the end of the crypto_tfm struct. - * It's a type of per "session" data storage location. - * This needs to be 16 byte aligned. - */ -struct aesni_rfc4106_gcm_ctx { - u8 hash_subkey[16] AESNI_ALIGN_ATTR; - struct crypto_aes_ctx aes_key_expanded AESNI_ALIGN_ATTR; - u8 nonce[4]; -}; - -struct generic_gcmaes_ctx { - u8 hash_subkey[16] AESNI_ALIGN_ATTR; - struct crypto_aes_ctx aes_key_expanded AESNI_ALIGN_ATTR; -}; - struct aesni_xts_ctx { struct crypto_aes_ctx tweak_ctx AESNI_ALIGN_ATTR; struct crypto_aes_ctx crypt_ctx AESNI_ALIGN_ATTR; }; -#define GCM_BLOCK_LEN 16 - -struct gcm_context_data { - /* init, update and finalize context data */ - u8 aad_hash[GCM_BLOCK_LEN]; - u64 aad_length; - u64 in_length; - u8 partial_block_enc_key[GCM_BLOCK_LEN]; - u8 orig_IV[GCM_BLOCK_LEN]; - u8 current_counter[GCM_BLOCK_LEN]; - u64 partial_block_len; - u64 unused; - u8 hash_keys[GCM_BLOCK_LEN * 16]; -}; - static inline void *aes_align_addr(void *addr) { if (crypto_tfm_ctx_alignment() >= AESNI_ALIGN) @@ -87,8 +57,8 @@ static inline void *aes_align_addr(void *addr) return PTR_ALIGN(addr, AESNI_ALIGN); } -asmlinkage int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key, - unsigned int key_len); +asmlinkage void aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key, + unsigned int key_len); asmlinkage void aesni_enc(const void *ctx, u8 *out, const u8 *in); asmlinkage void aesni_dec(const void *ctx, u8 *out, const u8 *in); asmlinkage void aesni_ecb_enc(struct crypto_aes_ctx *ctx, u8 *out, @@ -104,118 +74,15 @@ asmlinkage void aesni_cts_cbc_enc(struct crypto_aes_ctx *ctx, u8 *out, asmlinkage void aesni_cts_cbc_dec(struct crypto_aes_ctx *ctx, u8 *out, const u8 *in, unsigned int len, u8 *iv); -#define AVX_GEN2_OPTSIZE 640 -#define AVX_GEN4_OPTSIZE 4096 - -asmlinkage void aesni_xts_encrypt(const struct crypto_aes_ctx *ctx, u8 *out, - const u8 *in, unsigned int len, u8 *iv); +asmlinkage void aesni_xts_enc(const struct crypto_aes_ctx *ctx, u8 *out, + const u8 *in, unsigned int len, u8 *iv); -asmlinkage void aesni_xts_decrypt(const struct crypto_aes_ctx *ctx, u8 *out, - const u8 *in, unsigned int len, u8 *iv); +asmlinkage void aesni_xts_dec(const struct crypto_aes_ctx *ctx, u8 *out, + const u8 *in, unsigned int len, u8 *iv); #ifdef CONFIG_X86_64 - asmlinkage void aesni_ctr_enc(struct crypto_aes_ctx *ctx, u8 *out, const u8 *in, unsigned int len, u8 *iv); -DEFINE_STATIC_CALL(aesni_ctr_enc_tfm, aesni_ctr_enc); - -/* Scatter / Gather routines, with args similar to above */ -asmlinkage void aesni_gcm_init(void *ctx, - struct gcm_context_data *gdata, - u8 *iv, - u8 *hash_subkey, const u8 *aad, - unsigned long aad_len); -asmlinkage void aesni_gcm_enc_update(void *ctx, - struct gcm_context_data *gdata, u8 *out, - const u8 *in, unsigned long plaintext_len); -asmlinkage void aesni_gcm_dec_update(void *ctx, - struct gcm_context_data *gdata, u8 *out, - const u8 *in, - unsigned long ciphertext_len); -asmlinkage void aesni_gcm_finalize(void *ctx, - struct gcm_context_data *gdata, - u8 *auth_tag, unsigned long auth_tag_len); - -asmlinkage void aes_ctr_enc_128_avx_by8(const u8 *in, u8 *iv, - void *keys, u8 *out, unsigned int num_bytes); -asmlinkage void aes_ctr_enc_192_avx_by8(const u8 *in, u8 *iv, - void *keys, u8 *out, unsigned int num_bytes); -asmlinkage void aes_ctr_enc_256_avx_by8(const u8 *in, u8 *iv, - void *keys, u8 *out, unsigned int num_bytes); - - -asmlinkage void aes_xctr_enc_128_avx_by8(const u8 *in, const u8 *iv, - const void *keys, u8 *out, unsigned int num_bytes, - unsigned int byte_ctr); - -asmlinkage void aes_xctr_enc_192_avx_by8(const u8 *in, const u8 *iv, - const void *keys, u8 *out, unsigned int num_bytes, - unsigned int byte_ctr); - -asmlinkage void aes_xctr_enc_256_avx_by8(const u8 *in, const u8 *iv, - const void *keys, u8 *out, unsigned int num_bytes, - unsigned int byte_ctr); - -/* - * asmlinkage void aesni_gcm_init_avx_gen2() - * gcm_data *my_ctx_data, context data - * u8 *hash_subkey, the Hash sub key input. Data starts on a 16-byte boundary. - */ -asmlinkage void aesni_gcm_init_avx_gen2(void *my_ctx_data, - struct gcm_context_data *gdata, - u8 *iv, - u8 *hash_subkey, - const u8 *aad, - unsigned long aad_len); - -asmlinkage void aesni_gcm_enc_update_avx_gen2(void *ctx, - struct gcm_context_data *gdata, u8 *out, - const u8 *in, unsigned long plaintext_len); -asmlinkage void aesni_gcm_dec_update_avx_gen2(void *ctx, - struct gcm_context_data *gdata, u8 *out, - const u8 *in, - unsigned long ciphertext_len); -asmlinkage void aesni_gcm_finalize_avx_gen2(void *ctx, - struct gcm_context_data *gdata, - u8 *auth_tag, unsigned long auth_tag_len); - -/* - * asmlinkage void aesni_gcm_init_avx_gen4() - * gcm_data *my_ctx_data, context data - * u8 *hash_subkey, the Hash sub key input. Data starts on a 16-byte boundary. - */ -asmlinkage void aesni_gcm_init_avx_gen4(void *my_ctx_data, - struct gcm_context_data *gdata, - u8 *iv, - u8 *hash_subkey, - const u8 *aad, - unsigned long aad_len); - -asmlinkage void aesni_gcm_enc_update_avx_gen4(void *ctx, - struct gcm_context_data *gdata, u8 *out, - const u8 *in, unsigned long plaintext_len); -asmlinkage void aesni_gcm_dec_update_avx_gen4(void *ctx, - struct gcm_context_data *gdata, u8 *out, - const u8 *in, - unsigned long ciphertext_len); -asmlinkage void aesni_gcm_finalize_avx_gen4(void *ctx, - struct gcm_context_data *gdata, - u8 *auth_tag, unsigned long auth_tag_len); - -static __ro_after_init DEFINE_STATIC_KEY_FALSE(gcm_use_avx); -static __ro_after_init DEFINE_STATIC_KEY_FALSE(gcm_use_avx2); - -static inline struct -aesni_rfc4106_gcm_ctx *aesni_rfc4106_gcm_ctx_get(struct crypto_aead *tfm) -{ - return aes_align_addr(crypto_aead_ctx(tfm)); -} - -static inline struct -generic_gcmaes_ctx *generic_gcmaes_ctx_get(struct crypto_aead *tfm) -{ - return aes_align_addr(crypto_aead_ctx(tfm)); -} #endif static inline struct crypto_aes_ctx *aes_ctx(void *raw_ctx) @@ -233,19 +100,17 @@ static int aes_set_key_common(struct crypto_aes_ctx *ctx, { int err; - if (key_len != AES_KEYSIZE_128 && key_len != AES_KEYSIZE_192 && - key_len != AES_KEYSIZE_256) - return -EINVAL; - if (!crypto_simd_usable()) - err = aes_expandkey(ctx, in_key, key_len); - else { - kernel_fpu_begin(); - err = aesni_set_key(ctx, in_key, key_len); - kernel_fpu_end(); - } + return aes_expandkey(ctx, in_key, key_len); - return err; + err = aes_check_keylen(key_len); + if (err) + return err; + + kernel_fpu_begin(); + aesni_set_key(ctx, in_key, key_len); + kernel_fpu_end(); + return 0; } static int aes_set_key(struct crypto_tfm *tfm, const u8 *in_key, @@ -488,24 +353,8 @@ static int cts_cbc_decrypt(struct skcipher_request *req) } #ifdef CONFIG_X86_64 -static void aesni_ctr_enc_avx_tfm(struct crypto_aes_ctx *ctx, u8 *out, - const u8 *in, unsigned int len, u8 *iv) -{ - /* - * based on key length, override with the by8 version - * of ctr mode encryption/decryption for improved performance - * aes_set_key_common() ensures that key length is one of - * {128,192,256} - */ - if (ctx->key_length == AES_KEYSIZE_128) - aes_ctr_enc_128_avx_by8(in, iv, (void *)ctx, out, len); - else if (ctx->key_length == AES_KEYSIZE_192) - aes_ctr_enc_192_avx_by8(in, iv, (void *)ctx, out, len); - else - aes_ctr_enc_256_avx_by8(in, iv, (void *)ctx, out, len); -} - -static int ctr_crypt(struct skcipher_request *req) +/* This is the non-AVX version. */ +static int ctr_crypt_aesni(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct crypto_aes_ctx *ctx = aes_ctx(crypto_skcipher_ctx(tfm)); @@ -519,10 +368,9 @@ static int ctr_crypt(struct skcipher_request *req) while ((nbytes = walk.nbytes) > 0) { kernel_fpu_begin(); if (nbytes & AES_BLOCK_MASK) - static_call(aesni_ctr_enc_tfm)(ctx, walk.dst.virt.addr, - walk.src.virt.addr, - nbytes & AES_BLOCK_MASK, - walk.iv); + aesni_ctr_enc(ctx, walk.dst.virt.addr, + walk.src.virt.addr, + nbytes & AES_BLOCK_MASK, walk.iv); nbytes &= ~AES_BLOCK_MASK; if (walk.nbytes == walk.total && nbytes > 0) { @@ -538,346 +386,9 @@ static int ctr_crypt(struct skcipher_request *req) } return err; } - -static void aesni_xctr_enc_avx_tfm(struct crypto_aes_ctx *ctx, u8 *out, - const u8 *in, unsigned int len, u8 *iv, - unsigned int byte_ctr) -{ - if (ctx->key_length == AES_KEYSIZE_128) - aes_xctr_enc_128_avx_by8(in, iv, (void *)ctx, out, len, - byte_ctr); - else if (ctx->key_length == AES_KEYSIZE_192) - aes_xctr_enc_192_avx_by8(in, iv, (void *)ctx, out, len, - byte_ctr); - else - aes_xctr_enc_256_avx_by8(in, iv, (void *)ctx, out, len, - byte_ctr); -} - -static int xctr_crypt(struct skcipher_request *req) -{ - struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - struct crypto_aes_ctx *ctx = aes_ctx(crypto_skcipher_ctx(tfm)); - u8 keystream[AES_BLOCK_SIZE]; - struct skcipher_walk walk; - unsigned int nbytes; - unsigned int byte_ctr = 0; - int err; - __le32 block[AES_BLOCK_SIZE / sizeof(__le32)]; - - err = skcipher_walk_virt(&walk, req, false); - - while ((nbytes = walk.nbytes) > 0) { - kernel_fpu_begin(); - if (nbytes & AES_BLOCK_MASK) - aesni_xctr_enc_avx_tfm(ctx, walk.dst.virt.addr, - walk.src.virt.addr, nbytes & AES_BLOCK_MASK, - walk.iv, byte_ctr); - nbytes &= ~AES_BLOCK_MASK; - byte_ctr += walk.nbytes - nbytes; - - if (walk.nbytes == walk.total && nbytes > 0) { - memcpy(block, walk.iv, AES_BLOCK_SIZE); - block[0] ^= cpu_to_le32(1 + byte_ctr / AES_BLOCK_SIZE); - aesni_enc(ctx, keystream, (u8 *)block); - crypto_xor_cpy(walk.dst.virt.addr + walk.nbytes - - nbytes, walk.src.virt.addr + walk.nbytes - - nbytes, keystream, nbytes); - byte_ctr += nbytes; - nbytes = 0; - } - kernel_fpu_end(); - err = skcipher_walk_done(&walk, nbytes); - } - return err; -} - -static int -rfc4106_set_hash_subkey(u8 *hash_subkey, const u8 *key, unsigned int key_len) -{ - struct crypto_aes_ctx ctx; - int ret; - - ret = aes_expandkey(&ctx, key, key_len); - if (ret) - return ret; - - /* Clear the data in the hash sub key container to zero.*/ - /* We want to cipher all zeros to create the hash sub key. */ - memset(hash_subkey, 0, RFC4106_HASH_SUBKEY_SIZE); - - aes_encrypt(&ctx, hash_subkey, hash_subkey); - - memzero_explicit(&ctx, sizeof(ctx)); - return 0; -} - -static int common_rfc4106_set_key(struct crypto_aead *aead, const u8 *key, - unsigned int key_len) -{ - struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(aead); - - if (key_len < 4) - return -EINVAL; - - /*Account for 4 byte nonce at the end.*/ - key_len -= 4; - - memcpy(ctx->nonce, key + key_len, sizeof(ctx->nonce)); - - return aes_set_key_common(&ctx->aes_key_expanded, key, key_len) ?: - rfc4106_set_hash_subkey(ctx->hash_subkey, key, key_len); -} - -/* This is the Integrity Check Value (aka the authentication tag) length and can - * be 8, 12 or 16 bytes long. */ -static int common_rfc4106_set_authsize(struct crypto_aead *aead, - unsigned int authsize) -{ - switch (authsize) { - case 8: - case 12: - case 16: - break; - default: - return -EINVAL; - } - - return 0; -} - -static int generic_gcmaes_set_authsize(struct crypto_aead *tfm, - unsigned int authsize) -{ - switch (authsize) { - case 4: - case 8: - case 12: - case 13: - case 14: - case 15: - case 16: - break; - default: - return -EINVAL; - } - - return 0; -} - -static int gcmaes_crypt_by_sg(bool enc, struct aead_request *req, - unsigned int assoclen, u8 *hash_subkey, - u8 *iv, void *aes_ctx, u8 *auth_tag, - unsigned long auth_tag_len) -{ - u8 databuf[sizeof(struct gcm_context_data) + (AESNI_ALIGN - 8)] __aligned(8); - struct gcm_context_data *data = PTR_ALIGN((void *)databuf, AESNI_ALIGN); - unsigned long left = req->cryptlen; - struct scatter_walk assoc_sg_walk; - struct skcipher_walk walk; - bool do_avx, do_avx2; - u8 *assocmem = NULL; - u8 *assoc; - int err; - - if (!enc) - left -= auth_tag_len; - - do_avx = (left >= AVX_GEN2_OPTSIZE); - do_avx2 = (left >= AVX_GEN4_OPTSIZE); - - /* Linearize assoc, if not already linear */ - if (req->src->length >= assoclen && req->src->length) { - scatterwalk_start(&assoc_sg_walk, req->src); - assoc = scatterwalk_map(&assoc_sg_walk); - } else { - gfp_t flags = (req->base.flags & CRYPTO_TFM_REQ_MAY_SLEEP) ? - GFP_KERNEL : GFP_ATOMIC; - - /* assoc can be any length, so must be on heap */ - assocmem = kmalloc(assoclen, flags); - if (unlikely(!assocmem)) - return -ENOMEM; - assoc = assocmem; - - scatterwalk_map_and_copy(assoc, req->src, 0, assoclen, 0); - } - - kernel_fpu_begin(); - if (static_branch_likely(&gcm_use_avx2) && do_avx2) - aesni_gcm_init_avx_gen4(aes_ctx, data, iv, hash_subkey, assoc, - assoclen); - else if (static_branch_likely(&gcm_use_avx) && do_avx) - aesni_gcm_init_avx_gen2(aes_ctx, data, iv, hash_subkey, assoc, - assoclen); - else - aesni_gcm_init(aes_ctx, data, iv, hash_subkey, assoc, assoclen); - kernel_fpu_end(); - - if (!assocmem) - scatterwalk_unmap(assoc); - else - kfree(assocmem); - - err = enc ? skcipher_walk_aead_encrypt(&walk, req, false) - : skcipher_walk_aead_decrypt(&walk, req, false); - - while (walk.nbytes > 0) { - kernel_fpu_begin(); - if (static_branch_likely(&gcm_use_avx2) && do_avx2) { - if (enc) - aesni_gcm_enc_update_avx_gen4(aes_ctx, data, - walk.dst.virt.addr, - walk.src.virt.addr, - walk.nbytes); - else - aesni_gcm_dec_update_avx_gen4(aes_ctx, data, - walk.dst.virt.addr, - walk.src.virt.addr, - walk.nbytes); - } else if (static_branch_likely(&gcm_use_avx) && do_avx) { - if (enc) - aesni_gcm_enc_update_avx_gen2(aes_ctx, data, - walk.dst.virt.addr, - walk.src.virt.addr, - walk.nbytes); - else - aesni_gcm_dec_update_avx_gen2(aes_ctx, data, - walk.dst.virt.addr, - walk.src.virt.addr, - walk.nbytes); - } else if (enc) { - aesni_gcm_enc_update(aes_ctx, data, walk.dst.virt.addr, - walk.src.virt.addr, walk.nbytes); - } else { - aesni_gcm_dec_update(aes_ctx, data, walk.dst.virt.addr, - walk.src.virt.addr, walk.nbytes); - } - kernel_fpu_end(); - - err = skcipher_walk_done(&walk, 0); - } - - if (err) - return err; - - kernel_fpu_begin(); - if (static_branch_likely(&gcm_use_avx2) && do_avx2) - aesni_gcm_finalize_avx_gen4(aes_ctx, data, auth_tag, - auth_tag_len); - else if (static_branch_likely(&gcm_use_avx) && do_avx) - aesni_gcm_finalize_avx_gen2(aes_ctx, data, auth_tag, - auth_tag_len); - else - aesni_gcm_finalize(aes_ctx, data, auth_tag, auth_tag_len); - kernel_fpu_end(); - - return 0; -} - -static int gcmaes_encrypt(struct aead_request *req, unsigned int assoclen, - u8 *hash_subkey, u8 *iv, void *aes_ctx) -{ - struct crypto_aead *tfm = crypto_aead_reqtfm(req); - unsigned long auth_tag_len = crypto_aead_authsize(tfm); - u8 auth_tag[16]; - int err; - - err = gcmaes_crypt_by_sg(true, req, assoclen, hash_subkey, iv, aes_ctx, - auth_tag, auth_tag_len); - if (err) - return err; - - scatterwalk_map_and_copy(auth_tag, req->dst, - req->assoclen + req->cryptlen, - auth_tag_len, 1); - return 0; -} - -static int gcmaes_decrypt(struct aead_request *req, unsigned int assoclen, - u8 *hash_subkey, u8 *iv, void *aes_ctx) -{ - struct crypto_aead *tfm = crypto_aead_reqtfm(req); - unsigned long auth_tag_len = crypto_aead_authsize(tfm); - u8 auth_tag_msg[16]; - u8 auth_tag[16]; - int err; - - err = gcmaes_crypt_by_sg(false, req, assoclen, hash_subkey, iv, aes_ctx, - auth_tag, auth_tag_len); - if (err) - return err; - - /* Copy out original auth_tag */ - scatterwalk_map_and_copy(auth_tag_msg, req->src, - req->assoclen + req->cryptlen - auth_tag_len, - auth_tag_len, 0); - - /* Compare generated tag with passed in tag. */ - if (crypto_memneq(auth_tag_msg, auth_tag, auth_tag_len)) { - memzero_explicit(auth_tag, sizeof(auth_tag)); - return -EBADMSG; - } - return 0; -} - -static int helper_rfc4106_encrypt(struct aead_request *req) -{ - struct crypto_aead *tfm = crypto_aead_reqtfm(req); - struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm); - void *aes_ctx = &(ctx->aes_key_expanded); - u8 ivbuf[16 + (AESNI_ALIGN - 8)] __aligned(8); - u8 *iv = PTR_ALIGN(&ivbuf[0], AESNI_ALIGN); - unsigned int i; - __be32 counter = cpu_to_be32(1); - - /* Assuming we are supporting rfc4106 64-bit extended */ - /* sequence numbers We need to have the AAD length equal */ - /* to 16 or 20 bytes */ - if (unlikely(req->assoclen != 16 && req->assoclen != 20)) - return -EINVAL; - - /* IV below built */ - for (i = 0; i < 4; i++) - *(iv+i) = ctx->nonce[i]; - for (i = 0; i < 8; i++) - *(iv+4+i) = req->iv[i]; - *((__be32 *)(iv+12)) = counter; - - return gcmaes_encrypt(req, req->assoclen - 8, ctx->hash_subkey, iv, - aes_ctx); -} - -static int helper_rfc4106_decrypt(struct aead_request *req) -{ - __be32 counter = cpu_to_be32(1); - struct crypto_aead *tfm = crypto_aead_reqtfm(req); - struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm); - void *aes_ctx = &(ctx->aes_key_expanded); - u8 ivbuf[16 + (AESNI_ALIGN - 8)] __aligned(8); - u8 *iv = PTR_ALIGN(&ivbuf[0], AESNI_ALIGN); - unsigned int i; - - if (unlikely(req->assoclen != 16 && req->assoclen != 20)) - return -EINVAL; - - /* Assuming we are supporting rfc4106 64-bit extended */ - /* sequence numbers We need to have the AAD length */ - /* equal to 16 or 20 bytes */ - - /* IV below built */ - for (i = 0; i < 4; i++) - *(iv+i) = ctx->nonce[i]; - for (i = 0; i < 8; i++) - *(iv+4+i) = req->iv[i]; - *((__be32 *)(iv+12)) = counter; - - return gcmaes_decrypt(req, req->assoclen - 8, ctx->hash_subkey, iv, - aes_ctx); -} #endif -static int xts_aesni_setkey(struct crypto_skcipher *tfm, const u8 *key, +static int xts_setkey_aesni(struct crypto_skcipher *tfm, const u8 *key, unsigned int keylen) { struct aesni_xts_ctx *ctx = aes_xts_ctx(tfm); @@ -898,108 +409,139 @@ static int xts_aesni_setkey(struct crypto_skcipher *tfm, const u8 *key, return aes_set_key_common(&ctx->tweak_ctx, key + keylen, keylen); } -static int xts_crypt(struct skcipher_request *req, bool encrypt) +typedef void (*xts_encrypt_iv_func)(const struct crypto_aes_ctx *tweak_key, + u8 iv[AES_BLOCK_SIZE]); +typedef void (*xts_crypt_func)(const struct crypto_aes_ctx *key, + const u8 *src, u8 *dst, int len, + u8 tweak[AES_BLOCK_SIZE]); + +/* This handles cases where the source and/or destination span pages. */ +static noinline int +xts_crypt_slowpath(struct skcipher_request *req, xts_crypt_func crypt_func) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - struct aesni_xts_ctx *ctx = aes_xts_ctx(tfm); + const struct aesni_xts_ctx *ctx = aes_xts_ctx(tfm); int tail = req->cryptlen % AES_BLOCK_SIZE; + struct scatterlist sg_src[2], sg_dst[2]; struct skcipher_request subreq; struct skcipher_walk walk; + struct scatterlist *src, *dst; int err; - if (req->cryptlen < AES_BLOCK_SIZE) - return -EINVAL; - - err = skcipher_walk_virt(&walk, req, false); - if (!walk.nbytes) - return err; - - if (unlikely(tail > 0 && walk.nbytes < walk.total)) { - int blocks = DIV_ROUND_UP(req->cryptlen, AES_BLOCK_SIZE) - 2; - - skcipher_walk_abort(&walk); - + /* + * If the message length isn't divisible by the AES block size, then + * separate off the last full block and the partial block. This ensures + * that they are processed in the same call to the assembly function, + * which is required for ciphertext stealing. + */ + if (tail) { skcipher_request_set_tfm(&subreq, tfm); skcipher_request_set_callback(&subreq, skcipher_request_flags(req), NULL, NULL); skcipher_request_set_crypt(&subreq, req->src, req->dst, - blocks * AES_BLOCK_SIZE, req->iv); + req->cryptlen - tail - AES_BLOCK_SIZE, + req->iv); req = &subreq; - - err = skcipher_walk_virt(&walk, req, false); - if (!walk.nbytes) - return err; - } else { - tail = 0; } - kernel_fpu_begin(); + err = skcipher_walk_virt(&walk, req, false); - /* calculate first value of T */ - aesni_enc(&ctx->tweak_ctx, walk.iv, walk.iv); + while (walk.nbytes) { + kernel_fpu_begin(); + (*crypt_func)(&ctx->crypt_ctx, + walk.src.virt.addr, walk.dst.virt.addr, + walk.nbytes & ~(AES_BLOCK_SIZE - 1), req->iv); + kernel_fpu_end(); + err = skcipher_walk_done(&walk, + walk.nbytes & (AES_BLOCK_SIZE - 1)); + } - while (walk.nbytes > 0) { - int nbytes = walk.nbytes; + if (err || !tail) + return err; - if (nbytes < walk.total) - nbytes &= ~(AES_BLOCK_SIZE - 1); + /* Do ciphertext stealing with the last full block and partial block. */ - if (encrypt) - aesni_xts_encrypt(&ctx->crypt_ctx, - walk.dst.virt.addr, walk.src.virt.addr, - nbytes, walk.iv); - else - aesni_xts_decrypt(&ctx->crypt_ctx, - walk.dst.virt.addr, walk.src.virt.addr, - nbytes, walk.iv); - kernel_fpu_end(); + dst = src = scatterwalk_ffwd(sg_src, req->src, req->cryptlen); + if (req->dst != req->src) + dst = scatterwalk_ffwd(sg_dst, req->dst, req->cryptlen); - err = skcipher_walk_done(&walk, walk.nbytes - nbytes); + skcipher_request_set_crypt(req, src, dst, AES_BLOCK_SIZE + tail, + req->iv); - if (walk.nbytes > 0) - kernel_fpu_begin(); - } + err = skcipher_walk_virt(&walk, req, false); + if (err) + return err; - if (unlikely(tail > 0 && !err)) { - struct scatterlist sg_src[2], sg_dst[2]; - struct scatterlist *src, *dst; + kernel_fpu_begin(); + (*crypt_func)(&ctx->crypt_ctx, walk.src.virt.addr, walk.dst.virt.addr, + walk.nbytes, req->iv); + kernel_fpu_end(); - dst = src = scatterwalk_ffwd(sg_src, req->src, req->cryptlen); - if (req->dst != req->src) - dst = scatterwalk_ffwd(sg_dst, req->dst, req->cryptlen); + return skcipher_walk_done(&walk, 0); +} - skcipher_request_set_crypt(req, src, dst, AES_BLOCK_SIZE + tail, - req->iv); +/* __always_inline to avoid indirect call in fastpath */ +static __always_inline int +xts_crypt(struct skcipher_request *req, xts_encrypt_iv_func encrypt_iv, + xts_crypt_func crypt_func) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + const struct aesni_xts_ctx *ctx = aes_xts_ctx(tfm); - err = skcipher_walk_virt(&walk, &subreq, false); - if (err) - return err; + if (unlikely(req->cryptlen < AES_BLOCK_SIZE)) + return -EINVAL; - kernel_fpu_begin(); - if (encrypt) - aesni_xts_encrypt(&ctx->crypt_ctx, - walk.dst.virt.addr, walk.src.virt.addr, - walk.nbytes, walk.iv); - else - aesni_xts_decrypt(&ctx->crypt_ctx, - walk.dst.virt.addr, walk.src.virt.addr, - walk.nbytes, walk.iv); - kernel_fpu_end(); + kernel_fpu_begin(); + (*encrypt_iv)(&ctx->tweak_ctx, req->iv); - err = skcipher_walk_done(&walk, 0); + /* + * In practice, virtually all XTS plaintexts and ciphertexts are either + * 512 or 4096 bytes and do not use multiple scatterlist elements. To + * optimize the performance of these cases, the below fast-path handles + * single-scatterlist-element messages as efficiently as possible. The + * code is 64-bit specific, as it assumes no page mapping is needed. + */ + if (IS_ENABLED(CONFIG_X86_64) && + likely(req->src->length >= req->cryptlen && + req->dst->length >= req->cryptlen)) { + (*crypt_func)(&ctx->crypt_ctx, sg_virt(req->src), + sg_virt(req->dst), req->cryptlen, req->iv); + kernel_fpu_end(); + return 0; } - return err; + kernel_fpu_end(); + return xts_crypt_slowpath(req, crypt_func); +} + +static void aesni_xts_encrypt_iv(const struct crypto_aes_ctx *tweak_key, + u8 iv[AES_BLOCK_SIZE]) +{ + aesni_enc(tweak_key, iv, iv); +} + +static void aesni_xts_encrypt(const struct crypto_aes_ctx *key, + const u8 *src, u8 *dst, int len, + u8 tweak[AES_BLOCK_SIZE]) +{ + aesni_xts_enc(key, dst, src, len, tweak); +} + +static void aesni_xts_decrypt(const struct crypto_aes_ctx *key, + const u8 *src, u8 *dst, int len, + u8 tweak[AES_BLOCK_SIZE]) +{ + aesni_xts_dec(key, dst, src, len, tweak); } -static int xts_encrypt(struct skcipher_request *req) +static int xts_encrypt_aesni(struct skcipher_request *req) { - return xts_crypt(req, true); + return xts_crypt(req, aesni_xts_encrypt_iv, aesni_xts_encrypt); } -static int xts_decrypt(struct skcipher_request *req) +static int xts_decrypt_aesni(struct skcipher_request *req) { - return xts_crypt(req, false); + return xts_crypt(req, aesni_xts_encrypt_iv, aesni_xts_decrypt); } static struct crypto_alg aesni_cipher_alg = { @@ -1024,10 +566,9 @@ static struct crypto_alg aesni_cipher_alg = { static struct skcipher_alg aesni_skciphers[] = { { .base = { - .cra_name = "__ecb(aes)", - .cra_driver_name = "__ecb-aes-aesni", + .cra_name = "ecb(aes)", + .cra_driver_name = "ecb-aes-aesni", .cra_priority = 400, - .cra_flags = CRYPTO_ALG_INTERNAL, .cra_blocksize = AES_BLOCK_SIZE, .cra_ctxsize = CRYPTO_AES_CTX_SIZE, .cra_module = THIS_MODULE, @@ -1039,10 +580,9 @@ static struct skcipher_alg aesni_skciphers[] = { .decrypt = ecb_decrypt, }, { .base = { - .cra_name = "__cbc(aes)", - .cra_driver_name = "__cbc-aes-aesni", + .cra_name = "cbc(aes)", + .cra_driver_name = "cbc-aes-aesni", .cra_priority = 400, - .cra_flags = CRYPTO_ALG_INTERNAL, .cra_blocksize = AES_BLOCK_SIZE, .cra_ctxsize = CRYPTO_AES_CTX_SIZE, .cra_module = THIS_MODULE, @@ -1055,10 +595,9 @@ static struct skcipher_alg aesni_skciphers[] = { .decrypt = cbc_decrypt, }, { .base = { - .cra_name = "__cts(cbc(aes))", - .cra_driver_name = "__cts-cbc-aes-aesni", + .cra_name = "cts(cbc(aes))", + .cra_driver_name = "cts-cbc-aes-aesni", .cra_priority = 400, - .cra_flags = CRYPTO_ALG_INTERNAL, .cra_blocksize = AES_BLOCK_SIZE, .cra_ctxsize = CRYPTO_AES_CTX_SIZE, .cra_module = THIS_MODULE, @@ -1073,10 +612,9 @@ static struct skcipher_alg aesni_skciphers[] = { #ifdef CONFIG_X86_64 }, { .base = { - .cra_name = "__ctr(aes)", - .cra_driver_name = "__ctr-aes-aesni", + .cra_name = "ctr(aes)", + .cra_driver_name = "ctr-aes-aesni", .cra_priority = 400, - .cra_flags = CRYPTO_ALG_INTERNAL, .cra_blocksize = 1, .cra_ctxsize = CRYPTO_AES_CTX_SIZE, .cra_module = THIS_MODULE, @@ -1086,15 +624,14 @@ static struct skcipher_alg aesni_skciphers[] = { .ivsize = AES_BLOCK_SIZE, .chunksize = AES_BLOCK_SIZE, .setkey = aesni_skcipher_setkey, - .encrypt = ctr_crypt, - .decrypt = ctr_crypt, + .encrypt = ctr_crypt_aesni, + .decrypt = ctr_crypt_aesni, #endif }, { .base = { - .cra_name = "__xts(aes)", - .cra_driver_name = "__xts-aes-aesni", + .cra_name = "xts(aes)", + .cra_driver_name = "xts-aes-aesni", .cra_priority = 401, - .cra_flags = CRYPTO_ALG_INTERNAL, .cra_blocksize = AES_BLOCK_SIZE, .cra_ctxsize = XTS_AES_CTX_SIZE, .cra_module = THIS_MODULE, @@ -1103,124 +640,992 @@ static struct skcipher_alg aesni_skciphers[] = { .max_keysize = 2 * AES_MAX_KEY_SIZE, .ivsize = AES_BLOCK_SIZE, .walksize = 2 * AES_BLOCK_SIZE, - .setkey = xts_aesni_setkey, - .encrypt = xts_encrypt, - .decrypt = xts_decrypt, + .setkey = xts_setkey_aesni, + .encrypt = xts_encrypt_aesni, + .decrypt = xts_decrypt_aesni, } }; -static -struct simd_skcipher_alg *aesni_simd_skciphers[ARRAY_SIZE(aesni_skciphers)]; - #ifdef CONFIG_X86_64 +asmlinkage void aes_xts_encrypt_iv(const struct crypto_aes_ctx *tweak_key, + u8 iv[AES_BLOCK_SIZE]); + +/* __always_inline to avoid indirect call */ +static __always_inline int +ctr_crypt(struct skcipher_request *req, + void (*ctr64_func)(const struct crypto_aes_ctx *key, + const u8 *src, u8 *dst, int len, + const u64 le_ctr[2])) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + const struct crypto_aes_ctx *key = aes_ctx(crypto_skcipher_ctx(tfm)); + unsigned int nbytes, p1_nbytes, nblocks; + struct skcipher_walk walk; + u64 le_ctr[2]; + u64 ctr64; + int err; + + ctr64 = le_ctr[0] = get_unaligned_be64(&req->iv[8]); + le_ctr[1] = get_unaligned_be64(&req->iv[0]); + + err = skcipher_walk_virt(&walk, req, false); + + while ((nbytes = walk.nbytes) != 0) { + if (nbytes < walk.total) { + /* Not the end yet, so keep the length block-aligned. */ + nbytes = round_down(nbytes, AES_BLOCK_SIZE); + nblocks = nbytes / AES_BLOCK_SIZE; + } else { + /* It's the end, so include any final partial block. */ + nblocks = DIV_ROUND_UP(nbytes, AES_BLOCK_SIZE); + } + ctr64 += nblocks; + + kernel_fpu_begin(); + if (likely(ctr64 >= nblocks)) { + /* The low 64 bits of the counter won't overflow. */ + (*ctr64_func)(key, walk.src.virt.addr, + walk.dst.virt.addr, nbytes, le_ctr); + } else { + /* + * The low 64 bits of the counter will overflow. The + * assembly doesn't handle this case, so split the + * operation into two at the point where the overflow + * will occur. After the first part, add the carry bit. + */ + p1_nbytes = min_t(unsigned int, nbytes, + (nblocks - ctr64) * AES_BLOCK_SIZE); + (*ctr64_func)(key, walk.src.virt.addr, + walk.dst.virt.addr, p1_nbytes, le_ctr); + le_ctr[0] = 0; + le_ctr[1]++; + (*ctr64_func)(key, walk.src.virt.addr + p1_nbytes, + walk.dst.virt.addr + p1_nbytes, + nbytes - p1_nbytes, le_ctr); + } + kernel_fpu_end(); + le_ctr[0] = ctr64; + + err = skcipher_walk_done(&walk, walk.nbytes - nbytes); + } + + put_unaligned_be64(ctr64, &req->iv[8]); + put_unaligned_be64(le_ctr[1], &req->iv[0]); + + return err; +} + +/* __always_inline to avoid indirect call */ +static __always_inline int +xctr_crypt(struct skcipher_request *req, + void (*xctr_func)(const struct crypto_aes_ctx *key, + const u8 *src, u8 *dst, int len, + const u8 iv[AES_BLOCK_SIZE], u64 ctr)) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + const struct crypto_aes_ctx *key = aes_ctx(crypto_skcipher_ctx(tfm)); + struct skcipher_walk walk; + unsigned int nbytes; + u64 ctr = 1; + int err; + + err = skcipher_walk_virt(&walk, req, false); + while ((nbytes = walk.nbytes) != 0) { + if (nbytes < walk.total) + nbytes = round_down(nbytes, AES_BLOCK_SIZE); + + kernel_fpu_begin(); + (*xctr_func)(key, walk.src.virt.addr, walk.dst.virt.addr, + nbytes, req->iv, ctr); + kernel_fpu_end(); + + ctr += DIV_ROUND_UP(nbytes, AES_BLOCK_SIZE); + err = skcipher_walk_done(&walk, walk.nbytes - nbytes); + } + return err; +} + +#define DEFINE_AVX_SKCIPHER_ALGS(suffix, driver_name_suffix, priority) \ + \ +asmlinkage void \ +aes_xts_encrypt_##suffix(const struct crypto_aes_ctx *key, const u8 *src, \ + u8 *dst, int len, u8 tweak[AES_BLOCK_SIZE]); \ +asmlinkage void \ +aes_xts_decrypt_##suffix(const struct crypto_aes_ctx *key, const u8 *src, \ + u8 *dst, int len, u8 tweak[AES_BLOCK_SIZE]); \ + \ +static int xts_encrypt_##suffix(struct skcipher_request *req) \ +{ \ + return xts_crypt(req, aes_xts_encrypt_iv, aes_xts_encrypt_##suffix); \ +} \ + \ +static int xts_decrypt_##suffix(struct skcipher_request *req) \ +{ \ + return xts_crypt(req, aes_xts_encrypt_iv, aes_xts_decrypt_##suffix); \ +} \ + \ +asmlinkage void \ +aes_ctr64_crypt_##suffix(const struct crypto_aes_ctx *key, \ + const u8 *src, u8 *dst, int len, const u64 le_ctr[2]);\ + \ +static int ctr_crypt_##suffix(struct skcipher_request *req) \ +{ \ + return ctr_crypt(req, aes_ctr64_crypt_##suffix); \ +} \ + \ +asmlinkage void \ +aes_xctr_crypt_##suffix(const struct crypto_aes_ctx *key, \ + const u8 *src, u8 *dst, int len, \ + const u8 iv[AES_BLOCK_SIZE], u64 ctr); \ + \ +static int xctr_crypt_##suffix(struct skcipher_request *req) \ +{ \ + return xctr_crypt(req, aes_xctr_crypt_##suffix); \ +} \ + \ +static struct skcipher_alg skcipher_algs_##suffix[] = {{ \ + .base.cra_name = "xts(aes)", \ + .base.cra_driver_name = "xts-aes-" driver_name_suffix, \ + .base.cra_priority = priority, \ + .base.cra_blocksize = AES_BLOCK_SIZE, \ + .base.cra_ctxsize = XTS_AES_CTX_SIZE, \ + .base.cra_module = THIS_MODULE, \ + .min_keysize = 2 * AES_MIN_KEY_SIZE, \ + .max_keysize = 2 * AES_MAX_KEY_SIZE, \ + .ivsize = AES_BLOCK_SIZE, \ + .walksize = 2 * AES_BLOCK_SIZE, \ + .setkey = xts_setkey_aesni, \ + .encrypt = xts_encrypt_##suffix, \ + .decrypt = xts_decrypt_##suffix, \ +}, { \ + .base.cra_name = "ctr(aes)", \ + .base.cra_driver_name = "ctr-aes-" driver_name_suffix, \ + .base.cra_priority = priority, \ + .base.cra_blocksize = 1, \ + .base.cra_ctxsize = CRYPTO_AES_CTX_SIZE, \ + .base.cra_module = THIS_MODULE, \ + .min_keysize = AES_MIN_KEY_SIZE, \ + .max_keysize = AES_MAX_KEY_SIZE, \ + .ivsize = AES_BLOCK_SIZE, \ + .chunksize = AES_BLOCK_SIZE, \ + .setkey = aesni_skcipher_setkey, \ + .encrypt = ctr_crypt_##suffix, \ + .decrypt = ctr_crypt_##suffix, \ +}, { \ + .base.cra_name = "xctr(aes)", \ + .base.cra_driver_name = "xctr-aes-" driver_name_suffix, \ + .base.cra_priority = priority, \ + .base.cra_blocksize = 1, \ + .base.cra_ctxsize = CRYPTO_AES_CTX_SIZE, \ + .base.cra_module = THIS_MODULE, \ + .min_keysize = AES_MIN_KEY_SIZE, \ + .max_keysize = AES_MAX_KEY_SIZE, \ + .ivsize = AES_BLOCK_SIZE, \ + .chunksize = AES_BLOCK_SIZE, \ + .setkey = aesni_skcipher_setkey, \ + .encrypt = xctr_crypt_##suffix, \ + .decrypt = xctr_crypt_##suffix, \ +}} + +DEFINE_AVX_SKCIPHER_ALGS(aesni_avx, "aesni-avx", 500); +#if defined(CONFIG_AS_VAES) && defined(CONFIG_AS_VPCLMULQDQ) +DEFINE_AVX_SKCIPHER_ALGS(vaes_avx2, "vaes-avx2", 600); +DEFINE_AVX_SKCIPHER_ALGS(vaes_avx512, "vaes-avx512", 800); +#endif + +/* The common part of the x86_64 AES-GCM key struct */ +struct aes_gcm_key { + /* Expanded AES key and the AES key length in bytes */ + struct crypto_aes_ctx aes_key; + + /* RFC4106 nonce (used only by the rfc4106 algorithms) */ + u32 rfc4106_nonce; +}; + +/* Key struct used by the AES-NI implementations of AES-GCM */ +struct aes_gcm_key_aesni { + /* + * Common part of the key. The assembly code requires 16-byte alignment + * for the round keys; we get this by them being located at the start of + * the struct and the whole struct being 16-byte aligned. + */ + struct aes_gcm_key base; + + /* + * Powers of the hash key H^8 through H^1. These are 128-bit values. + * They all have an extra factor of x^-1 and are byte-reversed. 16-byte + * alignment is required by the assembly code. + */ + u64 h_powers[8][2] __aligned(16); + + /* + * h_powers_xored[i] contains the two 64-bit halves of h_powers[i] XOR'd + * together. It's used for Karatsuba multiplication. 16-byte alignment + * is required by the assembly code. + */ + u64 h_powers_xored[8] __aligned(16); + + /* + * H^1 times x^64 (and also the usual extra factor of x^-1). 16-byte + * alignment is required by the assembly code. + */ + u64 h_times_x64[2] __aligned(16); +}; +#define AES_GCM_KEY_AESNI(key) \ + container_of((key), struct aes_gcm_key_aesni, base) +#define AES_GCM_KEY_AESNI_SIZE \ + (sizeof(struct aes_gcm_key_aesni) + (15 & ~(CRYPTO_MINALIGN - 1))) + +/* Key struct used by the VAES + AVX10 implementations of AES-GCM */ +struct aes_gcm_key_avx10 { + /* + * Common part of the key. The assembly code prefers 16-byte alignment + * for the round keys; we get this by them being located at the start of + * the struct and the whole struct being 64-byte aligned. + */ + struct aes_gcm_key base; + + /* + * Powers of the hash key H^16 through H^1. These are 128-bit values. + * They all have an extra factor of x^-1 and are byte-reversed. This + * array is aligned to a 64-byte boundary to make it naturally aligned + * for 512-bit loads, which can improve performance. (The assembly code + * doesn't *need* the alignment; this is just an optimization.) + */ + u64 h_powers[16][2] __aligned(64); + + /* Three padding blocks required by the assembly code */ + u64 padding[3][2]; +}; +#define AES_GCM_KEY_AVX10(key) \ + container_of((key), struct aes_gcm_key_avx10, base) +#define AES_GCM_KEY_AVX10_SIZE \ + (sizeof(struct aes_gcm_key_avx10) + (63 & ~(CRYPTO_MINALIGN - 1))) + /* - * XCTR does not have a non-AVX implementation, so it must be enabled - * conditionally. + * These flags are passed to the AES-GCM helper functions to specify the + * specific version of AES-GCM (RFC4106 or not), whether it's encryption or + * decryption, and which assembly functions should be called. Assembly + * functions are selected using flags instead of function pointers to avoid + * indirect calls (which are very expensive on x86) regardless of inlining. */ -static struct skcipher_alg aesni_xctr = { - .base = { - .cra_name = "__xctr(aes)", - .cra_driver_name = "__xctr-aes-aesni", - .cra_priority = 400, - .cra_flags = CRYPTO_ALG_INTERNAL, - .cra_blocksize = 1, - .cra_ctxsize = CRYPTO_AES_CTX_SIZE, - .cra_module = THIS_MODULE, - }, - .min_keysize = AES_MIN_KEY_SIZE, - .max_keysize = AES_MAX_KEY_SIZE, - .ivsize = AES_BLOCK_SIZE, - .chunksize = AES_BLOCK_SIZE, - .setkey = aesni_skcipher_setkey, - .encrypt = xctr_crypt, - .decrypt = xctr_crypt, -}; +#define FLAG_RFC4106 BIT(0) +#define FLAG_ENC BIT(1) +#define FLAG_AVX BIT(2) +#if defined(CONFIG_AS_VAES) && defined(CONFIG_AS_VPCLMULQDQ) +# define FLAG_AVX10_256 BIT(3) +# define FLAG_AVX10_512 BIT(4) +#else + /* + * This should cause all calls to the AVX10 assembly functions to be + * optimized out, avoiding the need to ifdef each call individually. + */ +# define FLAG_AVX10_256 0 +# define FLAG_AVX10_512 0 +#endif -static struct simd_skcipher_alg *aesni_simd_xctr; -#endif /* CONFIG_X86_64 */ +static inline struct aes_gcm_key * +aes_gcm_key_get(struct crypto_aead *tfm, int flags) +{ + if (flags & (FLAG_AVX10_256 | FLAG_AVX10_512)) + return PTR_ALIGN(crypto_aead_ctx(tfm), 64); + else + return PTR_ALIGN(crypto_aead_ctx(tfm), 16); +} -#ifdef CONFIG_X86_64 -static int generic_gcmaes_set_key(struct crypto_aead *aead, const u8 *key, - unsigned int key_len) +asmlinkage void +aes_gcm_precompute_aesni(struct aes_gcm_key_aesni *key); +asmlinkage void +aes_gcm_precompute_aesni_avx(struct aes_gcm_key_aesni *key); +asmlinkage void +aes_gcm_precompute_vaes_avx10_256(struct aes_gcm_key_avx10 *key); +asmlinkage void +aes_gcm_precompute_vaes_avx10_512(struct aes_gcm_key_avx10 *key); + +static void aes_gcm_precompute(struct aes_gcm_key *key, int flags) { - struct generic_gcmaes_ctx *ctx = generic_gcmaes_ctx_get(aead); + /* + * To make things a bit easier on the assembly side, the AVX10 + * implementations use the same key format. Therefore, a single + * function using 256-bit vectors would suffice here. However, it's + * straightforward to provide a 512-bit one because of how the assembly + * code is structured, and it works nicely because the total size of the + * key powers is a multiple of 512 bits. So we take advantage of that. + * + * A similar situation applies to the AES-NI implementations. + */ + if (flags & FLAG_AVX10_512) + aes_gcm_precompute_vaes_avx10_512(AES_GCM_KEY_AVX10(key)); + else if (flags & FLAG_AVX10_256) + aes_gcm_precompute_vaes_avx10_256(AES_GCM_KEY_AVX10(key)); + else if (flags & FLAG_AVX) + aes_gcm_precompute_aesni_avx(AES_GCM_KEY_AESNI(key)); + else + aes_gcm_precompute_aesni(AES_GCM_KEY_AESNI(key)); +} - return aes_set_key_common(&ctx->aes_key_expanded, key, key_len) ?: - rfc4106_set_hash_subkey(ctx->hash_subkey, key, key_len); +asmlinkage void +aes_gcm_aad_update_aesni(const struct aes_gcm_key_aesni *key, + u8 ghash_acc[16], const u8 *aad, int aadlen); +asmlinkage void +aes_gcm_aad_update_aesni_avx(const struct aes_gcm_key_aesni *key, + u8 ghash_acc[16], const u8 *aad, int aadlen); +asmlinkage void +aes_gcm_aad_update_vaes_avx10(const struct aes_gcm_key_avx10 *key, + u8 ghash_acc[16], const u8 *aad, int aadlen); + +static void aes_gcm_aad_update(const struct aes_gcm_key *key, u8 ghash_acc[16], + const u8 *aad, int aadlen, int flags) +{ + if (flags & (FLAG_AVX10_256 | FLAG_AVX10_512)) + aes_gcm_aad_update_vaes_avx10(AES_GCM_KEY_AVX10(key), ghash_acc, + aad, aadlen); + else if (flags & FLAG_AVX) + aes_gcm_aad_update_aesni_avx(AES_GCM_KEY_AESNI(key), ghash_acc, + aad, aadlen); + else + aes_gcm_aad_update_aesni(AES_GCM_KEY_AESNI(key), ghash_acc, + aad, aadlen); } -static int generic_gcmaes_encrypt(struct aead_request *req) +asmlinkage void +aes_gcm_enc_update_aesni(const struct aes_gcm_key_aesni *key, + const u32 le_ctr[4], u8 ghash_acc[16], + const u8 *src, u8 *dst, int datalen); +asmlinkage void +aes_gcm_enc_update_aesni_avx(const struct aes_gcm_key_aesni *key, + const u32 le_ctr[4], u8 ghash_acc[16], + const u8 *src, u8 *dst, int datalen); +asmlinkage void +aes_gcm_enc_update_vaes_avx10_256(const struct aes_gcm_key_avx10 *key, + const u32 le_ctr[4], u8 ghash_acc[16], + const u8 *src, u8 *dst, int datalen); +asmlinkage void +aes_gcm_enc_update_vaes_avx10_512(const struct aes_gcm_key_avx10 *key, + const u32 le_ctr[4], u8 ghash_acc[16], + const u8 *src, u8 *dst, int datalen); + +asmlinkage void +aes_gcm_dec_update_aesni(const struct aes_gcm_key_aesni *key, + const u32 le_ctr[4], u8 ghash_acc[16], + const u8 *src, u8 *dst, int datalen); +asmlinkage void +aes_gcm_dec_update_aesni_avx(const struct aes_gcm_key_aesni *key, + const u32 le_ctr[4], u8 ghash_acc[16], + const u8 *src, u8 *dst, int datalen); +asmlinkage void +aes_gcm_dec_update_vaes_avx10_256(const struct aes_gcm_key_avx10 *key, + const u32 le_ctr[4], u8 ghash_acc[16], + const u8 *src, u8 *dst, int datalen); +asmlinkage void +aes_gcm_dec_update_vaes_avx10_512(const struct aes_gcm_key_avx10 *key, + const u32 le_ctr[4], u8 ghash_acc[16], + const u8 *src, u8 *dst, int datalen); + +/* __always_inline to optimize out the branches based on @flags */ +static __always_inline void +aes_gcm_update(const struct aes_gcm_key *key, + const u32 le_ctr[4], u8 ghash_acc[16], + const u8 *src, u8 *dst, int datalen, int flags) { - struct crypto_aead *tfm = crypto_aead_reqtfm(req); - struct generic_gcmaes_ctx *ctx = generic_gcmaes_ctx_get(tfm); - void *aes_ctx = &(ctx->aes_key_expanded); - u8 ivbuf[16 + (AESNI_ALIGN - 8)] __aligned(8); - u8 *iv = PTR_ALIGN(&ivbuf[0], AESNI_ALIGN); - __be32 counter = cpu_to_be32(1); + if (flags & FLAG_ENC) { + if (flags & FLAG_AVX10_512) + aes_gcm_enc_update_vaes_avx10_512(AES_GCM_KEY_AVX10(key), + le_ctr, ghash_acc, + src, dst, datalen); + else if (flags & FLAG_AVX10_256) + aes_gcm_enc_update_vaes_avx10_256(AES_GCM_KEY_AVX10(key), + le_ctr, ghash_acc, + src, dst, datalen); + else if (flags & FLAG_AVX) + aes_gcm_enc_update_aesni_avx(AES_GCM_KEY_AESNI(key), + le_ctr, ghash_acc, + src, dst, datalen); + else + aes_gcm_enc_update_aesni(AES_GCM_KEY_AESNI(key), le_ctr, + ghash_acc, src, dst, datalen); + } else { + if (flags & FLAG_AVX10_512) + aes_gcm_dec_update_vaes_avx10_512(AES_GCM_KEY_AVX10(key), + le_ctr, ghash_acc, + src, dst, datalen); + else if (flags & FLAG_AVX10_256) + aes_gcm_dec_update_vaes_avx10_256(AES_GCM_KEY_AVX10(key), + le_ctr, ghash_acc, + src, dst, datalen); + else if (flags & FLAG_AVX) + aes_gcm_dec_update_aesni_avx(AES_GCM_KEY_AESNI(key), + le_ctr, ghash_acc, + src, dst, datalen); + else + aes_gcm_dec_update_aesni(AES_GCM_KEY_AESNI(key), + le_ctr, ghash_acc, + src, dst, datalen); + } +} - memcpy(iv, req->iv, 12); - *((__be32 *)(iv+12)) = counter; +asmlinkage void +aes_gcm_enc_final_aesni(const struct aes_gcm_key_aesni *key, + const u32 le_ctr[4], u8 ghash_acc[16], + u64 total_aadlen, u64 total_datalen); +asmlinkage void +aes_gcm_enc_final_aesni_avx(const struct aes_gcm_key_aesni *key, + const u32 le_ctr[4], u8 ghash_acc[16], + u64 total_aadlen, u64 total_datalen); +asmlinkage void +aes_gcm_enc_final_vaes_avx10(const struct aes_gcm_key_avx10 *key, + const u32 le_ctr[4], u8 ghash_acc[16], + u64 total_aadlen, u64 total_datalen); + +/* __always_inline to optimize out the branches based on @flags */ +static __always_inline void +aes_gcm_enc_final(const struct aes_gcm_key *key, + const u32 le_ctr[4], u8 ghash_acc[16], + u64 total_aadlen, u64 total_datalen, int flags) +{ + if (flags & (FLAG_AVX10_256 | FLAG_AVX10_512)) + aes_gcm_enc_final_vaes_avx10(AES_GCM_KEY_AVX10(key), + le_ctr, ghash_acc, + total_aadlen, total_datalen); + else if (flags & FLAG_AVX) + aes_gcm_enc_final_aesni_avx(AES_GCM_KEY_AESNI(key), + le_ctr, ghash_acc, + total_aadlen, total_datalen); + else + aes_gcm_enc_final_aesni(AES_GCM_KEY_AESNI(key), + le_ctr, ghash_acc, + total_aadlen, total_datalen); +} - return gcmaes_encrypt(req, req->assoclen, ctx->hash_subkey, iv, - aes_ctx); +asmlinkage bool __must_check +aes_gcm_dec_final_aesni(const struct aes_gcm_key_aesni *key, + const u32 le_ctr[4], const u8 ghash_acc[16], + u64 total_aadlen, u64 total_datalen, + const u8 tag[16], int taglen); +asmlinkage bool __must_check +aes_gcm_dec_final_aesni_avx(const struct aes_gcm_key_aesni *key, + const u32 le_ctr[4], const u8 ghash_acc[16], + u64 total_aadlen, u64 total_datalen, + const u8 tag[16], int taglen); +asmlinkage bool __must_check +aes_gcm_dec_final_vaes_avx10(const struct aes_gcm_key_avx10 *key, + const u32 le_ctr[4], const u8 ghash_acc[16], + u64 total_aadlen, u64 total_datalen, + const u8 tag[16], int taglen); + +/* __always_inline to optimize out the branches based on @flags */ +static __always_inline bool __must_check +aes_gcm_dec_final(const struct aes_gcm_key *key, const u32 le_ctr[4], + u8 ghash_acc[16], u64 total_aadlen, u64 total_datalen, + u8 tag[16], int taglen, int flags) +{ + if (flags & (FLAG_AVX10_256 | FLAG_AVX10_512)) + return aes_gcm_dec_final_vaes_avx10(AES_GCM_KEY_AVX10(key), + le_ctr, ghash_acc, + total_aadlen, total_datalen, + tag, taglen); + else if (flags & FLAG_AVX) + return aes_gcm_dec_final_aesni_avx(AES_GCM_KEY_AESNI(key), + le_ctr, ghash_acc, + total_aadlen, total_datalen, + tag, taglen); + else + return aes_gcm_dec_final_aesni(AES_GCM_KEY_AESNI(key), + le_ctr, ghash_acc, + total_aadlen, total_datalen, + tag, taglen); } -static int generic_gcmaes_decrypt(struct aead_request *req) +/* + * This is the Integrity Check Value (aka the authentication tag) length and can + * be 8, 12 or 16 bytes long. + */ +static int common_rfc4106_set_authsize(struct crypto_aead *aead, + unsigned int authsize) +{ + switch (authsize) { + case 8: + case 12: + case 16: + break; + default: + return -EINVAL; + } + + return 0; +} + +static int generic_gcmaes_set_authsize(struct crypto_aead *tfm, + unsigned int authsize) +{ + switch (authsize) { + case 4: + case 8: + case 12: + case 13: + case 14: + case 15: + case 16: + break; + default: + return -EINVAL; + } + + return 0; +} + +/* + * This is the setkey function for the x86_64 implementations of AES-GCM. It + * saves the RFC4106 nonce if applicable, expands the AES key, and precomputes + * powers of the hash key. + * + * To comply with the crypto_aead API, this has to be usable in no-SIMD context. + * For that reason, this function includes a portable C implementation of the + * needed logic. However, the portable C implementation is very slow, taking + * about the same time as encrypting 37 KB of data. To be ready for users that + * may set a key even somewhat frequently, we therefore also include a SIMD + * assembly implementation, expanding the AES key using AES-NI and precomputing + * the hash key powers using PCLMULQDQ or VPCLMULQDQ. + */ +static int gcm_setkey(struct crypto_aead *tfm, const u8 *raw_key, + unsigned int keylen, int flags) +{ + struct aes_gcm_key *key = aes_gcm_key_get(tfm, flags); + int err; + + if (flags & FLAG_RFC4106) { + if (keylen < 4) + return -EINVAL; + keylen -= 4; + key->rfc4106_nonce = get_unaligned_be32(raw_key + keylen); + } + + /* The assembly code assumes the following offsets. */ + BUILD_BUG_ON(offsetof(struct aes_gcm_key_aesni, base.aes_key.key_enc) != 0); + BUILD_BUG_ON(offsetof(struct aes_gcm_key_aesni, base.aes_key.key_length) != 480); + BUILD_BUG_ON(offsetof(struct aes_gcm_key_aesni, h_powers) != 496); + BUILD_BUG_ON(offsetof(struct aes_gcm_key_aesni, h_powers_xored) != 624); + BUILD_BUG_ON(offsetof(struct aes_gcm_key_aesni, h_times_x64) != 688); + BUILD_BUG_ON(offsetof(struct aes_gcm_key_avx10, base.aes_key.key_enc) != 0); + BUILD_BUG_ON(offsetof(struct aes_gcm_key_avx10, base.aes_key.key_length) != 480); + BUILD_BUG_ON(offsetof(struct aes_gcm_key_avx10, h_powers) != 512); + BUILD_BUG_ON(offsetof(struct aes_gcm_key_avx10, padding) != 768); + + if (likely(crypto_simd_usable())) { + err = aes_check_keylen(keylen); + if (err) + return err; + kernel_fpu_begin(); + aesni_set_key(&key->aes_key, raw_key, keylen); + aes_gcm_precompute(key, flags); + kernel_fpu_end(); + } else { + static const u8 x_to_the_minus1[16] __aligned(__alignof__(be128)) = { + [0] = 0xc2, [15] = 1 + }; + static const u8 x_to_the_63[16] __aligned(__alignof__(be128)) = { + [7] = 1, + }; + be128 h1 = {}; + be128 h; + int i; + + err = aes_expandkey(&key->aes_key, raw_key, keylen); + if (err) + return err; + + /* Encrypt the all-zeroes block to get the hash key H^1 */ + aes_encrypt(&key->aes_key, (u8 *)&h1, (u8 *)&h1); + + /* Compute H^1 * x^-1 */ + h = h1; + gf128mul_lle(&h, (const be128 *)x_to_the_minus1); + + /* Compute the needed key powers */ + if (flags & (FLAG_AVX10_256 | FLAG_AVX10_512)) { + struct aes_gcm_key_avx10 *k = AES_GCM_KEY_AVX10(key); + + for (i = ARRAY_SIZE(k->h_powers) - 1; i >= 0; i--) { + k->h_powers[i][0] = be64_to_cpu(h.b); + k->h_powers[i][1] = be64_to_cpu(h.a); + gf128mul_lle(&h, &h1); + } + memset(k->padding, 0, sizeof(k->padding)); + } else { + struct aes_gcm_key_aesni *k = AES_GCM_KEY_AESNI(key); + + for (i = ARRAY_SIZE(k->h_powers) - 1; i >= 0; i--) { + k->h_powers[i][0] = be64_to_cpu(h.b); + k->h_powers[i][1] = be64_to_cpu(h.a); + k->h_powers_xored[i] = k->h_powers[i][0] ^ + k->h_powers[i][1]; + gf128mul_lle(&h, &h1); + } + gf128mul_lle(&h1, (const be128 *)x_to_the_63); + k->h_times_x64[0] = be64_to_cpu(h1.b); + k->h_times_x64[1] = be64_to_cpu(h1.a); + } + } + return 0; +} + +/* + * Initialize @ghash_acc, then pass all @assoclen bytes of associated data + * (a.k.a. additional authenticated data) from @sg_src through the GHASH update + * assembly function. kernel_fpu_begin() must have already been called. + */ +static void gcm_process_assoc(const struct aes_gcm_key *key, u8 ghash_acc[16], + struct scatterlist *sg_src, unsigned int assoclen, + int flags) +{ + struct scatter_walk walk; + /* + * The assembly function requires that the length of any non-last + * segment of associated data be a multiple of 16 bytes, so this + * function does the buffering needed to achieve that. + */ + unsigned int pos = 0; + u8 buf[16]; + + memset(ghash_acc, 0, 16); + scatterwalk_start(&walk, sg_src); + + while (assoclen) { + unsigned int orig_len_this_step = scatterwalk_next( + &walk, assoclen); + unsigned int len_this_step = orig_len_this_step; + unsigned int len; + const u8 *src = walk.addr; + + if (unlikely(pos)) { + len = min(len_this_step, 16 - pos); + memcpy(&buf[pos], src, len); + pos += len; + src += len; + len_this_step -= len; + if (pos < 16) + goto next; + aes_gcm_aad_update(key, ghash_acc, buf, 16, flags); + pos = 0; + } + len = len_this_step; + if (unlikely(assoclen)) /* Not the last segment yet? */ + len = round_down(len, 16); + aes_gcm_aad_update(key, ghash_acc, src, len, flags); + src += len; + len_this_step -= len; + if (unlikely(len_this_step)) { + memcpy(buf, src, len_this_step); + pos = len_this_step; + } +next: + scatterwalk_done_src(&walk, orig_len_this_step); + if (need_resched()) { + kernel_fpu_end(); + kernel_fpu_begin(); + } + assoclen -= orig_len_this_step; + } + if (unlikely(pos)) + aes_gcm_aad_update(key, ghash_acc, buf, pos, flags); +} + + +/* __always_inline to optimize out the branches based on @flags */ +static __always_inline int +gcm_crypt(struct aead_request *req, int flags) { - __be32 counter = cpu_to_be32(1); struct crypto_aead *tfm = crypto_aead_reqtfm(req); - struct generic_gcmaes_ctx *ctx = generic_gcmaes_ctx_get(tfm); - void *aes_ctx = &(ctx->aes_key_expanded); - u8 ivbuf[16 + (AESNI_ALIGN - 8)] __aligned(8); - u8 *iv = PTR_ALIGN(&ivbuf[0], AESNI_ALIGN); + const struct aes_gcm_key *key = aes_gcm_key_get(tfm, flags); + unsigned int assoclen = req->assoclen; + struct skcipher_walk walk; + unsigned int nbytes; + u8 ghash_acc[16]; /* GHASH accumulator */ + u32 le_ctr[4]; /* Counter in little-endian format */ + int taglen; + int err; - memcpy(iv, req->iv, 12); - *((__be32 *)(iv+12)) = counter; + /* Initialize the counter and determine the associated data length. */ + le_ctr[0] = 2; + if (flags & FLAG_RFC4106) { + if (unlikely(assoclen != 16 && assoclen != 20)) + return -EINVAL; + assoclen -= 8; + le_ctr[1] = get_unaligned_be32(req->iv + 4); + le_ctr[2] = get_unaligned_be32(req->iv + 0); + le_ctr[3] = key->rfc4106_nonce; /* already byte-swapped */ + } else { + le_ctr[1] = get_unaligned_be32(req->iv + 8); + le_ctr[2] = get_unaligned_be32(req->iv + 4); + le_ctr[3] = get_unaligned_be32(req->iv + 0); + } - return gcmaes_decrypt(req, req->assoclen, ctx->hash_subkey, iv, - aes_ctx); + /* Begin walking through the plaintext or ciphertext. */ + if (flags & FLAG_ENC) + err = skcipher_walk_aead_encrypt(&walk, req, false); + else + err = skcipher_walk_aead_decrypt(&walk, req, false); + if (err) + return err; + + /* + * Since the AES-GCM assembly code requires that at least three assembly + * functions be called to process any message (this is needed to support + * incremental updates cleanly), to reduce overhead we try to do all + * three calls in the same kernel FPU section if possible. We close the + * section and start a new one if there are multiple data segments or if + * rescheduling is needed while processing the associated data. + */ + kernel_fpu_begin(); + + /* Pass the associated data through GHASH. */ + gcm_process_assoc(key, ghash_acc, req->src, assoclen, flags); + + /* En/decrypt the data and pass the ciphertext through GHASH. */ + while (unlikely((nbytes = walk.nbytes) < walk.total)) { + /* + * Non-last segment. In this case, the assembly function + * requires that the length be a multiple of 16 (AES_BLOCK_SIZE) + * bytes. The needed buffering of up to 16 bytes is handled by + * the skcipher_walk. Here we just need to round down to a + * multiple of 16. + */ + nbytes = round_down(nbytes, AES_BLOCK_SIZE); + aes_gcm_update(key, le_ctr, ghash_acc, walk.src.virt.addr, + walk.dst.virt.addr, nbytes, flags); + le_ctr[0] += nbytes / AES_BLOCK_SIZE; + kernel_fpu_end(); + err = skcipher_walk_done(&walk, walk.nbytes - nbytes); + if (err) + return err; + kernel_fpu_begin(); + } + /* Last segment: process all remaining data. */ + aes_gcm_update(key, le_ctr, ghash_acc, walk.src.virt.addr, + walk.dst.virt.addr, nbytes, flags); + /* + * The low word of the counter isn't used by the finalize, so there's no + * need to increment it here. + */ + + /* Finalize */ + taglen = crypto_aead_authsize(tfm); + if (flags & FLAG_ENC) { + /* Finish computing the auth tag. */ + aes_gcm_enc_final(key, le_ctr, ghash_acc, assoclen, + req->cryptlen, flags); + + /* Store the computed auth tag in the dst scatterlist. */ + scatterwalk_map_and_copy(ghash_acc, req->dst, req->assoclen + + req->cryptlen, taglen, 1); + } else { + unsigned int datalen = req->cryptlen - taglen; + u8 tag[16]; + + /* Get the transmitted auth tag from the src scatterlist. */ + scatterwalk_map_and_copy(tag, req->src, req->assoclen + datalen, + taglen, 0); + /* + * Finish computing the auth tag and compare it to the + * transmitted one. The assembly function does the actual tag + * comparison. Here, just check the boolean result. + */ + if (!aes_gcm_dec_final(key, le_ctr, ghash_acc, assoclen, + datalen, tag, taglen, flags)) + err = -EBADMSG; + } + kernel_fpu_end(); + if (nbytes) + skcipher_walk_done(&walk, 0); + return err; } -static struct aead_alg aesni_aeads[] = { { - .setkey = common_rfc4106_set_key, - .setauthsize = common_rfc4106_set_authsize, - .encrypt = helper_rfc4106_encrypt, - .decrypt = helper_rfc4106_decrypt, - .ivsize = GCM_RFC4106_IV_SIZE, - .maxauthsize = 16, - .base = { - .cra_name = "__rfc4106(gcm(aes))", - .cra_driver_name = "__rfc4106-gcm-aesni", - .cra_priority = 400, - .cra_flags = CRYPTO_ALG_INTERNAL, - .cra_blocksize = 1, - .cra_ctxsize = sizeof(struct aesni_rfc4106_gcm_ctx), - .cra_alignmask = 0, - .cra_module = THIS_MODULE, - }, -}, { - .setkey = generic_gcmaes_set_key, - .setauthsize = generic_gcmaes_set_authsize, - .encrypt = generic_gcmaes_encrypt, - .decrypt = generic_gcmaes_decrypt, - .ivsize = GCM_AES_IV_SIZE, - .maxauthsize = 16, - .base = { - .cra_name = "__gcm(aes)", - .cra_driver_name = "__generic-gcm-aesni", - .cra_priority = 400, - .cra_flags = CRYPTO_ALG_INTERNAL, - .cra_blocksize = 1, - .cra_ctxsize = sizeof(struct generic_gcmaes_ctx), - .cra_alignmask = 0, - .cra_module = THIS_MODULE, - }, -} }; -#else -static struct aead_alg aesni_aeads[0]; +#define DEFINE_GCM_ALGS(suffix, flags, generic_driver_name, rfc_driver_name, \ + ctxsize, priority) \ + \ +static int gcm_setkey_##suffix(struct crypto_aead *tfm, const u8 *raw_key, \ + unsigned int keylen) \ +{ \ + return gcm_setkey(tfm, raw_key, keylen, (flags)); \ +} \ + \ +static int gcm_encrypt_##suffix(struct aead_request *req) \ +{ \ + return gcm_crypt(req, (flags) | FLAG_ENC); \ +} \ + \ +static int gcm_decrypt_##suffix(struct aead_request *req) \ +{ \ + return gcm_crypt(req, (flags)); \ +} \ + \ +static int rfc4106_setkey_##suffix(struct crypto_aead *tfm, const u8 *raw_key, \ + unsigned int keylen) \ +{ \ + return gcm_setkey(tfm, raw_key, keylen, (flags) | FLAG_RFC4106); \ +} \ + \ +static int rfc4106_encrypt_##suffix(struct aead_request *req) \ +{ \ + return gcm_crypt(req, (flags) | FLAG_RFC4106 | FLAG_ENC); \ +} \ + \ +static int rfc4106_decrypt_##suffix(struct aead_request *req) \ +{ \ + return gcm_crypt(req, (flags) | FLAG_RFC4106); \ +} \ + \ +static struct aead_alg aes_gcm_algs_##suffix[] = { { \ + .setkey = gcm_setkey_##suffix, \ + .setauthsize = generic_gcmaes_set_authsize, \ + .encrypt = gcm_encrypt_##suffix, \ + .decrypt = gcm_decrypt_##suffix, \ + .ivsize = GCM_AES_IV_SIZE, \ + .chunksize = AES_BLOCK_SIZE, \ + .maxauthsize = 16, \ + .base = { \ + .cra_name = "gcm(aes)", \ + .cra_driver_name = generic_driver_name, \ + .cra_priority = (priority), \ + .cra_blocksize = 1, \ + .cra_ctxsize = (ctxsize), \ + .cra_module = THIS_MODULE, \ + }, \ +}, { \ + .setkey = rfc4106_setkey_##suffix, \ + .setauthsize = common_rfc4106_set_authsize, \ + .encrypt = rfc4106_encrypt_##suffix, \ + .decrypt = rfc4106_decrypt_##suffix, \ + .ivsize = GCM_RFC4106_IV_SIZE, \ + .chunksize = AES_BLOCK_SIZE, \ + .maxauthsize = 16, \ + .base = { \ + .cra_name = "rfc4106(gcm(aes))", \ + .cra_driver_name = rfc_driver_name, \ + .cra_priority = (priority), \ + .cra_blocksize = 1, \ + .cra_ctxsize = (ctxsize), \ + .cra_module = THIS_MODULE, \ + }, \ +} } + +/* aes_gcm_algs_aesni */ +DEFINE_GCM_ALGS(aesni, /* no flags */ 0, + "generic-gcm-aesni", "rfc4106-gcm-aesni", + AES_GCM_KEY_AESNI_SIZE, 400); + +/* aes_gcm_algs_aesni_avx */ +DEFINE_GCM_ALGS(aesni_avx, FLAG_AVX, + "generic-gcm-aesni-avx", "rfc4106-gcm-aesni-avx", + AES_GCM_KEY_AESNI_SIZE, 500); + +#if defined(CONFIG_AS_VAES) && defined(CONFIG_AS_VPCLMULQDQ) +/* aes_gcm_algs_vaes_avx10_256 */ +DEFINE_GCM_ALGS(vaes_avx10_256, FLAG_AVX10_256, + "generic-gcm-vaes-avx10_256", "rfc4106-gcm-vaes-avx10_256", + AES_GCM_KEY_AVX10_SIZE, 700); + +/* aes_gcm_algs_vaes_avx10_512 */ +DEFINE_GCM_ALGS(vaes_avx10_512, FLAG_AVX10_512, + "generic-gcm-vaes-avx10_512", "rfc4106-gcm-vaes-avx10_512", + AES_GCM_KEY_AVX10_SIZE, 800); +#endif /* CONFIG_AS_VAES && CONFIG_AS_VPCLMULQDQ */ + +static int __init register_avx_algs(void) +{ + int err; + + if (!boot_cpu_has(X86_FEATURE_AVX)) + return 0; + err = crypto_register_skciphers(skcipher_algs_aesni_avx, + ARRAY_SIZE(skcipher_algs_aesni_avx)); + if (err) + return err; + err = crypto_register_aeads(aes_gcm_algs_aesni_avx, + ARRAY_SIZE(aes_gcm_algs_aesni_avx)); + if (err) + return err; + /* + * Note: not all the algorithms registered below actually require + * VPCLMULQDQ. But in practice every CPU with VAES also has VPCLMULQDQ. + * Similarly, the assembler support was added at about the same time. + * For simplicity, just always check for VAES and VPCLMULQDQ together. + */ +#if defined(CONFIG_AS_VAES) && defined(CONFIG_AS_VPCLMULQDQ) + if (!boot_cpu_has(X86_FEATURE_AVX2) || + !boot_cpu_has(X86_FEATURE_VAES) || + !boot_cpu_has(X86_FEATURE_VPCLMULQDQ) || + !boot_cpu_has(X86_FEATURE_PCLMULQDQ) || + !cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) + return 0; + err = crypto_register_skciphers(skcipher_algs_vaes_avx2, + ARRAY_SIZE(skcipher_algs_vaes_avx2)); + if (err) + return err; + + if (!boot_cpu_has(X86_FEATURE_AVX512BW) || + !boot_cpu_has(X86_FEATURE_AVX512VL) || + !boot_cpu_has(X86_FEATURE_BMI2) || + !cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM | + XFEATURE_MASK_AVX512, NULL)) + return 0; + + err = crypto_register_aeads(aes_gcm_algs_vaes_avx10_256, + ARRAY_SIZE(aes_gcm_algs_vaes_avx10_256)); + if (err) + return err; + + if (boot_cpu_has(X86_FEATURE_PREFER_YMM)) { + int i; + + for (i = 0; i < ARRAY_SIZE(skcipher_algs_vaes_avx512); i++) + skcipher_algs_vaes_avx512[i].base.cra_priority = 1; + for (i = 0; i < ARRAY_SIZE(aes_gcm_algs_vaes_avx10_512); i++) + aes_gcm_algs_vaes_avx10_512[i].base.cra_priority = 1; + } + + err = crypto_register_skciphers(skcipher_algs_vaes_avx512, + ARRAY_SIZE(skcipher_algs_vaes_avx512)); + if (err) + return err; + err = crypto_register_aeads(aes_gcm_algs_vaes_avx10_512, + ARRAY_SIZE(aes_gcm_algs_vaes_avx10_512)); + if (err) + return err; +#endif /* CONFIG_AS_VAES && CONFIG_AS_VPCLMULQDQ */ + return 0; +} + +#define unregister_skciphers(A) \ + if (refcount_read(&(A)[0].base.cra_refcnt) != 0) \ + crypto_unregister_skciphers((A), ARRAY_SIZE(A)) +#define unregister_aeads(A) \ + if (refcount_read(&(A)[0].base.cra_refcnt) != 0) \ + crypto_unregister_aeads((A), ARRAY_SIZE(A)) + +static void unregister_avx_algs(void) +{ + unregister_skciphers(skcipher_algs_aesni_avx); + unregister_aeads(aes_gcm_algs_aesni_avx); +#if defined(CONFIG_AS_VAES) && defined(CONFIG_AS_VPCLMULQDQ) + unregister_skciphers(skcipher_algs_vaes_avx2); + unregister_skciphers(skcipher_algs_vaes_avx512); + unregister_aeads(aes_gcm_algs_vaes_avx10_256); + unregister_aeads(aes_gcm_algs_vaes_avx10_512); #endif +} +#else /* CONFIG_X86_64 */ +static struct aead_alg aes_gcm_algs_aesni[0]; + +static int __init register_avx_algs(void) +{ + return 0; +} -static struct simd_aead_alg *aesni_simd_aeads[ARRAY_SIZE(aesni_aeads)]; +static void unregister_avx_algs(void) +{ +} +#endif /* !CONFIG_X86_64 */ static const struct x86_cpu_id aesni_cpu_id[] = { X86_MATCH_FEATURE(X86_FEATURE_AES, NULL), @@ -1234,59 +1639,34 @@ static int __init aesni_init(void) if (!x86_match_cpu(aesni_cpu_id)) return -ENODEV; -#ifdef CONFIG_X86_64 - if (boot_cpu_has(X86_FEATURE_AVX2)) { - pr_info("AVX2 version of gcm_enc/dec engaged.\n"); - static_branch_enable(&gcm_use_avx); - static_branch_enable(&gcm_use_avx2); - } else - if (boot_cpu_has(X86_FEATURE_AVX)) { - pr_info("AVX version of gcm_enc/dec engaged.\n"); - static_branch_enable(&gcm_use_avx); - } else { - pr_info("SSE version of gcm_enc/dec engaged.\n"); - } - if (boot_cpu_has(X86_FEATURE_AVX)) { - /* optimize performance of ctr mode encryption transform */ - static_call_update(aesni_ctr_enc_tfm, aesni_ctr_enc_avx_tfm); - pr_info("AES CTR mode by8 optimization enabled\n"); - } -#endif /* CONFIG_X86_64 */ err = crypto_register_alg(&aesni_cipher_alg); if (err) return err; - err = simd_register_skciphers_compat(aesni_skciphers, - ARRAY_SIZE(aesni_skciphers), - aesni_simd_skciphers); + err = crypto_register_skciphers(aesni_skciphers, + ARRAY_SIZE(aesni_skciphers)); if (err) goto unregister_cipher; - err = simd_register_aeads_compat(aesni_aeads, ARRAY_SIZE(aesni_aeads), - aesni_simd_aeads); + err = crypto_register_aeads(aes_gcm_algs_aesni, + ARRAY_SIZE(aes_gcm_algs_aesni)); if (err) goto unregister_skciphers; -#ifdef CONFIG_X86_64 - if (boot_cpu_has(X86_FEATURE_AVX)) - err = simd_register_skciphers_compat(&aesni_xctr, 1, - &aesni_simd_xctr); + err = register_avx_algs(); if (err) - goto unregister_aeads; -#endif /* CONFIG_X86_64 */ + goto unregister_avx; return 0; -#ifdef CONFIG_X86_64 -unregister_aeads: - simd_unregister_aeads(aesni_aeads, ARRAY_SIZE(aesni_aeads), - aesni_simd_aeads); -#endif /* CONFIG_X86_64 */ - +unregister_avx: + unregister_avx_algs(); + crypto_unregister_aeads(aes_gcm_algs_aesni, + ARRAY_SIZE(aes_gcm_algs_aesni)); unregister_skciphers: - simd_unregister_skciphers(aesni_skciphers, ARRAY_SIZE(aesni_skciphers), - aesni_simd_skciphers); + crypto_unregister_skciphers(aesni_skciphers, + ARRAY_SIZE(aesni_skciphers)); unregister_cipher: crypto_unregister_alg(&aesni_cipher_alg); return err; @@ -1294,20 +1674,17 @@ unregister_cipher: static void __exit aesni_exit(void) { - simd_unregister_aeads(aesni_aeads, ARRAY_SIZE(aesni_aeads), - aesni_simd_aeads); - simd_unregister_skciphers(aesni_skciphers, ARRAY_SIZE(aesni_skciphers), - aesni_simd_skciphers); + crypto_unregister_aeads(aes_gcm_algs_aesni, + ARRAY_SIZE(aes_gcm_algs_aesni)); + crypto_unregister_skciphers(aesni_skciphers, + ARRAY_SIZE(aesni_skciphers)); crypto_unregister_alg(&aesni_cipher_alg); -#ifdef CONFIG_X86_64 - if (boot_cpu_has(X86_FEATURE_AVX)) - simd_unregister_skciphers(&aesni_xctr, 1, &aesni_simd_xctr); -#endif /* CONFIG_X86_64 */ + unregister_avx_algs(); } -late_initcall(aesni_init); +module_init(aesni_init); module_exit(aesni_exit); -MODULE_DESCRIPTION("Rijndael (AES) Cipher Algorithm, Intel AES-NI instructions optimized"); +MODULE_DESCRIPTION("AES cipher and modes, optimized with AES-NI or VAES instructions"); MODULE_LICENSE("GPL"); MODULE_ALIAS_CRYPTO("aes"); diff --git a/arch/x86/crypto/aria_aesni_avx2_glue.c b/arch/x86/crypto/aria_aesni_avx2_glue.c index 87a11804fc77..b4bddcd58457 100644 --- a/arch/x86/crypto/aria_aesni_avx2_glue.c +++ b/arch/x86/crypto/aria_aesni_avx2_glue.c @@ -6,7 +6,6 @@ */ #include <crypto/algapi.h> -#include <crypto/internal/simd.h> #include <crypto/aria.h> #include <linux/crypto.h> #include <linux/err.h> @@ -165,10 +164,9 @@ static int aria_avx2_init_tfm(struct crypto_skcipher *tfm) static struct skcipher_alg aria_algs[] = { { - .base.cra_name = "__ecb(aria)", - .base.cra_driver_name = "__ecb-aria-avx2", + .base.cra_name = "ecb(aria)", + .base.cra_driver_name = "ecb-aria-avx2", .base.cra_priority = 500, - .base.cra_flags = CRYPTO_ALG_INTERNAL, .base.cra_blocksize = ARIA_BLOCK_SIZE, .base.cra_ctxsize = sizeof(struct aria_ctx), .base.cra_module = THIS_MODULE, @@ -178,11 +176,10 @@ static struct skcipher_alg aria_algs[] = { .encrypt = aria_avx2_ecb_encrypt, .decrypt = aria_avx2_ecb_decrypt, }, { - .base.cra_name = "__ctr(aria)", - .base.cra_driver_name = "__ctr-aria-avx2", + .base.cra_name = "ctr(aria)", + .base.cra_driver_name = "ctr-aria-avx2", .base.cra_priority = 500, - .base.cra_flags = CRYPTO_ALG_INTERNAL | - CRYPTO_ALG_SKCIPHER_REQSIZE_LARGE, + .base.cra_flags = CRYPTO_ALG_SKCIPHER_REQSIZE_LARGE, .base.cra_blocksize = 1, .base.cra_ctxsize = sizeof(struct aria_ctx), .base.cra_module = THIS_MODULE, @@ -197,8 +194,6 @@ static struct skcipher_alg aria_algs[] = { } }; -static struct simd_skcipher_alg *aria_simd_algs[ARRAY_SIZE(aria_algs)]; - static int __init aria_avx2_init(void) { const char *feature_name; @@ -233,15 +228,12 @@ static int __init aria_avx2_init(void) aria_ops.aria_ctr_crypt_32way = aria_aesni_avx2_ctr_crypt_32way; } - return simd_register_skciphers_compat(aria_algs, - ARRAY_SIZE(aria_algs), - aria_simd_algs); + return crypto_register_skciphers(aria_algs, ARRAY_SIZE(aria_algs)); } static void __exit aria_avx2_exit(void) { - simd_unregister_skciphers(aria_algs, ARRAY_SIZE(aria_algs), - aria_simd_algs); + crypto_unregister_skciphers(aria_algs, ARRAY_SIZE(aria_algs)); } module_init(aria_avx2_init); diff --git a/arch/x86/crypto/aria_aesni_avx_glue.c b/arch/x86/crypto/aria_aesni_avx_glue.c index 4e1516b76669..ab9b38d05332 100644 --- a/arch/x86/crypto/aria_aesni_avx_glue.c +++ b/arch/x86/crypto/aria_aesni_avx_glue.c @@ -6,7 +6,6 @@ */ #include <crypto/algapi.h> -#include <crypto/internal/simd.h> #include <crypto/aria.h> #include <linux/crypto.h> #include <linux/err.h> @@ -152,10 +151,9 @@ static int aria_avx_init_tfm(struct crypto_skcipher *tfm) static struct skcipher_alg aria_algs[] = { { - .base.cra_name = "__ecb(aria)", - .base.cra_driver_name = "__ecb-aria-avx", + .base.cra_name = "ecb(aria)", + .base.cra_driver_name = "ecb-aria-avx", .base.cra_priority = 400, - .base.cra_flags = CRYPTO_ALG_INTERNAL, .base.cra_blocksize = ARIA_BLOCK_SIZE, .base.cra_ctxsize = sizeof(struct aria_ctx), .base.cra_module = THIS_MODULE, @@ -165,10 +163,9 @@ static struct skcipher_alg aria_algs[] = { .encrypt = aria_avx_ecb_encrypt, .decrypt = aria_avx_ecb_decrypt, }, { - .base.cra_name = "__ctr(aria)", - .base.cra_driver_name = "__ctr-aria-avx", + .base.cra_name = "ctr(aria)", + .base.cra_driver_name = "ctr-aria-avx", .base.cra_priority = 400, - .base.cra_flags = CRYPTO_ALG_INTERNAL, .base.cra_blocksize = 1, .base.cra_ctxsize = sizeof(struct aria_ctx), .base.cra_module = THIS_MODULE, @@ -184,8 +181,6 @@ static struct skcipher_alg aria_algs[] = { } }; -static struct simd_skcipher_alg *aria_simd_algs[ARRAY_SIZE(aria_algs)]; - static int __init aria_avx_init(void) { const char *feature_name; @@ -213,15 +208,12 @@ static int __init aria_avx_init(void) aria_ops.aria_ctr_crypt_16way = aria_aesni_avx_ctr_crypt_16way; } - return simd_register_skciphers_compat(aria_algs, - ARRAY_SIZE(aria_algs), - aria_simd_algs); + return crypto_register_skciphers(aria_algs, ARRAY_SIZE(aria_algs)); } static void __exit aria_avx_exit(void) { - simd_unregister_skciphers(aria_algs, ARRAY_SIZE(aria_algs), - aria_simd_algs); + crypto_unregister_skciphers(aria_algs, ARRAY_SIZE(aria_algs)); } module_init(aria_avx_init); diff --git a/arch/x86/crypto/aria_gfni_avx512_glue.c b/arch/x86/crypto/aria_gfni_avx512_glue.c index f4a2208d2638..363cbf4399cc 100644 --- a/arch/x86/crypto/aria_gfni_avx512_glue.c +++ b/arch/x86/crypto/aria_gfni_avx512_glue.c @@ -6,7 +6,6 @@ */ #include <crypto/algapi.h> -#include <crypto/internal/simd.h> #include <crypto/aria.h> #include <linux/crypto.h> #include <linux/err.h> @@ -165,10 +164,9 @@ static int aria_avx512_init_tfm(struct crypto_skcipher *tfm) static struct skcipher_alg aria_algs[] = { { - .base.cra_name = "__ecb(aria)", - .base.cra_driver_name = "__ecb-aria-avx512", + .base.cra_name = "ecb(aria)", + .base.cra_driver_name = "ecb-aria-avx512", .base.cra_priority = 600, - .base.cra_flags = CRYPTO_ALG_INTERNAL, .base.cra_blocksize = ARIA_BLOCK_SIZE, .base.cra_ctxsize = sizeof(struct aria_ctx), .base.cra_module = THIS_MODULE, @@ -178,11 +176,10 @@ static struct skcipher_alg aria_algs[] = { .encrypt = aria_avx512_ecb_encrypt, .decrypt = aria_avx512_ecb_decrypt, }, { - .base.cra_name = "__ctr(aria)", - .base.cra_driver_name = "__ctr-aria-avx512", + .base.cra_name = "ctr(aria)", + .base.cra_driver_name = "ctr-aria-avx512", .base.cra_priority = 600, - .base.cra_flags = CRYPTO_ALG_INTERNAL | - CRYPTO_ALG_SKCIPHER_REQSIZE_LARGE, + .base.cra_flags = CRYPTO_ALG_SKCIPHER_REQSIZE_LARGE, .base.cra_blocksize = 1, .base.cra_ctxsize = sizeof(struct aria_ctx), .base.cra_module = THIS_MODULE, @@ -197,8 +194,6 @@ static struct skcipher_alg aria_algs[] = { } }; -static struct simd_skcipher_alg *aria_simd_algs[ARRAY_SIZE(aria_algs)]; - static int __init aria_avx512_init(void) { const char *feature_name; @@ -229,15 +224,12 @@ static int __init aria_avx512_init(void) aria_ops.aria_decrypt_64way = aria_gfni_avx512_decrypt_64way; aria_ops.aria_ctr_crypt_64way = aria_gfni_avx512_ctr_crypt_64way; - return simd_register_skciphers_compat(aria_algs, - ARRAY_SIZE(aria_algs), - aria_simd_algs); + return crypto_register_skciphers(aria_algs, ARRAY_SIZE(aria_algs)); } static void __exit aria_avx512_exit(void) { - simd_unregister_skciphers(aria_algs, ARRAY_SIZE(aria_algs), - aria_simd_algs); + crypto_unregister_skciphers(aria_algs, ARRAY_SIZE(aria_algs)); } module_init(aria_avx512_init); diff --git a/arch/x86/crypto/blake2s-core.S b/arch/x86/crypto/blake2s-core.S deleted file mode 100644 index b50b35ff1fdb..000000000000 --- a/arch/x86/crypto/blake2s-core.S +++ /dev/null @@ -1,256 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 OR MIT */ -/* - * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. - * Copyright (C) 2017-2019 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved. - */ - -#include <linux/linkage.h> - -.section .rodata.cst32.BLAKE2S_IV, "aM", @progbits, 32 -.align 32 -IV: .octa 0xA54FF53A3C6EF372BB67AE856A09E667 - .octa 0x5BE0CD191F83D9AB9B05688C510E527F -.section .rodata.cst16.ROT16, "aM", @progbits, 16 -.align 16 -ROT16: .octa 0x0D0C0F0E09080B0A0504070601000302 -.section .rodata.cst16.ROR328, "aM", @progbits, 16 -.align 16 -ROR328: .octa 0x0C0F0E0D080B0A090407060500030201 -.section .rodata.cst64.BLAKE2S_SIGMA, "aM", @progbits, 160 -.align 64 -SIGMA: -.byte 0, 2, 4, 6, 1, 3, 5, 7, 14, 8, 10, 12, 15, 9, 11, 13 -.byte 14, 4, 9, 13, 10, 8, 15, 6, 5, 1, 0, 11, 3, 12, 2, 7 -.byte 11, 12, 5, 15, 8, 0, 2, 13, 9, 10, 3, 7, 4, 14, 6, 1 -.byte 7, 3, 13, 11, 9, 1, 12, 14, 15, 2, 5, 4, 8, 6, 10, 0 -.byte 9, 5, 2, 10, 0, 7, 4, 15, 3, 14, 11, 6, 13, 1, 12, 8 -.byte 2, 6, 0, 8, 12, 10, 11, 3, 1, 4, 7, 15, 9, 13, 5, 14 -.byte 12, 1, 14, 4, 5, 15, 13, 10, 8, 0, 6, 9, 11, 7, 3, 2 -.byte 13, 7, 12, 3, 11, 14, 1, 9, 2, 5, 15, 8, 10, 0, 4, 6 -.byte 6, 14, 11, 0, 15, 9, 3, 8, 10, 12, 13, 1, 5, 2, 7, 4 -.byte 10, 8, 7, 1, 2, 4, 6, 5, 13, 15, 9, 3, 0, 11, 14, 12 -#ifdef CONFIG_AS_AVX512 -.section .rodata.cst64.BLAKE2S_SIGMA2, "aM", @progbits, 640 -.align 64 -SIGMA2: -.long 0, 2, 4, 6, 1, 3, 5, 7, 14, 8, 10, 12, 15, 9, 11, 13 -.long 8, 2, 13, 15, 10, 9, 12, 3, 6, 4, 0, 14, 5, 11, 1, 7 -.long 11, 13, 8, 6, 5, 10, 14, 3, 2, 4, 12, 15, 1, 0, 7, 9 -.long 11, 10, 7, 0, 8, 15, 1, 13, 3, 6, 2, 12, 4, 14, 9, 5 -.long 4, 10, 9, 14, 15, 0, 11, 8, 1, 7, 3, 13, 2, 5, 6, 12 -.long 2, 11, 4, 15, 14, 3, 10, 8, 13, 6, 5, 7, 0, 12, 1, 9 -.long 4, 8, 15, 9, 14, 11, 13, 5, 3, 2, 1, 12, 6, 10, 7, 0 -.long 6, 13, 0, 14, 12, 2, 1, 11, 15, 4, 5, 8, 7, 9, 3, 10 -.long 15, 5, 4, 13, 10, 7, 3, 11, 12, 2, 0, 6, 9, 8, 1, 14 -.long 8, 7, 14, 11, 13, 15, 0, 12, 10, 4, 5, 6, 3, 2, 1, 9 -#endif /* CONFIG_AS_AVX512 */ - -.text -SYM_FUNC_START(blake2s_compress_ssse3) - testq %rdx,%rdx - je .Lendofloop - movdqu (%rdi),%xmm0 - movdqu 0x10(%rdi),%xmm1 - movdqa ROT16(%rip),%xmm12 - movdqa ROR328(%rip),%xmm13 - movdqu 0x20(%rdi),%xmm14 - movq %rcx,%xmm15 - leaq SIGMA+0xa0(%rip),%r8 - jmp .Lbeginofloop - .align 32 -.Lbeginofloop: - movdqa %xmm0,%xmm10 - movdqa %xmm1,%xmm11 - paddq %xmm15,%xmm14 - movdqa IV(%rip),%xmm2 - movdqa %xmm14,%xmm3 - pxor IV+0x10(%rip),%xmm3 - leaq SIGMA(%rip),%rcx -.Lroundloop: - movzbl (%rcx),%eax - movd (%rsi,%rax,4),%xmm4 - movzbl 0x1(%rcx),%eax - movd (%rsi,%rax,4),%xmm5 - movzbl 0x2(%rcx),%eax - movd (%rsi,%rax,4),%xmm6 - movzbl 0x3(%rcx),%eax - movd (%rsi,%rax,4),%xmm7 - punpckldq %xmm5,%xmm4 - punpckldq %xmm7,%xmm6 - punpcklqdq %xmm6,%xmm4 - paddd %xmm4,%xmm0 - paddd %xmm1,%xmm0 - pxor %xmm0,%xmm3 - pshufb %xmm12,%xmm3 - paddd %xmm3,%xmm2 - pxor %xmm2,%xmm1 - movdqa %xmm1,%xmm8 - psrld $0xc,%xmm1 - pslld $0x14,%xmm8 - por %xmm8,%xmm1 - movzbl 0x4(%rcx),%eax - movd (%rsi,%rax,4),%xmm5 - movzbl 0x5(%rcx),%eax - movd (%rsi,%rax,4),%xmm6 - movzbl 0x6(%rcx),%eax - movd (%rsi,%rax,4),%xmm7 - movzbl 0x7(%rcx),%eax - movd (%rsi,%rax,4),%xmm4 - punpckldq %xmm6,%xmm5 - punpckldq %xmm4,%xmm7 - punpcklqdq %xmm7,%xmm5 - paddd %xmm5,%xmm0 - paddd %xmm1,%xmm0 - pxor %xmm0,%xmm3 - pshufb %xmm13,%xmm3 - paddd %xmm3,%xmm2 - pxor %xmm2,%xmm1 - movdqa %xmm1,%xmm8 - psrld $0x7,%xmm1 - pslld $0x19,%xmm8 - por %xmm8,%xmm1 - pshufd $0x93,%xmm0,%xmm0 - pshufd $0x4e,%xmm3,%xmm3 - pshufd $0x39,%xmm2,%xmm2 - movzbl 0x8(%rcx),%eax - movd (%rsi,%rax,4),%xmm6 - movzbl 0x9(%rcx),%eax - movd (%rsi,%rax,4),%xmm7 - movzbl 0xa(%rcx),%eax - movd (%rsi,%rax,4),%xmm4 - movzbl 0xb(%rcx),%eax - movd (%rsi,%rax,4),%xmm5 - punpckldq %xmm7,%xmm6 - punpckldq %xmm5,%xmm4 - punpcklqdq %xmm4,%xmm6 - paddd %xmm6,%xmm0 - paddd %xmm1,%xmm0 - pxor %xmm0,%xmm3 - pshufb %xmm12,%xmm3 - paddd %xmm3,%xmm2 - pxor %xmm2,%xmm1 - movdqa %xmm1,%xmm8 - psrld $0xc,%xmm1 - pslld $0x14,%xmm8 - por %xmm8,%xmm1 - movzbl 0xc(%rcx),%eax - movd (%rsi,%rax,4),%xmm7 - movzbl 0xd(%rcx),%eax - movd (%rsi,%rax,4),%xmm4 - movzbl 0xe(%rcx),%eax - movd (%rsi,%rax,4),%xmm5 - movzbl 0xf(%rcx),%eax - movd (%rsi,%rax,4),%xmm6 - punpckldq %xmm4,%xmm7 - punpckldq %xmm6,%xmm5 - punpcklqdq %xmm5,%xmm7 - paddd %xmm7,%xmm0 - paddd %xmm1,%xmm0 - pxor %xmm0,%xmm3 - pshufb %xmm13,%xmm3 - paddd %xmm3,%xmm2 - pxor %xmm2,%xmm1 - movdqa %xmm1,%xmm8 - psrld $0x7,%xmm1 - pslld $0x19,%xmm8 - por %xmm8,%xmm1 - pshufd $0x39,%xmm0,%xmm0 - pshufd $0x4e,%xmm3,%xmm3 - pshufd $0x93,%xmm2,%xmm2 - addq $0x10,%rcx - cmpq %r8,%rcx - jnz .Lroundloop - pxor %xmm2,%xmm0 - pxor %xmm3,%xmm1 - pxor %xmm10,%xmm0 - pxor %xmm11,%xmm1 - addq $0x40,%rsi - decq %rdx - jnz .Lbeginofloop - movdqu %xmm0,(%rdi) - movdqu %xmm1,0x10(%rdi) - movdqu %xmm14,0x20(%rdi) -.Lendofloop: - RET -SYM_FUNC_END(blake2s_compress_ssse3) - -#ifdef CONFIG_AS_AVX512 -SYM_FUNC_START(blake2s_compress_avx512) - vmovdqu (%rdi),%xmm0 - vmovdqu 0x10(%rdi),%xmm1 - vmovdqu 0x20(%rdi),%xmm4 - vmovq %rcx,%xmm5 - vmovdqa IV(%rip),%xmm14 - vmovdqa IV+16(%rip),%xmm15 - jmp .Lblake2s_compress_avx512_mainloop -.align 32 -.Lblake2s_compress_avx512_mainloop: - vmovdqa %xmm0,%xmm10 - vmovdqa %xmm1,%xmm11 - vpaddq %xmm5,%xmm4,%xmm4 - vmovdqa %xmm14,%xmm2 - vpxor %xmm15,%xmm4,%xmm3 - vmovdqu (%rsi),%ymm6 - vmovdqu 0x20(%rsi),%ymm7 - addq $0x40,%rsi - leaq SIGMA2(%rip),%rax - movb $0xa,%cl -.Lblake2s_compress_avx512_roundloop: - addq $0x40,%rax - vmovdqa -0x40(%rax),%ymm8 - vmovdqa -0x20(%rax),%ymm9 - vpermi2d %ymm7,%ymm6,%ymm8 - vpermi2d %ymm7,%ymm6,%ymm9 - vmovdqa %ymm8,%ymm6 - vmovdqa %ymm9,%ymm7 - vpaddd %xmm8,%xmm0,%xmm0 - vpaddd %xmm1,%xmm0,%xmm0 - vpxor %xmm0,%xmm3,%xmm3 - vprord $0x10,%xmm3,%xmm3 - vpaddd %xmm3,%xmm2,%xmm2 - vpxor %xmm2,%xmm1,%xmm1 - vprord $0xc,%xmm1,%xmm1 - vextracti128 $0x1,%ymm8,%xmm8 - vpaddd %xmm8,%xmm0,%xmm0 - vpaddd %xmm1,%xmm0,%xmm0 - vpxor %xmm0,%xmm3,%xmm3 - vprord $0x8,%xmm3,%xmm3 - vpaddd %xmm3,%xmm2,%xmm2 - vpxor %xmm2,%xmm1,%xmm1 - vprord $0x7,%xmm1,%xmm1 - vpshufd $0x93,%xmm0,%xmm0 - vpshufd $0x4e,%xmm3,%xmm3 - vpshufd $0x39,%xmm2,%xmm2 - vpaddd %xmm9,%xmm0,%xmm0 - vpaddd %xmm1,%xmm0,%xmm0 - vpxor %xmm0,%xmm3,%xmm3 - vprord $0x10,%xmm3,%xmm3 - vpaddd %xmm3,%xmm2,%xmm2 - vpxor %xmm2,%xmm1,%xmm1 - vprord $0xc,%xmm1,%xmm1 - vextracti128 $0x1,%ymm9,%xmm9 - vpaddd %xmm9,%xmm0,%xmm0 - vpaddd %xmm1,%xmm0,%xmm0 - vpxor %xmm0,%xmm3,%xmm3 - vprord $0x8,%xmm3,%xmm3 - vpaddd %xmm3,%xmm2,%xmm2 - vpxor %xmm2,%xmm1,%xmm1 - vprord $0x7,%xmm1,%xmm1 - vpshufd $0x39,%xmm0,%xmm0 - vpshufd $0x4e,%xmm3,%xmm3 - vpshufd $0x93,%xmm2,%xmm2 - decb %cl - jne .Lblake2s_compress_avx512_roundloop - vpxor %xmm10,%xmm0,%xmm0 - vpxor %xmm11,%xmm1,%xmm1 - vpxor %xmm2,%xmm0,%xmm0 - vpxor %xmm3,%xmm1,%xmm1 - decq %rdx - jne .Lblake2s_compress_avx512_mainloop - vmovdqu %xmm0,(%rdi) - vmovdqu %xmm1,0x10(%rdi) - vmovdqu %xmm4,0x20(%rdi) - vzeroupper - RET -SYM_FUNC_END(blake2s_compress_avx512) -#endif /* CONFIG_AS_AVX512 */ diff --git a/arch/x86/crypto/blake2s-glue.c b/arch/x86/crypto/blake2s-glue.c deleted file mode 100644 index 0313f9673f56..000000000000 --- a/arch/x86/crypto/blake2s-glue.c +++ /dev/null @@ -1,74 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 OR MIT -/* - * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. - */ - -#include <crypto/internal/blake2s.h> - -#include <linux/types.h> -#include <linux/jump_label.h> -#include <linux/kernel.h> -#include <linux/sizes.h> - -#include <asm/cpufeature.h> -#include <asm/fpu/api.h> -#include <asm/processor.h> -#include <asm/simd.h> - -asmlinkage void blake2s_compress_ssse3(struct blake2s_state *state, - const u8 *block, const size_t nblocks, - const u32 inc); -asmlinkage void blake2s_compress_avx512(struct blake2s_state *state, - const u8 *block, const size_t nblocks, - const u32 inc); - -static __ro_after_init DEFINE_STATIC_KEY_FALSE(blake2s_use_ssse3); -static __ro_after_init DEFINE_STATIC_KEY_FALSE(blake2s_use_avx512); - -void blake2s_compress(struct blake2s_state *state, const u8 *block, - size_t nblocks, const u32 inc) -{ - /* SIMD disables preemption, so relax after processing each page. */ - BUILD_BUG_ON(SZ_4K / BLAKE2S_BLOCK_SIZE < 8); - - if (!static_branch_likely(&blake2s_use_ssse3) || !may_use_simd()) { - blake2s_compress_generic(state, block, nblocks, inc); - return; - } - - do { - const size_t blocks = min_t(size_t, nblocks, - SZ_4K / BLAKE2S_BLOCK_SIZE); - - kernel_fpu_begin(); - if (IS_ENABLED(CONFIG_AS_AVX512) && - static_branch_likely(&blake2s_use_avx512)) - blake2s_compress_avx512(state, block, blocks, inc); - else - blake2s_compress_ssse3(state, block, blocks, inc); - kernel_fpu_end(); - - nblocks -= blocks; - block += blocks * BLAKE2S_BLOCK_SIZE; - } while (nblocks); -} -EXPORT_SYMBOL(blake2s_compress); - -static int __init blake2s_mod_init(void) -{ - if (boot_cpu_has(X86_FEATURE_SSSE3)) - static_branch_enable(&blake2s_use_ssse3); - - if (IS_ENABLED(CONFIG_AS_AVX512) && - boot_cpu_has(X86_FEATURE_AVX) && - boot_cpu_has(X86_FEATURE_AVX2) && - boot_cpu_has(X86_FEATURE_AVX512F) && - boot_cpu_has(X86_FEATURE_AVX512VL) && - cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM | - XFEATURE_MASK_AVX512, NULL)) - static_branch_enable(&blake2s_use_avx512); - - return 0; -} - -subsys_initcall(blake2s_mod_init); diff --git a/arch/x86/crypto/blowfish_glue.c b/arch/x86/crypto/blowfish_glue.c index 552f2df0643f..26c5f2ee5d10 100644 --- a/arch/x86/crypto/blowfish_glue.c +++ b/arch/x86/crypto/blowfish_glue.c @@ -94,7 +94,6 @@ static struct crypto_alg bf_cipher_alg = { .cra_flags = CRYPTO_ALG_TYPE_CIPHER, .cra_blocksize = BF_BLOCK_SIZE, .cra_ctxsize = sizeof(struct bf_ctx), - .cra_alignmask = 0, .cra_module = THIS_MODULE, .cra_u = { .cipher = { diff --git a/arch/x86/crypto/camellia-aesni-avx-asm_64.S b/arch/x86/crypto/camellia-aesni-avx-asm_64.S index 646477a13e11..1dfef28c1266 100644 --- a/arch/x86/crypto/camellia-aesni-avx-asm_64.S +++ b/arch/x86/crypto/camellia-aesni-avx-asm_64.S @@ -16,6 +16,7 @@ */ #include <linux/linkage.h> +#include <linux/cfi_types.h> #include <asm/frame.h> #define CAMELLIA_TABLE_BYTE_LEN 272 @@ -882,7 +883,7 @@ SYM_FUNC_START_LOCAL(__camellia_dec_blk16) jmp .Ldec_max24; SYM_FUNC_END(__camellia_dec_blk16) -SYM_FUNC_START(camellia_ecb_enc_16way) +SYM_TYPED_FUNC_START(camellia_ecb_enc_16way) /* input: * %rdi: ctx, CTX * %rsi: dst (16 blocks) @@ -907,7 +908,7 @@ SYM_FUNC_START(camellia_ecb_enc_16way) RET; SYM_FUNC_END(camellia_ecb_enc_16way) -SYM_FUNC_START(camellia_ecb_dec_16way) +SYM_TYPED_FUNC_START(camellia_ecb_dec_16way) /* input: * %rdi: ctx, CTX * %rsi: dst (16 blocks) @@ -937,7 +938,7 @@ SYM_FUNC_START(camellia_ecb_dec_16way) RET; SYM_FUNC_END(camellia_ecb_dec_16way) -SYM_FUNC_START(camellia_cbc_dec_16way) +SYM_TYPED_FUNC_START(camellia_cbc_dec_16way) /* input: * %rdi: ctx, CTX * %rsi: dst (16 blocks) diff --git a/arch/x86/crypto/camellia-aesni-avx2-asm_64.S b/arch/x86/crypto/camellia-aesni-avx2-asm_64.S index a0eb94e53b1b..b1c9b9450555 100644 --- a/arch/x86/crypto/camellia-aesni-avx2-asm_64.S +++ b/arch/x86/crypto/camellia-aesni-avx2-asm_64.S @@ -6,6 +6,7 @@ */ #include <linux/linkage.h> +#include <linux/cfi_types.h> #include <asm/frame.h> #define CAMELLIA_TABLE_BYTE_LEN 272 diff --git a/arch/x86/crypto/camellia-x86_64-asm_64.S b/arch/x86/crypto/camellia-x86_64-asm_64.S index 816b6bb8bded..824cb94de6c2 100644 --- a/arch/x86/crypto/camellia-x86_64-asm_64.S +++ b/arch/x86/crypto/camellia-x86_64-asm_64.S @@ -6,6 +6,7 @@ */ #include <linux/linkage.h> +#include <linux/cfi_types.h> .file "camellia-x86_64-asm_64.S" .text @@ -177,7 +178,7 @@ bswapq RAB0; \ movq RAB0, 4*2(RIO); -SYM_FUNC_START(__camellia_enc_blk) +SYM_TYPED_FUNC_START(__camellia_enc_blk) /* input: * %rdi: ctx, CTX * %rsi: dst @@ -224,7 +225,7 @@ SYM_FUNC_START(__camellia_enc_blk) RET; SYM_FUNC_END(__camellia_enc_blk) -SYM_FUNC_START(camellia_dec_blk) +SYM_TYPED_FUNC_START(camellia_dec_blk) /* input: * %rdi: ctx, CTX * %rsi: dst @@ -411,7 +412,7 @@ SYM_FUNC_END(camellia_dec_blk) bswapq RAB1; \ movq RAB1, 12*2(RIO); -SYM_FUNC_START(__camellia_enc_blk_2way) +SYM_TYPED_FUNC_START(__camellia_enc_blk_2way) /* input: * %rdi: ctx, CTX * %rsi: dst @@ -460,7 +461,7 @@ SYM_FUNC_START(__camellia_enc_blk_2way) RET; SYM_FUNC_END(__camellia_enc_blk_2way) -SYM_FUNC_START(camellia_dec_blk_2way) +SYM_TYPED_FUNC_START(camellia_dec_blk_2way) /* input: * %rdi: ctx, CTX * %rsi: dst diff --git a/arch/x86/crypto/camellia_aesni_avx2_glue.c b/arch/x86/crypto/camellia_aesni_avx2_glue.c index e7e4d64e9577..2d2f4e16537c 100644 --- a/arch/x86/crypto/camellia_aesni_avx2_glue.c +++ b/arch/x86/crypto/camellia_aesni_avx2_glue.c @@ -6,7 +6,6 @@ */ #include <crypto/algapi.h> -#include <crypto/internal/simd.h> #include <linux/crypto.h> #include <linux/err.h> #include <linux/module.h> @@ -69,10 +68,9 @@ static int cbc_decrypt(struct skcipher_request *req) static struct skcipher_alg camellia_algs[] = { { - .base.cra_name = "__ecb(camellia)", - .base.cra_driver_name = "__ecb-camellia-aesni-avx2", + .base.cra_name = "ecb(camellia)", + .base.cra_driver_name = "ecb-camellia-aesni-avx2", .base.cra_priority = 500, - .base.cra_flags = CRYPTO_ALG_INTERNAL, .base.cra_blocksize = CAMELLIA_BLOCK_SIZE, .base.cra_ctxsize = sizeof(struct camellia_ctx), .base.cra_module = THIS_MODULE, @@ -82,10 +80,9 @@ static struct skcipher_alg camellia_algs[] = { .encrypt = ecb_encrypt, .decrypt = ecb_decrypt, }, { - .base.cra_name = "__cbc(camellia)", - .base.cra_driver_name = "__cbc-camellia-aesni-avx2", + .base.cra_name = "cbc(camellia)", + .base.cra_driver_name = "cbc-camellia-aesni-avx2", .base.cra_priority = 500, - .base.cra_flags = CRYPTO_ALG_INTERNAL, .base.cra_blocksize = CAMELLIA_BLOCK_SIZE, .base.cra_ctxsize = sizeof(struct camellia_ctx), .base.cra_module = THIS_MODULE, @@ -98,8 +95,6 @@ static struct skcipher_alg camellia_algs[] = { }, }; -static struct simd_skcipher_alg *camellia_simd_algs[ARRAY_SIZE(camellia_algs)]; - static int __init camellia_aesni_init(void) { const char *feature_name; @@ -118,15 +113,13 @@ static int __init camellia_aesni_init(void) return -ENODEV; } - return simd_register_skciphers_compat(camellia_algs, - ARRAY_SIZE(camellia_algs), - camellia_simd_algs); + return crypto_register_skciphers(camellia_algs, + ARRAY_SIZE(camellia_algs)); } static void __exit camellia_aesni_fini(void) { - simd_unregister_skciphers(camellia_algs, ARRAY_SIZE(camellia_algs), - camellia_simd_algs); + crypto_unregister_skciphers(camellia_algs, ARRAY_SIZE(camellia_algs)); } module_init(camellia_aesni_init); diff --git a/arch/x86/crypto/camellia_aesni_avx_glue.c b/arch/x86/crypto/camellia_aesni_avx_glue.c index c7ccf63e741e..a7d162388142 100644 --- a/arch/x86/crypto/camellia_aesni_avx_glue.c +++ b/arch/x86/crypto/camellia_aesni_avx_glue.c @@ -6,7 +6,6 @@ */ #include <crypto/algapi.h> -#include <crypto/internal/simd.h> #include <linux/crypto.h> #include <linux/err.h> #include <linux/module.h> @@ -69,10 +68,9 @@ static int cbc_decrypt(struct skcipher_request *req) static struct skcipher_alg camellia_algs[] = { { - .base.cra_name = "__ecb(camellia)", - .base.cra_driver_name = "__ecb-camellia-aesni", + .base.cra_name = "ecb(camellia)", + .base.cra_driver_name = "ecb-camellia-aesni", .base.cra_priority = 400, - .base.cra_flags = CRYPTO_ALG_INTERNAL, .base.cra_blocksize = CAMELLIA_BLOCK_SIZE, .base.cra_ctxsize = sizeof(struct camellia_ctx), .base.cra_module = THIS_MODULE, @@ -82,10 +80,9 @@ static struct skcipher_alg camellia_algs[] = { .encrypt = ecb_encrypt, .decrypt = ecb_decrypt, }, { - .base.cra_name = "__cbc(camellia)", - .base.cra_driver_name = "__cbc-camellia-aesni", + .base.cra_name = "cbc(camellia)", + .base.cra_driver_name = "cbc-camellia-aesni", .base.cra_priority = 400, - .base.cra_flags = CRYPTO_ALG_INTERNAL, .base.cra_blocksize = CAMELLIA_BLOCK_SIZE, .base.cra_ctxsize = sizeof(struct camellia_ctx), .base.cra_module = THIS_MODULE, @@ -98,8 +95,6 @@ static struct skcipher_alg camellia_algs[] = { } }; -static struct simd_skcipher_alg *camellia_simd_algs[ARRAY_SIZE(camellia_algs)]; - static int __init camellia_aesni_init(void) { const char *feature_name; @@ -117,15 +112,13 @@ static int __init camellia_aesni_init(void) return -ENODEV; } - return simd_register_skciphers_compat(camellia_algs, - ARRAY_SIZE(camellia_algs), - camellia_simd_algs); + return crypto_register_skciphers(camellia_algs, + ARRAY_SIZE(camellia_algs)); } static void __exit camellia_aesni_fini(void) { - simd_unregister_skciphers(camellia_algs, ARRAY_SIZE(camellia_algs), - camellia_simd_algs); + crypto_unregister_skciphers(camellia_algs, ARRAY_SIZE(camellia_algs)); } module_init(camellia_aesni_init); diff --git a/arch/x86/crypto/camellia_glue.c b/arch/x86/crypto/camellia_glue.c index d45e9c0c42ac..3bd37d664121 100644 --- a/arch/x86/crypto/camellia_glue.c +++ b/arch/x86/crypto/camellia_glue.c @@ -8,7 +8,7 @@ * Copyright (C) 2006 NTT (Nippon Telegraph and Telephone Corporation) */ -#include <asm/unaligned.h> +#include <linux/unaligned.h> #include <linux/crypto.h> #include <linux/init.h> #include <linux/module.h> @@ -1313,7 +1313,6 @@ static struct crypto_alg camellia_cipher_alg = { .cra_flags = CRYPTO_ALG_TYPE_CIPHER, .cra_blocksize = CAMELLIA_BLOCK_SIZE, .cra_ctxsize = sizeof(struct camellia_ctx), - .cra_alignmask = 0, .cra_module = THIS_MODULE, .cra_u = { .cipher = { diff --git a/arch/x86/crypto/cast5-avx-x86_64-asm_64.S b/arch/x86/crypto/cast5-avx-x86_64-asm_64.S index b4e460a87f18..fb95a614249d 100644 --- a/arch/x86/crypto/cast5-avx-x86_64-asm_64.S +++ b/arch/x86/crypto/cast5-avx-x86_64-asm_64.S @@ -487,79 +487,3 @@ SYM_FUNC_START(cast5_cbc_dec_16way) FRAME_END RET; SYM_FUNC_END(cast5_cbc_dec_16way) - -SYM_FUNC_START(cast5_ctr_16way) - /* input: - * %rdi: ctx - * %rsi: dst - * %rdx: src - * %rcx: iv (big endian, 64bit) - */ - FRAME_BEGIN - pushq %r12; - pushq %r15; - - movq %rdi, CTX; - movq %rsi, %r11; - movq %rdx, %r12; - - vpcmpeqd RTMP, RTMP, RTMP; - vpsrldq $8, RTMP, RTMP; /* low: -1, high: 0 */ - - vpcmpeqd RKR, RKR, RKR; - vpaddq RKR, RKR, RKR; /* low: -2, high: -2 */ - vmovdqa .Lbswap_iv_mask(%rip), R1ST; - vmovdqa .Lbswap128_mask(%rip), RKM; - - /* load IV and byteswap */ - vmovq (%rcx), RX; - vpshufb R1ST, RX, RX; - - /* construct IVs */ - vpsubq RTMP, RX, RX; /* le: IV1, IV0 */ - vpshufb RKM, RX, RL1; /* be: IV0, IV1 */ - vpsubq RKR, RX, RX; - vpshufb RKM, RX, RR1; /* be: IV2, IV3 */ - vpsubq RKR, RX, RX; - vpshufb RKM, RX, RL2; /* be: IV4, IV5 */ - vpsubq RKR, RX, RX; - vpshufb RKM, RX, RR2; /* be: IV6, IV7 */ - vpsubq RKR, RX, RX; - vpshufb RKM, RX, RL3; /* be: IV8, IV9 */ - vpsubq RKR, RX, RX; - vpshufb RKM, RX, RR3; /* be: IV10, IV11 */ - vpsubq RKR, RX, RX; - vpshufb RKM, RX, RL4; /* be: IV12, IV13 */ - vpsubq RKR, RX, RX; - vpshufb RKM, RX, RR4; /* be: IV14, IV15 */ - - /* store last IV */ - vpsubq RTMP, RX, RX; /* le: IV16, IV14 */ - vpshufb R1ST, RX, RX; /* be: IV16, IV16 */ - vmovq RX, (%rcx); - - call __cast5_enc_blk16; - - /* dst = src ^ iv */ - vpxor (0*16)(%r12), RR1, RR1; - vpxor (1*16)(%r12), RL1, RL1; - vpxor (2*16)(%r12), RR2, RR2; - vpxor (3*16)(%r12), RL2, RL2; - vpxor (4*16)(%r12), RR3, RR3; - vpxor (5*16)(%r12), RL3, RL3; - vpxor (6*16)(%r12), RR4, RR4; - vpxor (7*16)(%r12), RL4, RL4; - vmovdqu RR1, (0*16)(%r11); - vmovdqu RL1, (1*16)(%r11); - vmovdqu RR2, (2*16)(%r11); - vmovdqu RL2, (3*16)(%r11); - vmovdqu RR3, (4*16)(%r11); - vmovdqu RL3, (5*16)(%r11); - vmovdqu RR4, (6*16)(%r11); - vmovdqu RL4, (7*16)(%r11); - - popq %r15; - popq %r12; - FRAME_END - RET; -SYM_FUNC_END(cast5_ctr_16way) diff --git a/arch/x86/crypto/cast5_avx_glue.c b/arch/x86/crypto/cast5_avx_glue.c index 3976a87f92ad..3aca04d43b34 100644 --- a/arch/x86/crypto/cast5_avx_glue.c +++ b/arch/x86/crypto/cast5_avx_glue.c @@ -8,7 +8,6 @@ #include <crypto/algapi.h> #include <crypto/cast5.h> -#include <crypto/internal/simd.h> #include <linux/crypto.h> #include <linux/err.h> #include <linux/module.h> @@ -64,10 +63,9 @@ static int cbc_decrypt(struct skcipher_request *req) static struct skcipher_alg cast5_algs[] = { { - .base.cra_name = "__ecb(cast5)", - .base.cra_driver_name = "__ecb-cast5-avx", + .base.cra_name = "ecb(cast5)", + .base.cra_driver_name = "ecb-cast5-avx", .base.cra_priority = 200, - .base.cra_flags = CRYPTO_ALG_INTERNAL, .base.cra_blocksize = CAST5_BLOCK_SIZE, .base.cra_ctxsize = sizeof(struct cast5_ctx), .base.cra_module = THIS_MODULE, @@ -77,10 +75,9 @@ static struct skcipher_alg cast5_algs[] = { .encrypt = ecb_encrypt, .decrypt = ecb_decrypt, }, { - .base.cra_name = "__cbc(cast5)", - .base.cra_driver_name = "__cbc-cast5-avx", + .base.cra_name = "cbc(cast5)", + .base.cra_driver_name = "cbc-cast5-avx", .base.cra_priority = 200, - .base.cra_flags = CRYPTO_ALG_INTERNAL, .base.cra_blocksize = CAST5_BLOCK_SIZE, .base.cra_ctxsize = sizeof(struct cast5_ctx), .base.cra_module = THIS_MODULE, @@ -93,8 +90,6 @@ static struct skcipher_alg cast5_algs[] = { } }; -static struct simd_skcipher_alg *cast5_simd_algs[ARRAY_SIZE(cast5_algs)]; - static int __init cast5_init(void) { const char *feature_name; @@ -105,15 +100,13 @@ static int __init cast5_init(void) return -ENODEV; } - return simd_register_skciphers_compat(cast5_algs, - ARRAY_SIZE(cast5_algs), - cast5_simd_algs); + return crypto_register_skciphers(cast5_algs, + ARRAY_SIZE(cast5_algs)); } static void __exit cast5_exit(void) { - simd_unregister_skciphers(cast5_algs, ARRAY_SIZE(cast5_algs), - cast5_simd_algs); + crypto_unregister_skciphers(cast5_algs, ARRAY_SIZE(cast5_algs)); } module_init(cast5_init); diff --git a/arch/x86/crypto/cast6_avx_glue.c b/arch/x86/crypto/cast6_avx_glue.c index 7e2aea372349..c4dd28c30303 100644 --- a/arch/x86/crypto/cast6_avx_glue.c +++ b/arch/x86/crypto/cast6_avx_glue.c @@ -14,7 +14,6 @@ #include <linux/err.h> #include <crypto/algapi.h> #include <crypto/cast6.h> -#include <crypto/internal/simd.h> #include "ecb_cbc_helpers.h" @@ -64,10 +63,9 @@ static int cbc_decrypt(struct skcipher_request *req) static struct skcipher_alg cast6_algs[] = { { - .base.cra_name = "__ecb(cast6)", - .base.cra_driver_name = "__ecb-cast6-avx", + .base.cra_name = "ecb(cast6)", + .base.cra_driver_name = "ecb-cast6-avx", .base.cra_priority = 200, - .base.cra_flags = CRYPTO_ALG_INTERNAL, .base.cra_blocksize = CAST6_BLOCK_SIZE, .base.cra_ctxsize = sizeof(struct cast6_ctx), .base.cra_module = THIS_MODULE, @@ -77,10 +75,9 @@ static struct skcipher_alg cast6_algs[] = { .encrypt = ecb_encrypt, .decrypt = ecb_decrypt, }, { - .base.cra_name = "__cbc(cast6)", - .base.cra_driver_name = "__cbc-cast6-avx", + .base.cra_name = "cbc(cast6)", + .base.cra_driver_name = "cbc-cast6-avx", .base.cra_priority = 200, - .base.cra_flags = CRYPTO_ALG_INTERNAL, .base.cra_blocksize = CAST6_BLOCK_SIZE, .base.cra_ctxsize = sizeof(struct cast6_ctx), .base.cra_module = THIS_MODULE, @@ -93,8 +90,6 @@ static struct skcipher_alg cast6_algs[] = { }, }; -static struct simd_skcipher_alg *cast6_simd_algs[ARRAY_SIZE(cast6_algs)]; - static int __init cast6_init(void) { const char *feature_name; @@ -105,15 +100,12 @@ static int __init cast6_init(void) return -ENODEV; } - return simd_register_skciphers_compat(cast6_algs, - ARRAY_SIZE(cast6_algs), - cast6_simd_algs); + return crypto_register_skciphers(cast6_algs, ARRAY_SIZE(cast6_algs)); } static void __exit cast6_exit(void) { - simd_unregister_skciphers(cast6_algs, ARRAY_SIZE(cast6_algs), - cast6_simd_algs); + crypto_unregister_skciphers(cast6_algs, ARRAY_SIZE(cast6_algs)); } module_init(cast6_init); diff --git a/arch/x86/crypto/chacha-avx2-x86_64.S b/arch/x86/crypto/chacha-avx2-x86_64.S deleted file mode 100644 index f3d8fc018249..000000000000 --- a/arch/x86/crypto/chacha-avx2-x86_64.S +++ /dev/null @@ -1,1021 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * ChaCha 256-bit cipher algorithm, x64 AVX2 functions - * - * Copyright (C) 2015 Martin Willi - */ - -#include <linux/linkage.h> - -.section .rodata.cst32.ROT8, "aM", @progbits, 32 -.align 32 -ROT8: .octa 0x0e0d0c0f0a09080b0605040702010003 - .octa 0x0e0d0c0f0a09080b0605040702010003 - -.section .rodata.cst32.ROT16, "aM", @progbits, 32 -.align 32 -ROT16: .octa 0x0d0c0f0e09080b0a0504070601000302 - .octa 0x0d0c0f0e09080b0a0504070601000302 - -.section .rodata.cst32.CTRINC, "aM", @progbits, 32 -.align 32 -CTRINC: .octa 0x00000003000000020000000100000000 - .octa 0x00000007000000060000000500000004 - -.section .rodata.cst32.CTR2BL, "aM", @progbits, 32 -.align 32 -CTR2BL: .octa 0x00000000000000000000000000000000 - .octa 0x00000000000000000000000000000001 - -.section .rodata.cst32.CTR4BL, "aM", @progbits, 32 -.align 32 -CTR4BL: .octa 0x00000000000000000000000000000002 - .octa 0x00000000000000000000000000000003 - -.text - -SYM_FUNC_START(chacha_2block_xor_avx2) - # %rdi: Input state matrix, s - # %rsi: up to 2 data blocks output, o - # %rdx: up to 2 data blocks input, i - # %rcx: input/output length in bytes - # %r8d: nrounds - - # This function encrypts two ChaCha blocks by loading the state - # matrix twice across four AVX registers. It performs matrix operations - # on four words in each matrix in parallel, but requires shuffling to - # rearrange the words after each round. - - vzeroupper - - # x0..3[0-2] = s0..3 - vbroadcasti128 0x00(%rdi),%ymm0 - vbroadcasti128 0x10(%rdi),%ymm1 - vbroadcasti128 0x20(%rdi),%ymm2 - vbroadcasti128 0x30(%rdi),%ymm3 - - vpaddd CTR2BL(%rip),%ymm3,%ymm3 - - vmovdqa %ymm0,%ymm8 - vmovdqa %ymm1,%ymm9 - vmovdqa %ymm2,%ymm10 - vmovdqa %ymm3,%ymm11 - - vmovdqa ROT8(%rip),%ymm4 - vmovdqa ROT16(%rip),%ymm5 - - mov %rcx,%rax - -.Ldoubleround: - - # x0 += x1, x3 = rotl32(x3 ^ x0, 16) - vpaddd %ymm1,%ymm0,%ymm0 - vpxor %ymm0,%ymm3,%ymm3 - vpshufb %ymm5,%ymm3,%ymm3 - - # x2 += x3, x1 = rotl32(x1 ^ x2, 12) - vpaddd %ymm3,%ymm2,%ymm2 - vpxor %ymm2,%ymm1,%ymm1 - vmovdqa %ymm1,%ymm6 - vpslld $12,%ymm6,%ymm6 - vpsrld $20,%ymm1,%ymm1 - vpor %ymm6,%ymm1,%ymm1 - - # x0 += x1, x3 = rotl32(x3 ^ x0, 8) - vpaddd %ymm1,%ymm0,%ymm0 - vpxor %ymm0,%ymm3,%ymm3 - vpshufb %ymm4,%ymm3,%ymm3 - - # x2 += x3, x1 = rotl32(x1 ^ x2, 7) - vpaddd %ymm3,%ymm2,%ymm2 - vpxor %ymm2,%ymm1,%ymm1 - vmovdqa %ymm1,%ymm7 - vpslld $7,%ymm7,%ymm7 - vpsrld $25,%ymm1,%ymm1 - vpor %ymm7,%ymm1,%ymm1 - - # x1 = shuffle32(x1, MASK(0, 3, 2, 1)) - vpshufd $0x39,%ymm1,%ymm1 - # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) - vpshufd $0x4e,%ymm2,%ymm2 - # x3 = shuffle32(x3, MASK(2, 1, 0, 3)) - vpshufd $0x93,%ymm3,%ymm3 - - # x0 += x1, x3 = rotl32(x3 ^ x0, 16) - vpaddd %ymm1,%ymm0,%ymm0 - vpxor %ymm0,%ymm3,%ymm3 - vpshufb %ymm5,%ymm3,%ymm3 - - # x2 += x3, x1 = rotl32(x1 ^ x2, 12) - vpaddd %ymm3,%ymm2,%ymm2 - vpxor %ymm2,%ymm1,%ymm1 - vmovdqa %ymm1,%ymm6 - vpslld $12,%ymm6,%ymm6 - vpsrld $20,%ymm1,%ymm1 - vpor %ymm6,%ymm1,%ymm1 - - # x0 += x1, x3 = rotl32(x3 ^ x0, 8) - vpaddd %ymm1,%ymm0,%ymm0 - vpxor %ymm0,%ymm3,%ymm3 - vpshufb %ymm4,%ymm3,%ymm3 - - # x2 += x3, x1 = rotl32(x1 ^ x2, 7) - vpaddd %ymm3,%ymm2,%ymm2 - vpxor %ymm2,%ymm1,%ymm1 - vmovdqa %ymm1,%ymm7 - vpslld $7,%ymm7,%ymm7 - vpsrld $25,%ymm1,%ymm1 - vpor %ymm7,%ymm1,%ymm1 - - # x1 = shuffle32(x1, MASK(2, 1, 0, 3)) - vpshufd $0x93,%ymm1,%ymm1 - # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) - vpshufd $0x4e,%ymm2,%ymm2 - # x3 = shuffle32(x3, MASK(0, 3, 2, 1)) - vpshufd $0x39,%ymm3,%ymm3 - - sub $2,%r8d - jnz .Ldoubleround - - # o0 = i0 ^ (x0 + s0) - vpaddd %ymm8,%ymm0,%ymm7 - cmp $0x10,%rax - jl .Lxorpart2 - vpxor 0x00(%rdx),%xmm7,%xmm6 - vmovdqu %xmm6,0x00(%rsi) - vextracti128 $1,%ymm7,%xmm0 - # o1 = i1 ^ (x1 + s1) - vpaddd %ymm9,%ymm1,%ymm7 - cmp $0x20,%rax - jl .Lxorpart2 - vpxor 0x10(%rdx),%xmm7,%xmm6 - vmovdqu %xmm6,0x10(%rsi) - vextracti128 $1,%ymm7,%xmm1 - # o2 = i2 ^ (x2 + s2) - vpaddd %ymm10,%ymm2,%ymm7 - cmp $0x30,%rax - jl .Lxorpart2 - vpxor 0x20(%rdx),%xmm7,%xmm6 - vmovdqu %xmm6,0x20(%rsi) - vextracti128 $1,%ymm7,%xmm2 - # o3 = i3 ^ (x3 + s3) - vpaddd %ymm11,%ymm3,%ymm7 - cmp $0x40,%rax - jl .Lxorpart2 - vpxor 0x30(%rdx),%xmm7,%xmm6 - vmovdqu %xmm6,0x30(%rsi) - vextracti128 $1,%ymm7,%xmm3 - - # xor and write second block - vmovdqa %xmm0,%xmm7 - cmp $0x50,%rax - jl .Lxorpart2 - vpxor 0x40(%rdx),%xmm7,%xmm6 - vmovdqu %xmm6,0x40(%rsi) - - vmovdqa %xmm1,%xmm7 - cmp $0x60,%rax - jl .Lxorpart2 - vpxor 0x50(%rdx),%xmm7,%xmm6 - vmovdqu %xmm6,0x50(%rsi) - - vmovdqa %xmm2,%xmm7 - cmp $0x70,%rax - jl .Lxorpart2 - vpxor 0x60(%rdx),%xmm7,%xmm6 - vmovdqu %xmm6,0x60(%rsi) - - vmovdqa %xmm3,%xmm7 - cmp $0x80,%rax - jl .Lxorpart2 - vpxor 0x70(%rdx),%xmm7,%xmm6 - vmovdqu %xmm6,0x70(%rsi) - -.Ldone2: - vzeroupper - RET - -.Lxorpart2: - # xor remaining bytes from partial register into output - mov %rax,%r9 - and $0x0f,%r9 - jz .Ldone2 - and $~0x0f,%rax - - mov %rsi,%r11 - - lea 8(%rsp),%r10 - sub $0x10,%rsp - and $~31,%rsp - - lea (%rdx,%rax),%rsi - mov %rsp,%rdi - mov %r9,%rcx - rep movsb - - vpxor 0x00(%rsp),%xmm7,%xmm7 - vmovdqa %xmm7,0x00(%rsp) - - mov %rsp,%rsi - lea (%r11,%rax),%rdi - mov %r9,%rcx - rep movsb - - lea -8(%r10),%rsp - jmp .Ldone2 - -SYM_FUNC_END(chacha_2block_xor_avx2) - -SYM_FUNC_START(chacha_4block_xor_avx2) - # %rdi: Input state matrix, s - # %rsi: up to 4 data blocks output, o - # %rdx: up to 4 data blocks input, i - # %rcx: input/output length in bytes - # %r8d: nrounds - - # This function encrypts four ChaCha blocks by loading the state - # matrix four times across eight AVX registers. It performs matrix - # operations on four words in two matrices in parallel, sequentially - # to the operations on the four words of the other two matrices. The - # required word shuffling has a rather high latency, we can do the - # arithmetic on two matrix-pairs without much slowdown. - - vzeroupper - - # x0..3[0-4] = s0..3 - vbroadcasti128 0x00(%rdi),%ymm0 - vbroadcasti128 0x10(%rdi),%ymm1 - vbroadcasti128 0x20(%rdi),%ymm2 - vbroadcasti128 0x30(%rdi),%ymm3 - - vmovdqa %ymm0,%ymm4 - vmovdqa %ymm1,%ymm5 - vmovdqa %ymm2,%ymm6 - vmovdqa %ymm3,%ymm7 - - vpaddd CTR2BL(%rip),%ymm3,%ymm3 - vpaddd CTR4BL(%rip),%ymm7,%ymm7 - - vmovdqa %ymm0,%ymm11 - vmovdqa %ymm1,%ymm12 - vmovdqa %ymm2,%ymm13 - vmovdqa %ymm3,%ymm14 - vmovdqa %ymm7,%ymm15 - - vmovdqa ROT8(%rip),%ymm8 - vmovdqa ROT16(%rip),%ymm9 - - mov %rcx,%rax - -.Ldoubleround4: - - # x0 += x1, x3 = rotl32(x3 ^ x0, 16) - vpaddd %ymm1,%ymm0,%ymm0 - vpxor %ymm0,%ymm3,%ymm3 - vpshufb %ymm9,%ymm3,%ymm3 - - vpaddd %ymm5,%ymm4,%ymm4 - vpxor %ymm4,%ymm7,%ymm7 - vpshufb %ymm9,%ymm7,%ymm7 - - # x2 += x3, x1 = rotl32(x1 ^ x2, 12) - vpaddd %ymm3,%ymm2,%ymm2 - vpxor %ymm2,%ymm1,%ymm1 - vmovdqa %ymm1,%ymm10 - vpslld $12,%ymm10,%ymm10 - vpsrld $20,%ymm1,%ymm1 - vpor %ymm10,%ymm1,%ymm1 - - vpaddd %ymm7,%ymm6,%ymm6 - vpxor %ymm6,%ymm5,%ymm5 - vmovdqa %ymm5,%ymm10 - vpslld $12,%ymm10,%ymm10 - vpsrld $20,%ymm5,%ymm5 - vpor %ymm10,%ymm5,%ymm5 - - # x0 += x1, x3 = rotl32(x3 ^ x0, 8) - vpaddd %ymm1,%ymm0,%ymm0 - vpxor %ymm0,%ymm3,%ymm3 - vpshufb %ymm8,%ymm3,%ymm3 - - vpaddd %ymm5,%ymm4,%ymm4 - vpxor %ymm4,%ymm7,%ymm7 - vpshufb %ymm8,%ymm7,%ymm7 - - # x2 += x3, x1 = rotl32(x1 ^ x2, 7) - vpaddd %ymm3,%ymm2,%ymm2 - vpxor %ymm2,%ymm1,%ymm1 - vmovdqa %ymm1,%ymm10 - vpslld $7,%ymm10,%ymm10 - vpsrld $25,%ymm1,%ymm1 - vpor %ymm10,%ymm1,%ymm1 - - vpaddd %ymm7,%ymm6,%ymm6 - vpxor %ymm6,%ymm5,%ymm5 - vmovdqa %ymm5,%ymm10 - vpslld $7,%ymm10,%ymm10 - vpsrld $25,%ymm5,%ymm5 - vpor %ymm10,%ymm5,%ymm5 - - # x1 = shuffle32(x1, MASK(0, 3, 2, 1)) - vpshufd $0x39,%ymm1,%ymm1 - vpshufd $0x39,%ymm5,%ymm5 - # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) - vpshufd $0x4e,%ymm2,%ymm2 - vpshufd $0x4e,%ymm6,%ymm6 - # x3 = shuffle32(x3, MASK(2, 1, 0, 3)) - vpshufd $0x93,%ymm3,%ymm3 - vpshufd $0x93,%ymm7,%ymm7 - - # x0 += x1, x3 = rotl32(x3 ^ x0, 16) - vpaddd %ymm1,%ymm0,%ymm0 - vpxor %ymm0,%ymm3,%ymm3 - vpshufb %ymm9,%ymm3,%ymm3 - - vpaddd %ymm5,%ymm4,%ymm4 - vpxor %ymm4,%ymm7,%ymm7 - vpshufb %ymm9,%ymm7,%ymm7 - - # x2 += x3, x1 = rotl32(x1 ^ x2, 12) - vpaddd %ymm3,%ymm2,%ymm2 - vpxor %ymm2,%ymm1,%ymm1 - vmovdqa %ymm1,%ymm10 - vpslld $12,%ymm10,%ymm10 - vpsrld $20,%ymm1,%ymm1 - vpor %ymm10,%ymm1,%ymm1 - - vpaddd %ymm7,%ymm6,%ymm6 - vpxor %ymm6,%ymm5,%ymm5 - vmovdqa %ymm5,%ymm10 - vpslld $12,%ymm10,%ymm10 - vpsrld $20,%ymm5,%ymm5 - vpor %ymm10,%ymm5,%ymm5 - - # x0 += x1, x3 = rotl32(x3 ^ x0, 8) - vpaddd %ymm1,%ymm0,%ymm0 - vpxor %ymm0,%ymm3,%ymm3 - vpshufb %ymm8,%ymm3,%ymm3 - - vpaddd %ymm5,%ymm4,%ymm4 - vpxor %ymm4,%ymm7,%ymm7 - vpshufb %ymm8,%ymm7,%ymm7 - - # x2 += x3, x1 = rotl32(x1 ^ x2, 7) - vpaddd %ymm3,%ymm2,%ymm2 - vpxor %ymm2,%ymm1,%ymm1 - vmovdqa %ymm1,%ymm10 - vpslld $7,%ymm10,%ymm10 - vpsrld $25,%ymm1,%ymm1 - vpor %ymm10,%ymm1,%ymm1 - - vpaddd %ymm7,%ymm6,%ymm6 - vpxor %ymm6,%ymm5,%ymm5 - vmovdqa %ymm5,%ymm10 - vpslld $7,%ymm10,%ymm10 - vpsrld $25,%ymm5,%ymm5 - vpor %ymm10,%ymm5,%ymm5 - - # x1 = shuffle32(x1, MASK(2, 1, 0, 3)) - vpshufd $0x93,%ymm1,%ymm1 - vpshufd $0x93,%ymm5,%ymm5 - # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) - vpshufd $0x4e,%ymm2,%ymm2 - vpshufd $0x4e,%ymm6,%ymm6 - # x3 = shuffle32(x3, MASK(0, 3, 2, 1)) - vpshufd $0x39,%ymm3,%ymm3 - vpshufd $0x39,%ymm7,%ymm7 - - sub $2,%r8d - jnz .Ldoubleround4 - - # o0 = i0 ^ (x0 + s0), first block - vpaddd %ymm11,%ymm0,%ymm10 - cmp $0x10,%rax - jl .Lxorpart4 - vpxor 0x00(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0x00(%rsi) - vextracti128 $1,%ymm10,%xmm0 - # o1 = i1 ^ (x1 + s1), first block - vpaddd %ymm12,%ymm1,%ymm10 - cmp $0x20,%rax - jl .Lxorpart4 - vpxor 0x10(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0x10(%rsi) - vextracti128 $1,%ymm10,%xmm1 - # o2 = i2 ^ (x2 + s2), first block - vpaddd %ymm13,%ymm2,%ymm10 - cmp $0x30,%rax - jl .Lxorpart4 - vpxor 0x20(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0x20(%rsi) - vextracti128 $1,%ymm10,%xmm2 - # o3 = i3 ^ (x3 + s3), first block - vpaddd %ymm14,%ymm3,%ymm10 - cmp $0x40,%rax - jl .Lxorpart4 - vpxor 0x30(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0x30(%rsi) - vextracti128 $1,%ymm10,%xmm3 - - # xor and write second block - vmovdqa %xmm0,%xmm10 - cmp $0x50,%rax - jl .Lxorpart4 - vpxor 0x40(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0x40(%rsi) - - vmovdqa %xmm1,%xmm10 - cmp $0x60,%rax - jl .Lxorpart4 - vpxor 0x50(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0x50(%rsi) - - vmovdqa %xmm2,%xmm10 - cmp $0x70,%rax - jl .Lxorpart4 - vpxor 0x60(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0x60(%rsi) - - vmovdqa %xmm3,%xmm10 - cmp $0x80,%rax - jl .Lxorpart4 - vpxor 0x70(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0x70(%rsi) - - # o0 = i0 ^ (x0 + s0), third block - vpaddd %ymm11,%ymm4,%ymm10 - cmp $0x90,%rax - jl .Lxorpart4 - vpxor 0x80(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0x80(%rsi) - vextracti128 $1,%ymm10,%xmm4 - # o1 = i1 ^ (x1 + s1), third block - vpaddd %ymm12,%ymm5,%ymm10 - cmp $0xa0,%rax - jl .Lxorpart4 - vpxor 0x90(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0x90(%rsi) - vextracti128 $1,%ymm10,%xmm5 - # o2 = i2 ^ (x2 + s2), third block - vpaddd %ymm13,%ymm6,%ymm10 - cmp $0xb0,%rax - jl .Lxorpart4 - vpxor 0xa0(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0xa0(%rsi) - vextracti128 $1,%ymm10,%xmm6 - # o3 = i3 ^ (x3 + s3), third block - vpaddd %ymm15,%ymm7,%ymm10 - cmp $0xc0,%rax - jl .Lxorpart4 - vpxor 0xb0(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0xb0(%rsi) - vextracti128 $1,%ymm10,%xmm7 - - # xor and write fourth block - vmovdqa %xmm4,%xmm10 - cmp $0xd0,%rax - jl .Lxorpart4 - vpxor 0xc0(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0xc0(%rsi) - - vmovdqa %xmm5,%xmm10 - cmp $0xe0,%rax - jl .Lxorpart4 - vpxor 0xd0(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0xd0(%rsi) - - vmovdqa %xmm6,%xmm10 - cmp $0xf0,%rax - jl .Lxorpart4 - vpxor 0xe0(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0xe0(%rsi) - - vmovdqa %xmm7,%xmm10 - cmp $0x100,%rax - jl .Lxorpart4 - vpxor 0xf0(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0xf0(%rsi) - -.Ldone4: - vzeroupper - RET - -.Lxorpart4: - # xor remaining bytes from partial register into output - mov %rax,%r9 - and $0x0f,%r9 - jz .Ldone4 - and $~0x0f,%rax - - mov %rsi,%r11 - - lea 8(%rsp),%r10 - sub $0x10,%rsp - and $~31,%rsp - - lea (%rdx,%rax),%rsi - mov %rsp,%rdi - mov %r9,%rcx - rep movsb - - vpxor 0x00(%rsp),%xmm10,%xmm10 - vmovdqa %xmm10,0x00(%rsp) - - mov %rsp,%rsi - lea (%r11,%rax),%rdi - mov %r9,%rcx - rep movsb - - lea -8(%r10),%rsp - jmp .Ldone4 - -SYM_FUNC_END(chacha_4block_xor_avx2) - -SYM_FUNC_START(chacha_8block_xor_avx2) - # %rdi: Input state matrix, s - # %rsi: up to 8 data blocks output, o - # %rdx: up to 8 data blocks input, i - # %rcx: input/output length in bytes - # %r8d: nrounds - - # This function encrypts eight consecutive ChaCha blocks by loading - # the state matrix in AVX registers eight times. As we need some - # scratch registers, we save the first four registers on the stack. The - # algorithm performs each operation on the corresponding word of each - # state matrix, hence requires no word shuffling. For final XORing step - # we transpose the matrix by interleaving 32-, 64- and then 128-bit - # words, which allows us to do XOR in AVX registers. 8/16-bit word - # rotation is done with the slightly better performing byte shuffling, - # 7/12-bit word rotation uses traditional shift+OR. - - vzeroupper - # 4 * 32 byte stack, 32-byte aligned - lea 8(%rsp),%r10 - and $~31, %rsp - sub $0x80, %rsp - mov %rcx,%rax - - # x0..15[0-7] = s[0..15] - vpbroadcastd 0x00(%rdi),%ymm0 - vpbroadcastd 0x04(%rdi),%ymm1 - vpbroadcastd 0x08(%rdi),%ymm2 - vpbroadcastd 0x0c(%rdi),%ymm3 - vpbroadcastd 0x10(%rdi),%ymm4 - vpbroadcastd 0x14(%rdi),%ymm5 - vpbroadcastd 0x18(%rdi),%ymm6 - vpbroadcastd 0x1c(%rdi),%ymm7 - vpbroadcastd 0x20(%rdi),%ymm8 - vpbroadcastd 0x24(%rdi),%ymm9 - vpbroadcastd 0x28(%rdi),%ymm10 - vpbroadcastd 0x2c(%rdi),%ymm11 - vpbroadcastd 0x30(%rdi),%ymm12 - vpbroadcastd 0x34(%rdi),%ymm13 - vpbroadcastd 0x38(%rdi),%ymm14 - vpbroadcastd 0x3c(%rdi),%ymm15 - # x0..3 on stack - vmovdqa %ymm0,0x00(%rsp) - vmovdqa %ymm1,0x20(%rsp) - vmovdqa %ymm2,0x40(%rsp) - vmovdqa %ymm3,0x60(%rsp) - - vmovdqa CTRINC(%rip),%ymm1 - vmovdqa ROT8(%rip),%ymm2 - vmovdqa ROT16(%rip),%ymm3 - - # x12 += counter values 0-3 - vpaddd %ymm1,%ymm12,%ymm12 - -.Ldoubleround8: - # x0 += x4, x12 = rotl32(x12 ^ x0, 16) - vpaddd 0x00(%rsp),%ymm4,%ymm0 - vmovdqa %ymm0,0x00(%rsp) - vpxor %ymm0,%ymm12,%ymm12 - vpshufb %ymm3,%ymm12,%ymm12 - # x1 += x5, x13 = rotl32(x13 ^ x1, 16) - vpaddd 0x20(%rsp),%ymm5,%ymm0 - vmovdqa %ymm0,0x20(%rsp) - vpxor %ymm0,%ymm13,%ymm13 - vpshufb %ymm3,%ymm13,%ymm13 - # x2 += x6, x14 = rotl32(x14 ^ x2, 16) - vpaddd 0x40(%rsp),%ymm6,%ymm0 - vmovdqa %ymm0,0x40(%rsp) - vpxor %ymm0,%ymm14,%ymm14 - vpshufb %ymm3,%ymm14,%ymm14 - # x3 += x7, x15 = rotl32(x15 ^ x3, 16) - vpaddd 0x60(%rsp),%ymm7,%ymm0 - vmovdqa %ymm0,0x60(%rsp) - vpxor %ymm0,%ymm15,%ymm15 - vpshufb %ymm3,%ymm15,%ymm15 - - # x8 += x12, x4 = rotl32(x4 ^ x8, 12) - vpaddd %ymm12,%ymm8,%ymm8 - vpxor %ymm8,%ymm4,%ymm4 - vpslld $12,%ymm4,%ymm0 - vpsrld $20,%ymm4,%ymm4 - vpor %ymm0,%ymm4,%ymm4 - # x9 += x13, x5 = rotl32(x5 ^ x9, 12) - vpaddd %ymm13,%ymm9,%ymm9 - vpxor %ymm9,%ymm5,%ymm5 - vpslld $12,%ymm5,%ymm0 - vpsrld $20,%ymm5,%ymm5 - vpor %ymm0,%ymm5,%ymm5 - # x10 += x14, x6 = rotl32(x6 ^ x10, 12) - vpaddd %ymm14,%ymm10,%ymm10 - vpxor %ymm10,%ymm6,%ymm6 - vpslld $12,%ymm6,%ymm0 - vpsrld $20,%ymm6,%ymm6 - vpor %ymm0,%ymm6,%ymm6 - # x11 += x15, x7 = rotl32(x7 ^ x11, 12) - vpaddd %ymm15,%ymm11,%ymm11 - vpxor %ymm11,%ymm7,%ymm7 - vpslld $12,%ymm7,%ymm0 - vpsrld $20,%ymm7,%ymm7 - vpor %ymm0,%ymm7,%ymm7 - - # x0 += x4, x12 = rotl32(x12 ^ x0, 8) - vpaddd 0x00(%rsp),%ymm4,%ymm0 - vmovdqa %ymm0,0x00(%rsp) - vpxor %ymm0,%ymm12,%ymm12 - vpshufb %ymm2,%ymm12,%ymm12 - # x1 += x5, x13 = rotl32(x13 ^ x1, 8) - vpaddd 0x20(%rsp),%ymm5,%ymm0 - vmovdqa %ymm0,0x20(%rsp) - vpxor %ymm0,%ymm13,%ymm13 - vpshufb %ymm2,%ymm13,%ymm13 - # x2 += x6, x14 = rotl32(x14 ^ x2, 8) - vpaddd 0x40(%rsp),%ymm6,%ymm0 - vmovdqa %ymm0,0x40(%rsp) - vpxor %ymm0,%ymm14,%ymm14 - vpshufb %ymm2,%ymm14,%ymm14 - # x3 += x7, x15 = rotl32(x15 ^ x3, 8) - vpaddd 0x60(%rsp),%ymm7,%ymm0 - vmovdqa %ymm0,0x60(%rsp) - vpxor %ymm0,%ymm15,%ymm15 - vpshufb %ymm2,%ymm15,%ymm15 - - # x8 += x12, x4 = rotl32(x4 ^ x8, 7) - vpaddd %ymm12,%ymm8,%ymm8 - vpxor %ymm8,%ymm4,%ymm4 - vpslld $7,%ymm4,%ymm0 - vpsrld $25,%ymm4,%ymm4 - vpor %ymm0,%ymm4,%ymm4 - # x9 += x13, x5 = rotl32(x5 ^ x9, 7) - vpaddd %ymm13,%ymm9,%ymm9 - vpxor %ymm9,%ymm5,%ymm5 - vpslld $7,%ymm5,%ymm0 - vpsrld $25,%ymm5,%ymm5 - vpor %ymm0,%ymm5,%ymm5 - # x10 += x14, x6 = rotl32(x6 ^ x10, 7) - vpaddd %ymm14,%ymm10,%ymm10 - vpxor %ymm10,%ymm6,%ymm6 - vpslld $7,%ymm6,%ymm0 - vpsrld $25,%ymm6,%ymm6 - vpor %ymm0,%ymm6,%ymm6 - # x11 += x15, x7 = rotl32(x7 ^ x11, 7) - vpaddd %ymm15,%ymm11,%ymm11 - vpxor %ymm11,%ymm7,%ymm7 - vpslld $7,%ymm7,%ymm0 - vpsrld $25,%ymm7,%ymm7 - vpor %ymm0,%ymm7,%ymm7 - - # x0 += x5, x15 = rotl32(x15 ^ x0, 16) - vpaddd 0x00(%rsp),%ymm5,%ymm0 - vmovdqa %ymm0,0x00(%rsp) - vpxor %ymm0,%ymm15,%ymm15 - vpshufb %ymm3,%ymm15,%ymm15 - # x1 += x6, x12 = rotl32(x12 ^ x1, 16)%ymm0 - vpaddd 0x20(%rsp),%ymm6,%ymm0 - vmovdqa %ymm0,0x20(%rsp) - vpxor %ymm0,%ymm12,%ymm12 - vpshufb %ymm3,%ymm12,%ymm12 - # x2 += x7, x13 = rotl32(x13 ^ x2, 16) - vpaddd 0x40(%rsp),%ymm7,%ymm0 - vmovdqa %ymm0,0x40(%rsp) - vpxor %ymm0,%ymm13,%ymm13 - vpshufb %ymm3,%ymm13,%ymm13 - # x3 += x4, x14 = rotl32(x14 ^ x3, 16) - vpaddd 0x60(%rsp),%ymm4,%ymm0 - vmovdqa %ymm0,0x60(%rsp) - vpxor %ymm0,%ymm14,%ymm14 - vpshufb %ymm3,%ymm14,%ymm14 - - # x10 += x15, x5 = rotl32(x5 ^ x10, 12) - vpaddd %ymm15,%ymm10,%ymm10 - vpxor %ymm10,%ymm5,%ymm5 - vpslld $12,%ymm5,%ymm0 - vpsrld $20,%ymm5,%ymm5 - vpor %ymm0,%ymm5,%ymm5 - # x11 += x12, x6 = rotl32(x6 ^ x11, 12) - vpaddd %ymm12,%ymm11,%ymm11 - vpxor %ymm11,%ymm6,%ymm6 - vpslld $12,%ymm6,%ymm0 - vpsrld $20,%ymm6,%ymm6 - vpor %ymm0,%ymm6,%ymm6 - # x8 += x13, x7 = rotl32(x7 ^ x8, 12) - vpaddd %ymm13,%ymm8,%ymm8 - vpxor %ymm8,%ymm7,%ymm7 - vpslld $12,%ymm7,%ymm0 - vpsrld $20,%ymm7,%ymm7 - vpor %ymm0,%ymm7,%ymm7 - # x9 += x14, x4 = rotl32(x4 ^ x9, 12) - vpaddd %ymm14,%ymm9,%ymm9 - vpxor %ymm9,%ymm4,%ymm4 - vpslld $12,%ymm4,%ymm0 - vpsrld $20,%ymm4,%ymm4 - vpor %ymm0,%ymm4,%ymm4 - - # x0 += x5, x15 = rotl32(x15 ^ x0, 8) - vpaddd 0x00(%rsp),%ymm5,%ymm0 - vmovdqa %ymm0,0x00(%rsp) - vpxor %ymm0,%ymm15,%ymm15 - vpshufb %ymm2,%ymm15,%ymm15 - # x1 += x6, x12 = rotl32(x12 ^ x1, 8) - vpaddd 0x20(%rsp),%ymm6,%ymm0 - vmovdqa %ymm0,0x20(%rsp) - vpxor %ymm0,%ymm12,%ymm12 - vpshufb %ymm2,%ymm12,%ymm12 - # x2 += x7, x13 = rotl32(x13 ^ x2, 8) - vpaddd 0x40(%rsp),%ymm7,%ymm0 - vmovdqa %ymm0,0x40(%rsp) - vpxor %ymm0,%ymm13,%ymm13 - vpshufb %ymm2,%ymm13,%ymm13 - # x3 += x4, x14 = rotl32(x14 ^ x3, 8) - vpaddd 0x60(%rsp),%ymm4,%ymm0 - vmovdqa %ymm0,0x60(%rsp) - vpxor %ymm0,%ymm14,%ymm14 - vpshufb %ymm2,%ymm14,%ymm14 - - # x10 += x15, x5 = rotl32(x5 ^ x10, 7) - vpaddd %ymm15,%ymm10,%ymm10 - vpxor %ymm10,%ymm5,%ymm5 - vpslld $7,%ymm5,%ymm0 - vpsrld $25,%ymm5,%ymm5 - vpor %ymm0,%ymm5,%ymm5 - # x11 += x12, x6 = rotl32(x6 ^ x11, 7) - vpaddd %ymm12,%ymm11,%ymm11 - vpxor %ymm11,%ymm6,%ymm6 - vpslld $7,%ymm6,%ymm0 - vpsrld $25,%ymm6,%ymm6 - vpor %ymm0,%ymm6,%ymm6 - # x8 += x13, x7 = rotl32(x7 ^ x8, 7) - vpaddd %ymm13,%ymm8,%ymm8 - vpxor %ymm8,%ymm7,%ymm7 - vpslld $7,%ymm7,%ymm0 - vpsrld $25,%ymm7,%ymm7 - vpor %ymm0,%ymm7,%ymm7 - # x9 += x14, x4 = rotl32(x4 ^ x9, 7) - vpaddd %ymm14,%ymm9,%ymm9 - vpxor %ymm9,%ymm4,%ymm4 - vpslld $7,%ymm4,%ymm0 - vpsrld $25,%ymm4,%ymm4 - vpor %ymm0,%ymm4,%ymm4 - - sub $2,%r8d - jnz .Ldoubleround8 - - # x0..15[0-3] += s[0..15] - vpbroadcastd 0x00(%rdi),%ymm0 - vpaddd 0x00(%rsp),%ymm0,%ymm0 - vmovdqa %ymm0,0x00(%rsp) - vpbroadcastd 0x04(%rdi),%ymm0 - vpaddd 0x20(%rsp),%ymm0,%ymm0 - vmovdqa %ymm0,0x20(%rsp) - vpbroadcastd 0x08(%rdi),%ymm0 - vpaddd 0x40(%rsp),%ymm0,%ymm0 - vmovdqa %ymm0,0x40(%rsp) - vpbroadcastd 0x0c(%rdi),%ymm0 - vpaddd 0x60(%rsp),%ymm0,%ymm0 - vmovdqa %ymm0,0x60(%rsp) - vpbroadcastd 0x10(%rdi),%ymm0 - vpaddd %ymm0,%ymm4,%ymm4 - vpbroadcastd 0x14(%rdi),%ymm0 - vpaddd %ymm0,%ymm5,%ymm5 - vpbroadcastd 0x18(%rdi),%ymm0 - vpaddd %ymm0,%ymm6,%ymm6 - vpbroadcastd 0x1c(%rdi),%ymm0 - vpaddd %ymm0,%ymm7,%ymm7 - vpbroadcastd 0x20(%rdi),%ymm0 - vpaddd %ymm0,%ymm8,%ymm8 - vpbroadcastd 0x24(%rdi),%ymm0 - vpaddd %ymm0,%ymm9,%ymm9 - vpbroadcastd 0x28(%rdi),%ymm0 - vpaddd %ymm0,%ymm10,%ymm10 - vpbroadcastd 0x2c(%rdi),%ymm0 - vpaddd %ymm0,%ymm11,%ymm11 - vpbroadcastd 0x30(%rdi),%ymm0 - vpaddd %ymm0,%ymm12,%ymm12 - vpbroadcastd 0x34(%rdi),%ymm0 - vpaddd %ymm0,%ymm13,%ymm13 - vpbroadcastd 0x38(%rdi),%ymm0 - vpaddd %ymm0,%ymm14,%ymm14 - vpbroadcastd 0x3c(%rdi),%ymm0 - vpaddd %ymm0,%ymm15,%ymm15 - - # x12 += counter values 0-3 - vpaddd %ymm1,%ymm12,%ymm12 - - # interleave 32-bit words in state n, n+1 - vmovdqa 0x00(%rsp),%ymm0 - vmovdqa 0x20(%rsp),%ymm1 - vpunpckldq %ymm1,%ymm0,%ymm2 - vpunpckhdq %ymm1,%ymm0,%ymm1 - vmovdqa %ymm2,0x00(%rsp) - vmovdqa %ymm1,0x20(%rsp) - vmovdqa 0x40(%rsp),%ymm0 - vmovdqa 0x60(%rsp),%ymm1 - vpunpckldq %ymm1,%ymm0,%ymm2 - vpunpckhdq %ymm1,%ymm0,%ymm1 - vmovdqa %ymm2,0x40(%rsp) - vmovdqa %ymm1,0x60(%rsp) - vmovdqa %ymm4,%ymm0 - vpunpckldq %ymm5,%ymm0,%ymm4 - vpunpckhdq %ymm5,%ymm0,%ymm5 - vmovdqa %ymm6,%ymm0 - vpunpckldq %ymm7,%ymm0,%ymm6 - vpunpckhdq %ymm7,%ymm0,%ymm7 - vmovdqa %ymm8,%ymm0 - vpunpckldq %ymm9,%ymm0,%ymm8 - vpunpckhdq %ymm9,%ymm0,%ymm9 - vmovdqa %ymm10,%ymm0 - vpunpckldq %ymm11,%ymm0,%ymm10 - vpunpckhdq %ymm11,%ymm0,%ymm11 - vmovdqa %ymm12,%ymm0 - vpunpckldq %ymm13,%ymm0,%ymm12 - vpunpckhdq %ymm13,%ymm0,%ymm13 - vmovdqa %ymm14,%ymm0 - vpunpckldq %ymm15,%ymm0,%ymm14 - vpunpckhdq %ymm15,%ymm0,%ymm15 - - # interleave 64-bit words in state n, n+2 - vmovdqa 0x00(%rsp),%ymm0 - vmovdqa 0x40(%rsp),%ymm2 - vpunpcklqdq %ymm2,%ymm0,%ymm1 - vpunpckhqdq %ymm2,%ymm0,%ymm2 - vmovdqa %ymm1,0x00(%rsp) - vmovdqa %ymm2,0x40(%rsp) - vmovdqa 0x20(%rsp),%ymm0 - vmovdqa 0x60(%rsp),%ymm2 - vpunpcklqdq %ymm2,%ymm0,%ymm1 - vpunpckhqdq %ymm2,%ymm0,%ymm2 - vmovdqa %ymm1,0x20(%rsp) - vmovdqa %ymm2,0x60(%rsp) - vmovdqa %ymm4,%ymm0 - vpunpcklqdq %ymm6,%ymm0,%ymm4 - vpunpckhqdq %ymm6,%ymm0,%ymm6 - vmovdqa %ymm5,%ymm0 - vpunpcklqdq %ymm7,%ymm0,%ymm5 - vpunpckhqdq %ymm7,%ymm0,%ymm7 - vmovdqa %ymm8,%ymm0 - vpunpcklqdq %ymm10,%ymm0,%ymm8 - vpunpckhqdq %ymm10,%ymm0,%ymm10 - vmovdqa %ymm9,%ymm0 - vpunpcklqdq %ymm11,%ymm0,%ymm9 - vpunpckhqdq %ymm11,%ymm0,%ymm11 - vmovdqa %ymm12,%ymm0 - vpunpcklqdq %ymm14,%ymm0,%ymm12 - vpunpckhqdq %ymm14,%ymm0,%ymm14 - vmovdqa %ymm13,%ymm0 - vpunpcklqdq %ymm15,%ymm0,%ymm13 - vpunpckhqdq %ymm15,%ymm0,%ymm15 - - # interleave 128-bit words in state n, n+4 - # xor/write first four blocks - vmovdqa 0x00(%rsp),%ymm1 - vperm2i128 $0x20,%ymm4,%ymm1,%ymm0 - cmp $0x0020,%rax - jl .Lxorpart8 - vpxor 0x0000(%rdx),%ymm0,%ymm0 - vmovdqu %ymm0,0x0000(%rsi) - vperm2i128 $0x31,%ymm4,%ymm1,%ymm4 - - vperm2i128 $0x20,%ymm12,%ymm8,%ymm0 - cmp $0x0040,%rax - jl .Lxorpart8 - vpxor 0x0020(%rdx),%ymm0,%ymm0 - vmovdqu %ymm0,0x0020(%rsi) - vperm2i128 $0x31,%ymm12,%ymm8,%ymm12 - - vmovdqa 0x40(%rsp),%ymm1 - vperm2i128 $0x20,%ymm6,%ymm1,%ymm0 - cmp $0x0060,%rax - jl .Lxorpart8 - vpxor 0x0040(%rdx),%ymm0,%ymm0 - vmovdqu %ymm0,0x0040(%rsi) - vperm2i128 $0x31,%ymm6,%ymm1,%ymm6 - - vperm2i128 $0x20,%ymm14,%ymm10,%ymm0 - cmp $0x0080,%rax - jl .Lxorpart8 - vpxor 0x0060(%rdx),%ymm0,%ymm0 - vmovdqu %ymm0,0x0060(%rsi) - vperm2i128 $0x31,%ymm14,%ymm10,%ymm14 - - vmovdqa 0x20(%rsp),%ymm1 - vperm2i128 $0x20,%ymm5,%ymm1,%ymm0 - cmp $0x00a0,%rax - jl .Lxorpart8 - vpxor 0x0080(%rdx),%ymm0,%ymm0 - vmovdqu %ymm0,0x0080(%rsi) - vperm2i128 $0x31,%ymm5,%ymm1,%ymm5 - - vperm2i128 $0x20,%ymm13,%ymm9,%ymm0 - cmp $0x00c0,%rax - jl .Lxorpart8 - vpxor 0x00a0(%rdx),%ymm0,%ymm0 - vmovdqu %ymm0,0x00a0(%rsi) - vperm2i128 $0x31,%ymm13,%ymm9,%ymm13 - - vmovdqa 0x60(%rsp),%ymm1 - vperm2i128 $0x20,%ymm7,%ymm1,%ymm0 - cmp $0x00e0,%rax - jl .Lxorpart8 - vpxor 0x00c0(%rdx),%ymm0,%ymm0 - vmovdqu %ymm0,0x00c0(%rsi) - vperm2i128 $0x31,%ymm7,%ymm1,%ymm7 - - vperm2i128 $0x20,%ymm15,%ymm11,%ymm0 - cmp $0x0100,%rax - jl .Lxorpart8 - vpxor 0x00e0(%rdx),%ymm0,%ymm0 - vmovdqu %ymm0,0x00e0(%rsi) - vperm2i128 $0x31,%ymm15,%ymm11,%ymm15 - - # xor remaining blocks, write to output - vmovdqa %ymm4,%ymm0 - cmp $0x0120,%rax - jl .Lxorpart8 - vpxor 0x0100(%rdx),%ymm0,%ymm0 - vmovdqu %ymm0,0x0100(%rsi) - - vmovdqa %ymm12,%ymm0 - cmp $0x0140,%rax - jl .Lxorpart8 - vpxor 0x0120(%rdx),%ymm0,%ymm0 - vmovdqu %ymm0,0x0120(%rsi) - - vmovdqa %ymm6,%ymm0 - cmp $0x0160,%rax - jl .Lxorpart8 - vpxor 0x0140(%rdx),%ymm0,%ymm0 - vmovdqu %ymm0,0x0140(%rsi) - - vmovdqa %ymm14,%ymm0 - cmp $0x0180,%rax - jl .Lxorpart8 - vpxor 0x0160(%rdx),%ymm0,%ymm0 - vmovdqu %ymm0,0x0160(%rsi) - - vmovdqa %ymm5,%ymm0 - cmp $0x01a0,%rax - jl .Lxorpart8 - vpxor 0x0180(%rdx),%ymm0,%ymm0 - vmovdqu %ymm0,0x0180(%rsi) - - vmovdqa %ymm13,%ymm0 - cmp $0x01c0,%rax - jl .Lxorpart8 - vpxor 0x01a0(%rdx),%ymm0,%ymm0 - vmovdqu %ymm0,0x01a0(%rsi) - - vmovdqa %ymm7,%ymm0 - cmp $0x01e0,%rax - jl .Lxorpart8 - vpxor 0x01c0(%rdx),%ymm0,%ymm0 - vmovdqu %ymm0,0x01c0(%rsi) - - vmovdqa %ymm15,%ymm0 - cmp $0x0200,%rax - jl .Lxorpart8 - vpxor 0x01e0(%rdx),%ymm0,%ymm0 - vmovdqu %ymm0,0x01e0(%rsi) - -.Ldone8: - vzeroupper - lea -8(%r10),%rsp - RET - -.Lxorpart8: - # xor remaining bytes from partial register into output - mov %rax,%r9 - and $0x1f,%r9 - jz .Ldone8 - and $~0x1f,%rax - - mov %rsi,%r11 - - lea (%rdx,%rax),%rsi - mov %rsp,%rdi - mov %r9,%rcx - rep movsb - - vpxor 0x00(%rsp),%ymm0,%ymm0 - vmovdqa %ymm0,0x00(%rsp) - - mov %rsp,%rsi - lea (%r11,%rax),%rdi - mov %r9,%rcx - rep movsb - - jmp .Ldone8 - -SYM_FUNC_END(chacha_8block_xor_avx2) diff --git a/arch/x86/crypto/chacha-avx512vl-x86_64.S b/arch/x86/crypto/chacha-avx512vl-x86_64.S deleted file mode 100644 index 259383e1ad44..000000000000 --- a/arch/x86/crypto/chacha-avx512vl-x86_64.S +++ /dev/null @@ -1,836 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0+ */ -/* - * ChaCha 256-bit cipher algorithm, x64 AVX-512VL functions - * - * Copyright (C) 2018 Martin Willi - */ - -#include <linux/linkage.h> - -.section .rodata.cst32.CTR2BL, "aM", @progbits, 32 -.align 32 -CTR2BL: .octa 0x00000000000000000000000000000000 - .octa 0x00000000000000000000000000000001 - -.section .rodata.cst32.CTR4BL, "aM", @progbits, 32 -.align 32 -CTR4BL: .octa 0x00000000000000000000000000000002 - .octa 0x00000000000000000000000000000003 - -.section .rodata.cst32.CTR8BL, "aM", @progbits, 32 -.align 32 -CTR8BL: .octa 0x00000003000000020000000100000000 - .octa 0x00000007000000060000000500000004 - -.text - -SYM_FUNC_START(chacha_2block_xor_avx512vl) - # %rdi: Input state matrix, s - # %rsi: up to 2 data blocks output, o - # %rdx: up to 2 data blocks input, i - # %rcx: input/output length in bytes - # %r8d: nrounds - - # This function encrypts two ChaCha blocks by loading the state - # matrix twice across four AVX registers. It performs matrix operations - # on four words in each matrix in parallel, but requires shuffling to - # rearrange the words after each round. - - vzeroupper - - # x0..3[0-2] = s0..3 - vbroadcasti128 0x00(%rdi),%ymm0 - vbroadcasti128 0x10(%rdi),%ymm1 - vbroadcasti128 0x20(%rdi),%ymm2 - vbroadcasti128 0x30(%rdi),%ymm3 - - vpaddd CTR2BL(%rip),%ymm3,%ymm3 - - vmovdqa %ymm0,%ymm8 - vmovdqa %ymm1,%ymm9 - vmovdqa %ymm2,%ymm10 - vmovdqa %ymm3,%ymm11 - -.Ldoubleround: - - # x0 += x1, x3 = rotl32(x3 ^ x0, 16) - vpaddd %ymm1,%ymm0,%ymm0 - vpxord %ymm0,%ymm3,%ymm3 - vprold $16,%ymm3,%ymm3 - - # x2 += x3, x1 = rotl32(x1 ^ x2, 12) - vpaddd %ymm3,%ymm2,%ymm2 - vpxord %ymm2,%ymm1,%ymm1 - vprold $12,%ymm1,%ymm1 - - # x0 += x1, x3 = rotl32(x3 ^ x0, 8) - vpaddd %ymm1,%ymm0,%ymm0 - vpxord %ymm0,%ymm3,%ymm3 - vprold $8,%ymm3,%ymm3 - - # x2 += x3, x1 = rotl32(x1 ^ x2, 7) - vpaddd %ymm3,%ymm2,%ymm2 - vpxord %ymm2,%ymm1,%ymm1 - vprold $7,%ymm1,%ymm1 - - # x1 = shuffle32(x1, MASK(0, 3, 2, 1)) - vpshufd $0x39,%ymm1,%ymm1 - # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) - vpshufd $0x4e,%ymm2,%ymm2 - # x3 = shuffle32(x3, MASK(2, 1, 0, 3)) - vpshufd $0x93,%ymm3,%ymm3 - - # x0 += x1, x3 = rotl32(x3 ^ x0, 16) - vpaddd %ymm1,%ymm0,%ymm0 - vpxord %ymm0,%ymm3,%ymm3 - vprold $16,%ymm3,%ymm3 - - # x2 += x3, x1 = rotl32(x1 ^ x2, 12) - vpaddd %ymm3,%ymm2,%ymm2 - vpxord %ymm2,%ymm1,%ymm1 - vprold $12,%ymm1,%ymm1 - - # x0 += x1, x3 = rotl32(x3 ^ x0, 8) - vpaddd %ymm1,%ymm0,%ymm0 - vpxord %ymm0,%ymm3,%ymm3 - vprold $8,%ymm3,%ymm3 - - # x2 += x3, x1 = rotl32(x1 ^ x2, 7) - vpaddd %ymm3,%ymm2,%ymm2 - vpxord %ymm2,%ymm1,%ymm1 - vprold $7,%ymm1,%ymm1 - - # x1 = shuffle32(x1, MASK(2, 1, 0, 3)) - vpshufd $0x93,%ymm1,%ymm1 - # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) - vpshufd $0x4e,%ymm2,%ymm2 - # x3 = shuffle32(x3, MASK(0, 3, 2, 1)) - vpshufd $0x39,%ymm3,%ymm3 - - sub $2,%r8d - jnz .Ldoubleround - - # o0 = i0 ^ (x0 + s0) - vpaddd %ymm8,%ymm0,%ymm7 - cmp $0x10,%rcx - jl .Lxorpart2 - vpxord 0x00(%rdx),%xmm7,%xmm6 - vmovdqu %xmm6,0x00(%rsi) - vextracti128 $1,%ymm7,%xmm0 - # o1 = i1 ^ (x1 + s1) - vpaddd %ymm9,%ymm1,%ymm7 - cmp $0x20,%rcx - jl .Lxorpart2 - vpxord 0x10(%rdx),%xmm7,%xmm6 - vmovdqu %xmm6,0x10(%rsi) - vextracti128 $1,%ymm7,%xmm1 - # o2 = i2 ^ (x2 + s2) - vpaddd %ymm10,%ymm2,%ymm7 - cmp $0x30,%rcx - jl .Lxorpart2 - vpxord 0x20(%rdx),%xmm7,%xmm6 - vmovdqu %xmm6,0x20(%rsi) - vextracti128 $1,%ymm7,%xmm2 - # o3 = i3 ^ (x3 + s3) - vpaddd %ymm11,%ymm3,%ymm7 - cmp $0x40,%rcx - jl .Lxorpart2 - vpxord 0x30(%rdx),%xmm7,%xmm6 - vmovdqu %xmm6,0x30(%rsi) - vextracti128 $1,%ymm7,%xmm3 - - # xor and write second block - vmovdqa %xmm0,%xmm7 - cmp $0x50,%rcx - jl .Lxorpart2 - vpxord 0x40(%rdx),%xmm7,%xmm6 - vmovdqu %xmm6,0x40(%rsi) - - vmovdqa %xmm1,%xmm7 - cmp $0x60,%rcx - jl .Lxorpart2 - vpxord 0x50(%rdx),%xmm7,%xmm6 - vmovdqu %xmm6,0x50(%rsi) - - vmovdqa %xmm2,%xmm7 - cmp $0x70,%rcx - jl .Lxorpart2 - vpxord 0x60(%rdx),%xmm7,%xmm6 - vmovdqu %xmm6,0x60(%rsi) - - vmovdqa %xmm3,%xmm7 - cmp $0x80,%rcx - jl .Lxorpart2 - vpxord 0x70(%rdx),%xmm7,%xmm6 - vmovdqu %xmm6,0x70(%rsi) - -.Ldone2: - vzeroupper - RET - -.Lxorpart2: - # xor remaining bytes from partial register into output - mov %rcx,%rax - and $0xf,%rcx - jz .Ldone2 - mov %rax,%r9 - and $~0xf,%r9 - - mov $1,%rax - shld %cl,%rax,%rax - sub $1,%rax - kmovq %rax,%k1 - - vmovdqu8 (%rdx,%r9),%xmm1{%k1}{z} - vpxord %xmm7,%xmm1,%xmm1 - vmovdqu8 %xmm1,(%rsi,%r9){%k1} - - jmp .Ldone2 - -SYM_FUNC_END(chacha_2block_xor_avx512vl) - -SYM_FUNC_START(chacha_4block_xor_avx512vl) - # %rdi: Input state matrix, s - # %rsi: up to 4 data blocks output, o - # %rdx: up to 4 data blocks input, i - # %rcx: input/output length in bytes - # %r8d: nrounds - - # This function encrypts four ChaCha blocks by loading the state - # matrix four times across eight AVX registers. It performs matrix - # operations on four words in two matrices in parallel, sequentially - # to the operations on the four words of the other two matrices. The - # required word shuffling has a rather high latency, we can do the - # arithmetic on two matrix-pairs without much slowdown. - - vzeroupper - - # x0..3[0-4] = s0..3 - vbroadcasti128 0x00(%rdi),%ymm0 - vbroadcasti128 0x10(%rdi),%ymm1 - vbroadcasti128 0x20(%rdi),%ymm2 - vbroadcasti128 0x30(%rdi),%ymm3 - - vmovdqa %ymm0,%ymm4 - vmovdqa %ymm1,%ymm5 - vmovdqa %ymm2,%ymm6 - vmovdqa %ymm3,%ymm7 - - vpaddd CTR2BL(%rip),%ymm3,%ymm3 - vpaddd CTR4BL(%rip),%ymm7,%ymm7 - - vmovdqa %ymm0,%ymm11 - vmovdqa %ymm1,%ymm12 - vmovdqa %ymm2,%ymm13 - vmovdqa %ymm3,%ymm14 - vmovdqa %ymm7,%ymm15 - -.Ldoubleround4: - - # x0 += x1, x3 = rotl32(x3 ^ x0, 16) - vpaddd %ymm1,%ymm0,%ymm0 - vpxord %ymm0,%ymm3,%ymm3 - vprold $16,%ymm3,%ymm3 - - vpaddd %ymm5,%ymm4,%ymm4 - vpxord %ymm4,%ymm7,%ymm7 - vprold $16,%ymm7,%ymm7 - - # x2 += x3, x1 = rotl32(x1 ^ x2, 12) - vpaddd %ymm3,%ymm2,%ymm2 - vpxord %ymm2,%ymm1,%ymm1 - vprold $12,%ymm1,%ymm1 - - vpaddd %ymm7,%ymm6,%ymm6 - vpxord %ymm6,%ymm5,%ymm5 - vprold $12,%ymm5,%ymm5 - - # x0 += x1, x3 = rotl32(x3 ^ x0, 8) - vpaddd %ymm1,%ymm0,%ymm0 - vpxord %ymm0,%ymm3,%ymm3 - vprold $8,%ymm3,%ymm3 - - vpaddd %ymm5,%ymm4,%ymm4 - vpxord %ymm4,%ymm7,%ymm7 - vprold $8,%ymm7,%ymm7 - - # x2 += x3, x1 = rotl32(x1 ^ x2, 7) - vpaddd %ymm3,%ymm2,%ymm2 - vpxord %ymm2,%ymm1,%ymm1 - vprold $7,%ymm1,%ymm1 - - vpaddd %ymm7,%ymm6,%ymm6 - vpxord %ymm6,%ymm5,%ymm5 - vprold $7,%ymm5,%ymm5 - - # x1 = shuffle32(x1, MASK(0, 3, 2, 1)) - vpshufd $0x39,%ymm1,%ymm1 - vpshufd $0x39,%ymm5,%ymm5 - # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) - vpshufd $0x4e,%ymm2,%ymm2 - vpshufd $0x4e,%ymm6,%ymm6 - # x3 = shuffle32(x3, MASK(2, 1, 0, 3)) - vpshufd $0x93,%ymm3,%ymm3 - vpshufd $0x93,%ymm7,%ymm7 - - # x0 += x1, x3 = rotl32(x3 ^ x0, 16) - vpaddd %ymm1,%ymm0,%ymm0 - vpxord %ymm0,%ymm3,%ymm3 - vprold $16,%ymm3,%ymm3 - - vpaddd %ymm5,%ymm4,%ymm4 - vpxord %ymm4,%ymm7,%ymm7 - vprold $16,%ymm7,%ymm7 - - # x2 += x3, x1 = rotl32(x1 ^ x2, 12) - vpaddd %ymm3,%ymm2,%ymm2 - vpxord %ymm2,%ymm1,%ymm1 - vprold $12,%ymm1,%ymm1 - - vpaddd %ymm7,%ymm6,%ymm6 - vpxord %ymm6,%ymm5,%ymm5 - vprold $12,%ymm5,%ymm5 - - # x0 += x1, x3 = rotl32(x3 ^ x0, 8) - vpaddd %ymm1,%ymm0,%ymm0 - vpxord %ymm0,%ymm3,%ymm3 - vprold $8,%ymm3,%ymm3 - - vpaddd %ymm5,%ymm4,%ymm4 - vpxord %ymm4,%ymm7,%ymm7 - vprold $8,%ymm7,%ymm7 - - # x2 += x3, x1 = rotl32(x1 ^ x2, 7) - vpaddd %ymm3,%ymm2,%ymm2 - vpxord %ymm2,%ymm1,%ymm1 - vprold $7,%ymm1,%ymm1 - - vpaddd %ymm7,%ymm6,%ymm6 - vpxord %ymm6,%ymm5,%ymm5 - vprold $7,%ymm5,%ymm5 - - # x1 = shuffle32(x1, MASK(2, 1, 0, 3)) - vpshufd $0x93,%ymm1,%ymm1 - vpshufd $0x93,%ymm5,%ymm5 - # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) - vpshufd $0x4e,%ymm2,%ymm2 - vpshufd $0x4e,%ymm6,%ymm6 - # x3 = shuffle32(x3, MASK(0, 3, 2, 1)) - vpshufd $0x39,%ymm3,%ymm3 - vpshufd $0x39,%ymm7,%ymm7 - - sub $2,%r8d - jnz .Ldoubleround4 - - # o0 = i0 ^ (x0 + s0), first block - vpaddd %ymm11,%ymm0,%ymm10 - cmp $0x10,%rcx - jl .Lxorpart4 - vpxord 0x00(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0x00(%rsi) - vextracti128 $1,%ymm10,%xmm0 - # o1 = i1 ^ (x1 + s1), first block - vpaddd %ymm12,%ymm1,%ymm10 - cmp $0x20,%rcx - jl .Lxorpart4 - vpxord 0x10(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0x10(%rsi) - vextracti128 $1,%ymm10,%xmm1 - # o2 = i2 ^ (x2 + s2), first block - vpaddd %ymm13,%ymm2,%ymm10 - cmp $0x30,%rcx - jl .Lxorpart4 - vpxord 0x20(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0x20(%rsi) - vextracti128 $1,%ymm10,%xmm2 - # o3 = i3 ^ (x3 + s3), first block - vpaddd %ymm14,%ymm3,%ymm10 - cmp $0x40,%rcx - jl .Lxorpart4 - vpxord 0x30(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0x30(%rsi) - vextracti128 $1,%ymm10,%xmm3 - - # xor and write second block - vmovdqa %xmm0,%xmm10 - cmp $0x50,%rcx - jl .Lxorpart4 - vpxord 0x40(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0x40(%rsi) - - vmovdqa %xmm1,%xmm10 - cmp $0x60,%rcx - jl .Lxorpart4 - vpxord 0x50(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0x50(%rsi) - - vmovdqa %xmm2,%xmm10 - cmp $0x70,%rcx - jl .Lxorpart4 - vpxord 0x60(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0x60(%rsi) - - vmovdqa %xmm3,%xmm10 - cmp $0x80,%rcx - jl .Lxorpart4 - vpxord 0x70(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0x70(%rsi) - - # o0 = i0 ^ (x0 + s0), third block - vpaddd %ymm11,%ymm4,%ymm10 - cmp $0x90,%rcx - jl .Lxorpart4 - vpxord 0x80(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0x80(%rsi) - vextracti128 $1,%ymm10,%xmm4 - # o1 = i1 ^ (x1 + s1), third block - vpaddd %ymm12,%ymm5,%ymm10 - cmp $0xa0,%rcx - jl .Lxorpart4 - vpxord 0x90(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0x90(%rsi) - vextracti128 $1,%ymm10,%xmm5 - # o2 = i2 ^ (x2 + s2), third block - vpaddd %ymm13,%ymm6,%ymm10 - cmp $0xb0,%rcx - jl .Lxorpart4 - vpxord 0xa0(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0xa0(%rsi) - vextracti128 $1,%ymm10,%xmm6 - # o3 = i3 ^ (x3 + s3), third block - vpaddd %ymm15,%ymm7,%ymm10 - cmp $0xc0,%rcx - jl .Lxorpart4 - vpxord 0xb0(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0xb0(%rsi) - vextracti128 $1,%ymm10,%xmm7 - - # xor and write fourth block - vmovdqa %xmm4,%xmm10 - cmp $0xd0,%rcx - jl .Lxorpart4 - vpxord 0xc0(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0xc0(%rsi) - - vmovdqa %xmm5,%xmm10 - cmp $0xe0,%rcx - jl .Lxorpart4 - vpxord 0xd0(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0xd0(%rsi) - - vmovdqa %xmm6,%xmm10 - cmp $0xf0,%rcx - jl .Lxorpart4 - vpxord 0xe0(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0xe0(%rsi) - - vmovdqa %xmm7,%xmm10 - cmp $0x100,%rcx - jl .Lxorpart4 - vpxord 0xf0(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0xf0(%rsi) - -.Ldone4: - vzeroupper - RET - -.Lxorpart4: - # xor remaining bytes from partial register into output - mov %rcx,%rax - and $0xf,%rcx - jz .Ldone4 - mov %rax,%r9 - and $~0xf,%r9 - - mov $1,%rax - shld %cl,%rax,%rax - sub $1,%rax - kmovq %rax,%k1 - - vmovdqu8 (%rdx,%r9),%xmm1{%k1}{z} - vpxord %xmm10,%xmm1,%xmm1 - vmovdqu8 %xmm1,(%rsi,%r9){%k1} - - jmp .Ldone4 - -SYM_FUNC_END(chacha_4block_xor_avx512vl) - -SYM_FUNC_START(chacha_8block_xor_avx512vl) - # %rdi: Input state matrix, s - # %rsi: up to 8 data blocks output, o - # %rdx: up to 8 data blocks input, i - # %rcx: input/output length in bytes - # %r8d: nrounds - - # This function encrypts eight consecutive ChaCha blocks by loading - # the state matrix in AVX registers eight times. Compared to AVX2, this - # mostly benefits from the new rotate instructions in VL and the - # additional registers. - - vzeroupper - - # x0..15[0-7] = s[0..15] - vpbroadcastd 0x00(%rdi),%ymm0 - vpbroadcastd 0x04(%rdi),%ymm1 - vpbroadcastd 0x08(%rdi),%ymm2 - vpbroadcastd 0x0c(%rdi),%ymm3 - vpbroadcastd 0x10(%rdi),%ymm4 - vpbroadcastd 0x14(%rdi),%ymm5 - vpbroadcastd 0x18(%rdi),%ymm6 - vpbroadcastd 0x1c(%rdi),%ymm7 - vpbroadcastd 0x20(%rdi),%ymm8 - vpbroadcastd 0x24(%rdi),%ymm9 - vpbroadcastd 0x28(%rdi),%ymm10 - vpbroadcastd 0x2c(%rdi),%ymm11 - vpbroadcastd 0x30(%rdi),%ymm12 - vpbroadcastd 0x34(%rdi),%ymm13 - vpbroadcastd 0x38(%rdi),%ymm14 - vpbroadcastd 0x3c(%rdi),%ymm15 - - # x12 += counter values 0-3 - vpaddd CTR8BL(%rip),%ymm12,%ymm12 - - vmovdqa64 %ymm0,%ymm16 - vmovdqa64 %ymm1,%ymm17 - vmovdqa64 %ymm2,%ymm18 - vmovdqa64 %ymm3,%ymm19 - vmovdqa64 %ymm4,%ymm20 - vmovdqa64 %ymm5,%ymm21 - vmovdqa64 %ymm6,%ymm22 - vmovdqa64 %ymm7,%ymm23 - vmovdqa64 %ymm8,%ymm24 - vmovdqa64 %ymm9,%ymm25 - vmovdqa64 %ymm10,%ymm26 - vmovdqa64 %ymm11,%ymm27 - vmovdqa64 %ymm12,%ymm28 - vmovdqa64 %ymm13,%ymm29 - vmovdqa64 %ymm14,%ymm30 - vmovdqa64 %ymm15,%ymm31 - -.Ldoubleround8: - # x0 += x4, x12 = rotl32(x12 ^ x0, 16) - vpaddd %ymm0,%ymm4,%ymm0 - vpxord %ymm0,%ymm12,%ymm12 - vprold $16,%ymm12,%ymm12 - # x1 += x5, x13 = rotl32(x13 ^ x1, 16) - vpaddd %ymm1,%ymm5,%ymm1 - vpxord %ymm1,%ymm13,%ymm13 - vprold $16,%ymm13,%ymm13 - # x2 += x6, x14 = rotl32(x14 ^ x2, 16) - vpaddd %ymm2,%ymm6,%ymm2 - vpxord %ymm2,%ymm14,%ymm14 - vprold $16,%ymm14,%ymm14 - # x3 += x7, x15 = rotl32(x15 ^ x3, 16) - vpaddd %ymm3,%ymm7,%ymm3 - vpxord %ymm3,%ymm15,%ymm15 - vprold $16,%ymm15,%ymm15 - - # x8 += x12, x4 = rotl32(x4 ^ x8, 12) - vpaddd %ymm12,%ymm8,%ymm8 - vpxord %ymm8,%ymm4,%ymm4 - vprold $12,%ymm4,%ymm4 - # x9 += x13, x5 = rotl32(x5 ^ x9, 12) - vpaddd %ymm13,%ymm9,%ymm9 - vpxord %ymm9,%ymm5,%ymm5 - vprold $12,%ymm5,%ymm5 - # x10 += x14, x6 = rotl32(x6 ^ x10, 12) - vpaddd %ymm14,%ymm10,%ymm10 - vpxord %ymm10,%ymm6,%ymm6 - vprold $12,%ymm6,%ymm6 - # x11 += x15, x7 = rotl32(x7 ^ x11, 12) - vpaddd %ymm15,%ymm11,%ymm11 - vpxord %ymm11,%ymm7,%ymm7 - vprold $12,%ymm7,%ymm7 - - # x0 += x4, x12 = rotl32(x12 ^ x0, 8) - vpaddd %ymm0,%ymm4,%ymm0 - vpxord %ymm0,%ymm12,%ymm12 - vprold $8,%ymm12,%ymm12 - # x1 += x5, x13 = rotl32(x13 ^ x1, 8) - vpaddd %ymm1,%ymm5,%ymm1 - vpxord %ymm1,%ymm13,%ymm13 - vprold $8,%ymm13,%ymm13 - # x2 += x6, x14 = rotl32(x14 ^ x2, 8) - vpaddd %ymm2,%ymm6,%ymm2 - vpxord %ymm2,%ymm14,%ymm14 - vprold $8,%ymm14,%ymm14 - # x3 += x7, x15 = rotl32(x15 ^ x3, 8) - vpaddd %ymm3,%ymm7,%ymm3 - vpxord %ymm3,%ymm15,%ymm15 - vprold $8,%ymm15,%ymm15 - - # x8 += x12, x4 = rotl32(x4 ^ x8, 7) - vpaddd %ymm12,%ymm8,%ymm8 - vpxord %ymm8,%ymm4,%ymm4 - vprold $7,%ymm4,%ymm4 - # x9 += x13, x5 = rotl32(x5 ^ x9, 7) - vpaddd %ymm13,%ymm9,%ymm9 - vpxord %ymm9,%ymm5,%ymm5 - vprold $7,%ymm5,%ymm5 - # x10 += x14, x6 = rotl32(x6 ^ x10, 7) - vpaddd %ymm14,%ymm10,%ymm10 - vpxord %ymm10,%ymm6,%ymm6 - vprold $7,%ymm6,%ymm6 - # x11 += x15, x7 = rotl32(x7 ^ x11, 7) - vpaddd %ymm15,%ymm11,%ymm11 - vpxord %ymm11,%ymm7,%ymm7 - vprold $7,%ymm7,%ymm7 - - # x0 += x5, x15 = rotl32(x15 ^ x0, 16) - vpaddd %ymm0,%ymm5,%ymm0 - vpxord %ymm0,%ymm15,%ymm15 - vprold $16,%ymm15,%ymm15 - # x1 += x6, x12 = rotl32(x12 ^ x1, 16) - vpaddd %ymm1,%ymm6,%ymm1 - vpxord %ymm1,%ymm12,%ymm12 - vprold $16,%ymm12,%ymm12 - # x2 += x7, x13 = rotl32(x13 ^ x2, 16) - vpaddd %ymm2,%ymm7,%ymm2 - vpxord %ymm2,%ymm13,%ymm13 - vprold $16,%ymm13,%ymm13 - # x3 += x4, x14 = rotl32(x14 ^ x3, 16) - vpaddd %ymm3,%ymm4,%ymm3 - vpxord %ymm3,%ymm14,%ymm14 - vprold $16,%ymm14,%ymm14 - - # x10 += x15, x5 = rotl32(x5 ^ x10, 12) - vpaddd %ymm15,%ymm10,%ymm10 - vpxord %ymm10,%ymm5,%ymm5 - vprold $12,%ymm5,%ymm5 - # x11 += x12, x6 = rotl32(x6 ^ x11, 12) - vpaddd %ymm12,%ymm11,%ymm11 - vpxord %ymm11,%ymm6,%ymm6 - vprold $12,%ymm6,%ymm6 - # x8 += x13, x7 = rotl32(x7 ^ x8, 12) - vpaddd %ymm13,%ymm8,%ymm8 - vpxord %ymm8,%ymm7,%ymm7 - vprold $12,%ymm7,%ymm7 - # x9 += x14, x4 = rotl32(x4 ^ x9, 12) - vpaddd %ymm14,%ymm9,%ymm9 - vpxord %ymm9,%ymm4,%ymm4 - vprold $12,%ymm4,%ymm4 - - # x0 += x5, x15 = rotl32(x15 ^ x0, 8) - vpaddd %ymm0,%ymm5,%ymm0 - vpxord %ymm0,%ymm15,%ymm15 - vprold $8,%ymm15,%ymm15 - # x1 += x6, x12 = rotl32(x12 ^ x1, 8) - vpaddd %ymm1,%ymm6,%ymm1 - vpxord %ymm1,%ymm12,%ymm12 - vprold $8,%ymm12,%ymm12 - # x2 += x7, x13 = rotl32(x13 ^ x2, 8) - vpaddd %ymm2,%ymm7,%ymm2 - vpxord %ymm2,%ymm13,%ymm13 - vprold $8,%ymm13,%ymm13 - # x3 += x4, x14 = rotl32(x14 ^ x3, 8) - vpaddd %ymm3,%ymm4,%ymm3 - vpxord %ymm3,%ymm14,%ymm14 - vprold $8,%ymm14,%ymm14 - - # x10 += x15, x5 = rotl32(x5 ^ x10, 7) - vpaddd %ymm15,%ymm10,%ymm10 - vpxord %ymm10,%ymm5,%ymm5 - vprold $7,%ymm5,%ymm5 - # x11 += x12, x6 = rotl32(x6 ^ x11, 7) - vpaddd %ymm12,%ymm11,%ymm11 - vpxord %ymm11,%ymm6,%ymm6 - vprold $7,%ymm6,%ymm6 - # x8 += x13, x7 = rotl32(x7 ^ x8, 7) - vpaddd %ymm13,%ymm8,%ymm8 - vpxord %ymm8,%ymm7,%ymm7 - vprold $7,%ymm7,%ymm7 - # x9 += x14, x4 = rotl32(x4 ^ x9, 7) - vpaddd %ymm14,%ymm9,%ymm9 - vpxord %ymm9,%ymm4,%ymm4 - vprold $7,%ymm4,%ymm4 - - sub $2,%r8d - jnz .Ldoubleround8 - - # x0..15[0-3] += s[0..15] - vpaddd %ymm16,%ymm0,%ymm0 - vpaddd %ymm17,%ymm1,%ymm1 - vpaddd %ymm18,%ymm2,%ymm2 - vpaddd %ymm19,%ymm3,%ymm3 - vpaddd %ymm20,%ymm4,%ymm4 - vpaddd %ymm21,%ymm5,%ymm5 - vpaddd %ymm22,%ymm6,%ymm6 - vpaddd %ymm23,%ymm7,%ymm7 - vpaddd %ymm24,%ymm8,%ymm8 - vpaddd %ymm25,%ymm9,%ymm9 - vpaddd %ymm26,%ymm10,%ymm10 - vpaddd %ymm27,%ymm11,%ymm11 - vpaddd %ymm28,%ymm12,%ymm12 - vpaddd %ymm29,%ymm13,%ymm13 - vpaddd %ymm30,%ymm14,%ymm14 - vpaddd %ymm31,%ymm15,%ymm15 - - # interleave 32-bit words in state n, n+1 - vpunpckldq %ymm1,%ymm0,%ymm16 - vpunpckhdq %ymm1,%ymm0,%ymm17 - vpunpckldq %ymm3,%ymm2,%ymm18 - vpunpckhdq %ymm3,%ymm2,%ymm19 - vpunpckldq %ymm5,%ymm4,%ymm20 - vpunpckhdq %ymm5,%ymm4,%ymm21 - vpunpckldq %ymm7,%ymm6,%ymm22 - vpunpckhdq %ymm7,%ymm6,%ymm23 - vpunpckldq %ymm9,%ymm8,%ymm24 - vpunpckhdq %ymm9,%ymm8,%ymm25 - vpunpckldq %ymm11,%ymm10,%ymm26 - vpunpckhdq %ymm11,%ymm10,%ymm27 - vpunpckldq %ymm13,%ymm12,%ymm28 - vpunpckhdq %ymm13,%ymm12,%ymm29 - vpunpckldq %ymm15,%ymm14,%ymm30 - vpunpckhdq %ymm15,%ymm14,%ymm31 - - # interleave 64-bit words in state n, n+2 - vpunpcklqdq %ymm18,%ymm16,%ymm0 - vpunpcklqdq %ymm19,%ymm17,%ymm1 - vpunpckhqdq %ymm18,%ymm16,%ymm2 - vpunpckhqdq %ymm19,%ymm17,%ymm3 - vpunpcklqdq %ymm22,%ymm20,%ymm4 - vpunpcklqdq %ymm23,%ymm21,%ymm5 - vpunpckhqdq %ymm22,%ymm20,%ymm6 - vpunpckhqdq %ymm23,%ymm21,%ymm7 - vpunpcklqdq %ymm26,%ymm24,%ymm8 - vpunpcklqdq %ymm27,%ymm25,%ymm9 - vpunpckhqdq %ymm26,%ymm24,%ymm10 - vpunpckhqdq %ymm27,%ymm25,%ymm11 - vpunpcklqdq %ymm30,%ymm28,%ymm12 - vpunpcklqdq %ymm31,%ymm29,%ymm13 - vpunpckhqdq %ymm30,%ymm28,%ymm14 - vpunpckhqdq %ymm31,%ymm29,%ymm15 - - # interleave 128-bit words in state n, n+4 - # xor/write first four blocks - vmovdqa64 %ymm0,%ymm16 - vperm2i128 $0x20,%ymm4,%ymm0,%ymm0 - cmp $0x0020,%rcx - jl .Lxorpart8 - vpxord 0x0000(%rdx),%ymm0,%ymm0 - vmovdqu64 %ymm0,0x0000(%rsi) - vmovdqa64 %ymm16,%ymm0 - vperm2i128 $0x31,%ymm4,%ymm0,%ymm4 - - vperm2i128 $0x20,%ymm12,%ymm8,%ymm0 - cmp $0x0040,%rcx - jl .Lxorpart8 - vpxord 0x0020(%rdx),%ymm0,%ymm0 - vmovdqu64 %ymm0,0x0020(%rsi) - vperm2i128 $0x31,%ymm12,%ymm8,%ymm12 - - vperm2i128 $0x20,%ymm6,%ymm2,%ymm0 - cmp $0x0060,%rcx - jl .Lxorpart8 - vpxord 0x0040(%rdx),%ymm0,%ymm0 - vmovdqu64 %ymm0,0x0040(%rsi) - vperm2i128 $0x31,%ymm6,%ymm2,%ymm6 - - vperm2i128 $0x20,%ymm14,%ymm10,%ymm0 - cmp $0x0080,%rcx - jl .Lxorpart8 - vpxord 0x0060(%rdx),%ymm0,%ymm0 - vmovdqu64 %ymm0,0x0060(%rsi) - vperm2i128 $0x31,%ymm14,%ymm10,%ymm14 - - vperm2i128 $0x20,%ymm5,%ymm1,%ymm0 - cmp $0x00a0,%rcx - jl .Lxorpart8 - vpxord 0x0080(%rdx),%ymm0,%ymm0 - vmovdqu64 %ymm0,0x0080(%rsi) - vperm2i128 $0x31,%ymm5,%ymm1,%ymm5 - - vperm2i128 $0x20,%ymm13,%ymm9,%ymm0 - cmp $0x00c0,%rcx - jl .Lxorpart8 - vpxord 0x00a0(%rdx),%ymm0,%ymm0 - vmovdqu64 %ymm0,0x00a0(%rsi) - vperm2i128 $0x31,%ymm13,%ymm9,%ymm13 - - vperm2i128 $0x20,%ymm7,%ymm3,%ymm0 - cmp $0x00e0,%rcx - jl .Lxorpart8 - vpxord 0x00c0(%rdx),%ymm0,%ymm0 - vmovdqu64 %ymm0,0x00c0(%rsi) - vperm2i128 $0x31,%ymm7,%ymm3,%ymm7 - - vperm2i128 $0x20,%ymm15,%ymm11,%ymm0 - cmp $0x0100,%rcx - jl .Lxorpart8 - vpxord 0x00e0(%rdx),%ymm0,%ymm0 - vmovdqu64 %ymm0,0x00e0(%rsi) - vperm2i128 $0x31,%ymm15,%ymm11,%ymm15 - - # xor remaining blocks, write to output - vmovdqa64 %ymm4,%ymm0 - cmp $0x0120,%rcx - jl .Lxorpart8 - vpxord 0x0100(%rdx),%ymm0,%ymm0 - vmovdqu64 %ymm0,0x0100(%rsi) - - vmovdqa64 %ymm12,%ymm0 - cmp $0x0140,%rcx - jl .Lxorpart8 - vpxord 0x0120(%rdx),%ymm0,%ymm0 - vmovdqu64 %ymm0,0x0120(%rsi) - - vmovdqa64 %ymm6,%ymm0 - cmp $0x0160,%rcx - jl .Lxorpart8 - vpxord 0x0140(%rdx),%ymm0,%ymm0 - vmovdqu64 %ymm0,0x0140(%rsi) - - vmovdqa64 %ymm14,%ymm0 - cmp $0x0180,%rcx - jl .Lxorpart8 - vpxord 0x0160(%rdx),%ymm0,%ymm0 - vmovdqu64 %ymm0,0x0160(%rsi) - - vmovdqa64 %ymm5,%ymm0 - cmp $0x01a0,%rcx - jl .Lxorpart8 - vpxord 0x0180(%rdx),%ymm0,%ymm0 - vmovdqu64 %ymm0,0x0180(%rsi) - - vmovdqa64 %ymm13,%ymm0 - cmp $0x01c0,%rcx - jl .Lxorpart8 - vpxord 0x01a0(%rdx),%ymm0,%ymm0 - vmovdqu64 %ymm0,0x01a0(%rsi) - - vmovdqa64 %ymm7,%ymm0 - cmp $0x01e0,%rcx - jl .Lxorpart8 - vpxord 0x01c0(%rdx),%ymm0,%ymm0 - vmovdqu64 %ymm0,0x01c0(%rsi) - - vmovdqa64 %ymm15,%ymm0 - cmp $0x0200,%rcx - jl .Lxorpart8 - vpxord 0x01e0(%rdx),%ymm0,%ymm0 - vmovdqu64 %ymm0,0x01e0(%rsi) - -.Ldone8: - vzeroupper - RET - -.Lxorpart8: - # xor remaining bytes from partial register into output - mov %rcx,%rax - and $0x1f,%rcx - jz .Ldone8 - mov %rax,%r9 - and $~0x1f,%r9 - - mov $1,%rax - shld %cl,%rax,%rax - sub $1,%rax - kmovq %rax,%k1 - - vmovdqu8 (%rdx,%r9),%ymm1{%k1}{z} - vpxord %ymm0,%ymm1,%ymm1 - vmovdqu8 %ymm1,(%rsi,%r9){%k1} - - jmp .Ldone8 - -SYM_FUNC_END(chacha_8block_xor_avx512vl) diff --git a/arch/x86/crypto/chacha-ssse3-x86_64.S b/arch/x86/crypto/chacha-ssse3-x86_64.S deleted file mode 100644 index 7111949cd5b9..000000000000 --- a/arch/x86/crypto/chacha-ssse3-x86_64.S +++ /dev/null @@ -1,791 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * ChaCha 256-bit cipher algorithm, x64 SSSE3 functions - * - * Copyright (C) 2015 Martin Willi - */ - -#include <linux/linkage.h> -#include <asm/frame.h> - -.section .rodata.cst16.ROT8, "aM", @progbits, 16 -.align 16 -ROT8: .octa 0x0e0d0c0f0a09080b0605040702010003 -.section .rodata.cst16.ROT16, "aM", @progbits, 16 -.align 16 -ROT16: .octa 0x0d0c0f0e09080b0a0504070601000302 -.section .rodata.cst16.CTRINC, "aM", @progbits, 16 -.align 16 -CTRINC: .octa 0x00000003000000020000000100000000 - -.text - -/* - * chacha_permute - permute one block - * - * Permute one 64-byte block where the state matrix is in %xmm0-%xmm3. This - * function performs matrix operations on four words in parallel, but requires - * shuffling to rearrange the words after each round. 8/16-bit word rotation is - * done with the slightly better performing SSSE3 byte shuffling, 7/12-bit word - * rotation uses traditional shift+OR. - * - * The round count is given in %r8d. - * - * Clobbers: %r8d, %xmm4-%xmm7 - */ -SYM_FUNC_START_LOCAL(chacha_permute) - - movdqa ROT8(%rip),%xmm4 - movdqa ROT16(%rip),%xmm5 - -.Ldoubleround: - # x0 += x1, x3 = rotl32(x3 ^ x0, 16) - paddd %xmm1,%xmm0 - pxor %xmm0,%xmm3 - pshufb %xmm5,%xmm3 - - # x2 += x3, x1 = rotl32(x1 ^ x2, 12) - paddd %xmm3,%xmm2 - pxor %xmm2,%xmm1 - movdqa %xmm1,%xmm6 - pslld $12,%xmm6 - psrld $20,%xmm1 - por %xmm6,%xmm1 - - # x0 += x1, x3 = rotl32(x3 ^ x0, 8) - paddd %xmm1,%xmm0 - pxor %xmm0,%xmm3 - pshufb %xmm4,%xmm3 - - # x2 += x3, x1 = rotl32(x1 ^ x2, 7) - paddd %xmm3,%xmm2 - pxor %xmm2,%xmm1 - movdqa %xmm1,%xmm7 - pslld $7,%xmm7 - psrld $25,%xmm1 - por %xmm7,%xmm1 - - # x1 = shuffle32(x1, MASK(0, 3, 2, 1)) - pshufd $0x39,%xmm1,%xmm1 - # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) - pshufd $0x4e,%xmm2,%xmm2 - # x3 = shuffle32(x3, MASK(2, 1, 0, 3)) - pshufd $0x93,%xmm3,%xmm3 - - # x0 += x1, x3 = rotl32(x3 ^ x0, 16) - paddd %xmm1,%xmm0 - pxor %xmm0,%xmm3 - pshufb %xmm5,%xmm3 - - # x2 += x3, x1 = rotl32(x1 ^ x2, 12) - paddd %xmm3,%xmm2 - pxor %xmm2,%xmm1 - movdqa %xmm1,%xmm6 - pslld $12,%xmm6 - psrld $20,%xmm1 - por %xmm6,%xmm1 - - # x0 += x1, x3 = rotl32(x3 ^ x0, 8) - paddd %xmm1,%xmm0 - pxor %xmm0,%xmm3 - pshufb %xmm4,%xmm3 - - # x2 += x3, x1 = rotl32(x1 ^ x2, 7) - paddd %xmm3,%xmm2 - pxor %xmm2,%xmm1 - movdqa %xmm1,%xmm7 - pslld $7,%xmm7 - psrld $25,%xmm1 - por %xmm7,%xmm1 - - # x1 = shuffle32(x1, MASK(2, 1, 0, 3)) - pshufd $0x93,%xmm1,%xmm1 - # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) - pshufd $0x4e,%xmm2,%xmm2 - # x3 = shuffle32(x3, MASK(0, 3, 2, 1)) - pshufd $0x39,%xmm3,%xmm3 - - sub $2,%r8d - jnz .Ldoubleround - - RET -SYM_FUNC_END(chacha_permute) - -SYM_FUNC_START(chacha_block_xor_ssse3) - # %rdi: Input state matrix, s - # %rsi: up to 1 data block output, o - # %rdx: up to 1 data block input, i - # %rcx: input/output length in bytes - # %r8d: nrounds - FRAME_BEGIN - - # x0..3 = s0..3 - movdqu 0x00(%rdi),%xmm0 - movdqu 0x10(%rdi),%xmm1 - movdqu 0x20(%rdi),%xmm2 - movdqu 0x30(%rdi),%xmm3 - movdqa %xmm0,%xmm8 - movdqa %xmm1,%xmm9 - movdqa %xmm2,%xmm10 - movdqa %xmm3,%xmm11 - - mov %rcx,%rax - call chacha_permute - - # o0 = i0 ^ (x0 + s0) - paddd %xmm8,%xmm0 - cmp $0x10,%rax - jl .Lxorpart - movdqu 0x00(%rdx),%xmm4 - pxor %xmm4,%xmm0 - movdqu %xmm0,0x00(%rsi) - # o1 = i1 ^ (x1 + s1) - paddd %xmm9,%xmm1 - movdqa %xmm1,%xmm0 - cmp $0x20,%rax - jl .Lxorpart - movdqu 0x10(%rdx),%xmm0 - pxor %xmm1,%xmm0 - movdqu %xmm0,0x10(%rsi) - # o2 = i2 ^ (x2 + s2) - paddd %xmm10,%xmm2 - movdqa %xmm2,%xmm0 - cmp $0x30,%rax - jl .Lxorpart - movdqu 0x20(%rdx),%xmm0 - pxor %xmm2,%xmm0 - movdqu %xmm0,0x20(%rsi) - # o3 = i3 ^ (x3 + s3) - paddd %xmm11,%xmm3 - movdqa %xmm3,%xmm0 - cmp $0x40,%rax - jl .Lxorpart - movdqu 0x30(%rdx),%xmm0 - pxor %xmm3,%xmm0 - movdqu %xmm0,0x30(%rsi) - -.Ldone: - FRAME_END - RET - -.Lxorpart: - # xor remaining bytes from partial register into output - mov %rax,%r9 - and $0x0f,%r9 - jz .Ldone - and $~0x0f,%rax - - mov %rsi,%r11 - - lea 8(%rsp),%r10 - sub $0x10,%rsp - and $~31,%rsp - - lea (%rdx,%rax),%rsi - mov %rsp,%rdi - mov %r9,%rcx - rep movsb - - pxor 0x00(%rsp),%xmm0 - movdqa %xmm0,0x00(%rsp) - - mov %rsp,%rsi - lea (%r11,%rax),%rdi - mov %r9,%rcx - rep movsb - - lea -8(%r10),%rsp - jmp .Ldone - -SYM_FUNC_END(chacha_block_xor_ssse3) - -SYM_FUNC_START(hchacha_block_ssse3) - # %rdi: Input state matrix, s - # %rsi: output (8 32-bit words) - # %edx: nrounds - FRAME_BEGIN - - movdqu 0x00(%rdi),%xmm0 - movdqu 0x10(%rdi),%xmm1 - movdqu 0x20(%rdi),%xmm2 - movdqu 0x30(%rdi),%xmm3 - - mov %edx,%r8d - call chacha_permute - - movdqu %xmm0,0x00(%rsi) - movdqu %xmm3,0x10(%rsi) - - FRAME_END - RET -SYM_FUNC_END(hchacha_block_ssse3) - -SYM_FUNC_START(chacha_4block_xor_ssse3) - # %rdi: Input state matrix, s - # %rsi: up to 4 data blocks output, o - # %rdx: up to 4 data blocks input, i - # %rcx: input/output length in bytes - # %r8d: nrounds - - # This function encrypts four consecutive ChaCha blocks by loading the - # the state matrix in SSE registers four times. As we need some scratch - # registers, we save the first four registers on the stack. The - # algorithm performs each operation on the corresponding word of each - # state matrix, hence requires no word shuffling. For final XORing step - # we transpose the matrix by interleaving 32- and then 64-bit words, - # which allows us to do XOR in SSE registers. 8/16-bit word rotation is - # done with the slightly better performing SSSE3 byte shuffling, - # 7/12-bit word rotation uses traditional shift+OR. - - lea 8(%rsp),%r10 - sub $0x80,%rsp - and $~63,%rsp - mov %rcx,%rax - - # x0..15[0-3] = s0..3[0..3] - movq 0x00(%rdi),%xmm1 - pshufd $0x00,%xmm1,%xmm0 - pshufd $0x55,%xmm1,%xmm1 - movq 0x08(%rdi),%xmm3 - pshufd $0x00,%xmm3,%xmm2 - pshufd $0x55,%xmm3,%xmm3 - movq 0x10(%rdi),%xmm5 - pshufd $0x00,%xmm5,%xmm4 - pshufd $0x55,%xmm5,%xmm5 - movq 0x18(%rdi),%xmm7 - pshufd $0x00,%xmm7,%xmm6 - pshufd $0x55,%xmm7,%xmm7 - movq 0x20(%rdi),%xmm9 - pshufd $0x00,%xmm9,%xmm8 - pshufd $0x55,%xmm9,%xmm9 - movq 0x28(%rdi),%xmm11 - pshufd $0x00,%xmm11,%xmm10 - pshufd $0x55,%xmm11,%xmm11 - movq 0x30(%rdi),%xmm13 - pshufd $0x00,%xmm13,%xmm12 - pshufd $0x55,%xmm13,%xmm13 - movq 0x38(%rdi),%xmm15 - pshufd $0x00,%xmm15,%xmm14 - pshufd $0x55,%xmm15,%xmm15 - # x0..3 on stack - movdqa %xmm0,0x00(%rsp) - movdqa %xmm1,0x10(%rsp) - movdqa %xmm2,0x20(%rsp) - movdqa %xmm3,0x30(%rsp) - - movdqa CTRINC(%rip),%xmm1 - movdqa ROT8(%rip),%xmm2 - movdqa ROT16(%rip),%xmm3 - - # x12 += counter values 0-3 - paddd %xmm1,%xmm12 - -.Ldoubleround4: - # x0 += x4, x12 = rotl32(x12 ^ x0, 16) - movdqa 0x00(%rsp),%xmm0 - paddd %xmm4,%xmm0 - movdqa %xmm0,0x00(%rsp) - pxor %xmm0,%xmm12 - pshufb %xmm3,%xmm12 - # x1 += x5, x13 = rotl32(x13 ^ x1, 16) - movdqa 0x10(%rsp),%xmm0 - paddd %xmm5,%xmm0 - movdqa %xmm0,0x10(%rsp) - pxor %xmm0,%xmm13 - pshufb %xmm3,%xmm13 - # x2 += x6, x14 = rotl32(x14 ^ x2, 16) - movdqa 0x20(%rsp),%xmm0 - paddd %xmm6,%xmm0 - movdqa %xmm0,0x20(%rsp) - pxor %xmm0,%xmm14 - pshufb %xmm3,%xmm14 - # x3 += x7, x15 = rotl32(x15 ^ x3, 16) - movdqa 0x30(%rsp),%xmm0 - paddd %xmm7,%xmm0 - movdqa %xmm0,0x30(%rsp) - pxor %xmm0,%xmm15 - pshufb %xmm3,%xmm15 - - # x8 += x12, x4 = rotl32(x4 ^ x8, 12) - paddd %xmm12,%xmm8 - pxor %xmm8,%xmm4 - movdqa %xmm4,%xmm0 - pslld $12,%xmm0 - psrld $20,%xmm4 - por %xmm0,%xmm4 - # x9 += x13, x5 = rotl32(x5 ^ x9, 12) - paddd %xmm13,%xmm9 - pxor %xmm9,%xmm5 - movdqa %xmm5,%xmm0 - pslld $12,%xmm0 - psrld $20,%xmm5 - por %xmm0,%xmm5 - # x10 += x14, x6 = rotl32(x6 ^ x10, 12) - paddd %xmm14,%xmm10 - pxor %xmm10,%xmm6 - movdqa %xmm6,%xmm0 - pslld $12,%xmm0 - psrld $20,%xmm6 - por %xmm0,%xmm6 - # x11 += x15, x7 = rotl32(x7 ^ x11, 12) - paddd %xmm15,%xmm11 - pxor %xmm11,%xmm7 - movdqa %xmm7,%xmm0 - pslld $12,%xmm0 - psrld $20,%xmm7 - por %xmm0,%xmm7 - - # x0 += x4, x12 = rotl32(x12 ^ x0, 8) - movdqa 0x00(%rsp),%xmm0 - paddd %xmm4,%xmm0 - movdqa %xmm0,0x00(%rsp) - pxor %xmm0,%xmm12 - pshufb %xmm2,%xmm12 - # x1 += x5, x13 = rotl32(x13 ^ x1, 8) - movdqa 0x10(%rsp),%xmm0 - paddd %xmm5,%xmm0 - movdqa %xmm0,0x10(%rsp) - pxor %xmm0,%xmm13 - pshufb %xmm2,%xmm13 - # x2 += x6, x14 = rotl32(x14 ^ x2, 8) - movdqa 0x20(%rsp),%xmm0 - paddd %xmm6,%xmm0 - movdqa %xmm0,0x20(%rsp) - pxor %xmm0,%xmm14 - pshufb %xmm2,%xmm14 - # x3 += x7, x15 = rotl32(x15 ^ x3, 8) - movdqa 0x30(%rsp),%xmm0 - paddd %xmm7,%xmm0 - movdqa %xmm0,0x30(%rsp) - pxor %xmm0,%xmm15 - pshufb %xmm2,%xmm15 - - # x8 += x12, x4 = rotl32(x4 ^ x8, 7) - paddd %xmm12,%xmm8 - pxor %xmm8,%xmm4 - movdqa %xmm4,%xmm0 - pslld $7,%xmm0 - psrld $25,%xmm4 - por %xmm0,%xmm4 - # x9 += x13, x5 = rotl32(x5 ^ x9, 7) - paddd %xmm13,%xmm9 - pxor %xmm9,%xmm5 - movdqa %xmm5,%xmm0 - pslld $7,%xmm0 - psrld $25,%xmm5 - por %xmm0,%xmm5 - # x10 += x14, x6 = rotl32(x6 ^ x10, 7) - paddd %xmm14,%xmm10 - pxor %xmm10,%xmm6 - movdqa %xmm6,%xmm0 - pslld $7,%xmm0 - psrld $25,%xmm6 - por %xmm0,%xmm6 - # x11 += x15, x7 = rotl32(x7 ^ x11, 7) - paddd %xmm15,%xmm11 - pxor %xmm11,%xmm7 - movdqa %xmm7,%xmm0 - pslld $7,%xmm0 - psrld $25,%xmm7 - por %xmm0,%xmm7 - - # x0 += x5, x15 = rotl32(x15 ^ x0, 16) - movdqa 0x00(%rsp),%xmm0 - paddd %xmm5,%xmm0 - movdqa %xmm0,0x00(%rsp) - pxor %xmm0,%xmm15 - pshufb %xmm3,%xmm15 - # x1 += x6, x12 = rotl32(x12 ^ x1, 16) - movdqa 0x10(%rsp),%xmm0 - paddd %xmm6,%xmm0 - movdqa %xmm0,0x10(%rsp) - pxor %xmm0,%xmm12 - pshufb %xmm3,%xmm12 - # x2 += x7, x13 = rotl32(x13 ^ x2, 16) - movdqa 0x20(%rsp),%xmm0 - paddd %xmm7,%xmm0 - movdqa %xmm0,0x20(%rsp) - pxor %xmm0,%xmm13 - pshufb %xmm3,%xmm13 - # x3 += x4, x14 = rotl32(x14 ^ x3, 16) - movdqa 0x30(%rsp),%xmm0 - paddd %xmm4,%xmm0 - movdqa %xmm0,0x30(%rsp) - pxor %xmm0,%xmm14 - pshufb %xmm3,%xmm14 - - # x10 += x15, x5 = rotl32(x5 ^ x10, 12) - paddd %xmm15,%xmm10 - pxor %xmm10,%xmm5 - movdqa %xmm5,%xmm0 - pslld $12,%xmm0 - psrld $20,%xmm5 - por %xmm0,%xmm5 - # x11 += x12, x6 = rotl32(x6 ^ x11, 12) - paddd %xmm12,%xmm11 - pxor %xmm11,%xmm6 - movdqa %xmm6,%xmm0 - pslld $12,%xmm0 - psrld $20,%xmm6 - por %xmm0,%xmm6 - # x8 += x13, x7 = rotl32(x7 ^ x8, 12) - paddd %xmm13,%xmm8 - pxor %xmm8,%xmm7 - movdqa %xmm7,%xmm0 - pslld $12,%xmm0 - psrld $20,%xmm7 - por %xmm0,%xmm7 - # x9 += x14, x4 = rotl32(x4 ^ x9, 12) - paddd %xmm14,%xmm9 - pxor %xmm9,%xmm4 - movdqa %xmm4,%xmm0 - pslld $12,%xmm0 - psrld $20,%xmm4 - por %xmm0,%xmm4 - - # x0 += x5, x15 = rotl32(x15 ^ x0, 8) - movdqa 0x00(%rsp),%xmm0 - paddd %xmm5,%xmm0 - movdqa %xmm0,0x00(%rsp) - pxor %xmm0,%xmm15 - pshufb %xmm2,%xmm15 - # x1 += x6, x12 = rotl32(x12 ^ x1, 8) - movdqa 0x10(%rsp),%xmm0 - paddd %xmm6,%xmm0 - movdqa %xmm0,0x10(%rsp) - pxor %xmm0,%xmm12 - pshufb %xmm2,%xmm12 - # x2 += x7, x13 = rotl32(x13 ^ x2, 8) - movdqa 0x20(%rsp),%xmm0 - paddd %xmm7,%xmm0 - movdqa %xmm0,0x20(%rsp) - pxor %xmm0,%xmm13 - pshufb %xmm2,%xmm13 - # x3 += x4, x14 = rotl32(x14 ^ x3, 8) - movdqa 0x30(%rsp),%xmm0 - paddd %xmm4,%xmm0 - movdqa %xmm0,0x30(%rsp) - pxor %xmm0,%xmm14 - pshufb %xmm2,%xmm14 - - # x10 += x15, x5 = rotl32(x5 ^ x10, 7) - paddd %xmm15,%xmm10 - pxor %xmm10,%xmm5 - movdqa %xmm5,%xmm0 - pslld $7,%xmm0 - psrld $25,%xmm5 - por %xmm0,%xmm5 - # x11 += x12, x6 = rotl32(x6 ^ x11, 7) - paddd %xmm12,%xmm11 - pxor %xmm11,%xmm6 - movdqa %xmm6,%xmm0 - pslld $7,%xmm0 - psrld $25,%xmm6 - por %xmm0,%xmm6 - # x8 += x13, x7 = rotl32(x7 ^ x8, 7) - paddd %xmm13,%xmm8 - pxor %xmm8,%xmm7 - movdqa %xmm7,%xmm0 - pslld $7,%xmm0 - psrld $25,%xmm7 - por %xmm0,%xmm7 - # x9 += x14, x4 = rotl32(x4 ^ x9, 7) - paddd %xmm14,%xmm9 - pxor %xmm9,%xmm4 - movdqa %xmm4,%xmm0 - pslld $7,%xmm0 - psrld $25,%xmm4 - por %xmm0,%xmm4 - - sub $2,%r8d - jnz .Ldoubleround4 - - # x0[0-3] += s0[0] - # x1[0-3] += s0[1] - movq 0x00(%rdi),%xmm3 - pshufd $0x00,%xmm3,%xmm2 - pshufd $0x55,%xmm3,%xmm3 - paddd 0x00(%rsp),%xmm2 - movdqa %xmm2,0x00(%rsp) - paddd 0x10(%rsp),%xmm3 - movdqa %xmm3,0x10(%rsp) - # x2[0-3] += s0[2] - # x3[0-3] += s0[3] - movq 0x08(%rdi),%xmm3 - pshufd $0x00,%xmm3,%xmm2 - pshufd $0x55,%xmm3,%xmm3 - paddd 0x20(%rsp),%xmm2 - movdqa %xmm2,0x20(%rsp) - paddd 0x30(%rsp),%xmm3 - movdqa %xmm3,0x30(%rsp) - - # x4[0-3] += s1[0] - # x5[0-3] += s1[1] - movq 0x10(%rdi),%xmm3 - pshufd $0x00,%xmm3,%xmm2 - pshufd $0x55,%xmm3,%xmm3 - paddd %xmm2,%xmm4 - paddd %xmm3,%xmm5 - # x6[0-3] += s1[2] - # x7[0-3] += s1[3] - movq 0x18(%rdi),%xmm3 - pshufd $0x00,%xmm3,%xmm2 - pshufd $0x55,%xmm3,%xmm3 - paddd %xmm2,%xmm6 - paddd %xmm3,%xmm7 - - # x8[0-3] += s2[0] - # x9[0-3] += s2[1] - movq 0x20(%rdi),%xmm3 - pshufd $0x00,%xmm3,%xmm2 - pshufd $0x55,%xmm3,%xmm3 - paddd %xmm2,%xmm8 - paddd %xmm3,%xmm9 - # x10[0-3] += s2[2] - # x11[0-3] += s2[3] - movq 0x28(%rdi),%xmm3 - pshufd $0x00,%xmm3,%xmm2 - pshufd $0x55,%xmm3,%xmm3 - paddd %xmm2,%xmm10 - paddd %xmm3,%xmm11 - - # x12[0-3] += s3[0] - # x13[0-3] += s3[1] - movq 0x30(%rdi),%xmm3 - pshufd $0x00,%xmm3,%xmm2 - pshufd $0x55,%xmm3,%xmm3 - paddd %xmm2,%xmm12 - paddd %xmm3,%xmm13 - # x14[0-3] += s3[2] - # x15[0-3] += s3[3] - movq 0x38(%rdi),%xmm3 - pshufd $0x00,%xmm3,%xmm2 - pshufd $0x55,%xmm3,%xmm3 - paddd %xmm2,%xmm14 - paddd %xmm3,%xmm15 - - # x12 += counter values 0-3 - paddd %xmm1,%xmm12 - - # interleave 32-bit words in state n, n+1 - movdqa 0x00(%rsp),%xmm0 - movdqa 0x10(%rsp),%xmm1 - movdqa %xmm0,%xmm2 - punpckldq %xmm1,%xmm2 - punpckhdq %xmm1,%xmm0 - movdqa %xmm2,0x00(%rsp) - movdqa %xmm0,0x10(%rsp) - movdqa 0x20(%rsp),%xmm0 - movdqa 0x30(%rsp),%xmm1 - movdqa %xmm0,%xmm2 - punpckldq %xmm1,%xmm2 - punpckhdq %xmm1,%xmm0 - movdqa %xmm2,0x20(%rsp) - movdqa %xmm0,0x30(%rsp) - movdqa %xmm4,%xmm0 - punpckldq %xmm5,%xmm4 - punpckhdq %xmm5,%xmm0 - movdqa %xmm0,%xmm5 - movdqa %xmm6,%xmm0 - punpckldq %xmm7,%xmm6 - punpckhdq %xmm7,%xmm0 - movdqa %xmm0,%xmm7 - movdqa %xmm8,%xmm0 - punpckldq %xmm9,%xmm8 - punpckhdq %xmm9,%xmm0 - movdqa %xmm0,%xmm9 - movdqa %xmm10,%xmm0 - punpckldq %xmm11,%xmm10 - punpckhdq %xmm11,%xmm0 - movdqa %xmm0,%xmm11 - movdqa %xmm12,%xmm0 - punpckldq %xmm13,%xmm12 - punpckhdq %xmm13,%xmm0 - movdqa %xmm0,%xmm13 - movdqa %xmm14,%xmm0 - punpckldq %xmm15,%xmm14 - punpckhdq %xmm15,%xmm0 - movdqa %xmm0,%xmm15 - - # interleave 64-bit words in state n, n+2 - movdqa 0x00(%rsp),%xmm0 - movdqa 0x20(%rsp),%xmm1 - movdqa %xmm0,%xmm2 - punpcklqdq %xmm1,%xmm2 - punpckhqdq %xmm1,%xmm0 - movdqa %xmm2,0x00(%rsp) - movdqa %xmm0,0x20(%rsp) - movdqa 0x10(%rsp),%xmm0 - movdqa 0x30(%rsp),%xmm1 - movdqa %xmm0,%xmm2 - punpcklqdq %xmm1,%xmm2 - punpckhqdq %xmm1,%xmm0 - movdqa %xmm2,0x10(%rsp) - movdqa %xmm0,0x30(%rsp) - movdqa %xmm4,%xmm0 - punpcklqdq %xmm6,%xmm4 - punpckhqdq %xmm6,%xmm0 - movdqa %xmm0,%xmm6 - movdqa %xmm5,%xmm0 - punpcklqdq %xmm7,%xmm5 - punpckhqdq %xmm7,%xmm0 - movdqa %xmm0,%xmm7 - movdqa %xmm8,%xmm0 - punpcklqdq %xmm10,%xmm8 - punpckhqdq %xmm10,%xmm0 - movdqa %xmm0,%xmm10 - movdqa %xmm9,%xmm0 - punpcklqdq %xmm11,%xmm9 - punpckhqdq %xmm11,%xmm0 - movdqa %xmm0,%xmm11 - movdqa %xmm12,%xmm0 - punpcklqdq %xmm14,%xmm12 - punpckhqdq %xmm14,%xmm0 - movdqa %xmm0,%xmm14 - movdqa %xmm13,%xmm0 - punpcklqdq %xmm15,%xmm13 - punpckhqdq %xmm15,%xmm0 - movdqa %xmm0,%xmm15 - - # xor with corresponding input, write to output - movdqa 0x00(%rsp),%xmm0 - cmp $0x10,%rax - jl .Lxorpart4 - movdqu 0x00(%rdx),%xmm1 - pxor %xmm1,%xmm0 - movdqu %xmm0,0x00(%rsi) - - movdqu %xmm4,%xmm0 - cmp $0x20,%rax - jl .Lxorpart4 - movdqu 0x10(%rdx),%xmm1 - pxor %xmm1,%xmm0 - movdqu %xmm0,0x10(%rsi) - - movdqu %xmm8,%xmm0 - cmp $0x30,%rax - jl .Lxorpart4 - movdqu 0x20(%rdx),%xmm1 - pxor %xmm1,%xmm0 - movdqu %xmm0,0x20(%rsi) - - movdqu %xmm12,%xmm0 - cmp $0x40,%rax - jl .Lxorpart4 - movdqu 0x30(%rdx),%xmm1 - pxor %xmm1,%xmm0 - movdqu %xmm0,0x30(%rsi) - - movdqa 0x20(%rsp),%xmm0 - cmp $0x50,%rax - jl .Lxorpart4 - movdqu 0x40(%rdx),%xmm1 - pxor %xmm1,%xmm0 - movdqu %xmm0,0x40(%rsi) - - movdqu %xmm6,%xmm0 - cmp $0x60,%rax - jl .Lxorpart4 - movdqu 0x50(%rdx),%xmm1 - pxor %xmm1,%xmm0 - movdqu %xmm0,0x50(%rsi) - - movdqu %xmm10,%xmm0 - cmp $0x70,%rax - jl .Lxorpart4 - movdqu 0x60(%rdx),%xmm1 - pxor %xmm1,%xmm0 - movdqu %xmm0,0x60(%rsi) - - movdqu %xmm14,%xmm0 - cmp $0x80,%rax - jl .Lxorpart4 - movdqu 0x70(%rdx),%xmm1 - pxor %xmm1,%xmm0 - movdqu %xmm0,0x70(%rsi) - - movdqa 0x10(%rsp),%xmm0 - cmp $0x90,%rax - jl .Lxorpart4 - movdqu 0x80(%rdx),%xmm1 - pxor %xmm1,%xmm0 - movdqu %xmm0,0x80(%rsi) - - movdqu %xmm5,%xmm0 - cmp $0xa0,%rax - jl .Lxorpart4 - movdqu 0x90(%rdx),%xmm1 - pxor %xmm1,%xmm0 - movdqu %xmm0,0x90(%rsi) - - movdqu %xmm9,%xmm0 - cmp $0xb0,%rax - jl .Lxorpart4 - movdqu 0xa0(%rdx),%xmm1 - pxor %xmm1,%xmm0 - movdqu %xmm0,0xa0(%rsi) - - movdqu %xmm13,%xmm0 - cmp $0xc0,%rax - jl .Lxorpart4 - movdqu 0xb0(%rdx),%xmm1 - pxor %xmm1,%xmm0 - movdqu %xmm0,0xb0(%rsi) - - movdqa 0x30(%rsp),%xmm0 - cmp $0xd0,%rax - jl .Lxorpart4 - movdqu 0xc0(%rdx),%xmm1 - pxor %xmm1,%xmm0 - movdqu %xmm0,0xc0(%rsi) - - movdqu %xmm7,%xmm0 - cmp $0xe0,%rax - jl .Lxorpart4 - movdqu 0xd0(%rdx),%xmm1 - pxor %xmm1,%xmm0 - movdqu %xmm0,0xd0(%rsi) - - movdqu %xmm11,%xmm0 - cmp $0xf0,%rax - jl .Lxorpart4 - movdqu 0xe0(%rdx),%xmm1 - pxor %xmm1,%xmm0 - movdqu %xmm0,0xe0(%rsi) - - movdqu %xmm15,%xmm0 - cmp $0x100,%rax - jl .Lxorpart4 - movdqu 0xf0(%rdx),%xmm1 - pxor %xmm1,%xmm0 - movdqu %xmm0,0xf0(%rsi) - -.Ldone4: - lea -8(%r10),%rsp - RET - -.Lxorpart4: - # xor remaining bytes from partial register into output - mov %rax,%r9 - and $0x0f,%r9 - jz .Ldone4 - and $~0x0f,%rax - - mov %rsi,%r11 - - lea (%rdx,%rax),%rsi - mov %rsp,%rdi - mov %r9,%rcx - rep movsb - - pxor 0x00(%rsp),%xmm0 - movdqa %xmm0,0x00(%rsp) - - mov %rsp,%rsi - lea (%r11,%rax),%rdi - mov %r9,%rcx - rep movsb - - jmp .Ldone4 - -SYM_FUNC_END(chacha_4block_xor_ssse3) diff --git a/arch/x86/crypto/chacha_glue.c b/arch/x86/crypto/chacha_glue.c deleted file mode 100644 index 7b3a1cf0984b..000000000000 --- a/arch/x86/crypto/chacha_glue.c +++ /dev/null @@ -1,317 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * x64 SIMD accelerated ChaCha and XChaCha stream ciphers, - * including ChaCha20 (RFC7539) - * - * Copyright (C) 2015 Martin Willi - */ - -#include <crypto/algapi.h> -#include <crypto/internal/chacha.h> -#include <crypto/internal/simd.h> -#include <crypto/internal/skcipher.h> -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/sizes.h> -#include <asm/simd.h> - -asmlinkage void chacha_block_xor_ssse3(u32 *state, u8 *dst, const u8 *src, - unsigned int len, int nrounds); -asmlinkage void chacha_4block_xor_ssse3(u32 *state, u8 *dst, const u8 *src, - unsigned int len, int nrounds); -asmlinkage void hchacha_block_ssse3(const u32 *state, u32 *out, int nrounds); - -asmlinkage void chacha_2block_xor_avx2(u32 *state, u8 *dst, const u8 *src, - unsigned int len, int nrounds); -asmlinkage void chacha_4block_xor_avx2(u32 *state, u8 *dst, const u8 *src, - unsigned int len, int nrounds); -asmlinkage void chacha_8block_xor_avx2(u32 *state, u8 *dst, const u8 *src, - unsigned int len, int nrounds); - -asmlinkage void chacha_2block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src, - unsigned int len, int nrounds); -asmlinkage void chacha_4block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src, - unsigned int len, int nrounds); -asmlinkage void chacha_8block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src, - unsigned int len, int nrounds); - -static __ro_after_init DEFINE_STATIC_KEY_FALSE(chacha_use_simd); -static __ro_after_init DEFINE_STATIC_KEY_FALSE(chacha_use_avx2); -static __ro_after_init DEFINE_STATIC_KEY_FALSE(chacha_use_avx512vl); - -static unsigned int chacha_advance(unsigned int len, unsigned int maxblocks) -{ - len = min(len, maxblocks * CHACHA_BLOCK_SIZE); - return round_up(len, CHACHA_BLOCK_SIZE) / CHACHA_BLOCK_SIZE; -} - -static void chacha_dosimd(u32 *state, u8 *dst, const u8 *src, - unsigned int bytes, int nrounds) -{ - if (IS_ENABLED(CONFIG_AS_AVX512) && - static_branch_likely(&chacha_use_avx512vl)) { - while (bytes >= CHACHA_BLOCK_SIZE * 8) { - chacha_8block_xor_avx512vl(state, dst, src, bytes, - nrounds); - bytes -= CHACHA_BLOCK_SIZE * 8; - src += CHACHA_BLOCK_SIZE * 8; - dst += CHACHA_BLOCK_SIZE * 8; - state[12] += 8; - } - if (bytes > CHACHA_BLOCK_SIZE * 4) { - chacha_8block_xor_avx512vl(state, dst, src, bytes, - nrounds); - state[12] += chacha_advance(bytes, 8); - return; - } - if (bytes > CHACHA_BLOCK_SIZE * 2) { - chacha_4block_xor_avx512vl(state, dst, src, bytes, - nrounds); - state[12] += chacha_advance(bytes, 4); - return; - } - if (bytes) { - chacha_2block_xor_avx512vl(state, dst, src, bytes, - nrounds); - state[12] += chacha_advance(bytes, 2); - return; - } - } - - if (static_branch_likely(&chacha_use_avx2)) { - while (bytes >= CHACHA_BLOCK_SIZE * 8) { - chacha_8block_xor_avx2(state, dst, src, bytes, nrounds); - bytes -= CHACHA_BLOCK_SIZE * 8; - src += CHACHA_BLOCK_SIZE * 8; - dst += CHACHA_BLOCK_SIZE * 8; - state[12] += 8; - } - if (bytes > CHACHA_BLOCK_SIZE * 4) { - chacha_8block_xor_avx2(state, dst, src, bytes, nrounds); - state[12] += chacha_advance(bytes, 8); - return; - } - if (bytes > CHACHA_BLOCK_SIZE * 2) { - chacha_4block_xor_avx2(state, dst, src, bytes, nrounds); - state[12] += chacha_advance(bytes, 4); - return; - } - if (bytes > CHACHA_BLOCK_SIZE) { - chacha_2block_xor_avx2(state, dst, src, bytes, nrounds); - state[12] += chacha_advance(bytes, 2); - return; - } - } - - while (bytes >= CHACHA_BLOCK_SIZE * 4) { - chacha_4block_xor_ssse3(state, dst, src, bytes, nrounds); - bytes -= CHACHA_BLOCK_SIZE * 4; - src += CHACHA_BLOCK_SIZE * 4; - dst += CHACHA_BLOCK_SIZE * 4; - state[12] += 4; - } - if (bytes > CHACHA_BLOCK_SIZE) { - chacha_4block_xor_ssse3(state, dst, src, bytes, nrounds); - state[12] += chacha_advance(bytes, 4); - return; - } - if (bytes) { - chacha_block_xor_ssse3(state, dst, src, bytes, nrounds); - state[12]++; - } -} - -void hchacha_block_arch(const u32 *state, u32 *stream, int nrounds) -{ - if (!static_branch_likely(&chacha_use_simd) || !crypto_simd_usable()) { - hchacha_block_generic(state, stream, nrounds); - } else { - kernel_fpu_begin(); - hchacha_block_ssse3(state, stream, nrounds); - kernel_fpu_end(); - } -} -EXPORT_SYMBOL(hchacha_block_arch); - -void chacha_init_arch(u32 *state, const u32 *key, const u8 *iv) -{ - chacha_init_generic(state, key, iv); -} -EXPORT_SYMBOL(chacha_init_arch); - -void chacha_crypt_arch(u32 *state, u8 *dst, const u8 *src, unsigned int bytes, - int nrounds) -{ - if (!static_branch_likely(&chacha_use_simd) || !crypto_simd_usable() || - bytes <= CHACHA_BLOCK_SIZE) - return chacha_crypt_generic(state, dst, src, bytes, nrounds); - - do { - unsigned int todo = min_t(unsigned int, bytes, SZ_4K); - - kernel_fpu_begin(); - chacha_dosimd(state, dst, src, todo, nrounds); - kernel_fpu_end(); - - bytes -= todo; - src += todo; - dst += todo; - } while (bytes); -} -EXPORT_SYMBOL(chacha_crypt_arch); - -static int chacha_simd_stream_xor(struct skcipher_request *req, - const struct chacha_ctx *ctx, const u8 *iv) -{ - u32 state[CHACHA_STATE_WORDS] __aligned(8); - struct skcipher_walk walk; - int err; - - err = skcipher_walk_virt(&walk, req, false); - - chacha_init_generic(state, ctx->key, iv); - - while (walk.nbytes > 0) { - unsigned int nbytes = walk.nbytes; - - if (nbytes < walk.total) - nbytes = round_down(nbytes, walk.stride); - - if (!static_branch_likely(&chacha_use_simd) || - !crypto_simd_usable()) { - chacha_crypt_generic(state, walk.dst.virt.addr, - walk.src.virt.addr, nbytes, - ctx->nrounds); - } else { - kernel_fpu_begin(); - chacha_dosimd(state, walk.dst.virt.addr, - walk.src.virt.addr, nbytes, - ctx->nrounds); - kernel_fpu_end(); - } - err = skcipher_walk_done(&walk, walk.nbytes - nbytes); - } - - return err; -} - -static int chacha_simd(struct skcipher_request *req) -{ - struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm); - - return chacha_simd_stream_xor(req, ctx, req->iv); -} - -static int xchacha_simd(struct skcipher_request *req) -{ - struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm); - u32 state[CHACHA_STATE_WORDS] __aligned(8); - struct chacha_ctx subctx; - u8 real_iv[16]; - - chacha_init_generic(state, ctx->key, req->iv); - - if (req->cryptlen > CHACHA_BLOCK_SIZE && crypto_simd_usable()) { - kernel_fpu_begin(); - hchacha_block_ssse3(state, subctx.key, ctx->nrounds); - kernel_fpu_end(); - } else { - hchacha_block_generic(state, subctx.key, ctx->nrounds); - } - subctx.nrounds = ctx->nrounds; - - memcpy(&real_iv[0], req->iv + 24, 8); - memcpy(&real_iv[8], req->iv + 16, 8); - return chacha_simd_stream_xor(req, &subctx, real_iv); -} - -static struct skcipher_alg algs[] = { - { - .base.cra_name = "chacha20", - .base.cra_driver_name = "chacha20-simd", - .base.cra_priority = 300, - .base.cra_blocksize = 1, - .base.cra_ctxsize = sizeof(struct chacha_ctx), - .base.cra_module = THIS_MODULE, - - .min_keysize = CHACHA_KEY_SIZE, - .max_keysize = CHACHA_KEY_SIZE, - .ivsize = CHACHA_IV_SIZE, - .chunksize = CHACHA_BLOCK_SIZE, - .setkey = chacha20_setkey, - .encrypt = chacha_simd, - .decrypt = chacha_simd, - }, { - .base.cra_name = "xchacha20", - .base.cra_driver_name = "xchacha20-simd", - .base.cra_priority = 300, - .base.cra_blocksize = 1, - .base.cra_ctxsize = sizeof(struct chacha_ctx), - .base.cra_module = THIS_MODULE, - - .min_keysize = CHACHA_KEY_SIZE, - .max_keysize = CHACHA_KEY_SIZE, - .ivsize = XCHACHA_IV_SIZE, - .chunksize = CHACHA_BLOCK_SIZE, - .setkey = chacha20_setkey, - .encrypt = xchacha_simd, - .decrypt = xchacha_simd, - }, { - .base.cra_name = "xchacha12", - .base.cra_driver_name = "xchacha12-simd", - .base.cra_priority = 300, - .base.cra_blocksize = 1, - .base.cra_ctxsize = sizeof(struct chacha_ctx), - .base.cra_module = THIS_MODULE, - - .min_keysize = CHACHA_KEY_SIZE, - .max_keysize = CHACHA_KEY_SIZE, - .ivsize = XCHACHA_IV_SIZE, - .chunksize = CHACHA_BLOCK_SIZE, - .setkey = chacha12_setkey, - .encrypt = xchacha_simd, - .decrypt = xchacha_simd, - }, -}; - -static int __init chacha_simd_mod_init(void) -{ - if (!boot_cpu_has(X86_FEATURE_SSSE3)) - return 0; - - static_branch_enable(&chacha_use_simd); - - if (boot_cpu_has(X86_FEATURE_AVX) && - boot_cpu_has(X86_FEATURE_AVX2) && - cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) { - static_branch_enable(&chacha_use_avx2); - - if (IS_ENABLED(CONFIG_AS_AVX512) && - boot_cpu_has(X86_FEATURE_AVX512VL) && - boot_cpu_has(X86_FEATURE_AVX512BW)) /* kmovq */ - static_branch_enable(&chacha_use_avx512vl); - } - return IS_REACHABLE(CONFIG_CRYPTO_SKCIPHER) ? - crypto_register_skciphers(algs, ARRAY_SIZE(algs)) : 0; -} - -static void __exit chacha_simd_mod_fini(void) -{ - if (IS_REACHABLE(CONFIG_CRYPTO_SKCIPHER) && boot_cpu_has(X86_FEATURE_SSSE3)) - crypto_unregister_skciphers(algs, ARRAY_SIZE(algs)); -} - -module_init(chacha_simd_mod_init); -module_exit(chacha_simd_mod_fini); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Martin Willi <martin@strongswan.org>"); -MODULE_DESCRIPTION("ChaCha and XChaCha stream ciphers (x64 SIMD accelerated)"); -MODULE_ALIAS_CRYPTO("chacha20"); -MODULE_ALIAS_CRYPTO("chacha20-simd"); -MODULE_ALIAS_CRYPTO("xchacha20"); -MODULE_ALIAS_CRYPTO("xchacha20-simd"); -MODULE_ALIAS_CRYPTO("xchacha12"); -MODULE_ALIAS_CRYPTO("xchacha12-simd"); diff --git a/arch/x86/crypto/crc32-pclmul_asm.S b/arch/x86/crypto/crc32-pclmul_asm.S deleted file mode 100644 index 5d31137e2c7d..000000000000 --- a/arch/x86/crypto/crc32-pclmul_asm.S +++ /dev/null @@ -1,218 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Copyright 2012 Xyratex Technology Limited - * - * Using hardware provided PCLMULQDQ instruction to accelerate the CRC32 - * calculation. - * CRC32 polynomial:0x04c11db7(BE)/0xEDB88320(LE) - * PCLMULQDQ is a new instruction in Intel SSE4.2, the reference can be found - * at: - * http://www.intel.com/products/processor/manuals/ - * Intel(R) 64 and IA-32 Architectures Software Developer's Manual - * Volume 2B: Instruction Set Reference, N-Z - * - * Authors: Gregory Prestas <Gregory_Prestas@us.xyratex.com> - * Alexander Boyko <Alexander_Boyko@xyratex.com> - */ - -#include <linux/linkage.h> - - -.section .rodata -.align 16 -/* - * [x4*128+32 mod P(x) << 32)]' << 1 = 0x154442bd4 - * #define CONSTANT_R1 0x154442bd4LL - * - * [(x4*128-32 mod P(x) << 32)]' << 1 = 0x1c6e41596 - * #define CONSTANT_R2 0x1c6e41596LL - */ -.Lconstant_R2R1: - .octa 0x00000001c6e415960000000154442bd4 -/* - * [(x128+32 mod P(x) << 32)]' << 1 = 0x1751997d0 - * #define CONSTANT_R3 0x1751997d0LL - * - * [(x128-32 mod P(x) << 32)]' << 1 = 0x0ccaa009e - * #define CONSTANT_R4 0x0ccaa009eLL - */ -.Lconstant_R4R3: - .octa 0x00000000ccaa009e00000001751997d0 -/* - * [(x64 mod P(x) << 32)]' << 1 = 0x163cd6124 - * #define CONSTANT_R5 0x163cd6124LL - */ -.Lconstant_R5: - .octa 0x00000000000000000000000163cd6124 -.Lconstant_mask32: - .octa 0x000000000000000000000000FFFFFFFF -/* - * #define CRCPOLY_TRUE_LE_FULL 0x1DB710641LL - * - * Barrett Reduction constant (u64`) = u` = (x**64 / P(x))` = 0x1F7011641LL - * #define CONSTANT_RU 0x1F7011641LL - */ -.Lconstant_RUpoly: - .octa 0x00000001F701164100000001DB710641 - -#define CONSTANT %xmm0 - -#ifdef __x86_64__ -#define BUF %rdi -#define LEN %rsi -#define CRC %edx -#else -#define BUF %eax -#define LEN %edx -#define CRC %ecx -#endif - - - -.text -/** - * Calculate crc32 - * BUF - buffer (16 bytes aligned) - * LEN - sizeof buffer (16 bytes aligned), LEN should be grater than 63 - * CRC - initial crc32 - * return %eax crc32 - * uint crc32_pclmul_le_16(unsigned char const *buffer, - * size_t len, uint crc32) - */ - -SYM_FUNC_START(crc32_pclmul_le_16) /* buffer and buffer size are 16 bytes aligned */ - movdqa (BUF), %xmm1 - movdqa 0x10(BUF), %xmm2 - movdqa 0x20(BUF), %xmm3 - movdqa 0x30(BUF), %xmm4 - movd CRC, CONSTANT - pxor CONSTANT, %xmm1 - sub $0x40, LEN - add $0x40, BUF - cmp $0x40, LEN - jb .Lless_64 - -#ifdef __x86_64__ - movdqa .Lconstant_R2R1(%rip), CONSTANT -#else - movdqa .Lconstant_R2R1, CONSTANT -#endif - -.Lloop_64:/* 64 bytes Full cache line folding */ - prefetchnta 0x40(BUF) - movdqa %xmm1, %xmm5 - movdqa %xmm2, %xmm6 - movdqa %xmm3, %xmm7 -#ifdef __x86_64__ - movdqa %xmm4, %xmm8 -#endif - pclmulqdq $0x00, CONSTANT, %xmm1 - pclmulqdq $0x00, CONSTANT, %xmm2 - pclmulqdq $0x00, CONSTANT, %xmm3 -#ifdef __x86_64__ - pclmulqdq $0x00, CONSTANT, %xmm4 -#endif - pclmulqdq $0x11, CONSTANT, %xmm5 - pclmulqdq $0x11, CONSTANT, %xmm6 - pclmulqdq $0x11, CONSTANT, %xmm7 -#ifdef __x86_64__ - pclmulqdq $0x11, CONSTANT, %xmm8 -#endif - pxor %xmm5, %xmm1 - pxor %xmm6, %xmm2 - pxor %xmm7, %xmm3 -#ifdef __x86_64__ - pxor %xmm8, %xmm4 -#else - /* xmm8 unsupported for x32 */ - movdqa %xmm4, %xmm5 - pclmulqdq $0x00, CONSTANT, %xmm4 - pclmulqdq $0x11, CONSTANT, %xmm5 - pxor %xmm5, %xmm4 -#endif - - pxor (BUF), %xmm1 - pxor 0x10(BUF), %xmm2 - pxor 0x20(BUF), %xmm3 - pxor 0x30(BUF), %xmm4 - - sub $0x40, LEN - add $0x40, BUF - cmp $0x40, LEN - jge .Lloop_64 -.Lless_64:/* Folding cache line into 128bit */ -#ifdef __x86_64__ - movdqa .Lconstant_R4R3(%rip), CONSTANT -#else - movdqa .Lconstant_R4R3, CONSTANT -#endif - prefetchnta (BUF) - - movdqa %xmm1, %xmm5 - pclmulqdq $0x00, CONSTANT, %xmm1 - pclmulqdq $0x11, CONSTANT, %xmm5 - pxor %xmm5, %xmm1 - pxor %xmm2, %xmm1 - - movdqa %xmm1, %xmm5 - pclmulqdq $0x00, CONSTANT, %xmm1 - pclmulqdq $0x11, CONSTANT, %xmm5 - pxor %xmm5, %xmm1 - pxor %xmm3, %xmm1 - - movdqa %xmm1, %xmm5 - pclmulqdq $0x00, CONSTANT, %xmm1 - pclmulqdq $0x11, CONSTANT, %xmm5 - pxor %xmm5, %xmm1 - pxor %xmm4, %xmm1 - - cmp $0x10, LEN - jb .Lfold_64 -.Lloop_16:/* Folding rest buffer into 128bit */ - movdqa %xmm1, %xmm5 - pclmulqdq $0x00, CONSTANT, %xmm1 - pclmulqdq $0x11, CONSTANT, %xmm5 - pxor %xmm5, %xmm1 - pxor (BUF), %xmm1 - sub $0x10, LEN - add $0x10, BUF - cmp $0x10, LEN - jge .Lloop_16 - -.Lfold_64: - /* perform the last 64 bit fold, also adds 32 zeroes - * to the input stream */ - pclmulqdq $0x01, %xmm1, CONSTANT /* R4 * xmm1.low */ - psrldq $0x08, %xmm1 - pxor CONSTANT, %xmm1 - - /* final 32-bit fold */ - movdqa %xmm1, %xmm2 -#ifdef __x86_64__ - movdqa .Lconstant_R5(%rip), CONSTANT - movdqa .Lconstant_mask32(%rip), %xmm3 -#else - movdqa .Lconstant_R5, CONSTANT - movdqa .Lconstant_mask32, %xmm3 -#endif - psrldq $0x04, %xmm2 - pand %xmm3, %xmm1 - pclmulqdq $0x00, CONSTANT, %xmm1 - pxor %xmm2, %xmm1 - - /* Finish up with the bit-reversed barrett reduction 64 ==> 32 bits */ -#ifdef __x86_64__ - movdqa .Lconstant_RUpoly(%rip), CONSTANT -#else - movdqa .Lconstant_RUpoly, CONSTANT -#endif - movdqa %xmm1, %xmm2 - pand %xmm3, %xmm1 - pclmulqdq $0x10, CONSTANT, %xmm1 - pand %xmm3, %xmm1 - pclmulqdq $0x00, CONSTANT, %xmm1 - pxor %xmm2, %xmm1 - pextrd $0x01, %xmm1, %eax - - RET -SYM_FUNC_END(crc32_pclmul_le_16) diff --git a/arch/x86/crypto/crc32-pclmul_glue.c b/arch/x86/crypto/crc32-pclmul_glue.c deleted file mode 100644 index 98cf3b4e4c9f..000000000000 --- a/arch/x86/crypto/crc32-pclmul_glue.c +++ /dev/null @@ -1,201 +0,0 @@ -/* GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see http://www.gnu.org/licenses - * - * Please visit http://www.xyratex.com/contact if you need additional - * information or have any questions. - * - * GPL HEADER END - */ - -/* - * Copyright 2012 Xyratex Technology Limited - * - * Wrappers for kernel crypto shash api to pclmulqdq crc32 implementation. - */ -#include <linux/init.h> -#include <linux/module.h> -#include <linux/string.h> -#include <linux/kernel.h> -#include <linux/crc32.h> -#include <crypto/internal/hash.h> -#include <crypto/internal/simd.h> - -#include <asm/cpufeatures.h> -#include <asm/cpu_device_id.h> -#include <asm/simd.h> - -#define CHKSUM_BLOCK_SIZE 1 -#define CHKSUM_DIGEST_SIZE 4 - -#define PCLMUL_MIN_LEN 64L /* minimum size of buffer - * for crc32_pclmul_le_16 */ -#define SCALE_F 16L /* size of xmm register */ -#define SCALE_F_MASK (SCALE_F - 1) - -u32 crc32_pclmul_le_16(unsigned char const *buffer, size_t len, u32 crc32); - -static u32 __attribute__((pure)) - crc32_pclmul_le(u32 crc, unsigned char const *p, size_t len) -{ - unsigned int iquotient; - unsigned int iremainder; - unsigned int prealign; - - if (len < PCLMUL_MIN_LEN + SCALE_F_MASK || !crypto_simd_usable()) - return crc32_le(crc, p, len); - - if ((long)p & SCALE_F_MASK) { - /* align p to 16 byte */ - prealign = SCALE_F - ((long)p & SCALE_F_MASK); - - crc = crc32_le(crc, p, prealign); - len -= prealign; - p = (unsigned char *)(((unsigned long)p + SCALE_F_MASK) & - ~SCALE_F_MASK); - } - iquotient = len & (~SCALE_F_MASK); - iremainder = len & SCALE_F_MASK; - - kernel_fpu_begin(); - crc = crc32_pclmul_le_16(p, iquotient, crc); - kernel_fpu_end(); - - if (iremainder) - crc = crc32_le(crc, p + iquotient, iremainder); - - return crc; -} - -static int crc32_pclmul_cra_init(struct crypto_tfm *tfm) -{ - u32 *key = crypto_tfm_ctx(tfm); - - *key = 0; - - return 0; -} - -static int crc32_pclmul_setkey(struct crypto_shash *hash, const u8 *key, - unsigned int keylen) -{ - u32 *mctx = crypto_shash_ctx(hash); - - if (keylen != sizeof(u32)) - return -EINVAL; - *mctx = le32_to_cpup((__le32 *)key); - return 0; -} - -static int crc32_pclmul_init(struct shash_desc *desc) -{ - u32 *mctx = crypto_shash_ctx(desc->tfm); - u32 *crcp = shash_desc_ctx(desc); - - *crcp = *mctx; - - return 0; -} - -static int crc32_pclmul_update(struct shash_desc *desc, const u8 *data, - unsigned int len) -{ - u32 *crcp = shash_desc_ctx(desc); - - *crcp = crc32_pclmul_le(*crcp, data, len); - return 0; -} - -/* No final XOR 0xFFFFFFFF, like crc32_le */ -static int __crc32_pclmul_finup(u32 *crcp, const u8 *data, unsigned int len, - u8 *out) -{ - *(__le32 *)out = cpu_to_le32(crc32_pclmul_le(*crcp, data, len)); - return 0; -} - -static int crc32_pclmul_finup(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *out) -{ - return __crc32_pclmul_finup(shash_desc_ctx(desc), data, len, out); -} - -static int crc32_pclmul_final(struct shash_desc *desc, u8 *out) -{ - u32 *crcp = shash_desc_ctx(desc); - - *(__le32 *)out = cpu_to_le32p(crcp); - return 0; -} - -static int crc32_pclmul_digest(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *out) -{ - return __crc32_pclmul_finup(crypto_shash_ctx(desc->tfm), data, len, - out); -} - -static struct shash_alg alg = { - .setkey = crc32_pclmul_setkey, - .init = crc32_pclmul_init, - .update = crc32_pclmul_update, - .final = crc32_pclmul_final, - .finup = crc32_pclmul_finup, - .digest = crc32_pclmul_digest, - .descsize = sizeof(u32), - .digestsize = CHKSUM_DIGEST_SIZE, - .base = { - .cra_name = "crc32", - .cra_driver_name = "crc32-pclmul", - .cra_priority = 200, - .cra_flags = CRYPTO_ALG_OPTIONAL_KEY, - .cra_blocksize = CHKSUM_BLOCK_SIZE, - .cra_ctxsize = sizeof(u32), - .cra_module = THIS_MODULE, - .cra_init = crc32_pclmul_cra_init, - } -}; - -static const struct x86_cpu_id crc32pclmul_cpu_id[] = { - X86_MATCH_FEATURE(X86_FEATURE_PCLMULQDQ, NULL), - {} -}; -MODULE_DEVICE_TABLE(x86cpu, crc32pclmul_cpu_id); - - -static int __init crc32_pclmul_mod_init(void) -{ - - if (!x86_match_cpu(crc32pclmul_cpu_id)) { - pr_info("PCLMULQDQ-NI instructions are not detected.\n"); - return -ENODEV; - } - return crypto_register_shash(&alg); -} - -static void __exit crc32_pclmul_mod_fini(void) -{ - crypto_unregister_shash(&alg); -} - -module_init(crc32_pclmul_mod_init); -module_exit(crc32_pclmul_mod_fini); - -MODULE_AUTHOR("Alexander Boyko <alexander_boyko@xyratex.com>"); -MODULE_LICENSE("GPL"); - -MODULE_ALIAS_CRYPTO("crc32"); -MODULE_ALIAS_CRYPTO("crc32-pclmul"); diff --git a/arch/x86/crypto/crc32c-intel_glue.c b/arch/x86/crypto/crc32c-intel_glue.c deleted file mode 100644 index feccb5254c7e..000000000000 --- a/arch/x86/crypto/crc32c-intel_glue.c +++ /dev/null @@ -1,250 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Using hardware provided CRC32 instruction to accelerate the CRC32 disposal. - * CRC32C polynomial:0x1EDC6F41(BE)/0x82F63B78(LE) - * CRC32 is a new instruction in Intel SSE4.2, the reference can be found at: - * http://www.intel.com/products/processor/manuals/ - * Intel(R) 64 and IA-32 Architectures Software Developer's Manual - * Volume 2A: Instruction Set Reference, A-M - * - * Copyright (C) 2008 Intel Corporation - * Authors: Austin Zhang <austin_zhang@linux.intel.com> - * Kent Liu <kent.liu@intel.com> - */ -#include <linux/init.h> -#include <linux/module.h> -#include <linux/string.h> -#include <linux/kernel.h> -#include <crypto/internal/hash.h> -#include <crypto/internal/simd.h> - -#include <asm/cpufeatures.h> -#include <asm/cpu_device_id.h> -#include <asm/simd.h> - -#define CHKSUM_BLOCK_SIZE 1 -#define CHKSUM_DIGEST_SIZE 4 - -#define SCALE_F sizeof(unsigned long) - -#ifdef CONFIG_X86_64 -#define CRC32_INST "crc32q %1, %q0" -#else -#define CRC32_INST "crc32l %1, %0" -#endif - -#ifdef CONFIG_X86_64 -/* - * use carryless multiply version of crc32c when buffer - * size is >= 512 to account - * for fpu state save/restore overhead. - */ -#define CRC32C_PCL_BREAKEVEN 512 - -asmlinkage unsigned int crc_pcl(const u8 *buffer, int len, - unsigned int crc_init); -#endif /* CONFIG_X86_64 */ - -static u32 crc32c_intel_le_hw_byte(u32 crc, unsigned char const *data, size_t length) -{ - while (length--) { - asm("crc32b %1, %0" - : "+r" (crc) : "rm" (*data)); - data++; - } - - return crc; -} - -static u32 __pure crc32c_intel_le_hw(u32 crc, unsigned char const *p, size_t len) -{ - unsigned int iquotient = len / SCALE_F; - unsigned int iremainder = len % SCALE_F; - unsigned long *ptmp = (unsigned long *)p; - - while (iquotient--) { - asm(CRC32_INST - : "+r" (crc) : "rm" (*ptmp)); - ptmp++; - } - - if (iremainder) - crc = crc32c_intel_le_hw_byte(crc, (unsigned char *)ptmp, - iremainder); - - return crc; -} - -/* - * Setting the seed allows arbitrary accumulators and flexible XOR policy - * If your algorithm starts with ~0, then XOR with ~0 before you set - * the seed. - */ -static int crc32c_intel_setkey(struct crypto_shash *hash, const u8 *key, - unsigned int keylen) -{ - u32 *mctx = crypto_shash_ctx(hash); - - if (keylen != sizeof(u32)) - return -EINVAL; - *mctx = le32_to_cpup((__le32 *)key); - return 0; -} - -static int crc32c_intel_init(struct shash_desc *desc) -{ - u32 *mctx = crypto_shash_ctx(desc->tfm); - u32 *crcp = shash_desc_ctx(desc); - - *crcp = *mctx; - - return 0; -} - -static int crc32c_intel_update(struct shash_desc *desc, const u8 *data, - unsigned int len) -{ - u32 *crcp = shash_desc_ctx(desc); - - *crcp = crc32c_intel_le_hw(*crcp, data, len); - return 0; -} - -static int __crc32c_intel_finup(u32 *crcp, const u8 *data, unsigned int len, - u8 *out) -{ - *(__le32 *)out = ~cpu_to_le32(crc32c_intel_le_hw(*crcp, data, len)); - return 0; -} - -static int crc32c_intel_finup(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *out) -{ - return __crc32c_intel_finup(shash_desc_ctx(desc), data, len, out); -} - -static int crc32c_intel_final(struct shash_desc *desc, u8 *out) -{ - u32 *crcp = shash_desc_ctx(desc); - - *(__le32 *)out = ~cpu_to_le32p(crcp); - return 0; -} - -static int crc32c_intel_digest(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *out) -{ - return __crc32c_intel_finup(crypto_shash_ctx(desc->tfm), data, len, - out); -} - -static int crc32c_intel_cra_init(struct crypto_tfm *tfm) -{ - u32 *key = crypto_tfm_ctx(tfm); - - *key = ~0; - - return 0; -} - -#ifdef CONFIG_X86_64 -static int crc32c_pcl_intel_update(struct shash_desc *desc, const u8 *data, - unsigned int len) -{ - u32 *crcp = shash_desc_ctx(desc); - - /* - * use faster PCL version if datasize is large enough to - * overcome kernel fpu state save/restore overhead - */ - if (len >= CRC32C_PCL_BREAKEVEN && crypto_simd_usable()) { - kernel_fpu_begin(); - *crcp = crc_pcl(data, len, *crcp); - kernel_fpu_end(); - } else - *crcp = crc32c_intel_le_hw(*crcp, data, len); - return 0; -} - -static int __crc32c_pcl_intel_finup(u32 *crcp, const u8 *data, unsigned int len, - u8 *out) -{ - if (len >= CRC32C_PCL_BREAKEVEN && crypto_simd_usable()) { - kernel_fpu_begin(); - *(__le32 *)out = ~cpu_to_le32(crc_pcl(data, len, *crcp)); - kernel_fpu_end(); - } else - *(__le32 *)out = - ~cpu_to_le32(crc32c_intel_le_hw(*crcp, data, len)); - return 0; -} - -static int crc32c_pcl_intel_finup(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *out) -{ - return __crc32c_pcl_intel_finup(shash_desc_ctx(desc), data, len, out); -} - -static int crc32c_pcl_intel_digest(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *out) -{ - return __crc32c_pcl_intel_finup(crypto_shash_ctx(desc->tfm), data, len, - out); -} -#endif /* CONFIG_X86_64 */ - -static struct shash_alg alg = { - .setkey = crc32c_intel_setkey, - .init = crc32c_intel_init, - .update = crc32c_intel_update, - .final = crc32c_intel_final, - .finup = crc32c_intel_finup, - .digest = crc32c_intel_digest, - .descsize = sizeof(u32), - .digestsize = CHKSUM_DIGEST_SIZE, - .base = { - .cra_name = "crc32c", - .cra_driver_name = "crc32c-intel", - .cra_priority = 200, - .cra_flags = CRYPTO_ALG_OPTIONAL_KEY, - .cra_blocksize = CHKSUM_BLOCK_SIZE, - .cra_ctxsize = sizeof(u32), - .cra_module = THIS_MODULE, - .cra_init = crc32c_intel_cra_init, - } -}; - -static const struct x86_cpu_id crc32c_cpu_id[] = { - X86_MATCH_FEATURE(X86_FEATURE_XMM4_2, NULL), - {} -}; -MODULE_DEVICE_TABLE(x86cpu, crc32c_cpu_id); - -static int __init crc32c_intel_mod_init(void) -{ - if (!x86_match_cpu(crc32c_cpu_id)) - return -ENODEV; -#ifdef CONFIG_X86_64 - if (boot_cpu_has(X86_FEATURE_PCLMULQDQ)) { - alg.update = crc32c_pcl_intel_update; - alg.finup = crc32c_pcl_intel_finup; - alg.digest = crc32c_pcl_intel_digest; - } -#endif - return crypto_register_shash(&alg); -} - -static void __exit crc32c_intel_mod_fini(void) -{ - crypto_unregister_shash(&alg); -} - -module_init(crc32c_intel_mod_init); -module_exit(crc32c_intel_mod_fini); - -MODULE_AUTHOR("Austin Zhang <austin.zhang@intel.com>, Kent Liu <kent.liu@intel.com>"); -MODULE_DESCRIPTION("CRC32c (Castagnoli) optimization using Intel Hardware."); -MODULE_LICENSE("GPL"); - -MODULE_ALIAS_CRYPTO("crc32c"); -MODULE_ALIAS_CRYPTO("crc32c-intel"); diff --git a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S deleted file mode 100644 index bbcff1fb78cb..000000000000 --- a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S +++ /dev/null @@ -1,463 +0,0 @@ -/* - * Implement fast CRC32C with PCLMULQDQ instructions. (x86_64) - * - * The white papers on CRC32C calculations with PCLMULQDQ instruction can be - * downloaded from: - * http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/crc-iscsi-polynomial-crc32-instruction-paper.pdf - * http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-paper.pdf - * - * Copyright (C) 2012 Intel Corporation. - * - * Authors: - * Wajdi Feghali <wajdi.k.feghali@intel.com> - * James Guilford <james.guilford@intel.com> - * David Cote <david.m.cote@intel.com> - * Tim Chen <tim.c.chen@linux.intel.com> - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * OpenIB.org BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include <linux/linkage.h> -#include <asm/nospec-branch.h> - -## ISCSI CRC 32 Implementation with crc32 and pclmulqdq Instruction - -.macro LABEL prefix n -.L\prefix\n\(): -.endm - -.macro JMPTBL_ENTRY i -.quad .Lcrc_\i -.endm - -.macro JNC_LESS_THAN j - jnc .Lless_than_\j -.endm - -# Define threshold where buffers are considered "small" and routed to more -# efficient "by-1" code. This "by-1" code only handles up to 255 bytes, so -# SMALL_SIZE can be no larger than 255. - -#define SMALL_SIZE 200 - -.if (SMALL_SIZE > 255) -.error "SMALL_ SIZE must be < 256" -.endif - -# unsigned int crc_pcl(u8 *buffer, int len, unsigned int crc_init); - -.text -SYM_FUNC_START(crc_pcl) -#define bufp rdi -#define bufp_dw %edi -#define bufp_w %di -#define bufp_b %dil -#define bufptmp %rcx -#define block_0 %rcx -#define block_1 %rdx -#define block_2 %r11 -#define len %rsi -#define len_dw %esi -#define len_w %si -#define len_b %sil -#define crc_init_arg %rdx -#define tmp %rbx -#define crc_init %r8 -#define crc_init_dw %r8d -#define crc1 %r9 -#define crc2 %r10 - - pushq %rbx - pushq %rdi - pushq %rsi - - ## Move crc_init for Linux to a different - mov crc_init_arg, crc_init - - ################################################################ - ## 1) ALIGN: - ################################################################ - - mov %bufp, bufptmp # rdi = *buf - neg %bufp - and $7, %bufp # calculate the unalignment amount of - # the address - je .Lproc_block # Skip if aligned - - ## If len is less than 8 and we're unaligned, we need to jump - ## to special code to avoid reading beyond the end of the buffer - cmp $8, len - jae .Ldo_align - # less_than_8 expects length in upper 3 bits of len_dw - # less_than_8_post_shl1 expects length = carryflag * 8 + len_dw[31:30] - shl $32-3+1, len_dw - jmp .Lless_than_8_post_shl1 - -.Ldo_align: - #### Calculate CRC of unaligned bytes of the buffer (if any) - movq (bufptmp), tmp # load a quadward from the buffer - add %bufp, bufptmp # align buffer pointer for quadword - # processing - sub %bufp, len # update buffer length -.Lalign_loop: - crc32b %bl, crc_init_dw # compute crc32 of 1-byte - shr $8, tmp # get next byte - dec %bufp - jne .Lalign_loop - -.Lproc_block: - - ################################################################ - ## 2) PROCESS BLOCKS: - ################################################################ - - ## compute num of bytes to be processed - movq len, tmp # save num bytes in tmp - - cmpq $128*24, len - jae .Lfull_block - -.Lcontinue_block: - cmpq $SMALL_SIZE, len - jb .Lsmall - - ## len < 128*24 - movq $2731, %rax # 2731 = ceil(2^16 / 24) - mul len_dw - shrq $16, %rax - - ## eax contains floor(bytes / 24) = num 24-byte chunks to do - - ## process rax 24-byte chunks (128 >= rax >= 0) - - ## compute end address of each block - ## block 0 (base addr + RAX * 8) - ## block 1 (base addr + RAX * 16) - ## block 2 (base addr + RAX * 24) - lea (bufptmp, %rax, 8), block_0 - lea (block_0, %rax, 8), block_1 - lea (block_1, %rax, 8), block_2 - - xor crc1, crc1 - xor crc2, crc2 - - ## branch into array - leaq jump_table(%rip), %bufp - mov (%bufp,%rax,8), %bufp - JMP_NOSPEC bufp - - ################################################################ - ## 2a) PROCESS FULL BLOCKS: - ################################################################ -.Lfull_block: - movl $128,%eax - lea 128*8*2(block_0), block_1 - lea 128*8*3(block_0), block_2 - add $128*8*1, block_0 - - xor crc1,crc1 - xor crc2,crc2 - - # Fall through into top of crc array (crc_128) - - ################################################################ - ## 3) CRC Array: - ################################################################ - - i=128 -.rept 128-1 -.altmacro -LABEL crc_ %i -.noaltmacro - ENDBR - crc32q -i*8(block_0), crc_init - crc32q -i*8(block_1), crc1 - crc32q -i*8(block_2), crc2 - i=(i-1) -.endr - -.altmacro -LABEL crc_ %i -.noaltmacro - ENDBR - crc32q -i*8(block_0), crc_init - crc32q -i*8(block_1), crc1 -# SKIP crc32 -i*8(block_2), crc2 ; Don't do this one yet - - mov block_2, block_0 - - ################################################################ - ## 4) Combine three results: - ################################################################ - - lea (K_table-8)(%rip), %bufp # first entry is for idx 1 - shlq $3, %rax # rax *= 8 - pmovzxdq (%bufp,%rax), %xmm0 # 2 consts: K1:K2 - leal (%eax,%eax,2), %eax # rax *= 3 (total *24) - subq %rax, tmp # tmp -= rax*24 - - movq crc_init, %xmm1 # CRC for block 1 - pclmulqdq $0x00, %xmm0, %xmm1 # Multiply by K2 - - movq crc1, %xmm2 # CRC for block 2 - pclmulqdq $0x10, %xmm0, %xmm2 # Multiply by K1 - - pxor %xmm2,%xmm1 - movq %xmm1, %rax - xor -i*8(block_2), %rax - mov crc2, crc_init - crc32 %rax, crc_init - - ################################################################ - ## 5) Check for end: - ################################################################ - -LABEL crc_ 0 - ENDBR - mov tmp, len - cmp $128*24, tmp - jae .Lfull_block - cmp $24, tmp - jae .Lcontinue_block - -.Lless_than_24: - shl $32-4, len_dw # less_than_16 expects length - # in upper 4 bits of len_dw - jnc .Lless_than_16 - crc32q (bufptmp), crc_init - crc32q 8(bufptmp), crc_init - jz .Ldo_return - add $16, bufptmp - # len is less than 8 if we got here - # less_than_8 expects length in upper 3 bits of len_dw - # less_than_8_post_shl1 expects length = carryflag * 8 + len_dw[31:30] - shl $2, len_dw - jmp .Lless_than_8_post_shl1 - - ####################################################################### - ## 6) LESS THAN 256-bytes REMAIN AT THIS POINT (8-bits of len are full) - ####################################################################### -.Lsmall: - shl $32-8, len_dw # Prepare len_dw for less_than_256 - j=256 -.rept 5 # j = {256, 128, 64, 32, 16} -.altmacro -LABEL less_than_ %j # less_than_j: Length should be in - # upper lg(j) bits of len_dw - j=(j/2) - shl $1, len_dw # Get next MSB - JNC_LESS_THAN %j -.noaltmacro - i=0 -.rept (j/8) - crc32q i(bufptmp), crc_init # Compute crc32 of 8-byte data - i=i+8 -.endr - jz .Ldo_return # Return if remaining length is zero - add $j, bufptmp # Advance buf -.endr - -.Lless_than_8: # Length should be stored in - # upper 3 bits of len_dw - shl $1, len_dw -.Lless_than_8_post_shl1: - jnc .Lless_than_4 - crc32l (bufptmp), crc_init_dw # CRC of 4 bytes - jz .Ldo_return # return if remaining data is zero - add $4, bufptmp -.Lless_than_4: # Length should be stored in - # upper 2 bits of len_dw - shl $1, len_dw - jnc .Lless_than_2 - crc32w (bufptmp), crc_init_dw # CRC of 2 bytes - jz .Ldo_return # return if remaining data is zero - add $2, bufptmp -.Lless_than_2: # Length should be stored in the MSB - # of len_dw - shl $1, len_dw - jnc .Lless_than_1 - crc32b (bufptmp), crc_init_dw # CRC of 1 byte -.Lless_than_1: # Length should be zero -.Ldo_return: - movq crc_init, %rax - popq %rsi - popq %rdi - popq %rbx - RET -SYM_FUNC_END(crc_pcl) - -.section .rodata, "a", @progbits - ################################################################ - ## jump table Table is 129 entries x 2 bytes each - ################################################################ -.align 4 -jump_table: - i=0 -.rept 129 -.altmacro -JMPTBL_ENTRY %i -.noaltmacro - i=i+1 -.endr - - - ################################################################ - ## PCLMULQDQ tables - ## Table is 128 entries x 2 words (8 bytes) each - ################################################################ -.align 8 -K_table: - .long 0x493c7d27, 0x00000001 - .long 0xba4fc28e, 0x493c7d27 - .long 0xddc0152b, 0xf20c0dfe - .long 0x9e4addf8, 0xba4fc28e - .long 0x39d3b296, 0x3da6d0cb - .long 0x0715ce53, 0xddc0152b - .long 0x47db8317, 0x1c291d04 - .long 0x0d3b6092, 0x9e4addf8 - .long 0xc96cfdc0, 0x740eef02 - .long 0x878a92a7, 0x39d3b296 - .long 0xdaece73e, 0x083a6eec - .long 0xab7aff2a, 0x0715ce53 - .long 0x2162d385, 0xc49f4f67 - .long 0x83348832, 0x47db8317 - .long 0x299847d5, 0x2ad91c30 - .long 0xb9e02b86, 0x0d3b6092 - .long 0x18b33a4e, 0x6992cea2 - .long 0xb6dd949b, 0xc96cfdc0 - .long 0x78d9ccb7, 0x7e908048 - .long 0xbac2fd7b, 0x878a92a7 - .long 0xa60ce07b, 0x1b3d8f29 - .long 0xce7f39f4, 0xdaece73e - .long 0x61d82e56, 0xf1d0f55e - .long 0xd270f1a2, 0xab7aff2a - .long 0xc619809d, 0xa87ab8a8 - .long 0x2b3cac5d, 0x2162d385 - .long 0x65863b64, 0x8462d800 - .long 0x1b03397f, 0x83348832 - .long 0xebb883bd, 0x71d111a8 - .long 0xb3e32c28, 0x299847d5 - .long 0x064f7f26, 0xffd852c6 - .long 0xdd7e3b0c, 0xb9e02b86 - .long 0xf285651c, 0xdcb17aa4 - .long 0x10746f3c, 0x18b33a4e - .long 0xc7a68855, 0xf37c5aee - .long 0x271d9844, 0xb6dd949b - .long 0x8e766a0c, 0x6051d5a2 - .long 0x93a5f730, 0x78d9ccb7 - .long 0x6cb08e5c, 0x18b0d4ff - .long 0x6b749fb2, 0xbac2fd7b - .long 0x1393e203, 0x21f3d99c - .long 0xcec3662e, 0xa60ce07b - .long 0x96c515bb, 0x8f158014 - .long 0xe6fc4e6a, 0xce7f39f4 - .long 0x8227bb8a, 0xa00457f7 - .long 0xb0cd4768, 0x61d82e56 - .long 0x39c7ff35, 0x8d6d2c43 - .long 0xd7a4825c, 0xd270f1a2 - .long 0x0ab3844b, 0x00ac29cf - .long 0x0167d312, 0xc619809d - .long 0xf6076544, 0xe9adf796 - .long 0x26f6a60a, 0x2b3cac5d - .long 0xa741c1bf, 0x96638b34 - .long 0x98d8d9cb, 0x65863b64 - .long 0x49c3cc9c, 0xe0e9f351 - .long 0x68bce87a, 0x1b03397f - .long 0x57a3d037, 0x9af01f2d - .long 0x6956fc3b, 0xebb883bd - .long 0x42d98888, 0x2cff42cf - .long 0x3771e98f, 0xb3e32c28 - .long 0xb42ae3d9, 0x88f25a3a - .long 0x2178513a, 0x064f7f26 - .long 0xe0ac139e, 0x4e36f0b0 - .long 0x170076fa, 0xdd7e3b0c - .long 0x444dd413, 0xbd6f81f8 - .long 0x6f345e45, 0xf285651c - .long 0x41d17b64, 0x91c9bd4b - .long 0xff0dba97, 0x10746f3c - .long 0xa2b73df1, 0x885f087b - .long 0xf872e54c, 0xc7a68855 - .long 0x1e41e9fc, 0x4c144932 - .long 0x86d8e4d2, 0x271d9844 - .long 0x651bd98b, 0x52148f02 - .long 0x5bb8f1bc, 0x8e766a0c - .long 0xa90fd27a, 0xa3c6f37a - .long 0xb3af077a, 0x93a5f730 - .long 0x4984d782, 0xd7c0557f - .long 0xca6ef3ac, 0x6cb08e5c - .long 0x234e0b26, 0x63ded06a - .long 0xdd66cbbb, 0x6b749fb2 - .long 0x4597456a, 0x4d56973c - .long 0xe9e28eb4, 0x1393e203 - .long 0x7b3ff57a, 0x9669c9df - .long 0xc9c8b782, 0xcec3662e - .long 0x3f70cc6f, 0xe417f38a - .long 0x93e106a4, 0x96c515bb - .long 0x62ec6c6d, 0x4b9e0f71 - .long 0xd813b325, 0xe6fc4e6a - .long 0x0df04680, 0xd104b8fc - .long 0x2342001e, 0x8227bb8a - .long 0x0a2a8d7e, 0x5b397730 - .long 0x6d9a4957, 0xb0cd4768 - .long 0xe8b6368b, 0xe78eb416 - .long 0xd2c3ed1a, 0x39c7ff35 - .long 0x995a5724, 0x61ff0e01 - .long 0x9ef68d35, 0xd7a4825c - .long 0x0c139b31, 0x8d96551c - .long 0xf2271e60, 0x0ab3844b - .long 0x0b0bf8ca, 0x0bf80dd2 - .long 0x2664fd8b, 0x0167d312 - .long 0xed64812d, 0x8821abed - .long 0x02ee03b2, 0xf6076544 - .long 0x8604ae0f, 0x6a45d2b2 - .long 0x363bd6b3, 0x26f6a60a - .long 0x135c83fd, 0xd8d26619 - .long 0x5fabe670, 0xa741c1bf - .long 0x35ec3279, 0xde87806c - .long 0x00bcf5f6, 0x98d8d9cb - .long 0x8ae00689, 0x14338754 - .long 0x17f27698, 0x49c3cc9c - .long 0x58ca5f00, 0x5bd2011f - .long 0xaa7c7ad5, 0x68bce87a - .long 0xb5cfca28, 0xdd07448e - .long 0xded288f8, 0x57a3d037 - .long 0x59f229bc, 0xdde8f5b9 - .long 0x6d390dec, 0x6956fc3b - .long 0x37170390, 0xa3e3e02c - .long 0x6353c1cc, 0x42d98888 - .long 0xc4584f5c, 0xd73c7bea - .long 0xf48642e9, 0x3771e98f - .long 0x531377e2, 0x80ff0093 - .long 0xdd35bc8d, 0xb42ae3d9 - .long 0xb25b29f2, 0x8fe4c34d - .long 0x9a5ede41, 0x2178513a - .long 0xa563905d, 0xdf99fc11 - .long 0x45cddf4e, 0xe0ac139e - .long 0xacfa3103, 0x6c23e841 - .long 0xa51b6135, 0x170076fa diff --git a/arch/x86/crypto/crct10dif-pcl-asm_64.S b/arch/x86/crypto/crct10dif-pcl-asm_64.S deleted file mode 100644 index 5286db5b8165..000000000000 --- a/arch/x86/crypto/crct10dif-pcl-asm_64.S +++ /dev/null @@ -1,332 +0,0 @@ -######################################################################## -# Implement fast CRC-T10DIF computation with SSE and PCLMULQDQ instructions -# -# Copyright (c) 2013, Intel Corporation -# -# Authors: -# Erdinc Ozturk <erdinc.ozturk@intel.com> -# Vinodh Gopal <vinodh.gopal@intel.com> -# James Guilford <james.guilford@intel.com> -# Tim Chen <tim.c.chen@linux.intel.com> -# -# This software is available to you under a choice of one of two -# licenses. You may choose to be licensed under the terms of the GNU -# General Public License (GPL) Version 2, available from the file -# COPYING in the main directory of this source tree, or the -# OpenIB.org BSD license below: -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are -# met: -# -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the -# distribution. -# -# * Neither the name of the Intel Corporation nor the names of its -# contributors may be used to endorse or promote products derived from -# this software without specific prior written permission. -# -# -# THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY -# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -# -# Reference paper titled "Fast CRC Computation for Generic -# Polynomials Using PCLMULQDQ Instruction" -# URL: http://www.intel.com/content/dam/www/public/us/en/documents -# /white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf -# - -#include <linux/linkage.h> - -.text - -#define init_crc %edi -#define buf %rsi -#define len %rdx - -#define FOLD_CONSTS %xmm10 -#define BSWAP_MASK %xmm11 - -# Fold reg1, reg2 into the next 32 data bytes, storing the result back into -# reg1, reg2. -.macro fold_32_bytes offset, reg1, reg2 - movdqu \offset(buf), %xmm9 - movdqu \offset+16(buf), %xmm12 - pshufb BSWAP_MASK, %xmm9 - pshufb BSWAP_MASK, %xmm12 - movdqa \reg1, %xmm8 - movdqa \reg2, %xmm13 - pclmulqdq $0x00, FOLD_CONSTS, \reg1 - pclmulqdq $0x11, FOLD_CONSTS, %xmm8 - pclmulqdq $0x00, FOLD_CONSTS, \reg2 - pclmulqdq $0x11, FOLD_CONSTS, %xmm13 - pxor %xmm9 , \reg1 - xorps %xmm8 , \reg1 - pxor %xmm12, \reg2 - xorps %xmm13, \reg2 -.endm - -# Fold src_reg into dst_reg. -.macro fold_16_bytes src_reg, dst_reg - movdqa \src_reg, %xmm8 - pclmulqdq $0x11, FOLD_CONSTS, \src_reg - pclmulqdq $0x00, FOLD_CONSTS, %xmm8 - pxor %xmm8, \dst_reg - xorps \src_reg, \dst_reg -.endm - -# -# u16 crc_t10dif_pcl(u16 init_crc, const *u8 buf, size_t len); -# -# Assumes len >= 16. -# -SYM_FUNC_START(crc_t10dif_pcl) - - movdqa .Lbswap_mask(%rip), BSWAP_MASK - - # For sizes less than 256 bytes, we can't fold 128 bytes at a time. - cmp $256, len - jl .Lless_than_256_bytes - - # Load the first 128 data bytes. Byte swapping is necessary to make the - # bit order match the polynomial coefficient order. - movdqu 16*0(buf), %xmm0 - movdqu 16*1(buf), %xmm1 - movdqu 16*2(buf), %xmm2 - movdqu 16*3(buf), %xmm3 - movdqu 16*4(buf), %xmm4 - movdqu 16*5(buf), %xmm5 - movdqu 16*6(buf), %xmm6 - movdqu 16*7(buf), %xmm7 - add $128, buf - pshufb BSWAP_MASK, %xmm0 - pshufb BSWAP_MASK, %xmm1 - pshufb BSWAP_MASK, %xmm2 - pshufb BSWAP_MASK, %xmm3 - pshufb BSWAP_MASK, %xmm4 - pshufb BSWAP_MASK, %xmm5 - pshufb BSWAP_MASK, %xmm6 - pshufb BSWAP_MASK, %xmm7 - - # XOR the first 16 data *bits* with the initial CRC value. - pxor %xmm8, %xmm8 - pinsrw $7, init_crc, %xmm8 - pxor %xmm8, %xmm0 - - movdqa .Lfold_across_128_bytes_consts(%rip), FOLD_CONSTS - - # Subtract 128 for the 128 data bytes just consumed. Subtract another - # 128 to simplify the termination condition of the following loop. - sub $256, len - - # While >= 128 data bytes remain (not counting xmm0-7), fold the 128 - # bytes xmm0-7 into them, storing the result back into xmm0-7. -.Lfold_128_bytes_loop: - fold_32_bytes 0, %xmm0, %xmm1 - fold_32_bytes 32, %xmm2, %xmm3 - fold_32_bytes 64, %xmm4, %xmm5 - fold_32_bytes 96, %xmm6, %xmm7 - add $128, buf - sub $128, len - jge .Lfold_128_bytes_loop - - # Now fold the 112 bytes in xmm0-xmm6 into the 16 bytes in xmm7. - - # Fold across 64 bytes. - movdqa .Lfold_across_64_bytes_consts(%rip), FOLD_CONSTS - fold_16_bytes %xmm0, %xmm4 - fold_16_bytes %xmm1, %xmm5 - fold_16_bytes %xmm2, %xmm6 - fold_16_bytes %xmm3, %xmm7 - # Fold across 32 bytes. - movdqa .Lfold_across_32_bytes_consts(%rip), FOLD_CONSTS - fold_16_bytes %xmm4, %xmm6 - fold_16_bytes %xmm5, %xmm7 - # Fold across 16 bytes. - movdqa .Lfold_across_16_bytes_consts(%rip), FOLD_CONSTS - fold_16_bytes %xmm6, %xmm7 - - # Add 128 to get the correct number of data bytes remaining in 0...127 - # (not counting xmm7), following the previous extra subtraction by 128. - # Then subtract 16 to simplify the termination condition of the - # following loop. - add $128-16, len - - # While >= 16 data bytes remain (not counting xmm7), fold the 16 bytes - # xmm7 into them, storing the result back into xmm7. - jl .Lfold_16_bytes_loop_done -.Lfold_16_bytes_loop: - movdqa %xmm7, %xmm8 - pclmulqdq $0x11, FOLD_CONSTS, %xmm7 - pclmulqdq $0x00, FOLD_CONSTS, %xmm8 - pxor %xmm8, %xmm7 - movdqu (buf), %xmm0 - pshufb BSWAP_MASK, %xmm0 - pxor %xmm0 , %xmm7 - add $16, buf - sub $16, len - jge .Lfold_16_bytes_loop - -.Lfold_16_bytes_loop_done: - # Add 16 to get the correct number of data bytes remaining in 0...15 - # (not counting xmm7), following the previous extra subtraction by 16. - add $16, len - je .Lreduce_final_16_bytes - -.Lhandle_partial_segment: - # Reduce the last '16 + len' bytes where 1 <= len <= 15 and the first 16 - # bytes are in xmm7 and the rest are the remaining data in 'buf'. To do - # this without needing a fold constant for each possible 'len', redivide - # the bytes into a first chunk of 'len' bytes and a second chunk of 16 - # bytes, then fold the first chunk into the second. - - movdqa %xmm7, %xmm2 - - # xmm1 = last 16 original data bytes - movdqu -16(buf, len), %xmm1 - pshufb BSWAP_MASK, %xmm1 - - # xmm2 = high order part of second chunk: xmm7 left-shifted by 'len' bytes. - lea .Lbyteshift_table+16(%rip), %rax - sub len, %rax - movdqu (%rax), %xmm0 - pshufb %xmm0, %xmm2 - - # xmm7 = first chunk: xmm7 right-shifted by '16-len' bytes. - pxor .Lmask1(%rip), %xmm0 - pshufb %xmm0, %xmm7 - - # xmm1 = second chunk: 'len' bytes from xmm1 (low-order bytes), - # then '16-len' bytes from xmm2 (high-order bytes). - pblendvb %xmm2, %xmm1 #xmm0 is implicit - - # Fold the first chunk into the second chunk, storing the result in xmm7. - movdqa %xmm7, %xmm8 - pclmulqdq $0x11, FOLD_CONSTS, %xmm7 - pclmulqdq $0x00, FOLD_CONSTS, %xmm8 - pxor %xmm8, %xmm7 - pxor %xmm1, %xmm7 - -.Lreduce_final_16_bytes: - # Reduce the 128-bit value M(x), stored in xmm7, to the final 16-bit CRC - - # Load 'x^48 * (x^48 mod G(x))' and 'x^48 * (x^80 mod G(x))'. - movdqa .Lfinal_fold_consts(%rip), FOLD_CONSTS - - # Fold the high 64 bits into the low 64 bits, while also multiplying by - # x^64. This produces a 128-bit value congruent to x^64 * M(x) and - # whose low 48 bits are 0. - movdqa %xmm7, %xmm0 - pclmulqdq $0x11, FOLD_CONSTS, %xmm7 # high bits * x^48 * (x^80 mod G(x)) - pslldq $8, %xmm0 - pxor %xmm0, %xmm7 # + low bits * x^64 - - # Fold the high 32 bits into the low 96 bits. This produces a 96-bit - # value congruent to x^64 * M(x) and whose low 48 bits are 0. - movdqa %xmm7, %xmm0 - pand .Lmask2(%rip), %xmm0 # zero high 32 bits - psrldq $12, %xmm7 # extract high 32 bits - pclmulqdq $0x00, FOLD_CONSTS, %xmm7 # high 32 bits * x^48 * (x^48 mod G(x)) - pxor %xmm0, %xmm7 # + low bits - - # Load G(x) and floor(x^48 / G(x)). - movdqa .Lbarrett_reduction_consts(%rip), FOLD_CONSTS - - # Use Barrett reduction to compute the final CRC value. - movdqa %xmm7, %xmm0 - pclmulqdq $0x11, FOLD_CONSTS, %xmm7 # high 32 bits * floor(x^48 / G(x)) - psrlq $32, %xmm7 # /= x^32 - pclmulqdq $0x00, FOLD_CONSTS, %xmm7 # *= G(x) - psrlq $48, %xmm0 - pxor %xmm7, %xmm0 # + low 16 nonzero bits - # Final CRC value (x^16 * M(x)) mod G(x) is in low 16 bits of xmm0. - - pextrw $0, %xmm0, %eax - RET - -.align 16 -.Lless_than_256_bytes: - # Checksumming a buffer of length 16...255 bytes - - # Load the first 16 data bytes. - movdqu (buf), %xmm7 - pshufb BSWAP_MASK, %xmm7 - add $16, buf - - # XOR the first 16 data *bits* with the initial CRC value. - pxor %xmm0, %xmm0 - pinsrw $7, init_crc, %xmm0 - pxor %xmm0, %xmm7 - - movdqa .Lfold_across_16_bytes_consts(%rip), FOLD_CONSTS - cmp $16, len - je .Lreduce_final_16_bytes # len == 16 - sub $32, len - jge .Lfold_16_bytes_loop # 32 <= len <= 255 - add $16, len - jmp .Lhandle_partial_segment # 17 <= len <= 31 -SYM_FUNC_END(crc_t10dif_pcl) - -.section .rodata, "a", @progbits -.align 16 - -# Fold constants precomputed from the polynomial 0x18bb7 -# G(x) = x^16 + x^15 + x^11 + x^9 + x^8 + x^7 + x^5 + x^4 + x^2 + x^1 + x^0 -.Lfold_across_128_bytes_consts: - .quad 0x0000000000006123 # x^(8*128) mod G(x) - .quad 0x0000000000002295 # x^(8*128+64) mod G(x) -.Lfold_across_64_bytes_consts: - .quad 0x0000000000001069 # x^(4*128) mod G(x) - .quad 0x000000000000dd31 # x^(4*128+64) mod G(x) -.Lfold_across_32_bytes_consts: - .quad 0x000000000000857d # x^(2*128) mod G(x) - .quad 0x0000000000007acc # x^(2*128+64) mod G(x) -.Lfold_across_16_bytes_consts: - .quad 0x000000000000a010 # x^(1*128) mod G(x) - .quad 0x0000000000001faa # x^(1*128+64) mod G(x) -.Lfinal_fold_consts: - .quad 0x1368000000000000 # x^48 * (x^48 mod G(x)) - .quad 0x2d56000000000000 # x^48 * (x^80 mod G(x)) -.Lbarrett_reduction_consts: - .quad 0x0000000000018bb7 # G(x) - .quad 0x00000001f65a57f8 # floor(x^48 / G(x)) - -.section .rodata.cst16.mask1, "aM", @progbits, 16 -.align 16 -.Lmask1: - .octa 0x80808080808080808080808080808080 - -.section .rodata.cst16.mask2, "aM", @progbits, 16 -.align 16 -.Lmask2: - .octa 0x00000000FFFFFFFFFFFFFFFFFFFFFFFF - -.section .rodata.cst16.bswap_mask, "aM", @progbits, 16 -.align 16 -.Lbswap_mask: - .octa 0x000102030405060708090A0B0C0D0E0F - -.section .rodata.cst32.byteshift_table, "aM", @progbits, 32 -.align 16 -# For 1 <= len <= 15, the 16-byte vector beginning at &byteshift_table[16 - len] -# is the index vector to shift left by 'len' bytes, and is also {0x80, ..., -# 0x80} XOR the index vector to shift right by '16 - len' bytes. -.Lbyteshift_table: - .byte 0x0, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87 - .byte 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f - .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7 - .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe , 0x0 diff --git a/arch/x86/crypto/crct10dif-pclmul_glue.c b/arch/x86/crypto/crct10dif-pclmul_glue.c deleted file mode 100644 index 71291d5af9f4..000000000000 --- a/arch/x86/crypto/crct10dif-pclmul_glue.c +++ /dev/null @@ -1,143 +0,0 @@ -/* - * Cryptographic API. - * - * T10 Data Integrity Field CRC16 Crypto Transform using PCLMULQDQ Instructions - * - * Copyright (C) 2013 Intel Corporation - * Author: Tim Chen <tim.c.chen@linux.intel.com> - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by the Free - * Software Foundation; either version 2 of the License, or (at your option) - * any later version. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - */ - -#include <linux/types.h> -#include <linux/module.h> -#include <linux/crc-t10dif.h> -#include <crypto/internal/hash.h> -#include <crypto/internal/simd.h> -#include <linux/init.h> -#include <linux/string.h> -#include <linux/kernel.h> -#include <asm/cpufeatures.h> -#include <asm/cpu_device_id.h> -#include <asm/simd.h> - -asmlinkage u16 crc_t10dif_pcl(u16 init_crc, const u8 *buf, size_t len); - -struct chksum_desc_ctx { - __u16 crc; -}; - -static int chksum_init(struct shash_desc *desc) -{ - struct chksum_desc_ctx *ctx = shash_desc_ctx(desc); - - ctx->crc = 0; - - return 0; -} - -static int chksum_update(struct shash_desc *desc, const u8 *data, - unsigned int length) -{ - struct chksum_desc_ctx *ctx = shash_desc_ctx(desc); - - if (length >= 16 && crypto_simd_usable()) { - kernel_fpu_begin(); - ctx->crc = crc_t10dif_pcl(ctx->crc, data, length); - kernel_fpu_end(); - } else - ctx->crc = crc_t10dif_generic(ctx->crc, data, length); - return 0; -} - -static int chksum_final(struct shash_desc *desc, u8 *out) -{ - struct chksum_desc_ctx *ctx = shash_desc_ctx(desc); - - *(__u16 *)out = ctx->crc; - return 0; -} - -static int __chksum_finup(__u16 crc, const u8 *data, unsigned int len, u8 *out) -{ - if (len >= 16 && crypto_simd_usable()) { - kernel_fpu_begin(); - *(__u16 *)out = crc_t10dif_pcl(crc, data, len); - kernel_fpu_end(); - } else - *(__u16 *)out = crc_t10dif_generic(crc, data, len); - return 0; -} - -static int chksum_finup(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *out) -{ - struct chksum_desc_ctx *ctx = shash_desc_ctx(desc); - - return __chksum_finup(ctx->crc, data, len, out); -} - -static int chksum_digest(struct shash_desc *desc, const u8 *data, - unsigned int length, u8 *out) -{ - return __chksum_finup(0, data, length, out); -} - -static struct shash_alg alg = { - .digestsize = CRC_T10DIF_DIGEST_SIZE, - .init = chksum_init, - .update = chksum_update, - .final = chksum_final, - .finup = chksum_finup, - .digest = chksum_digest, - .descsize = sizeof(struct chksum_desc_ctx), - .base = { - .cra_name = "crct10dif", - .cra_driver_name = "crct10dif-pclmul", - .cra_priority = 200, - .cra_blocksize = CRC_T10DIF_BLOCK_SIZE, - .cra_module = THIS_MODULE, - } -}; - -static const struct x86_cpu_id crct10dif_cpu_id[] = { - X86_MATCH_FEATURE(X86_FEATURE_PCLMULQDQ, NULL), - {} -}; -MODULE_DEVICE_TABLE(x86cpu, crct10dif_cpu_id); - -static int __init crct10dif_intel_mod_init(void) -{ - if (!x86_match_cpu(crct10dif_cpu_id)) - return -ENODEV; - - return crypto_register_shash(&alg); -} - -static void __exit crct10dif_intel_mod_fini(void) -{ - crypto_unregister_shash(&alg); -} - -module_init(crct10dif_intel_mod_init); -module_exit(crct10dif_intel_mod_fini); - -MODULE_AUTHOR("Tim Chen <tim.c.chen@linux.intel.com>"); -MODULE_DESCRIPTION("T10 DIF CRC calculation accelerated with PCLMULQDQ."); -MODULE_LICENSE("GPL"); - -MODULE_ALIAS_CRYPTO("crct10dif"); -MODULE_ALIAS_CRYPTO("crct10dif-pclmul"); diff --git a/arch/x86/crypto/curve25519-x86_64.c b/arch/x86/crypto/curve25519-x86_64.c index d55fa9e9b9e6..dcfc0de333de 100644 --- a/arch/x86/crypto/curve25519-x86_64.c +++ b/arch/x86/crypto/curve25519-x86_64.c @@ -1720,5 +1720,6 @@ module_exit(curve25519_mod_exit); MODULE_ALIAS_CRYPTO("curve25519"); MODULE_ALIAS_CRYPTO("curve25519-x86"); +MODULE_DESCRIPTION("Curve25519 algorithm, ADX optimized"); MODULE_LICENSE("GPL v2"); MODULE_AUTHOR("Jason A. Donenfeld <Jason@zx2c4.com>"); diff --git a/arch/x86/crypto/des3_ede_glue.c b/arch/x86/crypto/des3_ede_glue.c index abb8b1fe123b..34600f90d8a6 100644 --- a/arch/x86/crypto/des3_ede_glue.c +++ b/arch/x86/crypto/des3_ede_glue.c @@ -73,7 +73,7 @@ static int ecb_crypt(struct skcipher_request *req, const u32 *expkey) err = skcipher_walk_virt(&walk, req, false); while ((nbytes = walk.nbytes)) { - u8 *wsrc = walk.src.virt.addr; + const u8 *wsrc = walk.src.virt.addr; u8 *wdst = walk.dst.virt.addr; /* Process four block batch */ @@ -291,7 +291,6 @@ static struct crypto_alg des3_ede_cipher = { .cra_flags = CRYPTO_ALG_TYPE_CIPHER, .cra_blocksize = DES3_EDE_BLOCK_SIZE, .cra_ctxsize = sizeof(struct des3_ede_x86_ctx), - .cra_alignmask = 0, .cra_module = THIS_MODULE, .cra_u = { .cipher = { diff --git a/arch/x86/crypto/ghash-clmulni-intel_asm.S b/arch/x86/crypto/ghash-clmulni-intel_asm.S index 99cb983ded9e..c4fbaa82ed7a 100644 --- a/arch/x86/crypto/ghash-clmulni-intel_asm.S +++ b/arch/x86/crypto/ghash-clmulni-intel_asm.S @@ -103,8 +103,8 @@ SYM_FUNC_START(clmul_ghash_mul) SYM_FUNC_END(clmul_ghash_mul) /* - * void clmul_ghash_update(char *dst, const char *src, unsigned int srclen, - * const le128 *shash); + * int clmul_ghash_update(char *dst, const char *src, unsigned int srclen, + * const le128 *shash); */ SYM_FUNC_START(clmul_ghash_update) FRAME_BEGIN @@ -127,6 +127,7 @@ SYM_FUNC_START(clmul_ghash_update) pshufb BSWAP, DATA movups DATA, (%rdi) .Lupdate_just_ret: + mov %rdx, %rax FRAME_END RET SYM_FUNC_END(clmul_ghash_update) diff --git a/arch/x86/crypto/ghash-clmulni-intel_glue.c b/arch/x86/crypto/ghash-clmulni-intel_glue.c index 700ecaee9a08..aea5d4d06be7 100644 --- a/arch/x86/crypto/ghash-clmulni-intel_glue.c +++ b/arch/x86/crypto/ghash-clmulni-intel_glue.c @@ -7,41 +7,27 @@ * Author: Huang Ying <ying.huang@intel.com> */ -#include <linux/err.h> -#include <linux/module.h> -#include <linux/init.h> -#include <linux/kernel.h> -#include <linux/crypto.h> -#include <crypto/algapi.h> -#include <crypto/cryptd.h> -#include <crypto/gf128mul.h> -#include <crypto/internal/hash.h> -#include <crypto/internal/simd.h> #include <asm/cpu_device_id.h> #include <asm/simd.h> -#include <asm/unaligned.h> - -#define GHASH_BLOCK_SIZE 16 -#define GHASH_DIGEST_SIZE 16 - -void clmul_ghash_mul(char *dst, const le128 *shash); +#include <crypto/b128ops.h> +#include <crypto/ghash.h> +#include <crypto/internal/hash.h> +#include <crypto/utils.h> +#include <linux/errno.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/string.h> +#include <linux/unaligned.h> -void clmul_ghash_update(char *dst, const char *src, unsigned int srclen, - const le128 *shash); +asmlinkage void clmul_ghash_mul(char *dst, const le128 *shash); -struct ghash_async_ctx { - struct cryptd_ahash *cryptd_tfm; -}; +asmlinkage int clmul_ghash_update(char *dst, const char *src, + unsigned int srclen, const le128 *shash); -struct ghash_ctx { +struct x86_ghash_ctx { le128 shash; }; -struct ghash_desc_ctx { - u8 buffer[GHASH_BLOCK_SIZE]; - u32 bytes; -}; - static int ghash_init(struct shash_desc *desc) { struct ghash_desc_ctx *dctx = shash_desc_ctx(desc); @@ -54,7 +40,7 @@ static int ghash_init(struct shash_desc *desc) static int ghash_setkey(struct crypto_shash *tfm, const u8 *key, unsigned int keylen) { - struct ghash_ctx *ctx = crypto_shash_ctx(tfm); + struct x86_ghash_ctx *ctx = crypto_shash_ctx(tfm); u64 a, b; if (keylen != GHASH_BLOCK_SIZE) @@ -95,64 +81,38 @@ static int ghash_setkey(struct crypto_shash *tfm, static int ghash_update(struct shash_desc *desc, const u8 *src, unsigned int srclen) { + struct x86_ghash_ctx *ctx = crypto_shash_ctx(desc->tfm); struct ghash_desc_ctx *dctx = shash_desc_ctx(desc); - struct ghash_ctx *ctx = crypto_shash_ctx(desc->tfm); u8 *dst = dctx->buffer; + int remain; kernel_fpu_begin(); - if (dctx->bytes) { - int n = min(srclen, dctx->bytes); - u8 *pos = dst + (GHASH_BLOCK_SIZE - dctx->bytes); - - dctx->bytes -= n; - srclen -= n; - - while (n--) - *pos++ ^= *src++; - - if (!dctx->bytes) - clmul_ghash_mul(dst, &ctx->shash); - } - - clmul_ghash_update(dst, src, srclen, &ctx->shash); + remain = clmul_ghash_update(dst, src, srclen, &ctx->shash); kernel_fpu_end(); - - if (srclen & 0xf) { - src += srclen - (srclen & 0xf); - srclen &= 0xf; - dctx->bytes = GHASH_BLOCK_SIZE - srclen; - while (srclen--) - *dst++ ^= *src++; - } - - return 0; + return remain; } -static void ghash_flush(struct ghash_ctx *ctx, struct ghash_desc_ctx *dctx) +static void ghash_flush(struct x86_ghash_ctx *ctx, struct ghash_desc_ctx *dctx, + const u8 *src, unsigned int len) { u8 *dst = dctx->buffer; - if (dctx->bytes) { - u8 *tmp = dst + (GHASH_BLOCK_SIZE - dctx->bytes); - - while (dctx->bytes--) - *tmp++ ^= 0; - - kernel_fpu_begin(); + kernel_fpu_begin(); + if (len) { + crypto_xor(dst, src, len); clmul_ghash_mul(dst, &ctx->shash); - kernel_fpu_end(); } - - dctx->bytes = 0; + kernel_fpu_end(); } -static int ghash_final(struct shash_desc *desc, u8 *dst) +static int ghash_finup(struct shash_desc *desc, const u8 *src, + unsigned int len, u8 *dst) { + struct x86_ghash_ctx *ctx = crypto_shash_ctx(desc->tfm); struct ghash_desc_ctx *dctx = shash_desc_ctx(desc); - struct ghash_ctx *ctx = crypto_shash_ctx(desc->tfm); u8 *buf = dctx->buffer; - ghash_flush(ctx, dctx); + ghash_flush(ctx, dctx, src, len); memcpy(dst, buf, GHASH_BLOCK_SIZE); return 0; @@ -162,175 +122,20 @@ static struct shash_alg ghash_alg = { .digestsize = GHASH_DIGEST_SIZE, .init = ghash_init, .update = ghash_update, - .final = ghash_final, + .finup = ghash_finup, .setkey = ghash_setkey, .descsize = sizeof(struct ghash_desc_ctx), .base = { - .cra_name = "__ghash", - .cra_driver_name = "__ghash-pclmulqdqni", - .cra_priority = 0, - .cra_flags = CRYPTO_ALG_INTERNAL, + .cra_name = "ghash", + .cra_driver_name = "ghash-pclmulqdqni", + .cra_priority = 400, + .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY, .cra_blocksize = GHASH_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct ghash_ctx), + .cra_ctxsize = sizeof(struct x86_ghash_ctx), .cra_module = THIS_MODULE, }, }; -static int ghash_async_init(struct ahash_request *req) -{ - struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); - struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm); - struct ahash_request *cryptd_req = ahash_request_ctx(req); - struct cryptd_ahash *cryptd_tfm = ctx->cryptd_tfm; - struct shash_desc *desc = cryptd_shash_desc(cryptd_req); - struct crypto_shash *child = cryptd_ahash_child(cryptd_tfm); - - desc->tfm = child; - return crypto_shash_init(desc); -} - -static int ghash_async_update(struct ahash_request *req) -{ - struct ahash_request *cryptd_req = ahash_request_ctx(req); - struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); - struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm); - struct cryptd_ahash *cryptd_tfm = ctx->cryptd_tfm; - - if (!crypto_simd_usable() || - (in_atomic() && cryptd_ahash_queued(cryptd_tfm))) { - memcpy(cryptd_req, req, sizeof(*req)); - ahash_request_set_tfm(cryptd_req, &cryptd_tfm->base); - return crypto_ahash_update(cryptd_req); - } else { - struct shash_desc *desc = cryptd_shash_desc(cryptd_req); - return shash_ahash_update(req, desc); - } -} - -static int ghash_async_final(struct ahash_request *req) -{ - struct ahash_request *cryptd_req = ahash_request_ctx(req); - struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); - struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm); - struct cryptd_ahash *cryptd_tfm = ctx->cryptd_tfm; - - if (!crypto_simd_usable() || - (in_atomic() && cryptd_ahash_queued(cryptd_tfm))) { - memcpy(cryptd_req, req, sizeof(*req)); - ahash_request_set_tfm(cryptd_req, &cryptd_tfm->base); - return crypto_ahash_final(cryptd_req); - } else { - struct shash_desc *desc = cryptd_shash_desc(cryptd_req); - return crypto_shash_final(desc, req->result); - } -} - -static int ghash_async_import(struct ahash_request *req, const void *in) -{ - struct ahash_request *cryptd_req = ahash_request_ctx(req); - struct shash_desc *desc = cryptd_shash_desc(cryptd_req); - struct ghash_desc_ctx *dctx = shash_desc_ctx(desc); - - ghash_async_init(req); - memcpy(dctx, in, sizeof(*dctx)); - return 0; - -} - -static int ghash_async_export(struct ahash_request *req, void *out) -{ - struct ahash_request *cryptd_req = ahash_request_ctx(req); - struct shash_desc *desc = cryptd_shash_desc(cryptd_req); - struct ghash_desc_ctx *dctx = shash_desc_ctx(desc); - - memcpy(out, dctx, sizeof(*dctx)); - return 0; - -} - -static int ghash_async_digest(struct ahash_request *req) -{ - struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); - struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm); - struct ahash_request *cryptd_req = ahash_request_ctx(req); - struct cryptd_ahash *cryptd_tfm = ctx->cryptd_tfm; - - if (!crypto_simd_usable() || - (in_atomic() && cryptd_ahash_queued(cryptd_tfm))) { - memcpy(cryptd_req, req, sizeof(*req)); - ahash_request_set_tfm(cryptd_req, &cryptd_tfm->base); - return crypto_ahash_digest(cryptd_req); - } else { - struct shash_desc *desc = cryptd_shash_desc(cryptd_req); - struct crypto_shash *child = cryptd_ahash_child(cryptd_tfm); - - desc->tfm = child; - return shash_ahash_digest(req, desc); - } -} - -static int ghash_async_setkey(struct crypto_ahash *tfm, const u8 *key, - unsigned int keylen) -{ - struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm); - struct crypto_ahash *child = &ctx->cryptd_tfm->base; - - crypto_ahash_clear_flags(child, CRYPTO_TFM_REQ_MASK); - crypto_ahash_set_flags(child, crypto_ahash_get_flags(tfm) - & CRYPTO_TFM_REQ_MASK); - return crypto_ahash_setkey(child, key, keylen); -} - -static int ghash_async_init_tfm(struct crypto_tfm *tfm) -{ - struct cryptd_ahash *cryptd_tfm; - struct ghash_async_ctx *ctx = crypto_tfm_ctx(tfm); - - cryptd_tfm = cryptd_alloc_ahash("__ghash-pclmulqdqni", - CRYPTO_ALG_INTERNAL, - CRYPTO_ALG_INTERNAL); - if (IS_ERR(cryptd_tfm)) - return PTR_ERR(cryptd_tfm); - ctx->cryptd_tfm = cryptd_tfm; - crypto_ahash_set_reqsize(__crypto_ahash_cast(tfm), - sizeof(struct ahash_request) + - crypto_ahash_reqsize(&cryptd_tfm->base)); - - return 0; -} - -static void ghash_async_exit_tfm(struct crypto_tfm *tfm) -{ - struct ghash_async_ctx *ctx = crypto_tfm_ctx(tfm); - - cryptd_free_ahash(ctx->cryptd_tfm); -} - -static struct ahash_alg ghash_async_alg = { - .init = ghash_async_init, - .update = ghash_async_update, - .final = ghash_async_final, - .setkey = ghash_async_setkey, - .digest = ghash_async_digest, - .export = ghash_async_export, - .import = ghash_async_import, - .halg = { - .digestsize = GHASH_DIGEST_SIZE, - .statesize = sizeof(struct ghash_desc_ctx), - .base = { - .cra_name = "ghash", - .cra_driver_name = "ghash-clmulni", - .cra_priority = 400, - .cra_ctxsize = sizeof(struct ghash_async_ctx), - .cra_flags = CRYPTO_ALG_ASYNC, - .cra_blocksize = GHASH_BLOCK_SIZE, - .cra_module = THIS_MODULE, - .cra_init = ghash_async_init_tfm, - .cra_exit = ghash_async_exit_tfm, - }, - }, -}; - static const struct x86_cpu_id pcmul_cpu_id[] = { X86_MATCH_FEATURE(X86_FEATURE_PCLMULQDQ, NULL), /* Pickle-Mickle-Duck */ {} @@ -339,29 +144,14 @@ MODULE_DEVICE_TABLE(x86cpu, pcmul_cpu_id); static int __init ghash_pclmulqdqni_mod_init(void) { - int err; - if (!x86_match_cpu(pcmul_cpu_id)) return -ENODEV; - err = crypto_register_shash(&ghash_alg); - if (err) - goto err_out; - err = crypto_register_ahash(&ghash_async_alg); - if (err) - goto err_shash; - - return 0; - -err_shash: - crypto_unregister_shash(&ghash_alg); -err_out: - return err; + return crypto_register_shash(&ghash_alg); } static void __exit ghash_pclmulqdqni_mod_exit(void) { - crypto_unregister_ahash(&ghash_async_alg); crypto_unregister_shash(&ghash_alg); } diff --git a/arch/x86/crypto/nh-avx2-x86_64.S b/arch/x86/crypto/nh-avx2-x86_64.S index ef73a3ab8726..791386d9a83a 100644 --- a/arch/x86/crypto/nh-avx2-x86_64.S +++ b/arch/x86/crypto/nh-avx2-x86_64.S @@ -154,5 +154,6 @@ SYM_TYPED_FUNC_START(nh_avx2) vpaddq T1, T0, T0 vpaddq T4, T0, T0 vmovdqu T0, (HASH) + vzeroupper RET SYM_FUNC_END(nh_avx2) diff --git a/arch/x86/crypto/poly1305-x86_64-cryptogams.pl b/arch/x86/crypto/poly1305-x86_64-cryptogams.pl deleted file mode 100644 index b9abcd79c1f4..000000000000 --- a/arch/x86/crypto/poly1305-x86_64-cryptogams.pl +++ /dev/null @@ -1,4248 +0,0 @@ -#!/usr/bin/env perl -# SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause -# -# Copyright (C) 2017-2018 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved. -# Copyright (C) 2017-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. -# Copyright (C) 2006-2017 CRYPTOGAMS by <appro@openssl.org>. All Rights Reserved. -# -# This code is taken from the OpenSSL project but the author, Andy Polyakov, -# has relicensed it under the licenses specified in the SPDX header above. -# The original headers, including the original license headers, are -# included below for completeness. -# -# ==================================================================== -# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL -# project. The module is, however, dual licensed under OpenSSL and -# CRYPTOGAMS licenses depending on where you obtain it. For further -# details see http://www.openssl.org/~appro/cryptogams/. -# ==================================================================== -# -# This module implements Poly1305 hash for x86_64. -# -# March 2015 -# -# Initial release. -# -# December 2016 -# -# Add AVX512F+VL+BW code path. -# -# November 2017 -# -# Convert AVX512F+VL+BW code path to pure AVX512F, so that it can be -# executed even on Knights Landing. Trigger for modification was -# observation that AVX512 code paths can negatively affect overall -# Skylake-X system performance. Since we are likely to suppress -# AVX512F capability flag [at least on Skylake-X], conversion serves -# as kind of "investment protection". Note that next *lake processor, -# Cannonlake, has AVX512IFMA code path to execute... -# -# Numbers are cycles per processed byte with poly1305_blocks alone, -# measured with rdtsc at fixed clock frequency. -# -# IALU/gcc-4.8(*) AVX(**) AVX2 AVX-512 -# P4 4.46/+120% - -# Core 2 2.41/+90% - -# Westmere 1.88/+120% - -# Sandy Bridge 1.39/+140% 1.10 -# Haswell 1.14/+175% 1.11 0.65 -# Skylake[-X] 1.13/+120% 0.96 0.51 [0.35] -# Silvermont 2.83/+95% - -# Knights L 3.60/? 1.65 1.10 0.41(***) -# Goldmont 1.70/+180% - -# VIA Nano 1.82/+150% - -# Sledgehammer 1.38/+160% - -# Bulldozer 2.30/+130% 0.97 -# Ryzen 1.15/+200% 1.08 1.18 -# -# (*) improvement coefficients relative to clang are more modest and -# are ~50% on most processors, in both cases we are comparing to -# __int128 code; -# (**) SSE2 implementation was attempted, but among non-AVX processors -# it was faster than integer-only code only on older Intel P4 and -# Core processors, 50-30%, less newer processor is, but slower on -# contemporary ones, for example almost 2x slower on Atom, and as -# former are naturally disappearing, SSE2 is deemed unnecessary; -# (***) strangely enough performance seems to vary from core to core, -# listed result is best case; - -$flavour = shift; -$output = shift; -if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } - -$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); -$kernel=0; $kernel=1 if (!$flavour && !$output); - -if (!$kernel) { - $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; - ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or - ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or - die "can't locate x86_64-xlate.pl"; - - open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; - *STDOUT=*OUT; - - if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` - =~ /GNU assembler version ([2-9]\.[0-9]+)/) { - $avx = ($1>=2.19) + ($1>=2.22) + ($1>=2.25); - } - - if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && - `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) { - $avx = ($1>=2.09) + ($1>=2.10) + ($1>=2.12); - $avx += 1 if ($1==2.11 && $2>=8); - } - - if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && - `ml64 2>&1` =~ /Version ([0-9]+)\./) { - $avx = ($1>=10) + ($1>=11); - } - - if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) { - $avx = ($2>=3.0) + ($2>3.0); - } -} else { - $avx = 4; # The kernel uses ifdefs for this. -} - -sub declare_function() { - my ($name, $align, $nargs) = @_; - if($kernel) { - $code .= "SYM_FUNC_START($name)\n"; - $code .= ".L$name:\n"; - } else { - $code .= ".globl $name\n"; - $code .= ".type $name,\@function,$nargs\n"; - $code .= ".align $align\n"; - $code .= "$name:\n"; - } -} - -sub end_function() { - my ($name) = @_; - if($kernel) { - $code .= "SYM_FUNC_END($name)\n"; - } else { - $code .= ".size $name,.-$name\n"; - } -} - -$code.=<<___ if $kernel; -#include <linux/linkage.h> -___ - -if ($avx) { -$code.=<<___ if $kernel; -.section .rodata -___ -$code.=<<___; -.align 64 -.Lconst: -.Lmask24: -.long 0x0ffffff,0,0x0ffffff,0,0x0ffffff,0,0x0ffffff,0 -.L129: -.long `1<<24`,0,`1<<24`,0,`1<<24`,0,`1<<24`,0 -.Lmask26: -.long 0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0 -.Lpermd_avx2: -.long 2,2,2,3,2,0,2,1 -.Lpermd_avx512: -.long 0,0,0,1, 0,2,0,3, 0,4,0,5, 0,6,0,7 - -.L2_44_inp_permd: -.long 0,1,1,2,2,3,7,7 -.L2_44_inp_shift: -.quad 0,12,24,64 -.L2_44_mask: -.quad 0xfffffffffff,0xfffffffffff,0x3ffffffffff,0xffffffffffffffff -.L2_44_shift_rgt: -.quad 44,44,42,64 -.L2_44_shift_lft: -.quad 8,8,10,64 - -.align 64 -.Lx_mask44: -.quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff -.quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff -.Lx_mask42: -.quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff -.quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff -___ -} -$code.=<<___ if (!$kernel); -.asciz "Poly1305 for x86_64, CRYPTOGAMS by <appro\@openssl.org>" -.align 16 -___ - -my ($ctx,$inp,$len,$padbit)=("%rdi","%rsi","%rdx","%rcx"); -my ($mac,$nonce)=($inp,$len); # *_emit arguments -my ($d1,$d2,$d3, $r0,$r1,$s1)=("%r8","%r9","%rdi","%r11","%r12","%r13"); -my ($h0,$h1,$h2)=("%r14","%rbx","%r10"); - -sub poly1305_iteration { -# input: copy of $r1 in %rax, $h0-$h2, $r0-$r1 -# output: $h0-$h2 *= $r0-$r1 -$code.=<<___; - mulq $h0 # h0*r1 - mov %rax,$d2 - mov $r0,%rax - mov %rdx,$d3 - - mulq $h0 # h0*r0 - mov %rax,$h0 # future $h0 - mov $r0,%rax - mov %rdx,$d1 - - mulq $h1 # h1*r0 - add %rax,$d2 - mov $s1,%rax - adc %rdx,$d3 - - mulq $h1 # h1*s1 - mov $h2,$h1 # borrow $h1 - add %rax,$h0 - adc %rdx,$d1 - - imulq $s1,$h1 # h2*s1 - add $h1,$d2 - mov $d1,$h1 - adc \$0,$d3 - - imulq $r0,$h2 # h2*r0 - add $d2,$h1 - mov \$-4,%rax # mask value - adc $h2,$d3 - - and $d3,%rax # last reduction step - mov $d3,$h2 - shr \$2,$d3 - and \$3,$h2 - add $d3,%rax - add %rax,$h0 - adc \$0,$h1 - adc \$0,$h2 -___ -} - -######################################################################## -# Layout of opaque area is following. -# -# unsigned __int64 h[3]; # current hash value base 2^64 -# unsigned __int64 r[2]; # key value base 2^64 - -$code.=<<___; -.text -___ -$code.=<<___ if (!$kernel); -.extern OPENSSL_ia32cap_P - -.globl poly1305_init_x86_64 -.hidden poly1305_init_x86_64 -.globl poly1305_blocks_x86_64 -.hidden poly1305_blocks_x86_64 -.globl poly1305_emit_x86_64 -.hidden poly1305_emit_x86_64 -___ -&declare_function("poly1305_init_x86_64", 32, 3); -$code.=<<___; - xor %eax,%eax - mov %rax,0($ctx) # initialize hash value - mov %rax,8($ctx) - mov %rax,16($ctx) - - test $inp,$inp - je .Lno_key -___ -$code.=<<___ if (!$kernel); - lea poly1305_blocks_x86_64(%rip),%r10 - lea poly1305_emit_x86_64(%rip),%r11 -___ -$code.=<<___ if (!$kernel && $avx); - mov OPENSSL_ia32cap_P+4(%rip),%r9 - lea poly1305_blocks_avx(%rip),%rax - lea poly1305_emit_avx(%rip),%rcx - bt \$`60-32`,%r9 # AVX? - cmovc %rax,%r10 - cmovc %rcx,%r11 -___ -$code.=<<___ if (!$kernel && $avx>1); - lea poly1305_blocks_avx2(%rip),%rax - bt \$`5+32`,%r9 # AVX2? - cmovc %rax,%r10 -___ -$code.=<<___ if (!$kernel && $avx>3); - mov \$`(1<<31|1<<21|1<<16)`,%rax - shr \$32,%r9 - and %rax,%r9 - cmp %rax,%r9 - je .Linit_base2_44 -___ -$code.=<<___; - mov \$0x0ffffffc0fffffff,%rax - mov \$0x0ffffffc0ffffffc,%rcx - and 0($inp),%rax - and 8($inp),%rcx - mov %rax,24($ctx) - mov %rcx,32($ctx) -___ -$code.=<<___ if (!$kernel && $flavour !~ /elf32/); - mov %r10,0(%rdx) - mov %r11,8(%rdx) -___ -$code.=<<___ if (!$kernel && $flavour =~ /elf32/); - mov %r10d,0(%rdx) - mov %r11d,4(%rdx) -___ -$code.=<<___; - mov \$1,%eax -.Lno_key: - RET -___ -&end_function("poly1305_init_x86_64"); - -&declare_function("poly1305_blocks_x86_64", 32, 4); -$code.=<<___; -.cfi_startproc -.Lblocks: - shr \$4,$len - jz .Lno_data # too short - - push %rbx -.cfi_push %rbx - push %r12 -.cfi_push %r12 - push %r13 -.cfi_push %r13 - push %r14 -.cfi_push %r14 - push %r15 -.cfi_push %r15 - push $ctx -.cfi_push $ctx -.Lblocks_body: - - mov $len,%r15 # reassign $len - - mov 24($ctx),$r0 # load r - mov 32($ctx),$s1 - - mov 0($ctx),$h0 # load hash value - mov 8($ctx),$h1 - mov 16($ctx),$h2 - - mov $s1,$r1 - shr \$2,$s1 - mov $r1,%rax - add $r1,$s1 # s1 = r1 + (r1 >> 2) - jmp .Loop - -.align 32 -.Loop: - add 0($inp),$h0 # accumulate input - adc 8($inp),$h1 - lea 16($inp),$inp - adc $padbit,$h2 -___ - - &poly1305_iteration(); - -$code.=<<___; - mov $r1,%rax - dec %r15 # len-=16 - jnz .Loop - - mov 0(%rsp),$ctx -.cfi_restore $ctx - - mov $h0,0($ctx) # store hash value - mov $h1,8($ctx) - mov $h2,16($ctx) - - mov 8(%rsp),%r15 -.cfi_restore %r15 - mov 16(%rsp),%r14 -.cfi_restore %r14 - mov 24(%rsp),%r13 -.cfi_restore %r13 - mov 32(%rsp),%r12 -.cfi_restore %r12 - mov 40(%rsp),%rbx -.cfi_restore %rbx - lea 48(%rsp),%rsp -.cfi_adjust_cfa_offset -48 -.Lno_data: -.Lblocks_epilogue: - RET -.cfi_endproc -___ -&end_function("poly1305_blocks_x86_64"); - -&declare_function("poly1305_emit_x86_64", 32, 3); -$code.=<<___; -.Lemit: - mov 0($ctx),%r8 # load hash value - mov 8($ctx),%r9 - mov 16($ctx),%r10 - - mov %r8,%rax - add \$5,%r8 # compare to modulus - mov %r9,%rcx - adc \$0,%r9 - adc \$0,%r10 - shr \$2,%r10 # did 130-bit value overflow? - cmovnz %r8,%rax - cmovnz %r9,%rcx - - add 0($nonce),%rax # accumulate nonce - adc 8($nonce),%rcx - mov %rax,0($mac) # write result - mov %rcx,8($mac) - - RET -___ -&end_function("poly1305_emit_x86_64"); -if ($avx) { - -######################################################################## -# Layout of opaque area is following. -# -# unsigned __int32 h[5]; # current hash value base 2^26 -# unsigned __int32 is_base2_26; -# unsigned __int64 r[2]; # key value base 2^64 -# unsigned __int64 pad; -# struct { unsigned __int32 r^2, r^1, r^4, r^3; } r[9]; -# -# where r^n are base 2^26 digits of degrees of multiplier key. There are -# 5 digits, but last four are interleaved with multiples of 5, totalling -# in 9 elements: r0, r1, 5*r1, r2, 5*r2, r3, 5*r3, r4, 5*r4. - -my ($H0,$H1,$H2,$H3,$H4, $T0,$T1,$T2,$T3,$T4, $D0,$D1,$D2,$D3,$D4, $MASK) = - map("%xmm$_",(0..15)); - -$code.=<<___; -.type __poly1305_block,\@abi-omnipotent -.align 32 -__poly1305_block: - push $ctx -___ - &poly1305_iteration(); -$code.=<<___; - pop $ctx - RET -.size __poly1305_block,.-__poly1305_block - -.type __poly1305_init_avx,\@abi-omnipotent -.align 32 -__poly1305_init_avx: - push %rbp - mov %rsp,%rbp - mov $r0,$h0 - mov $r1,$h1 - xor $h2,$h2 - - lea 48+64($ctx),$ctx # size optimization - - mov $r1,%rax - call __poly1305_block # r^2 - - mov \$0x3ffffff,%eax # save interleaved r^2 and r base 2^26 - mov \$0x3ffffff,%edx - mov $h0,$d1 - and $h0#d,%eax - mov $r0,$d2 - and $r0#d,%edx - mov %eax,`16*0+0-64`($ctx) - shr \$26,$d1 - mov %edx,`16*0+4-64`($ctx) - shr \$26,$d2 - - mov \$0x3ffffff,%eax - mov \$0x3ffffff,%edx - and $d1#d,%eax - and $d2#d,%edx - mov %eax,`16*1+0-64`($ctx) - lea (%rax,%rax,4),%eax # *5 - mov %edx,`16*1+4-64`($ctx) - lea (%rdx,%rdx,4),%edx # *5 - mov %eax,`16*2+0-64`($ctx) - shr \$26,$d1 - mov %edx,`16*2+4-64`($ctx) - shr \$26,$d2 - - mov $h1,%rax - mov $r1,%rdx - shl \$12,%rax - shl \$12,%rdx - or $d1,%rax - or $d2,%rdx - and \$0x3ffffff,%eax - and \$0x3ffffff,%edx - mov %eax,`16*3+0-64`($ctx) - lea (%rax,%rax,4),%eax # *5 - mov %edx,`16*3+4-64`($ctx) - lea (%rdx,%rdx,4),%edx # *5 - mov %eax,`16*4+0-64`($ctx) - mov $h1,$d1 - mov %edx,`16*4+4-64`($ctx) - mov $r1,$d2 - - mov \$0x3ffffff,%eax - mov \$0x3ffffff,%edx - shr \$14,$d1 - shr \$14,$d2 - and $d1#d,%eax - and $d2#d,%edx - mov %eax,`16*5+0-64`($ctx) - lea (%rax,%rax,4),%eax # *5 - mov %edx,`16*5+4-64`($ctx) - lea (%rdx,%rdx,4),%edx # *5 - mov %eax,`16*6+0-64`($ctx) - shr \$26,$d1 - mov %edx,`16*6+4-64`($ctx) - shr \$26,$d2 - - mov $h2,%rax - shl \$24,%rax - or %rax,$d1 - mov $d1#d,`16*7+0-64`($ctx) - lea ($d1,$d1,4),$d1 # *5 - mov $d2#d,`16*7+4-64`($ctx) - lea ($d2,$d2,4),$d2 # *5 - mov $d1#d,`16*8+0-64`($ctx) - mov $d2#d,`16*8+4-64`($ctx) - - mov $r1,%rax - call __poly1305_block # r^3 - - mov \$0x3ffffff,%eax # save r^3 base 2^26 - mov $h0,$d1 - and $h0#d,%eax - shr \$26,$d1 - mov %eax,`16*0+12-64`($ctx) - - mov \$0x3ffffff,%edx - and $d1#d,%edx - mov %edx,`16*1+12-64`($ctx) - lea (%rdx,%rdx,4),%edx # *5 - shr \$26,$d1 - mov %edx,`16*2+12-64`($ctx) - - mov $h1,%rax - shl \$12,%rax - or $d1,%rax - and \$0x3ffffff,%eax - mov %eax,`16*3+12-64`($ctx) - lea (%rax,%rax,4),%eax # *5 - mov $h1,$d1 - mov %eax,`16*4+12-64`($ctx) - - mov \$0x3ffffff,%edx - shr \$14,$d1 - and $d1#d,%edx - mov %edx,`16*5+12-64`($ctx) - lea (%rdx,%rdx,4),%edx # *5 - shr \$26,$d1 - mov %edx,`16*6+12-64`($ctx) - - mov $h2,%rax - shl \$24,%rax - or %rax,$d1 - mov $d1#d,`16*7+12-64`($ctx) - lea ($d1,$d1,4),$d1 # *5 - mov $d1#d,`16*8+12-64`($ctx) - - mov $r1,%rax - call __poly1305_block # r^4 - - mov \$0x3ffffff,%eax # save r^4 base 2^26 - mov $h0,$d1 - and $h0#d,%eax - shr \$26,$d1 - mov %eax,`16*0+8-64`($ctx) - - mov \$0x3ffffff,%edx - and $d1#d,%edx - mov %edx,`16*1+8-64`($ctx) - lea (%rdx,%rdx,4),%edx # *5 - shr \$26,$d1 - mov %edx,`16*2+8-64`($ctx) - - mov $h1,%rax - shl \$12,%rax - or $d1,%rax - and \$0x3ffffff,%eax - mov %eax,`16*3+8-64`($ctx) - lea (%rax,%rax,4),%eax # *5 - mov $h1,$d1 - mov %eax,`16*4+8-64`($ctx) - - mov \$0x3ffffff,%edx - shr \$14,$d1 - and $d1#d,%edx - mov %edx,`16*5+8-64`($ctx) - lea (%rdx,%rdx,4),%edx # *5 - shr \$26,$d1 - mov %edx,`16*6+8-64`($ctx) - - mov $h2,%rax - shl \$24,%rax - or %rax,$d1 - mov $d1#d,`16*7+8-64`($ctx) - lea ($d1,$d1,4),$d1 # *5 - mov $d1#d,`16*8+8-64`($ctx) - - lea -48-64($ctx),$ctx # size [de-]optimization - pop %rbp - RET -.size __poly1305_init_avx,.-__poly1305_init_avx -___ - -&declare_function("poly1305_blocks_avx", 32, 4); -$code.=<<___; -.cfi_startproc - mov 20($ctx),%r8d # is_base2_26 - cmp \$128,$len - jae .Lblocks_avx - test %r8d,%r8d - jz .Lblocks - -.Lblocks_avx: - and \$-16,$len - jz .Lno_data_avx - - vzeroupper - - test %r8d,%r8d - jz .Lbase2_64_avx - - test \$31,$len - jz .Leven_avx - - push %rbp -.cfi_push %rbp - mov %rsp,%rbp - push %rbx -.cfi_push %rbx - push %r12 -.cfi_push %r12 - push %r13 -.cfi_push %r13 - push %r14 -.cfi_push %r14 - push %r15 -.cfi_push %r15 -.Lblocks_avx_body: - - mov $len,%r15 # reassign $len - - mov 0($ctx),$d1 # load hash value - mov 8($ctx),$d2 - mov 16($ctx),$h2#d - - mov 24($ctx),$r0 # load r - mov 32($ctx),$s1 - - ################################# base 2^26 -> base 2^64 - mov $d1#d,$h0#d - and \$`-1*(1<<31)`,$d1 - mov $d2,$r1 # borrow $r1 - mov $d2#d,$h1#d - and \$`-1*(1<<31)`,$d2 - - shr \$6,$d1 - shl \$52,$r1 - add $d1,$h0 - shr \$12,$h1 - shr \$18,$d2 - add $r1,$h0 - adc $d2,$h1 - - mov $h2,$d1 - shl \$40,$d1 - shr \$24,$h2 - add $d1,$h1 - adc \$0,$h2 # can be partially reduced... - - mov \$-4,$d2 # ... so reduce - mov $h2,$d1 - and $h2,$d2 - shr \$2,$d1 - and \$3,$h2 - add $d2,$d1 # =*5 - add $d1,$h0 - adc \$0,$h1 - adc \$0,$h2 - - mov $s1,$r1 - mov $s1,%rax - shr \$2,$s1 - add $r1,$s1 # s1 = r1 + (r1 >> 2) - - add 0($inp),$h0 # accumulate input - adc 8($inp),$h1 - lea 16($inp),$inp - adc $padbit,$h2 - - call __poly1305_block - - test $padbit,$padbit # if $padbit is zero, - jz .Lstore_base2_64_avx # store hash in base 2^64 format - - ################################# base 2^64 -> base 2^26 - mov $h0,%rax - mov $h0,%rdx - shr \$52,$h0 - mov $h1,$r0 - mov $h1,$r1 - shr \$26,%rdx - and \$0x3ffffff,%rax # h[0] - shl \$12,$r0 - and \$0x3ffffff,%rdx # h[1] - shr \$14,$h1 - or $r0,$h0 - shl \$24,$h2 - and \$0x3ffffff,$h0 # h[2] - shr \$40,$r1 - and \$0x3ffffff,$h1 # h[3] - or $r1,$h2 # h[4] - - sub \$16,%r15 - jz .Lstore_base2_26_avx - - vmovd %rax#d,$H0 - vmovd %rdx#d,$H1 - vmovd $h0#d,$H2 - vmovd $h1#d,$H3 - vmovd $h2#d,$H4 - jmp .Lproceed_avx - -.align 32 -.Lstore_base2_64_avx: - mov $h0,0($ctx) - mov $h1,8($ctx) - mov $h2,16($ctx) # note that is_base2_26 is zeroed - jmp .Ldone_avx - -.align 16 -.Lstore_base2_26_avx: - mov %rax#d,0($ctx) # store hash value base 2^26 - mov %rdx#d,4($ctx) - mov $h0#d,8($ctx) - mov $h1#d,12($ctx) - mov $h2#d,16($ctx) -.align 16 -.Ldone_avx: - pop %r15 -.cfi_restore %r15 - pop %r14 -.cfi_restore %r14 - pop %r13 -.cfi_restore %r13 - pop %r12 -.cfi_restore %r12 - pop %rbx -.cfi_restore %rbx - pop %rbp -.cfi_restore %rbp -.Lno_data_avx: -.Lblocks_avx_epilogue: - RET -.cfi_endproc - -.align 32 -.Lbase2_64_avx: -.cfi_startproc - push %rbp -.cfi_push %rbp - mov %rsp,%rbp - push %rbx -.cfi_push %rbx - push %r12 -.cfi_push %r12 - push %r13 -.cfi_push %r13 - push %r14 -.cfi_push %r14 - push %r15 -.cfi_push %r15 -.Lbase2_64_avx_body: - - mov $len,%r15 # reassign $len - - mov 24($ctx),$r0 # load r - mov 32($ctx),$s1 - - mov 0($ctx),$h0 # load hash value - mov 8($ctx),$h1 - mov 16($ctx),$h2#d - - mov $s1,$r1 - mov $s1,%rax - shr \$2,$s1 - add $r1,$s1 # s1 = r1 + (r1 >> 2) - - test \$31,$len - jz .Linit_avx - - add 0($inp),$h0 # accumulate input - adc 8($inp),$h1 - lea 16($inp),$inp - adc $padbit,$h2 - sub \$16,%r15 - - call __poly1305_block - -.Linit_avx: - ################################# base 2^64 -> base 2^26 - mov $h0,%rax - mov $h0,%rdx - shr \$52,$h0 - mov $h1,$d1 - mov $h1,$d2 - shr \$26,%rdx - and \$0x3ffffff,%rax # h[0] - shl \$12,$d1 - and \$0x3ffffff,%rdx # h[1] - shr \$14,$h1 - or $d1,$h0 - shl \$24,$h2 - and \$0x3ffffff,$h0 # h[2] - shr \$40,$d2 - and \$0x3ffffff,$h1 # h[3] - or $d2,$h2 # h[4] - - vmovd %rax#d,$H0 - vmovd %rdx#d,$H1 - vmovd $h0#d,$H2 - vmovd $h1#d,$H3 - vmovd $h2#d,$H4 - movl \$1,20($ctx) # set is_base2_26 - - call __poly1305_init_avx - -.Lproceed_avx: - mov %r15,$len - pop %r15 -.cfi_restore %r15 - pop %r14 -.cfi_restore %r14 - pop %r13 -.cfi_restore %r13 - pop %r12 -.cfi_restore %r12 - pop %rbx -.cfi_restore %rbx - pop %rbp -.cfi_restore %rbp -.Lbase2_64_avx_epilogue: - jmp .Ldo_avx -.cfi_endproc - -.align 32 -.Leven_avx: -.cfi_startproc - vmovd 4*0($ctx),$H0 # load hash value - vmovd 4*1($ctx),$H1 - vmovd 4*2($ctx),$H2 - vmovd 4*3($ctx),$H3 - vmovd 4*4($ctx),$H4 - -.Ldo_avx: -___ -$code.=<<___ if (!$win64); - lea 8(%rsp),%r10 -.cfi_def_cfa_register %r10 - and \$-32,%rsp - sub \$-8,%rsp - lea -0x58(%rsp),%r11 - sub \$0x178,%rsp -___ -$code.=<<___ if ($win64); - lea -0xf8(%rsp),%r11 - sub \$0x218,%rsp - vmovdqa %xmm6,0x50(%r11) - vmovdqa %xmm7,0x60(%r11) - vmovdqa %xmm8,0x70(%r11) - vmovdqa %xmm9,0x80(%r11) - vmovdqa %xmm10,0x90(%r11) - vmovdqa %xmm11,0xa0(%r11) - vmovdqa %xmm12,0xb0(%r11) - vmovdqa %xmm13,0xc0(%r11) - vmovdqa %xmm14,0xd0(%r11) - vmovdqa %xmm15,0xe0(%r11) -.Ldo_avx_body: -___ -$code.=<<___; - sub \$64,$len - lea -32($inp),%rax - cmovc %rax,$inp - - vmovdqu `16*3`($ctx),$D4 # preload r0^2 - lea `16*3+64`($ctx),$ctx # size optimization - lea .Lconst(%rip),%rcx - - ################################################################ - # load input - vmovdqu 16*2($inp),$T0 - vmovdqu 16*3($inp),$T1 - vmovdqa 64(%rcx),$MASK # .Lmask26 - - vpsrldq \$6,$T0,$T2 # splat input - vpsrldq \$6,$T1,$T3 - vpunpckhqdq $T1,$T0,$T4 # 4 - vpunpcklqdq $T1,$T0,$T0 # 0:1 - vpunpcklqdq $T3,$T2,$T3 # 2:3 - - vpsrlq \$40,$T4,$T4 # 4 - vpsrlq \$26,$T0,$T1 - vpand $MASK,$T0,$T0 # 0 - vpsrlq \$4,$T3,$T2 - vpand $MASK,$T1,$T1 # 1 - vpsrlq \$30,$T3,$T3 - vpand $MASK,$T2,$T2 # 2 - vpand $MASK,$T3,$T3 # 3 - vpor 32(%rcx),$T4,$T4 # padbit, yes, always - - jbe .Lskip_loop_avx - - # expand and copy pre-calculated table to stack - vmovdqu `16*1-64`($ctx),$D1 - vmovdqu `16*2-64`($ctx),$D2 - vpshufd \$0xEE,$D4,$D3 # 34xx -> 3434 - vpshufd \$0x44,$D4,$D0 # xx12 -> 1212 - vmovdqa $D3,-0x90(%r11) - vmovdqa $D0,0x00(%rsp) - vpshufd \$0xEE,$D1,$D4 - vmovdqu `16*3-64`($ctx),$D0 - vpshufd \$0x44,$D1,$D1 - vmovdqa $D4,-0x80(%r11) - vmovdqa $D1,0x10(%rsp) - vpshufd \$0xEE,$D2,$D3 - vmovdqu `16*4-64`($ctx),$D1 - vpshufd \$0x44,$D2,$D2 - vmovdqa $D3,-0x70(%r11) - vmovdqa $D2,0x20(%rsp) - vpshufd \$0xEE,$D0,$D4 - vmovdqu `16*5-64`($ctx),$D2 - vpshufd \$0x44,$D0,$D0 - vmovdqa $D4,-0x60(%r11) - vmovdqa $D0,0x30(%rsp) - vpshufd \$0xEE,$D1,$D3 - vmovdqu `16*6-64`($ctx),$D0 - vpshufd \$0x44,$D1,$D1 - vmovdqa $D3,-0x50(%r11) - vmovdqa $D1,0x40(%rsp) - vpshufd \$0xEE,$D2,$D4 - vmovdqu `16*7-64`($ctx),$D1 - vpshufd \$0x44,$D2,$D2 - vmovdqa $D4,-0x40(%r11) - vmovdqa $D2,0x50(%rsp) - vpshufd \$0xEE,$D0,$D3 - vmovdqu `16*8-64`($ctx),$D2 - vpshufd \$0x44,$D0,$D0 - vmovdqa $D3,-0x30(%r11) - vmovdqa $D0,0x60(%rsp) - vpshufd \$0xEE,$D1,$D4 - vpshufd \$0x44,$D1,$D1 - vmovdqa $D4,-0x20(%r11) - vmovdqa $D1,0x70(%rsp) - vpshufd \$0xEE,$D2,$D3 - vmovdqa 0x00(%rsp),$D4 # preload r0^2 - vpshufd \$0x44,$D2,$D2 - vmovdqa $D3,-0x10(%r11) - vmovdqa $D2,0x80(%rsp) - - jmp .Loop_avx - -.align 32 -.Loop_avx: - ################################################################ - # ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2 - # ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r - # \___________________/ - # ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2 - # ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r - # \___________________/ \____________________/ - # - # Note that we start with inp[2:3]*r^2. This is because it - # doesn't depend on reduction in previous iteration. - ################################################################ - # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 - # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 - # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 - # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 - # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 - # - # though note that $Tx and $Hx are "reversed" in this section, - # and $D4 is preloaded with r0^2... - - vpmuludq $T0,$D4,$D0 # d0 = h0*r0 - vpmuludq $T1,$D4,$D1 # d1 = h1*r0 - vmovdqa $H2,0x20(%r11) # offload hash - vpmuludq $T2,$D4,$D2 # d3 = h2*r0 - vmovdqa 0x10(%rsp),$H2 # r1^2 - vpmuludq $T3,$D4,$D3 # d3 = h3*r0 - vpmuludq $T4,$D4,$D4 # d4 = h4*r0 - - vmovdqa $H0,0x00(%r11) # - vpmuludq 0x20(%rsp),$T4,$H0 # h4*s1 - vmovdqa $H1,0x10(%r11) # - vpmuludq $T3,$H2,$H1 # h3*r1 - vpaddq $H0,$D0,$D0 # d0 += h4*s1 - vpaddq $H1,$D4,$D4 # d4 += h3*r1 - vmovdqa $H3,0x30(%r11) # - vpmuludq $T2,$H2,$H0 # h2*r1 - vpmuludq $T1,$H2,$H1 # h1*r1 - vpaddq $H0,$D3,$D3 # d3 += h2*r1 - vmovdqa 0x30(%rsp),$H3 # r2^2 - vpaddq $H1,$D2,$D2 # d2 += h1*r1 - vmovdqa $H4,0x40(%r11) # - vpmuludq $T0,$H2,$H2 # h0*r1 - vpmuludq $T2,$H3,$H0 # h2*r2 - vpaddq $H2,$D1,$D1 # d1 += h0*r1 - - vmovdqa 0x40(%rsp),$H4 # s2^2 - vpaddq $H0,$D4,$D4 # d4 += h2*r2 - vpmuludq $T1,$H3,$H1 # h1*r2 - vpmuludq $T0,$H3,$H3 # h0*r2 - vpaddq $H1,$D3,$D3 # d3 += h1*r2 - vmovdqa 0x50(%rsp),$H2 # r3^2 - vpaddq $H3,$D2,$D2 # d2 += h0*r2 - vpmuludq $T4,$H4,$H0 # h4*s2 - vpmuludq $T3,$H4,$H4 # h3*s2 - vpaddq $H0,$D1,$D1 # d1 += h4*s2 - vmovdqa 0x60(%rsp),$H3 # s3^2 - vpaddq $H4,$D0,$D0 # d0 += h3*s2 - - vmovdqa 0x80(%rsp),$H4 # s4^2 - vpmuludq $T1,$H2,$H1 # h1*r3 - vpmuludq $T0,$H2,$H2 # h0*r3 - vpaddq $H1,$D4,$D4 # d4 += h1*r3 - vpaddq $H2,$D3,$D3 # d3 += h0*r3 - vpmuludq $T4,$H3,$H0 # h4*s3 - vpmuludq $T3,$H3,$H1 # h3*s3 - vpaddq $H0,$D2,$D2 # d2 += h4*s3 - vmovdqu 16*0($inp),$H0 # load input - vpaddq $H1,$D1,$D1 # d1 += h3*s3 - vpmuludq $T2,$H3,$H3 # h2*s3 - vpmuludq $T2,$H4,$T2 # h2*s4 - vpaddq $H3,$D0,$D0 # d0 += h2*s3 - - vmovdqu 16*1($inp),$H1 # - vpaddq $T2,$D1,$D1 # d1 += h2*s4 - vpmuludq $T3,$H4,$T3 # h3*s4 - vpmuludq $T4,$H4,$T4 # h4*s4 - vpsrldq \$6,$H0,$H2 # splat input - vpaddq $T3,$D2,$D2 # d2 += h3*s4 - vpaddq $T4,$D3,$D3 # d3 += h4*s4 - vpsrldq \$6,$H1,$H3 # - vpmuludq 0x70(%rsp),$T0,$T4 # h0*r4 - vpmuludq $T1,$H4,$T0 # h1*s4 - vpunpckhqdq $H1,$H0,$H4 # 4 - vpaddq $T4,$D4,$D4 # d4 += h0*r4 - vmovdqa -0x90(%r11),$T4 # r0^4 - vpaddq $T0,$D0,$D0 # d0 += h1*s4 - - vpunpcklqdq $H1,$H0,$H0 # 0:1 - vpunpcklqdq $H3,$H2,$H3 # 2:3 - - #vpsrlq \$40,$H4,$H4 # 4 - vpsrldq \$`40/8`,$H4,$H4 # 4 - vpsrlq \$26,$H0,$H1 - vpand $MASK,$H0,$H0 # 0 - vpsrlq \$4,$H3,$H2 - vpand $MASK,$H1,$H1 # 1 - vpand 0(%rcx),$H4,$H4 # .Lmask24 - vpsrlq \$30,$H3,$H3 - vpand $MASK,$H2,$H2 # 2 - vpand $MASK,$H3,$H3 # 3 - vpor 32(%rcx),$H4,$H4 # padbit, yes, always - - vpaddq 0x00(%r11),$H0,$H0 # add hash value - vpaddq 0x10(%r11),$H1,$H1 - vpaddq 0x20(%r11),$H2,$H2 - vpaddq 0x30(%r11),$H3,$H3 - vpaddq 0x40(%r11),$H4,$H4 - - lea 16*2($inp),%rax - lea 16*4($inp),$inp - sub \$64,$len - cmovc %rax,$inp - - ################################################################ - # Now we accumulate (inp[0:1]+hash)*r^4 - ################################################################ - # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 - # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 - # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 - # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 - # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 - - vpmuludq $H0,$T4,$T0 # h0*r0 - vpmuludq $H1,$T4,$T1 # h1*r0 - vpaddq $T0,$D0,$D0 - vpaddq $T1,$D1,$D1 - vmovdqa -0x80(%r11),$T2 # r1^4 - vpmuludq $H2,$T4,$T0 # h2*r0 - vpmuludq $H3,$T4,$T1 # h3*r0 - vpaddq $T0,$D2,$D2 - vpaddq $T1,$D3,$D3 - vpmuludq $H4,$T4,$T4 # h4*r0 - vpmuludq -0x70(%r11),$H4,$T0 # h4*s1 - vpaddq $T4,$D4,$D4 - - vpaddq $T0,$D0,$D0 # d0 += h4*s1 - vpmuludq $H2,$T2,$T1 # h2*r1 - vpmuludq $H3,$T2,$T0 # h3*r1 - vpaddq $T1,$D3,$D3 # d3 += h2*r1 - vmovdqa -0x60(%r11),$T3 # r2^4 - vpaddq $T0,$D4,$D4 # d4 += h3*r1 - vpmuludq $H1,$T2,$T1 # h1*r1 - vpmuludq $H0,$T2,$T2 # h0*r1 - vpaddq $T1,$D2,$D2 # d2 += h1*r1 - vpaddq $T2,$D1,$D1 # d1 += h0*r1 - - vmovdqa -0x50(%r11),$T4 # s2^4 - vpmuludq $H2,$T3,$T0 # h2*r2 - vpmuludq $H1,$T3,$T1 # h1*r2 - vpaddq $T0,$D4,$D4 # d4 += h2*r2 - vpaddq $T1,$D3,$D3 # d3 += h1*r2 - vmovdqa -0x40(%r11),$T2 # r3^4 - vpmuludq $H0,$T3,$T3 # h0*r2 - vpmuludq $H4,$T4,$T0 # h4*s2 - vpaddq $T3,$D2,$D2 # d2 += h0*r2 - vpaddq $T0,$D1,$D1 # d1 += h4*s2 - vmovdqa -0x30(%r11),$T3 # s3^4 - vpmuludq $H3,$T4,$T4 # h3*s2 - vpmuludq $H1,$T2,$T1 # h1*r3 - vpaddq $T4,$D0,$D0 # d0 += h3*s2 - - vmovdqa -0x10(%r11),$T4 # s4^4 - vpaddq $T1,$D4,$D4 # d4 += h1*r3 - vpmuludq $H0,$T2,$T2 # h0*r3 - vpmuludq $H4,$T3,$T0 # h4*s3 - vpaddq $T2,$D3,$D3 # d3 += h0*r3 - vpaddq $T0,$D2,$D2 # d2 += h4*s3 - vmovdqu 16*2($inp),$T0 # load input - vpmuludq $H3,$T3,$T2 # h3*s3 - vpmuludq $H2,$T3,$T3 # h2*s3 - vpaddq $T2,$D1,$D1 # d1 += h3*s3 - vmovdqu 16*3($inp),$T1 # - vpaddq $T3,$D0,$D0 # d0 += h2*s3 - - vpmuludq $H2,$T4,$H2 # h2*s4 - vpmuludq $H3,$T4,$H3 # h3*s4 - vpsrldq \$6,$T0,$T2 # splat input - vpaddq $H2,$D1,$D1 # d1 += h2*s4 - vpmuludq $H4,$T4,$H4 # h4*s4 - vpsrldq \$6,$T1,$T3 # - vpaddq $H3,$D2,$H2 # h2 = d2 + h3*s4 - vpaddq $H4,$D3,$H3 # h3 = d3 + h4*s4 - vpmuludq -0x20(%r11),$H0,$H4 # h0*r4 - vpmuludq $H1,$T4,$H0 - vpunpckhqdq $T1,$T0,$T4 # 4 - vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4 - vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4 - - vpunpcklqdq $T1,$T0,$T0 # 0:1 - vpunpcklqdq $T3,$T2,$T3 # 2:3 - - #vpsrlq \$40,$T4,$T4 # 4 - vpsrldq \$`40/8`,$T4,$T4 # 4 - vpsrlq \$26,$T0,$T1 - vmovdqa 0x00(%rsp),$D4 # preload r0^2 - vpand $MASK,$T0,$T0 # 0 - vpsrlq \$4,$T3,$T2 - vpand $MASK,$T1,$T1 # 1 - vpand 0(%rcx),$T4,$T4 # .Lmask24 - vpsrlq \$30,$T3,$T3 - vpand $MASK,$T2,$T2 # 2 - vpand $MASK,$T3,$T3 # 3 - vpor 32(%rcx),$T4,$T4 # padbit, yes, always - - ################################################################ - # lazy reduction as discussed in "NEON crypto" by D.J. Bernstein - # and P. Schwabe - - vpsrlq \$26,$H3,$D3 - vpand $MASK,$H3,$H3 - vpaddq $D3,$H4,$H4 # h3 -> h4 - - vpsrlq \$26,$H0,$D0 - vpand $MASK,$H0,$H0 - vpaddq $D0,$D1,$H1 # h0 -> h1 - - vpsrlq \$26,$H4,$D0 - vpand $MASK,$H4,$H4 - - vpsrlq \$26,$H1,$D1 - vpand $MASK,$H1,$H1 - vpaddq $D1,$H2,$H2 # h1 -> h2 - - vpaddq $D0,$H0,$H0 - vpsllq \$2,$D0,$D0 - vpaddq $D0,$H0,$H0 # h4 -> h0 - - vpsrlq \$26,$H2,$D2 - vpand $MASK,$H2,$H2 - vpaddq $D2,$H3,$H3 # h2 -> h3 - - vpsrlq \$26,$H0,$D0 - vpand $MASK,$H0,$H0 - vpaddq $D0,$H1,$H1 # h0 -> h1 - - vpsrlq \$26,$H3,$D3 - vpand $MASK,$H3,$H3 - vpaddq $D3,$H4,$H4 # h3 -> h4 - - ja .Loop_avx - -.Lskip_loop_avx: - ################################################################ - # multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1 - - vpshufd \$0x10,$D4,$D4 # r0^n, xx12 -> x1x2 - add \$32,$len - jnz .Long_tail_avx - - vpaddq $H2,$T2,$T2 - vpaddq $H0,$T0,$T0 - vpaddq $H1,$T1,$T1 - vpaddq $H3,$T3,$T3 - vpaddq $H4,$T4,$T4 - -.Long_tail_avx: - vmovdqa $H2,0x20(%r11) - vmovdqa $H0,0x00(%r11) - vmovdqa $H1,0x10(%r11) - vmovdqa $H3,0x30(%r11) - vmovdqa $H4,0x40(%r11) - - # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 - # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 - # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 - # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 - # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 - - vpmuludq $T2,$D4,$D2 # d2 = h2*r0 - vpmuludq $T0,$D4,$D0 # d0 = h0*r0 - vpshufd \$0x10,`16*1-64`($ctx),$H2 # r1^n - vpmuludq $T1,$D4,$D1 # d1 = h1*r0 - vpmuludq $T3,$D4,$D3 # d3 = h3*r0 - vpmuludq $T4,$D4,$D4 # d4 = h4*r0 - - vpmuludq $T3,$H2,$H0 # h3*r1 - vpaddq $H0,$D4,$D4 # d4 += h3*r1 - vpshufd \$0x10,`16*2-64`($ctx),$H3 # s1^n - vpmuludq $T2,$H2,$H1 # h2*r1 - vpaddq $H1,$D3,$D3 # d3 += h2*r1 - vpshufd \$0x10,`16*3-64`($ctx),$H4 # r2^n - vpmuludq $T1,$H2,$H0 # h1*r1 - vpaddq $H0,$D2,$D2 # d2 += h1*r1 - vpmuludq $T0,$H2,$H2 # h0*r1 - vpaddq $H2,$D1,$D1 # d1 += h0*r1 - vpmuludq $T4,$H3,$H3 # h4*s1 - vpaddq $H3,$D0,$D0 # d0 += h4*s1 - - vpshufd \$0x10,`16*4-64`($ctx),$H2 # s2^n - vpmuludq $T2,$H4,$H1 # h2*r2 - vpaddq $H1,$D4,$D4 # d4 += h2*r2 - vpmuludq $T1,$H4,$H0 # h1*r2 - vpaddq $H0,$D3,$D3 # d3 += h1*r2 - vpshufd \$0x10,`16*5-64`($ctx),$H3 # r3^n - vpmuludq $T0,$H4,$H4 # h0*r2 - vpaddq $H4,$D2,$D2 # d2 += h0*r2 - vpmuludq $T4,$H2,$H1 # h4*s2 - vpaddq $H1,$D1,$D1 # d1 += h4*s2 - vpshufd \$0x10,`16*6-64`($ctx),$H4 # s3^n - vpmuludq $T3,$H2,$H2 # h3*s2 - vpaddq $H2,$D0,$D0 # d0 += h3*s2 - - vpmuludq $T1,$H3,$H0 # h1*r3 - vpaddq $H0,$D4,$D4 # d4 += h1*r3 - vpmuludq $T0,$H3,$H3 # h0*r3 - vpaddq $H3,$D3,$D3 # d3 += h0*r3 - vpshufd \$0x10,`16*7-64`($ctx),$H2 # r4^n - vpmuludq $T4,$H4,$H1 # h4*s3 - vpaddq $H1,$D2,$D2 # d2 += h4*s3 - vpshufd \$0x10,`16*8-64`($ctx),$H3 # s4^n - vpmuludq $T3,$H4,$H0 # h3*s3 - vpaddq $H0,$D1,$D1 # d1 += h3*s3 - vpmuludq $T2,$H4,$H4 # h2*s3 - vpaddq $H4,$D0,$D0 # d0 += h2*s3 - - vpmuludq $T0,$H2,$H2 # h0*r4 - vpaddq $H2,$D4,$D4 # h4 = d4 + h0*r4 - vpmuludq $T4,$H3,$H1 # h4*s4 - vpaddq $H1,$D3,$D3 # h3 = d3 + h4*s4 - vpmuludq $T3,$H3,$H0 # h3*s4 - vpaddq $H0,$D2,$D2 # h2 = d2 + h3*s4 - vpmuludq $T2,$H3,$H1 # h2*s4 - vpaddq $H1,$D1,$D1 # h1 = d1 + h2*s4 - vpmuludq $T1,$H3,$H3 # h1*s4 - vpaddq $H3,$D0,$D0 # h0 = d0 + h1*s4 - - jz .Lshort_tail_avx - - vmovdqu 16*0($inp),$H0 # load input - vmovdqu 16*1($inp),$H1 - - vpsrldq \$6,$H0,$H2 # splat input - vpsrldq \$6,$H1,$H3 - vpunpckhqdq $H1,$H0,$H4 # 4 - vpunpcklqdq $H1,$H0,$H0 # 0:1 - vpunpcklqdq $H3,$H2,$H3 # 2:3 - - vpsrlq \$40,$H4,$H4 # 4 - vpsrlq \$26,$H0,$H1 - vpand $MASK,$H0,$H0 # 0 - vpsrlq \$4,$H3,$H2 - vpand $MASK,$H1,$H1 # 1 - vpsrlq \$30,$H3,$H3 - vpand $MASK,$H2,$H2 # 2 - vpand $MASK,$H3,$H3 # 3 - vpor 32(%rcx),$H4,$H4 # padbit, yes, always - - vpshufd \$0x32,`16*0-64`($ctx),$T4 # r0^n, 34xx -> x3x4 - vpaddq 0x00(%r11),$H0,$H0 - vpaddq 0x10(%r11),$H1,$H1 - vpaddq 0x20(%r11),$H2,$H2 - vpaddq 0x30(%r11),$H3,$H3 - vpaddq 0x40(%r11),$H4,$H4 - - ################################################################ - # multiply (inp[0:1]+hash) by r^4:r^3 and accumulate - - vpmuludq $H0,$T4,$T0 # h0*r0 - vpaddq $T0,$D0,$D0 # d0 += h0*r0 - vpmuludq $H1,$T4,$T1 # h1*r0 - vpaddq $T1,$D1,$D1 # d1 += h1*r0 - vpmuludq $H2,$T4,$T0 # h2*r0 - vpaddq $T0,$D2,$D2 # d2 += h2*r0 - vpshufd \$0x32,`16*1-64`($ctx),$T2 # r1^n - vpmuludq $H3,$T4,$T1 # h3*r0 - vpaddq $T1,$D3,$D3 # d3 += h3*r0 - vpmuludq $H4,$T4,$T4 # h4*r0 - vpaddq $T4,$D4,$D4 # d4 += h4*r0 - - vpmuludq $H3,$T2,$T0 # h3*r1 - vpaddq $T0,$D4,$D4 # d4 += h3*r1 - vpshufd \$0x32,`16*2-64`($ctx),$T3 # s1 - vpmuludq $H2,$T2,$T1 # h2*r1 - vpaddq $T1,$D3,$D3 # d3 += h2*r1 - vpshufd \$0x32,`16*3-64`($ctx),$T4 # r2 - vpmuludq $H1,$T2,$T0 # h1*r1 - vpaddq $T0,$D2,$D2 # d2 += h1*r1 - vpmuludq $H0,$T2,$T2 # h0*r1 - vpaddq $T2,$D1,$D1 # d1 += h0*r1 - vpmuludq $H4,$T3,$T3 # h4*s1 - vpaddq $T3,$D0,$D0 # d0 += h4*s1 - - vpshufd \$0x32,`16*4-64`($ctx),$T2 # s2 - vpmuludq $H2,$T4,$T1 # h2*r2 - vpaddq $T1,$D4,$D4 # d4 += h2*r2 - vpmuludq $H1,$T4,$T0 # h1*r2 - vpaddq $T0,$D3,$D3 # d3 += h1*r2 - vpshufd \$0x32,`16*5-64`($ctx),$T3 # r3 - vpmuludq $H0,$T4,$T4 # h0*r2 - vpaddq $T4,$D2,$D2 # d2 += h0*r2 - vpmuludq $H4,$T2,$T1 # h4*s2 - vpaddq $T1,$D1,$D1 # d1 += h4*s2 - vpshufd \$0x32,`16*6-64`($ctx),$T4 # s3 - vpmuludq $H3,$T2,$T2 # h3*s2 - vpaddq $T2,$D0,$D0 # d0 += h3*s2 - - vpmuludq $H1,$T3,$T0 # h1*r3 - vpaddq $T0,$D4,$D4 # d4 += h1*r3 - vpmuludq $H0,$T3,$T3 # h0*r3 - vpaddq $T3,$D3,$D3 # d3 += h0*r3 - vpshufd \$0x32,`16*7-64`($ctx),$T2 # r4 - vpmuludq $H4,$T4,$T1 # h4*s3 - vpaddq $T1,$D2,$D2 # d2 += h4*s3 - vpshufd \$0x32,`16*8-64`($ctx),$T3 # s4 - vpmuludq $H3,$T4,$T0 # h3*s3 - vpaddq $T0,$D1,$D1 # d1 += h3*s3 - vpmuludq $H2,$T4,$T4 # h2*s3 - vpaddq $T4,$D0,$D0 # d0 += h2*s3 - - vpmuludq $H0,$T2,$T2 # h0*r4 - vpaddq $T2,$D4,$D4 # d4 += h0*r4 - vpmuludq $H4,$T3,$T1 # h4*s4 - vpaddq $T1,$D3,$D3 # d3 += h4*s4 - vpmuludq $H3,$T3,$T0 # h3*s4 - vpaddq $T0,$D2,$D2 # d2 += h3*s4 - vpmuludq $H2,$T3,$T1 # h2*s4 - vpaddq $T1,$D1,$D1 # d1 += h2*s4 - vpmuludq $H1,$T3,$T3 # h1*s4 - vpaddq $T3,$D0,$D0 # d0 += h1*s4 - -.Lshort_tail_avx: - ################################################################ - # horizontal addition - - vpsrldq \$8,$D4,$T4 - vpsrldq \$8,$D3,$T3 - vpsrldq \$8,$D1,$T1 - vpsrldq \$8,$D0,$T0 - vpsrldq \$8,$D2,$T2 - vpaddq $T3,$D3,$D3 - vpaddq $T4,$D4,$D4 - vpaddq $T0,$D0,$D0 - vpaddq $T1,$D1,$D1 - vpaddq $T2,$D2,$D2 - - ################################################################ - # lazy reduction - - vpsrlq \$26,$D3,$H3 - vpand $MASK,$D3,$D3 - vpaddq $H3,$D4,$D4 # h3 -> h4 - - vpsrlq \$26,$D0,$H0 - vpand $MASK,$D0,$D0 - vpaddq $H0,$D1,$D1 # h0 -> h1 - - vpsrlq \$26,$D4,$H4 - vpand $MASK,$D4,$D4 - - vpsrlq \$26,$D1,$H1 - vpand $MASK,$D1,$D1 - vpaddq $H1,$D2,$D2 # h1 -> h2 - - vpaddq $H4,$D0,$D0 - vpsllq \$2,$H4,$H4 - vpaddq $H4,$D0,$D0 # h4 -> h0 - - vpsrlq \$26,$D2,$H2 - vpand $MASK,$D2,$D2 - vpaddq $H2,$D3,$D3 # h2 -> h3 - - vpsrlq \$26,$D0,$H0 - vpand $MASK,$D0,$D0 - vpaddq $H0,$D1,$D1 # h0 -> h1 - - vpsrlq \$26,$D3,$H3 - vpand $MASK,$D3,$D3 - vpaddq $H3,$D4,$D4 # h3 -> h4 - - vmovd $D0,`4*0-48-64`($ctx) # save partially reduced - vmovd $D1,`4*1-48-64`($ctx) - vmovd $D2,`4*2-48-64`($ctx) - vmovd $D3,`4*3-48-64`($ctx) - vmovd $D4,`4*4-48-64`($ctx) -___ -$code.=<<___ if ($win64); - vmovdqa 0x50(%r11),%xmm6 - vmovdqa 0x60(%r11),%xmm7 - vmovdqa 0x70(%r11),%xmm8 - vmovdqa 0x80(%r11),%xmm9 - vmovdqa 0x90(%r11),%xmm10 - vmovdqa 0xa0(%r11),%xmm11 - vmovdqa 0xb0(%r11),%xmm12 - vmovdqa 0xc0(%r11),%xmm13 - vmovdqa 0xd0(%r11),%xmm14 - vmovdqa 0xe0(%r11),%xmm15 - lea 0xf8(%r11),%rsp -.Ldo_avx_epilogue: -___ -$code.=<<___ if (!$win64); - lea -8(%r10),%rsp -.cfi_def_cfa_register %rsp -___ -$code.=<<___; - vzeroupper - RET -.cfi_endproc -___ -&end_function("poly1305_blocks_avx"); - -&declare_function("poly1305_emit_avx", 32, 3); -$code.=<<___; - cmpl \$0,20($ctx) # is_base2_26? - je .Lemit - - mov 0($ctx),%eax # load hash value base 2^26 - mov 4($ctx),%ecx - mov 8($ctx),%r8d - mov 12($ctx),%r11d - mov 16($ctx),%r10d - - shl \$26,%rcx # base 2^26 -> base 2^64 - mov %r8,%r9 - shl \$52,%r8 - add %rcx,%rax - shr \$12,%r9 - add %rax,%r8 # h0 - adc \$0,%r9 - - shl \$14,%r11 - mov %r10,%rax - shr \$24,%r10 - add %r11,%r9 - shl \$40,%rax - add %rax,%r9 # h1 - adc \$0,%r10 # h2 - - mov %r10,%rax # could be partially reduced, so reduce - mov %r10,%rcx - and \$3,%r10 - shr \$2,%rax - and \$-4,%rcx - add %rcx,%rax - add %rax,%r8 - adc \$0,%r9 - adc \$0,%r10 - - mov %r8,%rax - add \$5,%r8 # compare to modulus - mov %r9,%rcx - adc \$0,%r9 - adc \$0,%r10 - shr \$2,%r10 # did 130-bit value overflow? - cmovnz %r8,%rax - cmovnz %r9,%rcx - - add 0($nonce),%rax # accumulate nonce - adc 8($nonce),%rcx - mov %rax,0($mac) # write result - mov %rcx,8($mac) - - RET -___ -&end_function("poly1305_emit_avx"); - -if ($avx>1) { - -my ($H0,$H1,$H2,$H3,$H4, $MASK, $T4,$T0,$T1,$T2,$T3, $D0,$D1,$D2,$D3,$D4) = - map("%ymm$_",(0..15)); -my $S4=$MASK; - -sub poly1305_blocks_avxN { - my ($avx512) = @_; - my $suffix = $avx512 ? "_avx512" : ""; -$code.=<<___; -.cfi_startproc - mov 20($ctx),%r8d # is_base2_26 - cmp \$128,$len - jae .Lblocks_avx2$suffix - test %r8d,%r8d - jz .Lblocks - -.Lblocks_avx2$suffix: - and \$-16,$len - jz .Lno_data_avx2$suffix - - vzeroupper - - test %r8d,%r8d - jz .Lbase2_64_avx2$suffix - - test \$63,$len - jz .Leven_avx2$suffix - - push %rbp -.cfi_push %rbp - mov %rsp,%rbp - push %rbx -.cfi_push %rbx - push %r12 -.cfi_push %r12 - push %r13 -.cfi_push %r13 - push %r14 -.cfi_push %r14 - push %r15 -.cfi_push %r15 -.Lblocks_avx2_body$suffix: - - mov $len,%r15 # reassign $len - - mov 0($ctx),$d1 # load hash value - mov 8($ctx),$d2 - mov 16($ctx),$h2#d - - mov 24($ctx),$r0 # load r - mov 32($ctx),$s1 - - ################################# base 2^26 -> base 2^64 - mov $d1#d,$h0#d - and \$`-1*(1<<31)`,$d1 - mov $d2,$r1 # borrow $r1 - mov $d2#d,$h1#d - and \$`-1*(1<<31)`,$d2 - - shr \$6,$d1 - shl \$52,$r1 - add $d1,$h0 - shr \$12,$h1 - shr \$18,$d2 - add $r1,$h0 - adc $d2,$h1 - - mov $h2,$d1 - shl \$40,$d1 - shr \$24,$h2 - add $d1,$h1 - adc \$0,$h2 # can be partially reduced... - - mov \$-4,$d2 # ... so reduce - mov $h2,$d1 - and $h2,$d2 - shr \$2,$d1 - and \$3,$h2 - add $d2,$d1 # =*5 - add $d1,$h0 - adc \$0,$h1 - adc \$0,$h2 - - mov $s1,$r1 - mov $s1,%rax - shr \$2,$s1 - add $r1,$s1 # s1 = r1 + (r1 >> 2) - -.Lbase2_26_pre_avx2$suffix: - add 0($inp),$h0 # accumulate input - adc 8($inp),$h1 - lea 16($inp),$inp - adc $padbit,$h2 - sub \$16,%r15 - - call __poly1305_block - mov $r1,%rax - - test \$63,%r15 - jnz .Lbase2_26_pre_avx2$suffix - - test $padbit,$padbit # if $padbit is zero, - jz .Lstore_base2_64_avx2$suffix # store hash in base 2^64 format - - ################################# base 2^64 -> base 2^26 - mov $h0,%rax - mov $h0,%rdx - shr \$52,$h0 - mov $h1,$r0 - mov $h1,$r1 - shr \$26,%rdx - and \$0x3ffffff,%rax # h[0] - shl \$12,$r0 - and \$0x3ffffff,%rdx # h[1] - shr \$14,$h1 - or $r0,$h0 - shl \$24,$h2 - and \$0x3ffffff,$h0 # h[2] - shr \$40,$r1 - and \$0x3ffffff,$h1 # h[3] - or $r1,$h2 # h[4] - - test %r15,%r15 - jz .Lstore_base2_26_avx2$suffix - - vmovd %rax#d,%x#$H0 - vmovd %rdx#d,%x#$H1 - vmovd $h0#d,%x#$H2 - vmovd $h1#d,%x#$H3 - vmovd $h2#d,%x#$H4 - jmp .Lproceed_avx2$suffix - -.align 32 -.Lstore_base2_64_avx2$suffix: - mov $h0,0($ctx) - mov $h1,8($ctx) - mov $h2,16($ctx) # note that is_base2_26 is zeroed - jmp .Ldone_avx2$suffix - -.align 16 -.Lstore_base2_26_avx2$suffix: - mov %rax#d,0($ctx) # store hash value base 2^26 - mov %rdx#d,4($ctx) - mov $h0#d,8($ctx) - mov $h1#d,12($ctx) - mov $h2#d,16($ctx) -.align 16 -.Ldone_avx2$suffix: - pop %r15 -.cfi_restore %r15 - pop %r14 -.cfi_restore %r14 - pop %r13 -.cfi_restore %r13 - pop %r12 -.cfi_restore %r12 - pop %rbx -.cfi_restore %rbx - pop %rbp -.cfi_restore %rbp -.Lno_data_avx2$suffix: -.Lblocks_avx2_epilogue$suffix: - RET -.cfi_endproc - -.align 32 -.Lbase2_64_avx2$suffix: -.cfi_startproc - push %rbp -.cfi_push %rbp - mov %rsp,%rbp - push %rbx -.cfi_push %rbx - push %r12 -.cfi_push %r12 - push %r13 -.cfi_push %r13 - push %r14 -.cfi_push %r14 - push %r15 -.cfi_push %r15 -.Lbase2_64_avx2_body$suffix: - - mov $len,%r15 # reassign $len - - mov 24($ctx),$r0 # load r - mov 32($ctx),$s1 - - mov 0($ctx),$h0 # load hash value - mov 8($ctx),$h1 - mov 16($ctx),$h2#d - - mov $s1,$r1 - mov $s1,%rax - shr \$2,$s1 - add $r1,$s1 # s1 = r1 + (r1 >> 2) - - test \$63,$len - jz .Linit_avx2$suffix - -.Lbase2_64_pre_avx2$suffix: - add 0($inp),$h0 # accumulate input - adc 8($inp),$h1 - lea 16($inp),$inp - adc $padbit,$h2 - sub \$16,%r15 - - call __poly1305_block - mov $r1,%rax - - test \$63,%r15 - jnz .Lbase2_64_pre_avx2$suffix - -.Linit_avx2$suffix: - ################################# base 2^64 -> base 2^26 - mov $h0,%rax - mov $h0,%rdx - shr \$52,$h0 - mov $h1,$d1 - mov $h1,$d2 - shr \$26,%rdx - and \$0x3ffffff,%rax # h[0] - shl \$12,$d1 - and \$0x3ffffff,%rdx # h[1] - shr \$14,$h1 - or $d1,$h0 - shl \$24,$h2 - and \$0x3ffffff,$h0 # h[2] - shr \$40,$d2 - and \$0x3ffffff,$h1 # h[3] - or $d2,$h2 # h[4] - - vmovd %rax#d,%x#$H0 - vmovd %rdx#d,%x#$H1 - vmovd $h0#d,%x#$H2 - vmovd $h1#d,%x#$H3 - vmovd $h2#d,%x#$H4 - movl \$1,20($ctx) # set is_base2_26 - - call __poly1305_init_avx - -.Lproceed_avx2$suffix: - mov %r15,$len # restore $len -___ -$code.=<<___ if (!$kernel); - mov OPENSSL_ia32cap_P+8(%rip),%r9d - mov \$`(1<<31|1<<30|1<<16)`,%r11d -___ -$code.=<<___; - pop %r15 -.cfi_restore %r15 - pop %r14 -.cfi_restore %r14 - pop %r13 -.cfi_restore %r13 - pop %r12 -.cfi_restore %r12 - pop %rbx -.cfi_restore %rbx - pop %rbp -.cfi_restore %rbp -.Lbase2_64_avx2_epilogue$suffix: - jmp .Ldo_avx2$suffix -.cfi_endproc - -.align 32 -.Leven_avx2$suffix: -.cfi_startproc -___ -$code.=<<___ if (!$kernel); - mov OPENSSL_ia32cap_P+8(%rip),%r9d -___ -$code.=<<___; - vmovd 4*0($ctx),%x#$H0 # load hash value base 2^26 - vmovd 4*1($ctx),%x#$H1 - vmovd 4*2($ctx),%x#$H2 - vmovd 4*3($ctx),%x#$H3 - vmovd 4*4($ctx),%x#$H4 - -.Ldo_avx2$suffix: -___ -$code.=<<___ if (!$kernel && $avx>2); - cmp \$512,$len - jb .Lskip_avx512 - and %r11d,%r9d - test \$`1<<16`,%r9d # check for AVX512F - jnz .Lblocks_avx512 -.Lskip_avx512$suffix: -___ -$code.=<<___ if ($avx > 2 && $avx512 && $kernel); - cmp \$512,$len - jae .Lblocks_avx512 -___ -$code.=<<___ if (!$win64); - lea 8(%rsp),%r10 -.cfi_def_cfa_register %r10 - sub \$0x128,%rsp -___ -$code.=<<___ if ($win64); - lea 8(%rsp),%r10 - sub \$0x1c8,%rsp - vmovdqa %xmm6,-0xb0(%r10) - vmovdqa %xmm7,-0xa0(%r10) - vmovdqa %xmm8,-0x90(%r10) - vmovdqa %xmm9,-0x80(%r10) - vmovdqa %xmm10,-0x70(%r10) - vmovdqa %xmm11,-0x60(%r10) - vmovdqa %xmm12,-0x50(%r10) - vmovdqa %xmm13,-0x40(%r10) - vmovdqa %xmm14,-0x30(%r10) - vmovdqa %xmm15,-0x20(%r10) -.Ldo_avx2_body$suffix: -___ -$code.=<<___; - lea .Lconst(%rip),%rcx - lea 48+64($ctx),$ctx # size optimization - vmovdqa 96(%rcx),$T0 # .Lpermd_avx2 - - # expand and copy pre-calculated table to stack - vmovdqu `16*0-64`($ctx),%x#$T2 - and \$-512,%rsp - vmovdqu `16*1-64`($ctx),%x#$T3 - vmovdqu `16*2-64`($ctx),%x#$T4 - vmovdqu `16*3-64`($ctx),%x#$D0 - vmovdqu `16*4-64`($ctx),%x#$D1 - vmovdqu `16*5-64`($ctx),%x#$D2 - lea 0x90(%rsp),%rax # size optimization - vmovdqu `16*6-64`($ctx),%x#$D3 - vpermd $T2,$T0,$T2 # 00003412 -> 14243444 - vmovdqu `16*7-64`($ctx),%x#$D4 - vpermd $T3,$T0,$T3 - vmovdqu `16*8-64`($ctx),%x#$MASK - vpermd $T4,$T0,$T4 - vmovdqa $T2,0x00(%rsp) - vpermd $D0,$T0,$D0 - vmovdqa $T3,0x20-0x90(%rax) - vpermd $D1,$T0,$D1 - vmovdqa $T4,0x40-0x90(%rax) - vpermd $D2,$T0,$D2 - vmovdqa $D0,0x60-0x90(%rax) - vpermd $D3,$T0,$D3 - vmovdqa $D1,0x80-0x90(%rax) - vpermd $D4,$T0,$D4 - vmovdqa $D2,0xa0-0x90(%rax) - vpermd $MASK,$T0,$MASK - vmovdqa $D3,0xc0-0x90(%rax) - vmovdqa $D4,0xe0-0x90(%rax) - vmovdqa $MASK,0x100-0x90(%rax) - vmovdqa 64(%rcx),$MASK # .Lmask26 - - ################################################################ - # load input - vmovdqu 16*0($inp),%x#$T0 - vmovdqu 16*1($inp),%x#$T1 - vinserti128 \$1,16*2($inp),$T0,$T0 - vinserti128 \$1,16*3($inp),$T1,$T1 - lea 16*4($inp),$inp - - vpsrldq \$6,$T0,$T2 # splat input - vpsrldq \$6,$T1,$T3 - vpunpckhqdq $T1,$T0,$T4 # 4 - vpunpcklqdq $T3,$T2,$T2 # 2:3 - vpunpcklqdq $T1,$T0,$T0 # 0:1 - - vpsrlq \$30,$T2,$T3 - vpsrlq \$4,$T2,$T2 - vpsrlq \$26,$T0,$T1 - vpsrlq \$40,$T4,$T4 # 4 - vpand $MASK,$T2,$T2 # 2 - vpand $MASK,$T0,$T0 # 0 - vpand $MASK,$T1,$T1 # 1 - vpand $MASK,$T3,$T3 # 3 - vpor 32(%rcx),$T4,$T4 # padbit, yes, always - - vpaddq $H2,$T2,$H2 # accumulate input - sub \$64,$len - jz .Ltail_avx2$suffix - jmp .Loop_avx2$suffix - -.align 32 -.Loop_avx2$suffix: - ################################################################ - # ((inp[0]*r^4+inp[4])*r^4+inp[ 8])*r^4 - # ((inp[1]*r^4+inp[5])*r^4+inp[ 9])*r^3 - # ((inp[2]*r^4+inp[6])*r^4+inp[10])*r^2 - # ((inp[3]*r^4+inp[7])*r^4+inp[11])*r^1 - # \________/\__________/ - ################################################################ - #vpaddq $H2,$T2,$H2 # accumulate input - vpaddq $H0,$T0,$H0 - vmovdqa `32*0`(%rsp),$T0 # r0^4 - vpaddq $H1,$T1,$H1 - vmovdqa `32*1`(%rsp),$T1 # r1^4 - vpaddq $H3,$T3,$H3 - vmovdqa `32*3`(%rsp),$T2 # r2^4 - vpaddq $H4,$T4,$H4 - vmovdqa `32*6-0x90`(%rax),$T3 # s3^4 - vmovdqa `32*8-0x90`(%rax),$S4 # s4^4 - - # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 - # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 - # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 - # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 - # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 - # - # however, as h2 is "chronologically" first one available pull - # corresponding operations up, so it's - # - # d4 = h2*r2 + h4*r0 + h3*r1 + h1*r3 + h0*r4 - # d3 = h2*r1 + h3*r0 + h1*r2 + h0*r3 + h4*5*r4 - # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 - # d1 = h2*5*r4 + h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 - # d0 = h2*5*r3 + h0*r0 + h4*5*r1 + h3*5*r2 + h1*5*r4 - - vpmuludq $H2,$T0,$D2 # d2 = h2*r0 - vpmuludq $H2,$T1,$D3 # d3 = h2*r1 - vpmuludq $H2,$T2,$D4 # d4 = h2*r2 - vpmuludq $H2,$T3,$D0 # d0 = h2*s3 - vpmuludq $H2,$S4,$D1 # d1 = h2*s4 - - vpmuludq $H0,$T1,$T4 # h0*r1 - vpmuludq $H1,$T1,$H2 # h1*r1, borrow $H2 as temp - vpaddq $T4,$D1,$D1 # d1 += h0*r1 - vpaddq $H2,$D2,$D2 # d2 += h1*r1 - vpmuludq $H3,$T1,$T4 # h3*r1 - vpmuludq `32*2`(%rsp),$H4,$H2 # h4*s1 - vpaddq $T4,$D4,$D4 # d4 += h3*r1 - vpaddq $H2,$D0,$D0 # d0 += h4*s1 - vmovdqa `32*4-0x90`(%rax),$T1 # s2 - - vpmuludq $H0,$T0,$T4 # h0*r0 - vpmuludq $H1,$T0,$H2 # h1*r0 - vpaddq $T4,$D0,$D0 # d0 += h0*r0 - vpaddq $H2,$D1,$D1 # d1 += h1*r0 - vpmuludq $H3,$T0,$T4 # h3*r0 - vpmuludq $H4,$T0,$H2 # h4*r0 - vmovdqu 16*0($inp),%x#$T0 # load input - vpaddq $T4,$D3,$D3 # d3 += h3*r0 - vpaddq $H2,$D4,$D4 # d4 += h4*r0 - vinserti128 \$1,16*2($inp),$T0,$T0 - - vpmuludq $H3,$T1,$T4 # h3*s2 - vpmuludq $H4,$T1,$H2 # h4*s2 - vmovdqu 16*1($inp),%x#$T1 - vpaddq $T4,$D0,$D0 # d0 += h3*s2 - vpaddq $H2,$D1,$D1 # d1 += h4*s2 - vmovdqa `32*5-0x90`(%rax),$H2 # r3 - vpmuludq $H1,$T2,$T4 # h1*r2 - vpmuludq $H0,$T2,$T2 # h0*r2 - vpaddq $T4,$D3,$D3 # d3 += h1*r2 - vpaddq $T2,$D2,$D2 # d2 += h0*r2 - vinserti128 \$1,16*3($inp),$T1,$T1 - lea 16*4($inp),$inp - - vpmuludq $H1,$H2,$T4 # h1*r3 - vpmuludq $H0,$H2,$H2 # h0*r3 - vpsrldq \$6,$T0,$T2 # splat input - vpaddq $T4,$D4,$D4 # d4 += h1*r3 - vpaddq $H2,$D3,$D3 # d3 += h0*r3 - vpmuludq $H3,$T3,$T4 # h3*s3 - vpmuludq $H4,$T3,$H2 # h4*s3 - vpsrldq \$6,$T1,$T3 - vpaddq $T4,$D1,$D1 # d1 += h3*s3 - vpaddq $H2,$D2,$D2 # d2 += h4*s3 - vpunpckhqdq $T1,$T0,$T4 # 4 - - vpmuludq $H3,$S4,$H3 # h3*s4 - vpmuludq $H4,$S4,$H4 # h4*s4 - vpunpcklqdq $T1,$T0,$T0 # 0:1 - vpaddq $H3,$D2,$H2 # h2 = d2 + h3*r4 - vpaddq $H4,$D3,$H3 # h3 = d3 + h4*r4 - vpunpcklqdq $T3,$T2,$T3 # 2:3 - vpmuludq `32*7-0x90`(%rax),$H0,$H4 # h0*r4 - vpmuludq $H1,$S4,$H0 # h1*s4 - vmovdqa 64(%rcx),$MASK # .Lmask26 - vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4 - vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4 - - ################################################################ - # lazy reduction (interleaved with tail of input splat) - - vpsrlq \$26,$H3,$D3 - vpand $MASK,$H3,$H3 - vpaddq $D3,$H4,$H4 # h3 -> h4 - - vpsrlq \$26,$H0,$D0 - vpand $MASK,$H0,$H0 - vpaddq $D0,$D1,$H1 # h0 -> h1 - - vpsrlq \$26,$H4,$D4 - vpand $MASK,$H4,$H4 - - vpsrlq \$4,$T3,$T2 - - vpsrlq \$26,$H1,$D1 - vpand $MASK,$H1,$H1 - vpaddq $D1,$H2,$H2 # h1 -> h2 - - vpaddq $D4,$H0,$H0 - vpsllq \$2,$D4,$D4 - vpaddq $D4,$H0,$H0 # h4 -> h0 - - vpand $MASK,$T2,$T2 # 2 - vpsrlq \$26,$T0,$T1 - - vpsrlq \$26,$H2,$D2 - vpand $MASK,$H2,$H2 - vpaddq $D2,$H3,$H3 # h2 -> h3 - - vpaddq $T2,$H2,$H2 # modulo-scheduled - vpsrlq \$30,$T3,$T3 - - vpsrlq \$26,$H0,$D0 - vpand $MASK,$H0,$H0 - vpaddq $D0,$H1,$H1 # h0 -> h1 - - vpsrlq \$40,$T4,$T4 # 4 - - vpsrlq \$26,$H3,$D3 - vpand $MASK,$H3,$H3 - vpaddq $D3,$H4,$H4 # h3 -> h4 - - vpand $MASK,$T0,$T0 # 0 - vpand $MASK,$T1,$T1 # 1 - vpand $MASK,$T3,$T3 # 3 - vpor 32(%rcx),$T4,$T4 # padbit, yes, always - - sub \$64,$len - jnz .Loop_avx2$suffix - - .byte 0x66,0x90 -.Ltail_avx2$suffix: - ################################################################ - # while above multiplications were by r^4 in all lanes, in last - # iteration we multiply least significant lane by r^4 and most - # significant one by r, so copy of above except that references - # to the precomputed table are displaced by 4... - - #vpaddq $H2,$T2,$H2 # accumulate input - vpaddq $H0,$T0,$H0 - vmovdqu `32*0+4`(%rsp),$T0 # r0^4 - vpaddq $H1,$T1,$H1 - vmovdqu `32*1+4`(%rsp),$T1 # r1^4 - vpaddq $H3,$T3,$H3 - vmovdqu `32*3+4`(%rsp),$T2 # r2^4 - vpaddq $H4,$T4,$H4 - vmovdqu `32*6+4-0x90`(%rax),$T3 # s3^4 - vmovdqu `32*8+4-0x90`(%rax),$S4 # s4^4 - - vpmuludq $H2,$T0,$D2 # d2 = h2*r0 - vpmuludq $H2,$T1,$D3 # d3 = h2*r1 - vpmuludq $H2,$T2,$D4 # d4 = h2*r2 - vpmuludq $H2,$T3,$D0 # d0 = h2*s3 - vpmuludq $H2,$S4,$D1 # d1 = h2*s4 - - vpmuludq $H0,$T1,$T4 # h0*r1 - vpmuludq $H1,$T1,$H2 # h1*r1 - vpaddq $T4,$D1,$D1 # d1 += h0*r1 - vpaddq $H2,$D2,$D2 # d2 += h1*r1 - vpmuludq $H3,$T1,$T4 # h3*r1 - vpmuludq `32*2+4`(%rsp),$H4,$H2 # h4*s1 - vpaddq $T4,$D4,$D4 # d4 += h3*r1 - vpaddq $H2,$D0,$D0 # d0 += h4*s1 - - vpmuludq $H0,$T0,$T4 # h0*r0 - vpmuludq $H1,$T0,$H2 # h1*r0 - vpaddq $T4,$D0,$D0 # d0 += h0*r0 - vmovdqu `32*4+4-0x90`(%rax),$T1 # s2 - vpaddq $H2,$D1,$D1 # d1 += h1*r0 - vpmuludq $H3,$T0,$T4 # h3*r0 - vpmuludq $H4,$T0,$H2 # h4*r0 - vpaddq $T4,$D3,$D3 # d3 += h3*r0 - vpaddq $H2,$D4,$D4 # d4 += h4*r0 - - vpmuludq $H3,$T1,$T4 # h3*s2 - vpmuludq $H4,$T1,$H2 # h4*s2 - vpaddq $T4,$D0,$D0 # d0 += h3*s2 - vpaddq $H2,$D1,$D1 # d1 += h4*s2 - vmovdqu `32*5+4-0x90`(%rax),$H2 # r3 - vpmuludq $H1,$T2,$T4 # h1*r2 - vpmuludq $H0,$T2,$T2 # h0*r2 - vpaddq $T4,$D3,$D3 # d3 += h1*r2 - vpaddq $T2,$D2,$D2 # d2 += h0*r2 - - vpmuludq $H1,$H2,$T4 # h1*r3 - vpmuludq $H0,$H2,$H2 # h0*r3 - vpaddq $T4,$D4,$D4 # d4 += h1*r3 - vpaddq $H2,$D3,$D3 # d3 += h0*r3 - vpmuludq $H3,$T3,$T4 # h3*s3 - vpmuludq $H4,$T3,$H2 # h4*s3 - vpaddq $T4,$D1,$D1 # d1 += h3*s3 - vpaddq $H2,$D2,$D2 # d2 += h4*s3 - - vpmuludq $H3,$S4,$H3 # h3*s4 - vpmuludq $H4,$S4,$H4 # h4*s4 - vpaddq $H3,$D2,$H2 # h2 = d2 + h3*r4 - vpaddq $H4,$D3,$H3 # h3 = d3 + h4*r4 - vpmuludq `32*7+4-0x90`(%rax),$H0,$H4 # h0*r4 - vpmuludq $H1,$S4,$H0 # h1*s4 - vmovdqa 64(%rcx),$MASK # .Lmask26 - vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4 - vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4 - - ################################################################ - # horizontal addition - - vpsrldq \$8,$D1,$T1 - vpsrldq \$8,$H2,$T2 - vpsrldq \$8,$H3,$T3 - vpsrldq \$8,$H4,$T4 - vpsrldq \$8,$H0,$T0 - vpaddq $T1,$D1,$D1 - vpaddq $T2,$H2,$H2 - vpaddq $T3,$H3,$H3 - vpaddq $T4,$H4,$H4 - vpaddq $T0,$H0,$H0 - - vpermq \$0x2,$H3,$T3 - vpermq \$0x2,$H4,$T4 - vpermq \$0x2,$H0,$T0 - vpermq \$0x2,$D1,$T1 - vpermq \$0x2,$H2,$T2 - vpaddq $T3,$H3,$H3 - vpaddq $T4,$H4,$H4 - vpaddq $T0,$H0,$H0 - vpaddq $T1,$D1,$D1 - vpaddq $T2,$H2,$H2 - - ################################################################ - # lazy reduction - - vpsrlq \$26,$H3,$D3 - vpand $MASK,$H3,$H3 - vpaddq $D3,$H4,$H4 # h3 -> h4 - - vpsrlq \$26,$H0,$D0 - vpand $MASK,$H0,$H0 - vpaddq $D0,$D1,$H1 # h0 -> h1 - - vpsrlq \$26,$H4,$D4 - vpand $MASK,$H4,$H4 - - vpsrlq \$26,$H1,$D1 - vpand $MASK,$H1,$H1 - vpaddq $D1,$H2,$H2 # h1 -> h2 - - vpaddq $D4,$H0,$H0 - vpsllq \$2,$D4,$D4 - vpaddq $D4,$H0,$H0 # h4 -> h0 - - vpsrlq \$26,$H2,$D2 - vpand $MASK,$H2,$H2 - vpaddq $D2,$H3,$H3 # h2 -> h3 - - vpsrlq \$26,$H0,$D0 - vpand $MASK,$H0,$H0 - vpaddq $D0,$H1,$H1 # h0 -> h1 - - vpsrlq \$26,$H3,$D3 - vpand $MASK,$H3,$H3 - vpaddq $D3,$H4,$H4 # h3 -> h4 - - vmovd %x#$H0,`4*0-48-64`($ctx)# save partially reduced - vmovd %x#$H1,`4*1-48-64`($ctx) - vmovd %x#$H2,`4*2-48-64`($ctx) - vmovd %x#$H3,`4*3-48-64`($ctx) - vmovd %x#$H4,`4*4-48-64`($ctx) -___ -$code.=<<___ if ($win64); - vmovdqa -0xb0(%r10),%xmm6 - vmovdqa -0xa0(%r10),%xmm7 - vmovdqa -0x90(%r10),%xmm8 - vmovdqa -0x80(%r10),%xmm9 - vmovdqa -0x70(%r10),%xmm10 - vmovdqa -0x60(%r10),%xmm11 - vmovdqa -0x50(%r10),%xmm12 - vmovdqa -0x40(%r10),%xmm13 - vmovdqa -0x30(%r10),%xmm14 - vmovdqa -0x20(%r10),%xmm15 - lea -8(%r10),%rsp -.Ldo_avx2_epilogue$suffix: -___ -$code.=<<___ if (!$win64); - lea -8(%r10),%rsp -.cfi_def_cfa_register %rsp -___ -$code.=<<___; - vzeroupper - RET -.cfi_endproc -___ -if($avx > 2 && $avx512) { -my ($R0,$R1,$R2,$R3,$R4, $S1,$S2,$S3,$S4) = map("%zmm$_",(16..24)); -my ($M0,$M1,$M2,$M3,$M4) = map("%zmm$_",(25..29)); -my $PADBIT="%zmm30"; - -map(s/%y/%z/,($T4,$T0,$T1,$T2,$T3)); # switch to %zmm domain -map(s/%y/%z/,($D0,$D1,$D2,$D3,$D4)); -map(s/%y/%z/,($H0,$H1,$H2,$H3,$H4)); -map(s/%y/%z/,($MASK)); - -$code.=<<___; -.cfi_startproc -.Lblocks_avx512: - mov \$15,%eax - kmovw %eax,%k2 -___ -$code.=<<___ if (!$win64); - lea 8(%rsp),%r10 -.cfi_def_cfa_register %r10 - sub \$0x128,%rsp -___ -$code.=<<___ if ($win64); - lea 8(%rsp),%r10 - sub \$0x1c8,%rsp - vmovdqa %xmm6,-0xb0(%r10) - vmovdqa %xmm7,-0xa0(%r10) - vmovdqa %xmm8,-0x90(%r10) - vmovdqa %xmm9,-0x80(%r10) - vmovdqa %xmm10,-0x70(%r10) - vmovdqa %xmm11,-0x60(%r10) - vmovdqa %xmm12,-0x50(%r10) - vmovdqa %xmm13,-0x40(%r10) - vmovdqa %xmm14,-0x30(%r10) - vmovdqa %xmm15,-0x20(%r10) -.Ldo_avx512_body: -___ -$code.=<<___; - lea .Lconst(%rip),%rcx - lea 48+64($ctx),$ctx # size optimization - vmovdqa 96(%rcx),%y#$T2 # .Lpermd_avx2 - - # expand pre-calculated table - vmovdqu `16*0-64`($ctx),%x#$D0 # will become expanded ${R0} - and \$-512,%rsp - vmovdqu `16*1-64`($ctx),%x#$D1 # will become ... ${R1} - mov \$0x20,%rax - vmovdqu `16*2-64`($ctx),%x#$T0 # ... ${S1} - vmovdqu `16*3-64`($ctx),%x#$D2 # ... ${R2} - vmovdqu `16*4-64`($ctx),%x#$T1 # ... ${S2} - vmovdqu `16*5-64`($ctx),%x#$D3 # ... ${R3} - vmovdqu `16*6-64`($ctx),%x#$T3 # ... ${S3} - vmovdqu `16*7-64`($ctx),%x#$D4 # ... ${R4} - vmovdqu `16*8-64`($ctx),%x#$T4 # ... ${S4} - vpermd $D0,$T2,$R0 # 00003412 -> 14243444 - vpbroadcastq 64(%rcx),$MASK # .Lmask26 - vpermd $D1,$T2,$R1 - vpermd $T0,$T2,$S1 - vpermd $D2,$T2,$R2 - vmovdqa64 $R0,0x00(%rsp){%k2} # save in case $len%128 != 0 - vpsrlq \$32,$R0,$T0 # 14243444 -> 01020304 - vpermd $T1,$T2,$S2 - vmovdqu64 $R1,0x00(%rsp,%rax){%k2} - vpsrlq \$32,$R1,$T1 - vpermd $D3,$T2,$R3 - vmovdqa64 $S1,0x40(%rsp){%k2} - vpermd $T3,$T2,$S3 - vpermd $D4,$T2,$R4 - vmovdqu64 $R2,0x40(%rsp,%rax){%k2} - vpermd $T4,$T2,$S4 - vmovdqa64 $S2,0x80(%rsp){%k2} - vmovdqu64 $R3,0x80(%rsp,%rax){%k2} - vmovdqa64 $S3,0xc0(%rsp){%k2} - vmovdqu64 $R4,0xc0(%rsp,%rax){%k2} - vmovdqa64 $S4,0x100(%rsp){%k2} - - ################################################################ - # calculate 5th through 8th powers of the key - # - # d0 = r0'*r0 + r1'*5*r4 + r2'*5*r3 + r3'*5*r2 + r4'*5*r1 - # d1 = r0'*r1 + r1'*r0 + r2'*5*r4 + r3'*5*r3 + r4'*5*r2 - # d2 = r0'*r2 + r1'*r1 + r2'*r0 + r3'*5*r4 + r4'*5*r3 - # d3 = r0'*r3 + r1'*r2 + r2'*r1 + r3'*r0 + r4'*5*r4 - # d4 = r0'*r4 + r1'*r3 + r2'*r2 + r3'*r1 + r4'*r0 - - vpmuludq $T0,$R0,$D0 # d0 = r0'*r0 - vpmuludq $T0,$R1,$D1 # d1 = r0'*r1 - vpmuludq $T0,$R2,$D2 # d2 = r0'*r2 - vpmuludq $T0,$R3,$D3 # d3 = r0'*r3 - vpmuludq $T0,$R4,$D4 # d4 = r0'*r4 - vpsrlq \$32,$R2,$T2 - - vpmuludq $T1,$S4,$M0 - vpmuludq $T1,$R0,$M1 - vpmuludq $T1,$R1,$M2 - vpmuludq $T1,$R2,$M3 - vpmuludq $T1,$R3,$M4 - vpsrlq \$32,$R3,$T3 - vpaddq $M0,$D0,$D0 # d0 += r1'*5*r4 - vpaddq $M1,$D1,$D1 # d1 += r1'*r0 - vpaddq $M2,$D2,$D2 # d2 += r1'*r1 - vpaddq $M3,$D3,$D3 # d3 += r1'*r2 - vpaddq $M4,$D4,$D4 # d4 += r1'*r3 - - vpmuludq $T2,$S3,$M0 - vpmuludq $T2,$S4,$M1 - vpmuludq $T2,$R1,$M3 - vpmuludq $T2,$R2,$M4 - vpmuludq $T2,$R0,$M2 - vpsrlq \$32,$R4,$T4 - vpaddq $M0,$D0,$D0 # d0 += r2'*5*r3 - vpaddq $M1,$D1,$D1 # d1 += r2'*5*r4 - vpaddq $M3,$D3,$D3 # d3 += r2'*r1 - vpaddq $M4,$D4,$D4 # d4 += r2'*r2 - vpaddq $M2,$D2,$D2 # d2 += r2'*r0 - - vpmuludq $T3,$S2,$M0 - vpmuludq $T3,$R0,$M3 - vpmuludq $T3,$R1,$M4 - vpmuludq $T3,$S3,$M1 - vpmuludq $T3,$S4,$M2 - vpaddq $M0,$D0,$D0 # d0 += r3'*5*r2 - vpaddq $M3,$D3,$D3 # d3 += r3'*r0 - vpaddq $M4,$D4,$D4 # d4 += r3'*r1 - vpaddq $M1,$D1,$D1 # d1 += r3'*5*r3 - vpaddq $M2,$D2,$D2 # d2 += r3'*5*r4 - - vpmuludq $T4,$S4,$M3 - vpmuludq $T4,$R0,$M4 - vpmuludq $T4,$S1,$M0 - vpmuludq $T4,$S2,$M1 - vpmuludq $T4,$S3,$M2 - vpaddq $M3,$D3,$D3 # d3 += r2'*5*r4 - vpaddq $M4,$D4,$D4 # d4 += r2'*r0 - vpaddq $M0,$D0,$D0 # d0 += r2'*5*r1 - vpaddq $M1,$D1,$D1 # d1 += r2'*5*r2 - vpaddq $M2,$D2,$D2 # d2 += r2'*5*r3 - - ################################################################ - # load input - vmovdqu64 16*0($inp),%z#$T3 - vmovdqu64 16*4($inp),%z#$T4 - lea 16*8($inp),$inp - - ################################################################ - # lazy reduction - - vpsrlq \$26,$D3,$M3 - vpandq $MASK,$D3,$D3 - vpaddq $M3,$D4,$D4 # d3 -> d4 - - vpsrlq \$26,$D0,$M0 - vpandq $MASK,$D0,$D0 - vpaddq $M0,$D1,$D1 # d0 -> d1 - - vpsrlq \$26,$D4,$M4 - vpandq $MASK,$D4,$D4 - - vpsrlq \$26,$D1,$M1 - vpandq $MASK,$D1,$D1 - vpaddq $M1,$D2,$D2 # d1 -> d2 - - vpaddq $M4,$D0,$D0 - vpsllq \$2,$M4,$M4 - vpaddq $M4,$D0,$D0 # d4 -> d0 - - vpsrlq \$26,$D2,$M2 - vpandq $MASK,$D2,$D2 - vpaddq $M2,$D3,$D3 # d2 -> d3 - - vpsrlq \$26,$D0,$M0 - vpandq $MASK,$D0,$D0 - vpaddq $M0,$D1,$D1 # d0 -> d1 - - vpsrlq \$26,$D3,$M3 - vpandq $MASK,$D3,$D3 - vpaddq $M3,$D4,$D4 # d3 -> d4 - - ################################################################ - # at this point we have 14243444 in $R0-$S4 and 05060708 in - # $D0-$D4, ... - - vpunpcklqdq $T4,$T3,$T0 # transpose input - vpunpckhqdq $T4,$T3,$T4 - - # ... since input 64-bit lanes are ordered as 73625140, we could - # "vperm" it to 76543210 (here and in each loop iteration), *or* - # we could just flow along, hence the goal for $R0-$S4 is - # 1858286838784888 ... - - vmovdqa32 128(%rcx),$M0 # .Lpermd_avx512: - mov \$0x7777,%eax - kmovw %eax,%k1 - - vpermd $R0,$M0,$R0 # 14243444 -> 1---2---3---4--- - vpermd $R1,$M0,$R1 - vpermd $R2,$M0,$R2 - vpermd $R3,$M0,$R3 - vpermd $R4,$M0,$R4 - - vpermd $D0,$M0,${R0}{%k1} # 05060708 -> 1858286838784888 - vpermd $D1,$M0,${R1}{%k1} - vpermd $D2,$M0,${R2}{%k1} - vpermd $D3,$M0,${R3}{%k1} - vpermd $D4,$M0,${R4}{%k1} - - vpslld \$2,$R1,$S1 # *5 - vpslld \$2,$R2,$S2 - vpslld \$2,$R3,$S3 - vpslld \$2,$R4,$S4 - vpaddd $R1,$S1,$S1 - vpaddd $R2,$S2,$S2 - vpaddd $R3,$S3,$S3 - vpaddd $R4,$S4,$S4 - - vpbroadcastq 32(%rcx),$PADBIT # .L129 - - vpsrlq \$52,$T0,$T2 # splat input - vpsllq \$12,$T4,$T3 - vporq $T3,$T2,$T2 - vpsrlq \$26,$T0,$T1 - vpsrlq \$14,$T4,$T3 - vpsrlq \$40,$T4,$T4 # 4 - vpandq $MASK,$T2,$T2 # 2 - vpandq $MASK,$T0,$T0 # 0 - #vpandq $MASK,$T1,$T1 # 1 - #vpandq $MASK,$T3,$T3 # 3 - #vporq $PADBIT,$T4,$T4 # padbit, yes, always - - vpaddq $H2,$T2,$H2 # accumulate input - sub \$192,$len - jbe .Ltail_avx512 - jmp .Loop_avx512 - -.align 32 -.Loop_avx512: - ################################################################ - # ((inp[0]*r^8+inp[ 8])*r^8+inp[16])*r^8 - # ((inp[1]*r^8+inp[ 9])*r^8+inp[17])*r^7 - # ((inp[2]*r^8+inp[10])*r^8+inp[18])*r^6 - # ((inp[3]*r^8+inp[11])*r^8+inp[19])*r^5 - # ((inp[4]*r^8+inp[12])*r^8+inp[20])*r^4 - # ((inp[5]*r^8+inp[13])*r^8+inp[21])*r^3 - # ((inp[6]*r^8+inp[14])*r^8+inp[22])*r^2 - # ((inp[7]*r^8+inp[15])*r^8+inp[23])*r^1 - # \________/\___________/ - ################################################################ - #vpaddq $H2,$T2,$H2 # accumulate input - - # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 - # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 - # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 - # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 - # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 - # - # however, as h2 is "chronologically" first one available pull - # corresponding operations up, so it's - # - # d3 = h2*r1 + h0*r3 + h1*r2 + h3*r0 + h4*5*r4 - # d4 = h2*r2 + h0*r4 + h1*r3 + h3*r1 + h4*r0 - # d0 = h2*5*r3 + h0*r0 + h1*5*r4 + h3*5*r2 + h4*5*r1 - # d1 = h2*5*r4 + h0*r1 + h1*r0 + h3*5*r3 + h4*5*r2 - # d2 = h2*r0 + h0*r2 + h1*r1 + h3*5*r4 + h4*5*r3 - - vpmuludq $H2,$R1,$D3 # d3 = h2*r1 - vpaddq $H0,$T0,$H0 - vpmuludq $H2,$R2,$D4 # d4 = h2*r2 - vpandq $MASK,$T1,$T1 # 1 - vpmuludq $H2,$S3,$D0 # d0 = h2*s3 - vpandq $MASK,$T3,$T3 # 3 - vpmuludq $H2,$S4,$D1 # d1 = h2*s4 - vporq $PADBIT,$T4,$T4 # padbit, yes, always - vpmuludq $H2,$R0,$D2 # d2 = h2*r0 - vpaddq $H1,$T1,$H1 # accumulate input - vpaddq $H3,$T3,$H3 - vpaddq $H4,$T4,$H4 - - vmovdqu64 16*0($inp),$T3 # load input - vmovdqu64 16*4($inp),$T4 - lea 16*8($inp),$inp - vpmuludq $H0,$R3,$M3 - vpmuludq $H0,$R4,$M4 - vpmuludq $H0,$R0,$M0 - vpmuludq $H0,$R1,$M1 - vpaddq $M3,$D3,$D3 # d3 += h0*r3 - vpaddq $M4,$D4,$D4 # d4 += h0*r4 - vpaddq $M0,$D0,$D0 # d0 += h0*r0 - vpaddq $M1,$D1,$D1 # d1 += h0*r1 - - vpmuludq $H1,$R2,$M3 - vpmuludq $H1,$R3,$M4 - vpmuludq $H1,$S4,$M0 - vpmuludq $H0,$R2,$M2 - vpaddq $M3,$D3,$D3 # d3 += h1*r2 - vpaddq $M4,$D4,$D4 # d4 += h1*r3 - vpaddq $M0,$D0,$D0 # d0 += h1*s4 - vpaddq $M2,$D2,$D2 # d2 += h0*r2 - - vpunpcklqdq $T4,$T3,$T0 # transpose input - vpunpckhqdq $T4,$T3,$T4 - - vpmuludq $H3,$R0,$M3 - vpmuludq $H3,$R1,$M4 - vpmuludq $H1,$R0,$M1 - vpmuludq $H1,$R1,$M2 - vpaddq $M3,$D3,$D3 # d3 += h3*r0 - vpaddq $M4,$D4,$D4 # d4 += h3*r1 - vpaddq $M1,$D1,$D1 # d1 += h1*r0 - vpaddq $M2,$D2,$D2 # d2 += h1*r1 - - vpmuludq $H4,$S4,$M3 - vpmuludq $H4,$R0,$M4 - vpmuludq $H3,$S2,$M0 - vpmuludq $H3,$S3,$M1 - vpaddq $M3,$D3,$D3 # d3 += h4*s4 - vpmuludq $H3,$S4,$M2 - vpaddq $M4,$D4,$D4 # d4 += h4*r0 - vpaddq $M0,$D0,$D0 # d0 += h3*s2 - vpaddq $M1,$D1,$D1 # d1 += h3*s3 - vpaddq $M2,$D2,$D2 # d2 += h3*s4 - - vpmuludq $H4,$S1,$M0 - vpmuludq $H4,$S2,$M1 - vpmuludq $H4,$S3,$M2 - vpaddq $M0,$D0,$H0 # h0 = d0 + h4*s1 - vpaddq $M1,$D1,$H1 # h1 = d2 + h4*s2 - vpaddq $M2,$D2,$H2 # h2 = d3 + h4*s3 - - ################################################################ - # lazy reduction (interleaved with input splat) - - vpsrlq \$52,$T0,$T2 # splat input - vpsllq \$12,$T4,$T3 - - vpsrlq \$26,$D3,$H3 - vpandq $MASK,$D3,$D3 - vpaddq $H3,$D4,$H4 # h3 -> h4 - - vporq $T3,$T2,$T2 - - vpsrlq \$26,$H0,$D0 - vpandq $MASK,$H0,$H0 - vpaddq $D0,$H1,$H1 # h0 -> h1 - - vpandq $MASK,$T2,$T2 # 2 - - vpsrlq \$26,$H4,$D4 - vpandq $MASK,$H4,$H4 - - vpsrlq \$26,$H1,$D1 - vpandq $MASK,$H1,$H1 - vpaddq $D1,$H2,$H2 # h1 -> h2 - - vpaddq $D4,$H0,$H0 - vpsllq \$2,$D4,$D4 - vpaddq $D4,$H0,$H0 # h4 -> h0 - - vpaddq $T2,$H2,$H2 # modulo-scheduled - vpsrlq \$26,$T0,$T1 - - vpsrlq \$26,$H2,$D2 - vpandq $MASK,$H2,$H2 - vpaddq $D2,$D3,$H3 # h2 -> h3 - - vpsrlq \$14,$T4,$T3 - - vpsrlq \$26,$H0,$D0 - vpandq $MASK,$H0,$H0 - vpaddq $D0,$H1,$H1 # h0 -> h1 - - vpsrlq \$40,$T4,$T4 # 4 - - vpsrlq \$26,$H3,$D3 - vpandq $MASK,$H3,$H3 - vpaddq $D3,$H4,$H4 # h3 -> h4 - - vpandq $MASK,$T0,$T0 # 0 - #vpandq $MASK,$T1,$T1 # 1 - #vpandq $MASK,$T3,$T3 # 3 - #vporq $PADBIT,$T4,$T4 # padbit, yes, always - - sub \$128,$len - ja .Loop_avx512 - -.Ltail_avx512: - ################################################################ - # while above multiplications were by r^8 in all lanes, in last - # iteration we multiply least significant lane by r^8 and most - # significant one by r, that's why table gets shifted... - - vpsrlq \$32,$R0,$R0 # 0105020603070408 - vpsrlq \$32,$R1,$R1 - vpsrlq \$32,$R2,$R2 - vpsrlq \$32,$S3,$S3 - vpsrlq \$32,$S4,$S4 - vpsrlq \$32,$R3,$R3 - vpsrlq \$32,$R4,$R4 - vpsrlq \$32,$S1,$S1 - vpsrlq \$32,$S2,$S2 - - ################################################################ - # load either next or last 64 byte of input - lea ($inp,$len),$inp - - #vpaddq $H2,$T2,$H2 # accumulate input - vpaddq $H0,$T0,$H0 - - vpmuludq $H2,$R1,$D3 # d3 = h2*r1 - vpmuludq $H2,$R2,$D4 # d4 = h2*r2 - vpmuludq $H2,$S3,$D0 # d0 = h2*s3 - vpandq $MASK,$T1,$T1 # 1 - vpmuludq $H2,$S4,$D1 # d1 = h2*s4 - vpandq $MASK,$T3,$T3 # 3 - vpmuludq $H2,$R0,$D2 # d2 = h2*r0 - vporq $PADBIT,$T4,$T4 # padbit, yes, always - vpaddq $H1,$T1,$H1 # accumulate input - vpaddq $H3,$T3,$H3 - vpaddq $H4,$T4,$H4 - - vmovdqu 16*0($inp),%x#$T0 - vpmuludq $H0,$R3,$M3 - vpmuludq $H0,$R4,$M4 - vpmuludq $H0,$R0,$M0 - vpmuludq $H0,$R1,$M1 - vpaddq $M3,$D3,$D3 # d3 += h0*r3 - vpaddq $M4,$D4,$D4 # d4 += h0*r4 - vpaddq $M0,$D0,$D0 # d0 += h0*r0 - vpaddq $M1,$D1,$D1 # d1 += h0*r1 - - vmovdqu 16*1($inp),%x#$T1 - vpmuludq $H1,$R2,$M3 - vpmuludq $H1,$R3,$M4 - vpmuludq $H1,$S4,$M0 - vpmuludq $H0,$R2,$M2 - vpaddq $M3,$D3,$D3 # d3 += h1*r2 - vpaddq $M4,$D4,$D4 # d4 += h1*r3 - vpaddq $M0,$D0,$D0 # d0 += h1*s4 - vpaddq $M2,$D2,$D2 # d2 += h0*r2 - - vinserti128 \$1,16*2($inp),%y#$T0,%y#$T0 - vpmuludq $H3,$R0,$M3 - vpmuludq $H3,$R1,$M4 - vpmuludq $H1,$R0,$M1 - vpmuludq $H1,$R1,$M2 - vpaddq $M3,$D3,$D3 # d3 += h3*r0 - vpaddq $M4,$D4,$D4 # d4 += h3*r1 - vpaddq $M1,$D1,$D1 # d1 += h1*r0 - vpaddq $M2,$D2,$D2 # d2 += h1*r1 - - vinserti128 \$1,16*3($inp),%y#$T1,%y#$T1 - vpmuludq $H4,$S4,$M3 - vpmuludq $H4,$R0,$M4 - vpmuludq $H3,$S2,$M0 - vpmuludq $H3,$S3,$M1 - vpmuludq $H3,$S4,$M2 - vpaddq $M3,$D3,$H3 # h3 = d3 + h4*s4 - vpaddq $M4,$D4,$D4 # d4 += h4*r0 - vpaddq $M0,$D0,$D0 # d0 += h3*s2 - vpaddq $M1,$D1,$D1 # d1 += h3*s3 - vpaddq $M2,$D2,$D2 # d2 += h3*s4 - - vpmuludq $H4,$S1,$M0 - vpmuludq $H4,$S2,$M1 - vpmuludq $H4,$S3,$M2 - vpaddq $M0,$D0,$H0 # h0 = d0 + h4*s1 - vpaddq $M1,$D1,$H1 # h1 = d2 + h4*s2 - vpaddq $M2,$D2,$H2 # h2 = d3 + h4*s3 - - ################################################################ - # horizontal addition - - mov \$1,%eax - vpermq \$0xb1,$H3,$D3 - vpermq \$0xb1,$D4,$H4 - vpermq \$0xb1,$H0,$D0 - vpermq \$0xb1,$H1,$D1 - vpermq \$0xb1,$H2,$D2 - vpaddq $D3,$H3,$H3 - vpaddq $D4,$H4,$H4 - vpaddq $D0,$H0,$H0 - vpaddq $D1,$H1,$H1 - vpaddq $D2,$H2,$H2 - - kmovw %eax,%k3 - vpermq \$0x2,$H3,$D3 - vpermq \$0x2,$H4,$D4 - vpermq \$0x2,$H0,$D0 - vpermq \$0x2,$H1,$D1 - vpermq \$0x2,$H2,$D2 - vpaddq $D3,$H3,$H3 - vpaddq $D4,$H4,$H4 - vpaddq $D0,$H0,$H0 - vpaddq $D1,$H1,$H1 - vpaddq $D2,$H2,$H2 - - vextracti64x4 \$0x1,$H3,%y#$D3 - vextracti64x4 \$0x1,$H4,%y#$D4 - vextracti64x4 \$0x1,$H0,%y#$D0 - vextracti64x4 \$0x1,$H1,%y#$D1 - vextracti64x4 \$0x1,$H2,%y#$D2 - vpaddq $D3,$H3,${H3}{%k3}{z} # keep single qword in case - vpaddq $D4,$H4,${H4}{%k3}{z} # it's passed to .Ltail_avx2 - vpaddq $D0,$H0,${H0}{%k3}{z} - vpaddq $D1,$H1,${H1}{%k3}{z} - vpaddq $D2,$H2,${H2}{%k3}{z} -___ -map(s/%z/%y/,($T0,$T1,$T2,$T3,$T4, $PADBIT)); -map(s/%z/%y/,($H0,$H1,$H2,$H3,$H4, $D0,$D1,$D2,$D3,$D4, $MASK)); -$code.=<<___; - ################################################################ - # lazy reduction (interleaved with input splat) - - vpsrlq \$26,$H3,$D3 - vpand $MASK,$H3,$H3 - vpsrldq \$6,$T0,$T2 # splat input - vpsrldq \$6,$T1,$T3 - vpunpckhqdq $T1,$T0,$T4 # 4 - vpaddq $D3,$H4,$H4 # h3 -> h4 - - vpsrlq \$26,$H0,$D0 - vpand $MASK,$H0,$H0 - vpunpcklqdq $T3,$T2,$T2 # 2:3 - vpunpcklqdq $T1,$T0,$T0 # 0:1 - vpaddq $D0,$H1,$H1 # h0 -> h1 - - vpsrlq \$26,$H4,$D4 - vpand $MASK,$H4,$H4 - - vpsrlq \$26,$H1,$D1 - vpand $MASK,$H1,$H1 - vpsrlq \$30,$T2,$T3 - vpsrlq \$4,$T2,$T2 - vpaddq $D1,$H2,$H2 # h1 -> h2 - - vpaddq $D4,$H0,$H0 - vpsllq \$2,$D4,$D4 - vpsrlq \$26,$T0,$T1 - vpsrlq \$40,$T4,$T4 # 4 - vpaddq $D4,$H0,$H0 # h4 -> h0 - - vpsrlq \$26,$H2,$D2 - vpand $MASK,$H2,$H2 - vpand $MASK,$T2,$T2 # 2 - vpand $MASK,$T0,$T0 # 0 - vpaddq $D2,$H3,$H3 # h2 -> h3 - - vpsrlq \$26,$H0,$D0 - vpand $MASK,$H0,$H0 - vpaddq $H2,$T2,$H2 # accumulate input for .Ltail_avx2 - vpand $MASK,$T1,$T1 # 1 - vpaddq $D0,$H1,$H1 # h0 -> h1 - - vpsrlq \$26,$H3,$D3 - vpand $MASK,$H3,$H3 - vpand $MASK,$T3,$T3 # 3 - vpor 32(%rcx),$T4,$T4 # padbit, yes, always - vpaddq $D3,$H4,$H4 # h3 -> h4 - - lea 0x90(%rsp),%rax # size optimization for .Ltail_avx2 - add \$64,$len - jnz .Ltail_avx2$suffix - - vpsubq $T2,$H2,$H2 # undo input accumulation - vmovd %x#$H0,`4*0-48-64`($ctx)# save partially reduced - vmovd %x#$H1,`4*1-48-64`($ctx) - vmovd %x#$H2,`4*2-48-64`($ctx) - vmovd %x#$H3,`4*3-48-64`($ctx) - vmovd %x#$H4,`4*4-48-64`($ctx) - vzeroall -___ -$code.=<<___ if ($win64); - movdqa -0xb0(%r10),%xmm6 - movdqa -0xa0(%r10),%xmm7 - movdqa -0x90(%r10),%xmm8 - movdqa -0x80(%r10),%xmm9 - movdqa -0x70(%r10),%xmm10 - movdqa -0x60(%r10),%xmm11 - movdqa -0x50(%r10),%xmm12 - movdqa -0x40(%r10),%xmm13 - movdqa -0x30(%r10),%xmm14 - movdqa -0x20(%r10),%xmm15 - lea -8(%r10),%rsp -.Ldo_avx512_epilogue: -___ -$code.=<<___ if (!$win64); - lea -8(%r10),%rsp -.cfi_def_cfa_register %rsp -___ -$code.=<<___; - RET -.cfi_endproc -___ - -} - -} - -&declare_function("poly1305_blocks_avx2", 32, 4); -poly1305_blocks_avxN(0); -&end_function("poly1305_blocks_avx2"); - -####################################################################### -if ($avx>2) { -# On entry we have input length divisible by 64. But since inner loop -# processes 128 bytes per iteration, cases when length is not divisible -# by 128 are handled by passing tail 64 bytes to .Ltail_avx2. For this -# reason stack layout is kept identical to poly1305_blocks_avx2. If not -# for this tail, we wouldn't have to even allocate stack frame... - -if($kernel) { - $code .= "#ifdef CONFIG_AS_AVX512\n"; -} - -&declare_function("poly1305_blocks_avx512", 32, 4); -poly1305_blocks_avxN(1); -&end_function("poly1305_blocks_avx512"); - -if ($kernel) { - $code .= "#endif\n"; -} - -if (!$kernel && $avx>3) { -######################################################################## -# VPMADD52 version using 2^44 radix. -# -# One can argue that base 2^52 would be more natural. Well, even though -# some operations would be more natural, one has to recognize couple of -# things. Base 2^52 doesn't provide advantage over base 2^44 if you look -# at amount of multiply-n-accumulate operations. Secondly, it makes it -# impossible to pre-compute multiples of 5 [referred to as s[]/sN in -# reference implementations], which means that more such operations -# would have to be performed in inner loop, which in turn makes critical -# path longer. In other words, even though base 2^44 reduction might -# look less elegant, overall critical path is actually shorter... - -######################################################################## -# Layout of opaque area is following. -# -# unsigned __int64 h[3]; # current hash value base 2^44 -# unsigned __int64 s[2]; # key value*20 base 2^44 -# unsigned __int64 r[3]; # key value base 2^44 -# struct { unsigned __int64 r^1, r^3, r^2, r^4; } R[4]; -# # r^n positions reflect -# # placement in register, not -# # memory, R[3] is R[1]*20 - -$code.=<<___; -.type poly1305_init_base2_44,\@function,3 -.align 32 -poly1305_init_base2_44: - xor %eax,%eax - mov %rax,0($ctx) # initialize hash value - mov %rax,8($ctx) - mov %rax,16($ctx) - -.Linit_base2_44: - lea poly1305_blocks_vpmadd52(%rip),%r10 - lea poly1305_emit_base2_44(%rip),%r11 - - mov \$0x0ffffffc0fffffff,%rax - mov \$0x0ffffffc0ffffffc,%rcx - and 0($inp),%rax - mov \$0x00000fffffffffff,%r8 - and 8($inp),%rcx - mov \$0x00000fffffffffff,%r9 - and %rax,%r8 - shrd \$44,%rcx,%rax - mov %r8,40($ctx) # r0 - and %r9,%rax - shr \$24,%rcx - mov %rax,48($ctx) # r1 - lea (%rax,%rax,4),%rax # *5 - mov %rcx,56($ctx) # r2 - shl \$2,%rax # magic <<2 - lea (%rcx,%rcx,4),%rcx # *5 - shl \$2,%rcx # magic <<2 - mov %rax,24($ctx) # s1 - mov %rcx,32($ctx) # s2 - movq \$-1,64($ctx) # write impossible value -___ -$code.=<<___ if ($flavour !~ /elf32/); - mov %r10,0(%rdx) - mov %r11,8(%rdx) -___ -$code.=<<___ if ($flavour =~ /elf32/); - mov %r10d,0(%rdx) - mov %r11d,4(%rdx) -___ -$code.=<<___; - mov \$1,%eax - RET -.size poly1305_init_base2_44,.-poly1305_init_base2_44 -___ -{ -my ($H0,$H1,$H2,$r2r1r0,$r1r0s2,$r0s2s1,$Dlo,$Dhi) = map("%ymm$_",(0..5,16,17)); -my ($T0,$inp_permd,$inp_shift,$PAD) = map("%ymm$_",(18..21)); -my ($reduc_mask,$reduc_rght,$reduc_left) = map("%ymm$_",(22..25)); - -$code.=<<___; -.type poly1305_blocks_vpmadd52,\@function,4 -.align 32 -poly1305_blocks_vpmadd52: - shr \$4,$len - jz .Lno_data_vpmadd52 # too short - - shl \$40,$padbit - mov 64($ctx),%r8 # peek on power of the key - - # if powers of the key are not calculated yet, process up to 3 - # blocks with this single-block subroutine, otherwise ensure that - # length is divisible by 2 blocks and pass the rest down to next - # subroutine... - - mov \$3,%rax - mov \$1,%r10 - cmp \$4,$len # is input long - cmovae %r10,%rax - test %r8,%r8 # is power value impossible? - cmovns %r10,%rax - - and $len,%rax # is input of favourable length? - jz .Lblocks_vpmadd52_4x - - sub %rax,$len - mov \$7,%r10d - mov \$1,%r11d - kmovw %r10d,%k7 - lea .L2_44_inp_permd(%rip),%r10 - kmovw %r11d,%k1 - - vmovq $padbit,%x#$PAD - vmovdqa64 0(%r10),$inp_permd # .L2_44_inp_permd - vmovdqa64 32(%r10),$inp_shift # .L2_44_inp_shift - vpermq \$0xcf,$PAD,$PAD - vmovdqa64 64(%r10),$reduc_mask # .L2_44_mask - - vmovdqu64 0($ctx),${Dlo}{%k7}{z} # load hash value - vmovdqu64 40($ctx),${r2r1r0}{%k7}{z} # load keys - vmovdqu64 32($ctx),${r1r0s2}{%k7}{z} - vmovdqu64 24($ctx),${r0s2s1}{%k7}{z} - - vmovdqa64 96(%r10),$reduc_rght # .L2_44_shift_rgt - vmovdqa64 128(%r10),$reduc_left # .L2_44_shift_lft - - jmp .Loop_vpmadd52 - -.align 32 -.Loop_vpmadd52: - vmovdqu32 0($inp),%x#$T0 # load input as ----3210 - lea 16($inp),$inp - - vpermd $T0,$inp_permd,$T0 # ----3210 -> --322110 - vpsrlvq $inp_shift,$T0,$T0 - vpandq $reduc_mask,$T0,$T0 - vporq $PAD,$T0,$T0 - - vpaddq $T0,$Dlo,$Dlo # accumulate input - - vpermq \$0,$Dlo,${H0}{%k7}{z} # smash hash value - vpermq \$0b01010101,$Dlo,${H1}{%k7}{z} - vpermq \$0b10101010,$Dlo,${H2}{%k7}{z} - - vpxord $Dlo,$Dlo,$Dlo - vpxord $Dhi,$Dhi,$Dhi - - vpmadd52luq $r2r1r0,$H0,$Dlo - vpmadd52huq $r2r1r0,$H0,$Dhi - - vpmadd52luq $r1r0s2,$H1,$Dlo - vpmadd52huq $r1r0s2,$H1,$Dhi - - vpmadd52luq $r0s2s1,$H2,$Dlo - vpmadd52huq $r0s2s1,$H2,$Dhi - - vpsrlvq $reduc_rght,$Dlo,$T0 # 0 in topmost qword - vpsllvq $reduc_left,$Dhi,$Dhi # 0 in topmost qword - vpandq $reduc_mask,$Dlo,$Dlo - - vpaddq $T0,$Dhi,$Dhi - - vpermq \$0b10010011,$Dhi,$Dhi # 0 in lowest qword - - vpaddq $Dhi,$Dlo,$Dlo # note topmost qword :-) - - vpsrlvq $reduc_rght,$Dlo,$T0 # 0 in topmost word - vpandq $reduc_mask,$Dlo,$Dlo - - vpermq \$0b10010011,$T0,$T0 - - vpaddq $T0,$Dlo,$Dlo - - vpermq \$0b10010011,$Dlo,${T0}{%k1}{z} - - vpaddq $T0,$Dlo,$Dlo - vpsllq \$2,$T0,$T0 - - vpaddq $T0,$Dlo,$Dlo - - dec %rax # len-=16 - jnz .Loop_vpmadd52 - - vmovdqu64 $Dlo,0($ctx){%k7} # store hash value - - test $len,$len - jnz .Lblocks_vpmadd52_4x - -.Lno_data_vpmadd52: - RET -.size poly1305_blocks_vpmadd52,.-poly1305_blocks_vpmadd52 -___ -} -{ -######################################################################## -# As implied by its name 4x subroutine processes 4 blocks in parallel -# (but handles even 4*n+2 blocks lengths). It takes up to 4th key power -# and is handled in 256-bit %ymm registers. - -my ($H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2) = map("%ymm$_",(0..5,16,17)); -my ($D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi) = map("%ymm$_",(18..23)); -my ($T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD) = map("%ymm$_",(24..31)); - -$code.=<<___; -.type poly1305_blocks_vpmadd52_4x,\@function,4 -.align 32 -poly1305_blocks_vpmadd52_4x: - shr \$4,$len - jz .Lno_data_vpmadd52_4x # too short - - shl \$40,$padbit - mov 64($ctx),%r8 # peek on power of the key - -.Lblocks_vpmadd52_4x: - vpbroadcastq $padbit,$PAD - - vmovdqa64 .Lx_mask44(%rip),$mask44 - mov \$5,%eax - vmovdqa64 .Lx_mask42(%rip),$mask42 - kmovw %eax,%k1 # used in 2x path - - test %r8,%r8 # is power value impossible? - js .Linit_vpmadd52 # if it is, then init R[4] - - vmovq 0($ctx),%x#$H0 # load current hash value - vmovq 8($ctx),%x#$H1 - vmovq 16($ctx),%x#$H2 - - test \$3,$len # is length 4*n+2? - jnz .Lblocks_vpmadd52_2x_do - -.Lblocks_vpmadd52_4x_do: - vpbroadcastq 64($ctx),$R0 # load 4th power of the key - vpbroadcastq 96($ctx),$R1 - vpbroadcastq 128($ctx),$R2 - vpbroadcastq 160($ctx),$S1 - -.Lblocks_vpmadd52_4x_key_loaded: - vpsllq \$2,$R2,$S2 # S2 = R2*5*4 - vpaddq $R2,$S2,$S2 - vpsllq \$2,$S2,$S2 - - test \$7,$len # is len 8*n? - jz .Lblocks_vpmadd52_8x - - vmovdqu64 16*0($inp),$T2 # load data - vmovdqu64 16*2($inp),$T3 - lea 16*4($inp),$inp - - vpunpcklqdq $T3,$T2,$T1 # transpose data - vpunpckhqdq $T3,$T2,$T3 - - # at this point 64-bit lanes are ordered as 3-1-2-0 - - vpsrlq \$24,$T3,$T2 # splat the data - vporq $PAD,$T2,$T2 - vpaddq $T2,$H2,$H2 # accumulate input - vpandq $mask44,$T1,$T0 - vpsrlq \$44,$T1,$T1 - vpsllq \$20,$T3,$T3 - vporq $T3,$T1,$T1 - vpandq $mask44,$T1,$T1 - - sub \$4,$len - jz .Ltail_vpmadd52_4x - jmp .Loop_vpmadd52_4x - ud2 - -.align 32 -.Linit_vpmadd52: - vmovq 24($ctx),%x#$S1 # load key - vmovq 56($ctx),%x#$H2 - vmovq 32($ctx),%x#$S2 - vmovq 40($ctx),%x#$R0 - vmovq 48($ctx),%x#$R1 - - vmovdqa $R0,$H0 - vmovdqa $R1,$H1 - vmovdqa $H2,$R2 - - mov \$2,%eax - -.Lmul_init_vpmadd52: - vpxorq $D0lo,$D0lo,$D0lo - vpmadd52luq $H2,$S1,$D0lo - vpxorq $D0hi,$D0hi,$D0hi - vpmadd52huq $H2,$S1,$D0hi - vpxorq $D1lo,$D1lo,$D1lo - vpmadd52luq $H2,$S2,$D1lo - vpxorq $D1hi,$D1hi,$D1hi - vpmadd52huq $H2,$S2,$D1hi - vpxorq $D2lo,$D2lo,$D2lo - vpmadd52luq $H2,$R0,$D2lo - vpxorq $D2hi,$D2hi,$D2hi - vpmadd52huq $H2,$R0,$D2hi - - vpmadd52luq $H0,$R0,$D0lo - vpmadd52huq $H0,$R0,$D0hi - vpmadd52luq $H0,$R1,$D1lo - vpmadd52huq $H0,$R1,$D1hi - vpmadd52luq $H0,$R2,$D2lo - vpmadd52huq $H0,$R2,$D2hi - - vpmadd52luq $H1,$S2,$D0lo - vpmadd52huq $H1,$S2,$D0hi - vpmadd52luq $H1,$R0,$D1lo - vpmadd52huq $H1,$R0,$D1hi - vpmadd52luq $H1,$R1,$D2lo - vpmadd52huq $H1,$R1,$D2hi - - ################################################################ - # partial reduction - vpsrlq \$44,$D0lo,$tmp - vpsllq \$8,$D0hi,$D0hi - vpandq $mask44,$D0lo,$H0 - vpaddq $tmp,$D0hi,$D0hi - - vpaddq $D0hi,$D1lo,$D1lo - - vpsrlq \$44,$D1lo,$tmp - vpsllq \$8,$D1hi,$D1hi - vpandq $mask44,$D1lo,$H1 - vpaddq $tmp,$D1hi,$D1hi - - vpaddq $D1hi,$D2lo,$D2lo - - vpsrlq \$42,$D2lo,$tmp - vpsllq \$10,$D2hi,$D2hi - vpandq $mask42,$D2lo,$H2 - vpaddq $tmp,$D2hi,$D2hi - - vpaddq $D2hi,$H0,$H0 - vpsllq \$2,$D2hi,$D2hi - - vpaddq $D2hi,$H0,$H0 - - vpsrlq \$44,$H0,$tmp # additional step - vpandq $mask44,$H0,$H0 - - vpaddq $tmp,$H1,$H1 - - dec %eax - jz .Ldone_init_vpmadd52 - - vpunpcklqdq $R1,$H1,$R1 # 1,2 - vpbroadcastq %x#$H1,%x#$H1 # 2,2 - vpunpcklqdq $R2,$H2,$R2 - vpbroadcastq %x#$H2,%x#$H2 - vpunpcklqdq $R0,$H0,$R0 - vpbroadcastq %x#$H0,%x#$H0 - - vpsllq \$2,$R1,$S1 # S1 = R1*5*4 - vpsllq \$2,$R2,$S2 # S2 = R2*5*4 - vpaddq $R1,$S1,$S1 - vpaddq $R2,$S2,$S2 - vpsllq \$2,$S1,$S1 - vpsllq \$2,$S2,$S2 - - jmp .Lmul_init_vpmadd52 - ud2 - -.align 32 -.Ldone_init_vpmadd52: - vinserti128 \$1,%x#$R1,$H1,$R1 # 1,2,3,4 - vinserti128 \$1,%x#$R2,$H2,$R2 - vinserti128 \$1,%x#$R0,$H0,$R0 - - vpermq \$0b11011000,$R1,$R1 # 1,3,2,4 - vpermq \$0b11011000,$R2,$R2 - vpermq \$0b11011000,$R0,$R0 - - vpsllq \$2,$R1,$S1 # S1 = R1*5*4 - vpaddq $R1,$S1,$S1 - vpsllq \$2,$S1,$S1 - - vmovq 0($ctx),%x#$H0 # load current hash value - vmovq 8($ctx),%x#$H1 - vmovq 16($ctx),%x#$H2 - - test \$3,$len # is length 4*n+2? - jnz .Ldone_init_vpmadd52_2x - - vmovdqu64 $R0,64($ctx) # save key powers - vpbroadcastq %x#$R0,$R0 # broadcast 4th power - vmovdqu64 $R1,96($ctx) - vpbroadcastq %x#$R1,$R1 - vmovdqu64 $R2,128($ctx) - vpbroadcastq %x#$R2,$R2 - vmovdqu64 $S1,160($ctx) - vpbroadcastq %x#$S1,$S1 - - jmp .Lblocks_vpmadd52_4x_key_loaded - ud2 - -.align 32 -.Ldone_init_vpmadd52_2x: - vmovdqu64 $R0,64($ctx) # save key powers - vpsrldq \$8,$R0,$R0 # 0-1-0-2 - vmovdqu64 $R1,96($ctx) - vpsrldq \$8,$R1,$R1 - vmovdqu64 $R2,128($ctx) - vpsrldq \$8,$R2,$R2 - vmovdqu64 $S1,160($ctx) - vpsrldq \$8,$S1,$S1 - jmp .Lblocks_vpmadd52_2x_key_loaded - ud2 - -.align 32 -.Lblocks_vpmadd52_2x_do: - vmovdqu64 128+8($ctx),${R2}{%k1}{z}# load 2nd and 1st key powers - vmovdqu64 160+8($ctx),${S1}{%k1}{z} - vmovdqu64 64+8($ctx),${R0}{%k1}{z} - vmovdqu64 96+8($ctx),${R1}{%k1}{z} - -.Lblocks_vpmadd52_2x_key_loaded: - vmovdqu64 16*0($inp),$T2 # load data - vpxorq $T3,$T3,$T3 - lea 16*2($inp),$inp - - vpunpcklqdq $T3,$T2,$T1 # transpose data - vpunpckhqdq $T3,$T2,$T3 - - # at this point 64-bit lanes are ordered as x-1-x-0 - - vpsrlq \$24,$T3,$T2 # splat the data - vporq $PAD,$T2,$T2 - vpaddq $T2,$H2,$H2 # accumulate input - vpandq $mask44,$T1,$T0 - vpsrlq \$44,$T1,$T1 - vpsllq \$20,$T3,$T3 - vporq $T3,$T1,$T1 - vpandq $mask44,$T1,$T1 - - jmp .Ltail_vpmadd52_2x - ud2 - -.align 32 -.Loop_vpmadd52_4x: - #vpaddq $T2,$H2,$H2 # accumulate input - vpaddq $T0,$H0,$H0 - vpaddq $T1,$H1,$H1 - - vpxorq $D0lo,$D0lo,$D0lo - vpmadd52luq $H2,$S1,$D0lo - vpxorq $D0hi,$D0hi,$D0hi - vpmadd52huq $H2,$S1,$D0hi - vpxorq $D1lo,$D1lo,$D1lo - vpmadd52luq $H2,$S2,$D1lo - vpxorq $D1hi,$D1hi,$D1hi - vpmadd52huq $H2,$S2,$D1hi - vpxorq $D2lo,$D2lo,$D2lo - vpmadd52luq $H2,$R0,$D2lo - vpxorq $D2hi,$D2hi,$D2hi - vpmadd52huq $H2,$R0,$D2hi - - vmovdqu64 16*0($inp),$T2 # load data - vmovdqu64 16*2($inp),$T3 - lea 16*4($inp),$inp - vpmadd52luq $H0,$R0,$D0lo - vpmadd52huq $H0,$R0,$D0hi - vpmadd52luq $H0,$R1,$D1lo - vpmadd52huq $H0,$R1,$D1hi - vpmadd52luq $H0,$R2,$D2lo - vpmadd52huq $H0,$R2,$D2hi - - vpunpcklqdq $T3,$T2,$T1 # transpose data - vpunpckhqdq $T3,$T2,$T3 - vpmadd52luq $H1,$S2,$D0lo - vpmadd52huq $H1,$S2,$D0hi - vpmadd52luq $H1,$R0,$D1lo - vpmadd52huq $H1,$R0,$D1hi - vpmadd52luq $H1,$R1,$D2lo - vpmadd52huq $H1,$R1,$D2hi - - ################################################################ - # partial reduction (interleaved with data splat) - vpsrlq \$44,$D0lo,$tmp - vpsllq \$8,$D0hi,$D0hi - vpandq $mask44,$D0lo,$H0 - vpaddq $tmp,$D0hi,$D0hi - - vpsrlq \$24,$T3,$T2 - vporq $PAD,$T2,$T2 - vpaddq $D0hi,$D1lo,$D1lo - - vpsrlq \$44,$D1lo,$tmp - vpsllq \$8,$D1hi,$D1hi - vpandq $mask44,$D1lo,$H1 - vpaddq $tmp,$D1hi,$D1hi - - vpandq $mask44,$T1,$T0 - vpsrlq \$44,$T1,$T1 - vpsllq \$20,$T3,$T3 - vpaddq $D1hi,$D2lo,$D2lo - - vpsrlq \$42,$D2lo,$tmp - vpsllq \$10,$D2hi,$D2hi - vpandq $mask42,$D2lo,$H2 - vpaddq $tmp,$D2hi,$D2hi - - vpaddq $T2,$H2,$H2 # accumulate input - vpaddq $D2hi,$H0,$H0 - vpsllq \$2,$D2hi,$D2hi - - vpaddq $D2hi,$H0,$H0 - vporq $T3,$T1,$T1 - vpandq $mask44,$T1,$T1 - - vpsrlq \$44,$H0,$tmp # additional step - vpandq $mask44,$H0,$H0 - - vpaddq $tmp,$H1,$H1 - - sub \$4,$len # len-=64 - jnz .Loop_vpmadd52_4x - -.Ltail_vpmadd52_4x: - vmovdqu64 128($ctx),$R2 # load all key powers - vmovdqu64 160($ctx),$S1 - vmovdqu64 64($ctx),$R0 - vmovdqu64 96($ctx),$R1 - -.Ltail_vpmadd52_2x: - vpsllq \$2,$R2,$S2 # S2 = R2*5*4 - vpaddq $R2,$S2,$S2 - vpsllq \$2,$S2,$S2 - - #vpaddq $T2,$H2,$H2 # accumulate input - vpaddq $T0,$H0,$H0 - vpaddq $T1,$H1,$H1 - - vpxorq $D0lo,$D0lo,$D0lo - vpmadd52luq $H2,$S1,$D0lo - vpxorq $D0hi,$D0hi,$D0hi - vpmadd52huq $H2,$S1,$D0hi - vpxorq $D1lo,$D1lo,$D1lo - vpmadd52luq $H2,$S2,$D1lo - vpxorq $D1hi,$D1hi,$D1hi - vpmadd52huq $H2,$S2,$D1hi - vpxorq $D2lo,$D2lo,$D2lo - vpmadd52luq $H2,$R0,$D2lo - vpxorq $D2hi,$D2hi,$D2hi - vpmadd52huq $H2,$R0,$D2hi - - vpmadd52luq $H0,$R0,$D0lo - vpmadd52huq $H0,$R0,$D0hi - vpmadd52luq $H0,$R1,$D1lo - vpmadd52huq $H0,$R1,$D1hi - vpmadd52luq $H0,$R2,$D2lo - vpmadd52huq $H0,$R2,$D2hi - - vpmadd52luq $H1,$S2,$D0lo - vpmadd52huq $H1,$S2,$D0hi - vpmadd52luq $H1,$R0,$D1lo - vpmadd52huq $H1,$R0,$D1hi - vpmadd52luq $H1,$R1,$D2lo - vpmadd52huq $H1,$R1,$D2hi - - ################################################################ - # horizontal addition - - mov \$1,%eax - kmovw %eax,%k1 - vpsrldq \$8,$D0lo,$T0 - vpsrldq \$8,$D0hi,$H0 - vpsrldq \$8,$D1lo,$T1 - vpsrldq \$8,$D1hi,$H1 - vpaddq $T0,$D0lo,$D0lo - vpaddq $H0,$D0hi,$D0hi - vpsrldq \$8,$D2lo,$T2 - vpsrldq \$8,$D2hi,$H2 - vpaddq $T1,$D1lo,$D1lo - vpaddq $H1,$D1hi,$D1hi - vpermq \$0x2,$D0lo,$T0 - vpermq \$0x2,$D0hi,$H0 - vpaddq $T2,$D2lo,$D2lo - vpaddq $H2,$D2hi,$D2hi - - vpermq \$0x2,$D1lo,$T1 - vpermq \$0x2,$D1hi,$H1 - vpaddq $T0,$D0lo,${D0lo}{%k1}{z} - vpaddq $H0,$D0hi,${D0hi}{%k1}{z} - vpermq \$0x2,$D2lo,$T2 - vpermq \$0x2,$D2hi,$H2 - vpaddq $T1,$D1lo,${D1lo}{%k1}{z} - vpaddq $H1,$D1hi,${D1hi}{%k1}{z} - vpaddq $T2,$D2lo,${D2lo}{%k1}{z} - vpaddq $H2,$D2hi,${D2hi}{%k1}{z} - - ################################################################ - # partial reduction - vpsrlq \$44,$D0lo,$tmp - vpsllq \$8,$D0hi,$D0hi - vpandq $mask44,$D0lo,$H0 - vpaddq $tmp,$D0hi,$D0hi - - vpaddq $D0hi,$D1lo,$D1lo - - vpsrlq \$44,$D1lo,$tmp - vpsllq \$8,$D1hi,$D1hi - vpandq $mask44,$D1lo,$H1 - vpaddq $tmp,$D1hi,$D1hi - - vpaddq $D1hi,$D2lo,$D2lo - - vpsrlq \$42,$D2lo,$tmp - vpsllq \$10,$D2hi,$D2hi - vpandq $mask42,$D2lo,$H2 - vpaddq $tmp,$D2hi,$D2hi - - vpaddq $D2hi,$H0,$H0 - vpsllq \$2,$D2hi,$D2hi - - vpaddq $D2hi,$H0,$H0 - - vpsrlq \$44,$H0,$tmp # additional step - vpandq $mask44,$H0,$H0 - - vpaddq $tmp,$H1,$H1 - # at this point $len is - # either 4*n+2 or 0... - sub \$2,$len # len-=32 - ja .Lblocks_vpmadd52_4x_do - - vmovq %x#$H0,0($ctx) - vmovq %x#$H1,8($ctx) - vmovq %x#$H2,16($ctx) - vzeroall - -.Lno_data_vpmadd52_4x: - RET -.size poly1305_blocks_vpmadd52_4x,.-poly1305_blocks_vpmadd52_4x -___ -} -{ -######################################################################## -# As implied by its name 8x subroutine processes 8 blocks in parallel... -# This is intermediate version, as it's used only in cases when input -# length is either 8*n, 8*n+1 or 8*n+2... - -my ($H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2) = map("%ymm$_",(0..5,16,17)); -my ($D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi) = map("%ymm$_",(18..23)); -my ($T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD) = map("%ymm$_",(24..31)); -my ($RR0,$RR1,$RR2,$SS1,$SS2) = map("%ymm$_",(6..10)); - -$code.=<<___; -.type poly1305_blocks_vpmadd52_8x,\@function,4 -.align 32 -poly1305_blocks_vpmadd52_8x: - shr \$4,$len - jz .Lno_data_vpmadd52_8x # too short - - shl \$40,$padbit - mov 64($ctx),%r8 # peek on power of the key - - vmovdqa64 .Lx_mask44(%rip),$mask44 - vmovdqa64 .Lx_mask42(%rip),$mask42 - - test %r8,%r8 # is power value impossible? - js .Linit_vpmadd52 # if it is, then init R[4] - - vmovq 0($ctx),%x#$H0 # load current hash value - vmovq 8($ctx),%x#$H1 - vmovq 16($ctx),%x#$H2 - -.Lblocks_vpmadd52_8x: - ################################################################ - # fist we calculate more key powers - - vmovdqu64 128($ctx),$R2 # load 1-3-2-4 powers - vmovdqu64 160($ctx),$S1 - vmovdqu64 64($ctx),$R0 - vmovdqu64 96($ctx),$R1 - - vpsllq \$2,$R2,$S2 # S2 = R2*5*4 - vpaddq $R2,$S2,$S2 - vpsllq \$2,$S2,$S2 - - vpbroadcastq %x#$R2,$RR2 # broadcast 4th power - vpbroadcastq %x#$R0,$RR0 - vpbroadcastq %x#$R1,$RR1 - - vpxorq $D0lo,$D0lo,$D0lo - vpmadd52luq $RR2,$S1,$D0lo - vpxorq $D0hi,$D0hi,$D0hi - vpmadd52huq $RR2,$S1,$D0hi - vpxorq $D1lo,$D1lo,$D1lo - vpmadd52luq $RR2,$S2,$D1lo - vpxorq $D1hi,$D1hi,$D1hi - vpmadd52huq $RR2,$S2,$D1hi - vpxorq $D2lo,$D2lo,$D2lo - vpmadd52luq $RR2,$R0,$D2lo - vpxorq $D2hi,$D2hi,$D2hi - vpmadd52huq $RR2,$R0,$D2hi - - vpmadd52luq $RR0,$R0,$D0lo - vpmadd52huq $RR0,$R0,$D0hi - vpmadd52luq $RR0,$R1,$D1lo - vpmadd52huq $RR0,$R1,$D1hi - vpmadd52luq $RR0,$R2,$D2lo - vpmadd52huq $RR0,$R2,$D2hi - - vpmadd52luq $RR1,$S2,$D0lo - vpmadd52huq $RR1,$S2,$D0hi - vpmadd52luq $RR1,$R0,$D1lo - vpmadd52huq $RR1,$R0,$D1hi - vpmadd52luq $RR1,$R1,$D2lo - vpmadd52huq $RR1,$R1,$D2hi - - ################################################################ - # partial reduction - vpsrlq \$44,$D0lo,$tmp - vpsllq \$8,$D0hi,$D0hi - vpandq $mask44,$D0lo,$RR0 - vpaddq $tmp,$D0hi,$D0hi - - vpaddq $D0hi,$D1lo,$D1lo - - vpsrlq \$44,$D1lo,$tmp - vpsllq \$8,$D1hi,$D1hi - vpandq $mask44,$D1lo,$RR1 - vpaddq $tmp,$D1hi,$D1hi - - vpaddq $D1hi,$D2lo,$D2lo - - vpsrlq \$42,$D2lo,$tmp - vpsllq \$10,$D2hi,$D2hi - vpandq $mask42,$D2lo,$RR2 - vpaddq $tmp,$D2hi,$D2hi - - vpaddq $D2hi,$RR0,$RR0 - vpsllq \$2,$D2hi,$D2hi - - vpaddq $D2hi,$RR0,$RR0 - - vpsrlq \$44,$RR0,$tmp # additional step - vpandq $mask44,$RR0,$RR0 - - vpaddq $tmp,$RR1,$RR1 - - ################################################################ - # At this point Rx holds 1324 powers, RRx - 5768, and the goal - # is 15263748, which reflects how data is loaded... - - vpunpcklqdq $R2,$RR2,$T2 # 3748 - vpunpckhqdq $R2,$RR2,$R2 # 1526 - vpunpcklqdq $R0,$RR0,$T0 - vpunpckhqdq $R0,$RR0,$R0 - vpunpcklqdq $R1,$RR1,$T1 - vpunpckhqdq $R1,$RR1,$R1 -___ -######## switch to %zmm -map(s/%y/%z/, $H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2); -map(s/%y/%z/, $D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi); -map(s/%y/%z/, $T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD); -map(s/%y/%z/, $RR0,$RR1,$RR2,$SS1,$SS2); - -$code.=<<___; - vshufi64x2 \$0x44,$R2,$T2,$RR2 # 15263748 - vshufi64x2 \$0x44,$R0,$T0,$RR0 - vshufi64x2 \$0x44,$R1,$T1,$RR1 - - vmovdqu64 16*0($inp),$T2 # load data - vmovdqu64 16*4($inp),$T3 - lea 16*8($inp),$inp - - vpsllq \$2,$RR2,$SS2 # S2 = R2*5*4 - vpsllq \$2,$RR1,$SS1 # S1 = R1*5*4 - vpaddq $RR2,$SS2,$SS2 - vpaddq $RR1,$SS1,$SS1 - vpsllq \$2,$SS2,$SS2 - vpsllq \$2,$SS1,$SS1 - - vpbroadcastq $padbit,$PAD - vpbroadcastq %x#$mask44,$mask44 - vpbroadcastq %x#$mask42,$mask42 - - vpbroadcastq %x#$SS1,$S1 # broadcast 8th power - vpbroadcastq %x#$SS2,$S2 - vpbroadcastq %x#$RR0,$R0 - vpbroadcastq %x#$RR1,$R1 - vpbroadcastq %x#$RR2,$R2 - - vpunpcklqdq $T3,$T2,$T1 # transpose data - vpunpckhqdq $T3,$T2,$T3 - - # at this point 64-bit lanes are ordered as 73625140 - - vpsrlq \$24,$T3,$T2 # splat the data - vporq $PAD,$T2,$T2 - vpaddq $T2,$H2,$H2 # accumulate input - vpandq $mask44,$T1,$T0 - vpsrlq \$44,$T1,$T1 - vpsllq \$20,$T3,$T3 - vporq $T3,$T1,$T1 - vpandq $mask44,$T1,$T1 - - sub \$8,$len - jz .Ltail_vpmadd52_8x - jmp .Loop_vpmadd52_8x - -.align 32 -.Loop_vpmadd52_8x: - #vpaddq $T2,$H2,$H2 # accumulate input - vpaddq $T0,$H0,$H0 - vpaddq $T1,$H1,$H1 - - vpxorq $D0lo,$D0lo,$D0lo - vpmadd52luq $H2,$S1,$D0lo - vpxorq $D0hi,$D0hi,$D0hi - vpmadd52huq $H2,$S1,$D0hi - vpxorq $D1lo,$D1lo,$D1lo - vpmadd52luq $H2,$S2,$D1lo - vpxorq $D1hi,$D1hi,$D1hi - vpmadd52huq $H2,$S2,$D1hi - vpxorq $D2lo,$D2lo,$D2lo - vpmadd52luq $H2,$R0,$D2lo - vpxorq $D2hi,$D2hi,$D2hi - vpmadd52huq $H2,$R0,$D2hi - - vmovdqu64 16*0($inp),$T2 # load data - vmovdqu64 16*4($inp),$T3 - lea 16*8($inp),$inp - vpmadd52luq $H0,$R0,$D0lo - vpmadd52huq $H0,$R0,$D0hi - vpmadd52luq $H0,$R1,$D1lo - vpmadd52huq $H0,$R1,$D1hi - vpmadd52luq $H0,$R2,$D2lo - vpmadd52huq $H0,$R2,$D2hi - - vpunpcklqdq $T3,$T2,$T1 # transpose data - vpunpckhqdq $T3,$T2,$T3 - vpmadd52luq $H1,$S2,$D0lo - vpmadd52huq $H1,$S2,$D0hi - vpmadd52luq $H1,$R0,$D1lo - vpmadd52huq $H1,$R0,$D1hi - vpmadd52luq $H1,$R1,$D2lo - vpmadd52huq $H1,$R1,$D2hi - - ################################################################ - # partial reduction (interleaved with data splat) - vpsrlq \$44,$D0lo,$tmp - vpsllq \$8,$D0hi,$D0hi - vpandq $mask44,$D0lo,$H0 - vpaddq $tmp,$D0hi,$D0hi - - vpsrlq \$24,$T3,$T2 - vporq $PAD,$T2,$T2 - vpaddq $D0hi,$D1lo,$D1lo - - vpsrlq \$44,$D1lo,$tmp - vpsllq \$8,$D1hi,$D1hi - vpandq $mask44,$D1lo,$H1 - vpaddq $tmp,$D1hi,$D1hi - - vpandq $mask44,$T1,$T0 - vpsrlq \$44,$T1,$T1 - vpsllq \$20,$T3,$T3 - vpaddq $D1hi,$D2lo,$D2lo - - vpsrlq \$42,$D2lo,$tmp - vpsllq \$10,$D2hi,$D2hi - vpandq $mask42,$D2lo,$H2 - vpaddq $tmp,$D2hi,$D2hi - - vpaddq $T2,$H2,$H2 # accumulate input - vpaddq $D2hi,$H0,$H0 - vpsllq \$2,$D2hi,$D2hi - - vpaddq $D2hi,$H0,$H0 - vporq $T3,$T1,$T1 - vpandq $mask44,$T1,$T1 - - vpsrlq \$44,$H0,$tmp # additional step - vpandq $mask44,$H0,$H0 - - vpaddq $tmp,$H1,$H1 - - sub \$8,$len # len-=128 - jnz .Loop_vpmadd52_8x - -.Ltail_vpmadd52_8x: - #vpaddq $T2,$H2,$H2 # accumulate input - vpaddq $T0,$H0,$H0 - vpaddq $T1,$H1,$H1 - - vpxorq $D0lo,$D0lo,$D0lo - vpmadd52luq $H2,$SS1,$D0lo - vpxorq $D0hi,$D0hi,$D0hi - vpmadd52huq $H2,$SS1,$D0hi - vpxorq $D1lo,$D1lo,$D1lo - vpmadd52luq $H2,$SS2,$D1lo - vpxorq $D1hi,$D1hi,$D1hi - vpmadd52huq $H2,$SS2,$D1hi - vpxorq $D2lo,$D2lo,$D2lo - vpmadd52luq $H2,$RR0,$D2lo - vpxorq $D2hi,$D2hi,$D2hi - vpmadd52huq $H2,$RR0,$D2hi - - vpmadd52luq $H0,$RR0,$D0lo - vpmadd52huq $H0,$RR0,$D0hi - vpmadd52luq $H0,$RR1,$D1lo - vpmadd52huq $H0,$RR1,$D1hi - vpmadd52luq $H0,$RR2,$D2lo - vpmadd52huq $H0,$RR2,$D2hi - - vpmadd52luq $H1,$SS2,$D0lo - vpmadd52huq $H1,$SS2,$D0hi - vpmadd52luq $H1,$RR0,$D1lo - vpmadd52huq $H1,$RR0,$D1hi - vpmadd52luq $H1,$RR1,$D2lo - vpmadd52huq $H1,$RR1,$D2hi - - ################################################################ - # horizontal addition - - mov \$1,%eax - kmovw %eax,%k1 - vpsrldq \$8,$D0lo,$T0 - vpsrldq \$8,$D0hi,$H0 - vpsrldq \$8,$D1lo,$T1 - vpsrldq \$8,$D1hi,$H1 - vpaddq $T0,$D0lo,$D0lo - vpaddq $H0,$D0hi,$D0hi - vpsrldq \$8,$D2lo,$T2 - vpsrldq \$8,$D2hi,$H2 - vpaddq $T1,$D1lo,$D1lo - vpaddq $H1,$D1hi,$D1hi - vpermq \$0x2,$D0lo,$T0 - vpermq \$0x2,$D0hi,$H0 - vpaddq $T2,$D2lo,$D2lo - vpaddq $H2,$D2hi,$D2hi - - vpermq \$0x2,$D1lo,$T1 - vpermq \$0x2,$D1hi,$H1 - vpaddq $T0,$D0lo,$D0lo - vpaddq $H0,$D0hi,$D0hi - vpermq \$0x2,$D2lo,$T2 - vpermq \$0x2,$D2hi,$H2 - vpaddq $T1,$D1lo,$D1lo - vpaddq $H1,$D1hi,$D1hi - vextracti64x4 \$1,$D0lo,%y#$T0 - vextracti64x4 \$1,$D0hi,%y#$H0 - vpaddq $T2,$D2lo,$D2lo - vpaddq $H2,$D2hi,$D2hi - - vextracti64x4 \$1,$D1lo,%y#$T1 - vextracti64x4 \$1,$D1hi,%y#$H1 - vextracti64x4 \$1,$D2lo,%y#$T2 - vextracti64x4 \$1,$D2hi,%y#$H2 -___ -######## switch back to %ymm -map(s/%z/%y/, $H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2); -map(s/%z/%y/, $D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi); -map(s/%z/%y/, $T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD); - -$code.=<<___; - vpaddq $T0,$D0lo,${D0lo}{%k1}{z} - vpaddq $H0,$D0hi,${D0hi}{%k1}{z} - vpaddq $T1,$D1lo,${D1lo}{%k1}{z} - vpaddq $H1,$D1hi,${D1hi}{%k1}{z} - vpaddq $T2,$D2lo,${D2lo}{%k1}{z} - vpaddq $H2,$D2hi,${D2hi}{%k1}{z} - - ################################################################ - # partial reduction - vpsrlq \$44,$D0lo,$tmp - vpsllq \$8,$D0hi,$D0hi - vpandq $mask44,$D0lo,$H0 - vpaddq $tmp,$D0hi,$D0hi - - vpaddq $D0hi,$D1lo,$D1lo - - vpsrlq \$44,$D1lo,$tmp - vpsllq \$8,$D1hi,$D1hi - vpandq $mask44,$D1lo,$H1 - vpaddq $tmp,$D1hi,$D1hi - - vpaddq $D1hi,$D2lo,$D2lo - - vpsrlq \$42,$D2lo,$tmp - vpsllq \$10,$D2hi,$D2hi - vpandq $mask42,$D2lo,$H2 - vpaddq $tmp,$D2hi,$D2hi - - vpaddq $D2hi,$H0,$H0 - vpsllq \$2,$D2hi,$D2hi - - vpaddq $D2hi,$H0,$H0 - - vpsrlq \$44,$H0,$tmp # additional step - vpandq $mask44,$H0,$H0 - - vpaddq $tmp,$H1,$H1 - - ################################################################ - - vmovq %x#$H0,0($ctx) - vmovq %x#$H1,8($ctx) - vmovq %x#$H2,16($ctx) - vzeroall - -.Lno_data_vpmadd52_8x: - RET -.size poly1305_blocks_vpmadd52_8x,.-poly1305_blocks_vpmadd52_8x -___ -} -$code.=<<___; -.type poly1305_emit_base2_44,\@function,3 -.align 32 -poly1305_emit_base2_44: - mov 0($ctx),%r8 # load hash value - mov 8($ctx),%r9 - mov 16($ctx),%r10 - - mov %r9,%rax - shr \$20,%r9 - shl \$44,%rax - mov %r10,%rcx - shr \$40,%r10 - shl \$24,%rcx - - add %rax,%r8 - adc %rcx,%r9 - adc \$0,%r10 - - mov %r8,%rax - add \$5,%r8 # compare to modulus - mov %r9,%rcx - adc \$0,%r9 - adc \$0,%r10 - shr \$2,%r10 # did 130-bit value overflow? - cmovnz %r8,%rax - cmovnz %r9,%rcx - - add 0($nonce),%rax # accumulate nonce - adc 8($nonce),%rcx - mov %rax,0($mac) # write result - mov %rcx,8($mac) - - RET -.size poly1305_emit_base2_44,.-poly1305_emit_base2_44 -___ -} } } -} - -if (!$kernel) -{ # chacha20-poly1305 helpers -my ($out,$inp,$otp,$len)=$win64 ? ("%rcx","%rdx","%r8", "%r9") : # Win64 order - ("%rdi","%rsi","%rdx","%rcx"); # Unix order -$code.=<<___; -.globl xor128_encrypt_n_pad -.type xor128_encrypt_n_pad,\@abi-omnipotent -.align 16 -xor128_encrypt_n_pad: - sub $otp,$inp - sub $otp,$out - mov $len,%r10 # put len aside - shr \$4,$len # len / 16 - jz .Ltail_enc - nop -.Loop_enc_xmm: - movdqu ($inp,$otp),%xmm0 - pxor ($otp),%xmm0 - movdqu %xmm0,($out,$otp) - movdqa %xmm0,($otp) - lea 16($otp),$otp - dec $len - jnz .Loop_enc_xmm - - and \$15,%r10 # len % 16 - jz .Ldone_enc - -.Ltail_enc: - mov \$16,$len - sub %r10,$len - xor %eax,%eax -.Loop_enc_byte: - mov ($inp,$otp),%al - xor ($otp),%al - mov %al,($out,$otp) - mov %al,($otp) - lea 1($otp),$otp - dec %r10 - jnz .Loop_enc_byte - - xor %eax,%eax -.Loop_enc_pad: - mov %al,($otp) - lea 1($otp),$otp - dec $len - jnz .Loop_enc_pad - -.Ldone_enc: - mov $otp,%rax - RET -.size xor128_encrypt_n_pad,.-xor128_encrypt_n_pad - -.globl xor128_decrypt_n_pad -.type xor128_decrypt_n_pad,\@abi-omnipotent -.align 16 -xor128_decrypt_n_pad: - sub $otp,$inp - sub $otp,$out - mov $len,%r10 # put len aside - shr \$4,$len # len / 16 - jz .Ltail_dec - nop -.Loop_dec_xmm: - movdqu ($inp,$otp),%xmm0 - movdqa ($otp),%xmm1 - pxor %xmm0,%xmm1 - movdqu %xmm1,($out,$otp) - movdqa %xmm0,($otp) - lea 16($otp),$otp - dec $len - jnz .Loop_dec_xmm - - pxor %xmm1,%xmm1 - and \$15,%r10 # len % 16 - jz .Ldone_dec - -.Ltail_dec: - mov \$16,$len - sub %r10,$len - xor %eax,%eax - xor %r11d,%r11d -.Loop_dec_byte: - mov ($inp,$otp),%r11b - mov ($otp),%al - xor %r11b,%al - mov %al,($out,$otp) - mov %r11b,($otp) - lea 1($otp),$otp - dec %r10 - jnz .Loop_dec_byte - - xor %eax,%eax -.Loop_dec_pad: - mov %al,($otp) - lea 1($otp),$otp - dec $len - jnz .Loop_dec_pad - -.Ldone_dec: - mov $otp,%rax - RET -.size xor128_decrypt_n_pad,.-xor128_decrypt_n_pad -___ -} - -# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, -# CONTEXT *context,DISPATCHER_CONTEXT *disp) -if ($win64) { -$rec="%rcx"; -$frame="%rdx"; -$context="%r8"; -$disp="%r9"; - -$code.=<<___; -.extern __imp_RtlVirtualUnwind -.type se_handler,\@abi-omnipotent -.align 16 -se_handler: - push %rsi - push %rdi - push %rbx - push %rbp - push %r12 - push %r13 - push %r14 - push %r15 - pushfq - sub \$64,%rsp - - mov 120($context),%rax # pull context->Rax - mov 248($context),%rbx # pull context->Rip - - mov 8($disp),%rsi # disp->ImageBase - mov 56($disp),%r11 # disp->HandlerData - - mov 0(%r11),%r10d # HandlerData[0] - lea (%rsi,%r10),%r10 # prologue label - cmp %r10,%rbx # context->Rip<.Lprologue - jb .Lcommon_seh_tail - - mov 152($context),%rax # pull context->Rsp - - mov 4(%r11),%r10d # HandlerData[1] - lea (%rsi,%r10),%r10 # epilogue label - cmp %r10,%rbx # context->Rip>=.Lepilogue - jae .Lcommon_seh_tail - - lea 48(%rax),%rax - - mov -8(%rax),%rbx - mov -16(%rax),%rbp - mov -24(%rax),%r12 - mov -32(%rax),%r13 - mov -40(%rax),%r14 - mov -48(%rax),%r15 - mov %rbx,144($context) # restore context->Rbx - mov %rbp,160($context) # restore context->Rbp - mov %r12,216($context) # restore context->R12 - mov %r13,224($context) # restore context->R13 - mov %r14,232($context) # restore context->R14 - mov %r15,240($context) # restore context->R14 - - jmp .Lcommon_seh_tail -.size se_handler,.-se_handler - -.type avx_handler,\@abi-omnipotent -.align 16 -avx_handler: - push %rsi - push %rdi - push %rbx - push %rbp - push %r12 - push %r13 - push %r14 - push %r15 - pushfq - sub \$64,%rsp - - mov 120($context),%rax # pull context->Rax - mov 248($context),%rbx # pull context->Rip - - mov 8($disp),%rsi # disp->ImageBase - mov 56($disp),%r11 # disp->HandlerData - - mov 0(%r11),%r10d # HandlerData[0] - lea (%rsi,%r10),%r10 # prologue label - cmp %r10,%rbx # context->Rip<prologue label - jb .Lcommon_seh_tail - - mov 152($context),%rax # pull context->Rsp - - mov 4(%r11),%r10d # HandlerData[1] - lea (%rsi,%r10),%r10 # epilogue label - cmp %r10,%rbx # context->Rip>=epilogue label - jae .Lcommon_seh_tail - - mov 208($context),%rax # pull context->R11 - - lea 0x50(%rax),%rsi - lea 0xf8(%rax),%rax - lea 512($context),%rdi # &context.Xmm6 - mov \$20,%ecx - .long 0xa548f3fc # cld; rep movsq - -.Lcommon_seh_tail: - mov 8(%rax),%rdi - mov 16(%rax),%rsi - mov %rax,152($context) # restore context->Rsp - mov %rsi,168($context) # restore context->Rsi - mov %rdi,176($context) # restore context->Rdi - - mov 40($disp),%rdi # disp->ContextRecord - mov $context,%rsi # context - mov \$154,%ecx # sizeof(CONTEXT) - .long 0xa548f3fc # cld; rep movsq - - mov $disp,%rsi - xor %ecx,%ecx # arg1, UNW_FLAG_NHANDLER - mov 8(%rsi),%rdx # arg2, disp->ImageBase - mov 0(%rsi),%r8 # arg3, disp->ControlPc - mov 16(%rsi),%r9 # arg4, disp->FunctionEntry - mov 40(%rsi),%r10 # disp->ContextRecord - lea 56(%rsi),%r11 # &disp->HandlerData - lea 24(%rsi),%r12 # &disp->EstablisherFrame - mov %r10,32(%rsp) # arg5 - mov %r11,40(%rsp) # arg6 - mov %r12,48(%rsp) # arg7 - mov %rcx,56(%rsp) # arg8, (NULL) - call *__imp_RtlVirtualUnwind(%rip) - - mov \$1,%eax # ExceptionContinueSearch - add \$64,%rsp - popfq - pop %r15 - pop %r14 - pop %r13 - pop %r12 - pop %rbp - pop %rbx - pop %rdi - pop %rsi - RET -.size avx_handler,.-avx_handler - -.section .pdata -.align 4 - .rva .LSEH_begin_poly1305_init_x86_64 - .rva .LSEH_end_poly1305_init_x86_64 - .rva .LSEH_info_poly1305_init_x86_64 - - .rva .LSEH_begin_poly1305_blocks_x86_64 - .rva .LSEH_end_poly1305_blocks_x86_64 - .rva .LSEH_info_poly1305_blocks_x86_64 - - .rva .LSEH_begin_poly1305_emit_x86_64 - .rva .LSEH_end_poly1305_emit_x86_64 - .rva .LSEH_info_poly1305_emit_x86_64 -___ -$code.=<<___ if ($avx); - .rva .LSEH_begin_poly1305_blocks_avx - .rva .Lbase2_64_avx - .rva .LSEH_info_poly1305_blocks_avx_1 - - .rva .Lbase2_64_avx - .rva .Leven_avx - .rva .LSEH_info_poly1305_blocks_avx_2 - - .rva .Leven_avx - .rva .LSEH_end_poly1305_blocks_avx - .rva .LSEH_info_poly1305_blocks_avx_3 - - .rva .LSEH_begin_poly1305_emit_avx - .rva .LSEH_end_poly1305_emit_avx - .rva .LSEH_info_poly1305_emit_avx -___ -$code.=<<___ if ($avx>1); - .rva .LSEH_begin_poly1305_blocks_avx2 - .rva .Lbase2_64_avx2 - .rva .LSEH_info_poly1305_blocks_avx2_1 - - .rva .Lbase2_64_avx2 - .rva .Leven_avx2 - .rva .LSEH_info_poly1305_blocks_avx2_2 - - .rva .Leven_avx2 - .rva .LSEH_end_poly1305_blocks_avx2 - .rva .LSEH_info_poly1305_blocks_avx2_3 -___ -$code.=<<___ if ($avx>2); - .rva .LSEH_begin_poly1305_blocks_avx512 - .rva .LSEH_end_poly1305_blocks_avx512 - .rva .LSEH_info_poly1305_blocks_avx512 -___ -$code.=<<___; -.section .xdata -.align 8 -.LSEH_info_poly1305_init_x86_64: - .byte 9,0,0,0 - .rva se_handler - .rva .LSEH_begin_poly1305_init_x86_64,.LSEH_begin_poly1305_init_x86_64 - -.LSEH_info_poly1305_blocks_x86_64: - .byte 9,0,0,0 - .rva se_handler - .rva .Lblocks_body,.Lblocks_epilogue - -.LSEH_info_poly1305_emit_x86_64: - .byte 9,0,0,0 - .rva se_handler - .rva .LSEH_begin_poly1305_emit_x86_64,.LSEH_begin_poly1305_emit_x86_64 -___ -$code.=<<___ if ($avx); -.LSEH_info_poly1305_blocks_avx_1: - .byte 9,0,0,0 - .rva se_handler - .rva .Lblocks_avx_body,.Lblocks_avx_epilogue # HandlerData[] - -.LSEH_info_poly1305_blocks_avx_2: - .byte 9,0,0,0 - .rva se_handler - .rva .Lbase2_64_avx_body,.Lbase2_64_avx_epilogue # HandlerData[] - -.LSEH_info_poly1305_blocks_avx_3: - .byte 9,0,0,0 - .rva avx_handler - .rva .Ldo_avx_body,.Ldo_avx_epilogue # HandlerData[] - -.LSEH_info_poly1305_emit_avx: - .byte 9,0,0,0 - .rva se_handler - .rva .LSEH_begin_poly1305_emit_avx,.LSEH_begin_poly1305_emit_avx -___ -$code.=<<___ if ($avx>1); -.LSEH_info_poly1305_blocks_avx2_1: - .byte 9,0,0,0 - .rva se_handler - .rva .Lblocks_avx2_body,.Lblocks_avx2_epilogue # HandlerData[] - -.LSEH_info_poly1305_blocks_avx2_2: - .byte 9,0,0,0 - .rva se_handler - .rva .Lbase2_64_avx2_body,.Lbase2_64_avx2_epilogue # HandlerData[] - -.LSEH_info_poly1305_blocks_avx2_3: - .byte 9,0,0,0 - .rva avx_handler - .rva .Ldo_avx2_body,.Ldo_avx2_epilogue # HandlerData[] -___ -$code.=<<___ if ($avx>2); -.LSEH_info_poly1305_blocks_avx512: - .byte 9,0,0,0 - .rva avx_handler - .rva .Ldo_avx512_body,.Ldo_avx512_epilogue # HandlerData[] -___ -} - -open SELF,$0; -while(<SELF>) { - next if (/^#!/); - last if (!s/^#/\/\// and !/^$/); - print; -} -close SELF; - -foreach (split('\n',$code)) { - s/\`([^\`]*)\`/eval($1)/ge; - s/%r([a-z]+)#d/%e$1/g; - s/%r([0-9]+)#d/%r$1d/g; - s/%x#%[yz]/%x/g or s/%y#%z/%y/g or s/%z#%[yz]/%z/g; - - if ($kernel) { - s/(^\.type.*),[0-9]+$/\1/; - s/(^\.type.*),\@abi-omnipotent+$/\1,\@function/; - next if /^\.cfi.*/; - } - - print $_,"\n"; -} -close STDOUT; diff --git a/arch/x86/crypto/poly1305_glue.c b/arch/x86/crypto/poly1305_glue.c deleted file mode 100644 index 1dfb8af48a3c..000000000000 --- a/arch/x86/crypto/poly1305_glue.c +++ /dev/null @@ -1,290 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 OR MIT -/* - * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. - */ - -#include <crypto/algapi.h> -#include <crypto/internal/hash.h> -#include <crypto/internal/poly1305.h> -#include <crypto/internal/simd.h> -#include <linux/crypto.h> -#include <linux/jump_label.h> -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/sizes.h> -#include <asm/intel-family.h> -#include <asm/simd.h> - -asmlinkage void poly1305_init_x86_64(void *ctx, - const u8 key[POLY1305_BLOCK_SIZE]); -asmlinkage void poly1305_blocks_x86_64(void *ctx, const u8 *inp, - const size_t len, const u32 padbit); -asmlinkage void poly1305_emit_x86_64(void *ctx, u8 mac[POLY1305_DIGEST_SIZE], - const u32 nonce[4]); -asmlinkage void poly1305_emit_avx(void *ctx, u8 mac[POLY1305_DIGEST_SIZE], - const u32 nonce[4]); -asmlinkage void poly1305_blocks_avx(void *ctx, const u8 *inp, const size_t len, - const u32 padbit); -asmlinkage void poly1305_blocks_avx2(void *ctx, const u8 *inp, const size_t len, - const u32 padbit); -asmlinkage void poly1305_blocks_avx512(void *ctx, const u8 *inp, - const size_t len, const u32 padbit); - -static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx); -static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx2); -static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx512); - -struct poly1305_arch_internal { - union { - struct { - u32 h[5]; - u32 is_base2_26; - }; - u64 hs[3]; - }; - u64 r[2]; - u64 pad; - struct { u32 r2, r1, r4, r3; } rn[9]; -}; - -/* The AVX code uses base 2^26, while the scalar code uses base 2^64. If we hit - * the unfortunate situation of using AVX and then having to go back to scalar - * -- because the user is silly and has called the update function from two - * separate contexts -- then we need to convert back to the original base before - * proceeding. It is possible to reason that the initial reduction below is - * sufficient given the implementation invariants. However, for an avoidance of - * doubt and because this is not performance critical, we do the full reduction - * anyway. Z3 proof of below function: https://xn--4db.cc/ltPtHCKN/py - */ -static void convert_to_base2_64(void *ctx) -{ - struct poly1305_arch_internal *state = ctx; - u32 cy; - - if (!state->is_base2_26) - return; - - cy = state->h[0] >> 26; state->h[0] &= 0x3ffffff; state->h[1] += cy; - cy = state->h[1] >> 26; state->h[1] &= 0x3ffffff; state->h[2] += cy; - cy = state->h[2] >> 26; state->h[2] &= 0x3ffffff; state->h[3] += cy; - cy = state->h[3] >> 26; state->h[3] &= 0x3ffffff; state->h[4] += cy; - state->hs[0] = ((u64)state->h[2] << 52) | ((u64)state->h[1] << 26) | state->h[0]; - state->hs[1] = ((u64)state->h[4] << 40) | ((u64)state->h[3] << 14) | (state->h[2] >> 12); - state->hs[2] = state->h[4] >> 24; -#define ULT(a, b) ((a ^ ((a ^ b) | ((a - b) ^ b))) >> (sizeof(a) * 8 - 1)) - cy = (state->hs[2] >> 2) + (state->hs[2] & ~3ULL); - state->hs[2] &= 3; - state->hs[0] += cy; - state->hs[1] += (cy = ULT(state->hs[0], cy)); - state->hs[2] += ULT(state->hs[1], cy); -#undef ULT - state->is_base2_26 = 0; -} - -static void poly1305_simd_init(void *ctx, const u8 key[POLY1305_BLOCK_SIZE]) -{ - poly1305_init_x86_64(ctx, key); -} - -static void poly1305_simd_blocks(void *ctx, const u8 *inp, size_t len, - const u32 padbit) -{ - struct poly1305_arch_internal *state = ctx; - - /* SIMD disables preemption, so relax after processing each page. */ - BUILD_BUG_ON(SZ_4K < POLY1305_BLOCK_SIZE || - SZ_4K % POLY1305_BLOCK_SIZE); - - if (!static_branch_likely(&poly1305_use_avx) || - (len < (POLY1305_BLOCK_SIZE * 18) && !state->is_base2_26) || - !crypto_simd_usable()) { - convert_to_base2_64(ctx); - poly1305_blocks_x86_64(ctx, inp, len, padbit); - return; - } - - do { - const size_t bytes = min_t(size_t, len, SZ_4K); - - kernel_fpu_begin(); - if (IS_ENABLED(CONFIG_AS_AVX512) && static_branch_likely(&poly1305_use_avx512)) - poly1305_blocks_avx512(ctx, inp, bytes, padbit); - else if (static_branch_likely(&poly1305_use_avx2)) - poly1305_blocks_avx2(ctx, inp, bytes, padbit); - else - poly1305_blocks_avx(ctx, inp, bytes, padbit); - kernel_fpu_end(); - - len -= bytes; - inp += bytes; - } while (len); -} - -static void poly1305_simd_emit(void *ctx, u8 mac[POLY1305_DIGEST_SIZE], - const u32 nonce[4]) -{ - if (!static_branch_likely(&poly1305_use_avx)) - poly1305_emit_x86_64(ctx, mac, nonce); - else - poly1305_emit_avx(ctx, mac, nonce); -} - -void poly1305_init_arch(struct poly1305_desc_ctx *dctx, const u8 key[POLY1305_KEY_SIZE]) -{ - poly1305_simd_init(&dctx->h, key); - dctx->s[0] = get_unaligned_le32(&key[16]); - dctx->s[1] = get_unaligned_le32(&key[20]); - dctx->s[2] = get_unaligned_le32(&key[24]); - dctx->s[3] = get_unaligned_le32(&key[28]); - dctx->buflen = 0; - dctx->sset = true; -} -EXPORT_SYMBOL(poly1305_init_arch); - -static unsigned int crypto_poly1305_setdctxkey(struct poly1305_desc_ctx *dctx, - const u8 *inp, unsigned int len) -{ - unsigned int acc = 0; - if (unlikely(!dctx->sset)) { - if (!dctx->rset && len >= POLY1305_BLOCK_SIZE) { - poly1305_simd_init(&dctx->h, inp); - inp += POLY1305_BLOCK_SIZE; - len -= POLY1305_BLOCK_SIZE; - acc += POLY1305_BLOCK_SIZE; - dctx->rset = 1; - } - if (len >= POLY1305_BLOCK_SIZE) { - dctx->s[0] = get_unaligned_le32(&inp[0]); - dctx->s[1] = get_unaligned_le32(&inp[4]); - dctx->s[2] = get_unaligned_le32(&inp[8]); - dctx->s[3] = get_unaligned_le32(&inp[12]); - acc += POLY1305_BLOCK_SIZE; - dctx->sset = true; - } - } - return acc; -} - -void poly1305_update_arch(struct poly1305_desc_ctx *dctx, const u8 *src, - unsigned int srclen) -{ - unsigned int bytes, used; - - if (unlikely(dctx->buflen)) { - bytes = min(srclen, POLY1305_BLOCK_SIZE - dctx->buflen); - memcpy(dctx->buf + dctx->buflen, src, bytes); - src += bytes; - srclen -= bytes; - dctx->buflen += bytes; - - if (dctx->buflen == POLY1305_BLOCK_SIZE) { - if (likely(!crypto_poly1305_setdctxkey(dctx, dctx->buf, POLY1305_BLOCK_SIZE))) - poly1305_simd_blocks(&dctx->h, dctx->buf, POLY1305_BLOCK_SIZE, 1); - dctx->buflen = 0; - } - } - - if (likely(srclen >= POLY1305_BLOCK_SIZE)) { - bytes = round_down(srclen, POLY1305_BLOCK_SIZE); - srclen -= bytes; - used = crypto_poly1305_setdctxkey(dctx, src, bytes); - if (likely(bytes - used)) - poly1305_simd_blocks(&dctx->h, src + used, bytes - used, 1); - src += bytes; - } - - if (unlikely(srclen)) { - dctx->buflen = srclen; - memcpy(dctx->buf, src, srclen); - } -} -EXPORT_SYMBOL(poly1305_update_arch); - -void poly1305_final_arch(struct poly1305_desc_ctx *dctx, u8 *dst) -{ - if (unlikely(dctx->buflen)) { - dctx->buf[dctx->buflen++] = 1; - memset(dctx->buf + dctx->buflen, 0, - POLY1305_BLOCK_SIZE - dctx->buflen); - poly1305_simd_blocks(&dctx->h, dctx->buf, POLY1305_BLOCK_SIZE, 0); - } - - poly1305_simd_emit(&dctx->h, dst, dctx->s); - memzero_explicit(dctx, sizeof(*dctx)); -} -EXPORT_SYMBOL(poly1305_final_arch); - -static int crypto_poly1305_init(struct shash_desc *desc) -{ - struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc); - - *dctx = (struct poly1305_desc_ctx){}; - return 0; -} - -static int crypto_poly1305_update(struct shash_desc *desc, - const u8 *src, unsigned int srclen) -{ - struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc); - - poly1305_update_arch(dctx, src, srclen); - return 0; -} - -static int crypto_poly1305_final(struct shash_desc *desc, u8 *dst) -{ - struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc); - - if (unlikely(!dctx->sset)) - return -ENOKEY; - - poly1305_final_arch(dctx, dst); - return 0; -} - -static struct shash_alg alg = { - .digestsize = POLY1305_DIGEST_SIZE, - .init = crypto_poly1305_init, - .update = crypto_poly1305_update, - .final = crypto_poly1305_final, - .descsize = sizeof(struct poly1305_desc_ctx), - .base = { - .cra_name = "poly1305", - .cra_driver_name = "poly1305-simd", - .cra_priority = 300, - .cra_blocksize = POLY1305_BLOCK_SIZE, - .cra_module = THIS_MODULE, - }, -}; - -static int __init poly1305_simd_mod_init(void) -{ - if (boot_cpu_has(X86_FEATURE_AVX) && - cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) - static_branch_enable(&poly1305_use_avx); - if (boot_cpu_has(X86_FEATURE_AVX) && boot_cpu_has(X86_FEATURE_AVX2) && - cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) - static_branch_enable(&poly1305_use_avx2); - if (IS_ENABLED(CONFIG_AS_AVX512) && boot_cpu_has(X86_FEATURE_AVX) && - boot_cpu_has(X86_FEATURE_AVX2) && boot_cpu_has(X86_FEATURE_AVX512F) && - cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM | XFEATURE_MASK_AVX512, NULL) && - /* Skylake downclocks unacceptably much when using zmm, but later generations are fast. */ - boot_cpu_data.x86_model != INTEL_FAM6_SKYLAKE_X) - static_branch_enable(&poly1305_use_avx512); - return IS_REACHABLE(CONFIG_CRYPTO_HASH) ? crypto_register_shash(&alg) : 0; -} - -static void __exit poly1305_simd_mod_exit(void) -{ - if (IS_REACHABLE(CONFIG_CRYPTO_HASH)) - crypto_unregister_shash(&alg); -} - -module_init(poly1305_simd_mod_init); -module_exit(poly1305_simd_mod_exit); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Jason A. Donenfeld <Jason@zx2c4.com>"); -MODULE_DESCRIPTION("Poly1305 authenticator"); -MODULE_ALIAS_CRYPTO("poly1305"); -MODULE_ALIAS_CRYPTO("poly1305-simd"); diff --git a/arch/x86/crypto/polyval-clmulni_glue.c b/arch/x86/crypto/polyval-clmulni_glue.c index 8fa58b0f3cb3..6b466867f91a 100644 --- a/arch/x86/crypto/polyval-clmulni_glue.c +++ b/arch/x86/crypto/polyval-clmulni_glue.c @@ -16,16 +16,15 @@ * operations. */ -#include <crypto/algapi.h> +#include <asm/cpu_device_id.h> +#include <asm/fpu/api.h> #include <crypto/internal/hash.h> -#include <crypto/internal/simd.h> #include <crypto/polyval.h> -#include <linux/crypto.h> -#include <linux/init.h> +#include <crypto/utils.h> +#include <linux/errno.h> #include <linux/kernel.h> #include <linux/module.h> -#include <asm/cpu_device_id.h> -#include <asm/simd.h> +#include <linux/string.h> #define POLYVAL_ALIGN 16 #define POLYVAL_ALIGN_ATTR __aligned(POLYVAL_ALIGN) @@ -42,7 +41,6 @@ struct polyval_tfm_ctx { struct polyval_desc_ctx { u8 buffer[POLYVAL_BLOCK_SIZE]; - u32 bytes; }; asmlinkage void clmul_polyval_update(const struct polyval_tfm_ctx *keys, @@ -57,25 +55,16 @@ static inline struct polyval_tfm_ctx *polyval_tfm_ctx(struct crypto_shash *tfm) static void internal_polyval_update(const struct polyval_tfm_ctx *keys, const u8 *in, size_t nblocks, u8 *accumulator) { - if (likely(crypto_simd_usable())) { - kernel_fpu_begin(); - clmul_polyval_update(keys, in, nblocks, accumulator); - kernel_fpu_end(); - } else { - polyval_update_non4k(keys->key_powers[NUM_KEY_POWERS-1], in, - nblocks, accumulator); - } + kernel_fpu_begin(); + clmul_polyval_update(keys, in, nblocks, accumulator); + kernel_fpu_end(); } static void internal_polyval_mul(u8 *op1, const u8 *op2) { - if (likely(crypto_simd_usable())) { - kernel_fpu_begin(); - clmul_polyval_mul(op1, op2); - kernel_fpu_end(); - } else { - polyval_mul_non4k(op1, op2); - } + kernel_fpu_begin(); + clmul_polyval_mul(op1, op2); + kernel_fpu_end(); } static int polyval_x86_setkey(struct crypto_shash *tfm, @@ -112,49 +101,27 @@ static int polyval_x86_update(struct shash_desc *desc, { struct polyval_desc_ctx *dctx = shash_desc_ctx(desc); const struct polyval_tfm_ctx *tctx = polyval_tfm_ctx(desc->tfm); - u8 *pos; unsigned int nblocks; - unsigned int n; - - if (dctx->bytes) { - n = min(srclen, dctx->bytes); - pos = dctx->buffer + POLYVAL_BLOCK_SIZE - dctx->bytes; - - dctx->bytes -= n; - srclen -= n; - - while (n--) - *pos++ ^= *src++; - if (!dctx->bytes) - internal_polyval_mul(dctx->buffer, - tctx->key_powers[NUM_KEY_POWERS-1]); - } - - while (srclen >= POLYVAL_BLOCK_SIZE) { + do { /* Allow rescheduling every 4K bytes. */ nblocks = min(srclen, 4096U) / POLYVAL_BLOCK_SIZE; internal_polyval_update(tctx, src, nblocks, dctx->buffer); srclen -= nblocks * POLYVAL_BLOCK_SIZE; src += nblocks * POLYVAL_BLOCK_SIZE; - } + } while (srclen >= POLYVAL_BLOCK_SIZE); - if (srclen) { - dctx->bytes = POLYVAL_BLOCK_SIZE - srclen; - pos = dctx->buffer; - while (srclen--) - *pos++ ^= *src++; - } - - return 0; + return srclen; } -static int polyval_x86_final(struct shash_desc *desc, u8 *dst) +static int polyval_x86_finup(struct shash_desc *desc, const u8 *src, + unsigned int len, u8 *dst) { struct polyval_desc_ctx *dctx = shash_desc_ctx(desc); const struct polyval_tfm_ctx *tctx = polyval_tfm_ctx(desc->tfm); - if (dctx->bytes) { + if (len) { + crypto_xor(dctx->buffer, src, len); internal_polyval_mul(dctx->buffer, tctx->key_powers[NUM_KEY_POWERS-1]); } @@ -168,13 +135,14 @@ static struct shash_alg polyval_alg = { .digestsize = POLYVAL_DIGEST_SIZE, .init = polyval_x86_init, .update = polyval_x86_update, - .final = polyval_x86_final, + .finup = polyval_x86_finup, .setkey = polyval_x86_setkey, .descsize = sizeof(struct polyval_desc_ctx), .base = { .cra_name = "polyval", .cra_driver_name = "polyval-clmulni", .cra_priority = 200, + .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY, .cra_blocksize = POLYVAL_BLOCK_SIZE, .cra_ctxsize = POLYVAL_CTX_SIZE, .cra_module = THIS_MODULE, diff --git a/arch/x86/crypto/serpent-avx-x86_64-asm_64.S b/arch/x86/crypto/serpent-avx-x86_64-asm_64.S index 97e283621851..84e47f7f6188 100644 --- a/arch/x86/crypto/serpent-avx-x86_64-asm_64.S +++ b/arch/x86/crypto/serpent-avx-x86_64-asm_64.S @@ -9,6 +9,7 @@ */ #include <linux/linkage.h> +#include <linux/cfi_types.h> #include <asm/frame.h> #include "glue_helper-asm-avx.S" @@ -656,7 +657,7 @@ SYM_FUNC_START_LOCAL(__serpent_dec_blk8_avx) RET; SYM_FUNC_END(__serpent_dec_blk8_avx) -SYM_FUNC_START(serpent_ecb_enc_8way_avx) +SYM_TYPED_FUNC_START(serpent_ecb_enc_8way_avx) /* input: * %rdi: ctx, CTX * %rsi: dst @@ -674,7 +675,7 @@ SYM_FUNC_START(serpent_ecb_enc_8way_avx) RET; SYM_FUNC_END(serpent_ecb_enc_8way_avx) -SYM_FUNC_START(serpent_ecb_dec_8way_avx) +SYM_TYPED_FUNC_START(serpent_ecb_dec_8way_avx) /* input: * %rdi: ctx, CTX * %rsi: dst @@ -692,7 +693,7 @@ SYM_FUNC_START(serpent_ecb_dec_8way_avx) RET; SYM_FUNC_END(serpent_ecb_dec_8way_avx) -SYM_FUNC_START(serpent_cbc_dec_8way_avx) +SYM_TYPED_FUNC_START(serpent_cbc_dec_8way_avx) /* input: * %rdi: ctx, CTX * %rsi: dst diff --git a/arch/x86/crypto/serpent_avx2_glue.c b/arch/x86/crypto/serpent_avx2_glue.c index 347e97f4b713..f5f2121b7956 100644 --- a/arch/x86/crypto/serpent_avx2_glue.c +++ b/arch/x86/crypto/serpent_avx2_glue.c @@ -10,7 +10,6 @@ #include <linux/crypto.h> #include <linux/err.h> #include <crypto/algapi.h> -#include <crypto/internal/simd.h> #include <crypto/serpent.h> #include "serpent-avx.h" @@ -65,10 +64,9 @@ static int cbc_decrypt(struct skcipher_request *req) static struct skcipher_alg serpent_algs[] = { { - .base.cra_name = "__ecb(serpent)", - .base.cra_driver_name = "__ecb-serpent-avx2", + .base.cra_name = "ecb(serpent)", + .base.cra_driver_name = "ecb-serpent-avx2", .base.cra_priority = 600, - .base.cra_flags = CRYPTO_ALG_INTERNAL, .base.cra_blocksize = SERPENT_BLOCK_SIZE, .base.cra_ctxsize = sizeof(struct serpent_ctx), .base.cra_module = THIS_MODULE, @@ -78,10 +76,9 @@ static struct skcipher_alg serpent_algs[] = { .encrypt = ecb_encrypt, .decrypt = ecb_decrypt, }, { - .base.cra_name = "__cbc(serpent)", - .base.cra_driver_name = "__cbc-serpent-avx2", + .base.cra_name = "cbc(serpent)", + .base.cra_driver_name = "cbc-serpent-avx2", .base.cra_priority = 600, - .base.cra_flags = CRYPTO_ALG_INTERNAL, .base.cra_blocksize = SERPENT_BLOCK_SIZE, .base.cra_ctxsize = sizeof(struct serpent_ctx), .base.cra_module = THIS_MODULE, @@ -94,8 +91,6 @@ static struct skcipher_alg serpent_algs[] = { }, }; -static struct simd_skcipher_alg *serpent_simd_algs[ARRAY_SIZE(serpent_algs)]; - static int __init serpent_avx2_init(void) { const char *feature_name; @@ -110,15 +105,13 @@ static int __init serpent_avx2_init(void) return -ENODEV; } - return simd_register_skciphers_compat(serpent_algs, - ARRAY_SIZE(serpent_algs), - serpent_simd_algs); + return crypto_register_skciphers(serpent_algs, + ARRAY_SIZE(serpent_algs)); } static void __exit serpent_avx2_fini(void) { - simd_unregister_skciphers(serpent_algs, ARRAY_SIZE(serpent_algs), - serpent_simd_algs); + crypto_unregister_skciphers(serpent_algs, ARRAY_SIZE(serpent_algs)); } module_init(serpent_avx2_init); diff --git a/arch/x86/crypto/serpent_avx_glue.c b/arch/x86/crypto/serpent_avx_glue.c index 6c248e1ea4ef..e640abc1cb8a 100644 --- a/arch/x86/crypto/serpent_avx_glue.c +++ b/arch/x86/crypto/serpent_avx_glue.c @@ -13,7 +13,6 @@ #include <linux/crypto.h> #include <linux/err.h> #include <crypto/algapi.h> -#include <crypto/internal/simd.h> #include <crypto/serpent.h> #include "serpent-avx.h" @@ -71,10 +70,9 @@ static int cbc_decrypt(struct skcipher_request *req) static struct skcipher_alg serpent_algs[] = { { - .base.cra_name = "__ecb(serpent)", - .base.cra_driver_name = "__ecb-serpent-avx", + .base.cra_name = "ecb(serpent)", + .base.cra_driver_name = "ecb-serpent-avx", .base.cra_priority = 500, - .base.cra_flags = CRYPTO_ALG_INTERNAL, .base.cra_blocksize = SERPENT_BLOCK_SIZE, .base.cra_ctxsize = sizeof(struct serpent_ctx), .base.cra_module = THIS_MODULE, @@ -84,10 +82,9 @@ static struct skcipher_alg serpent_algs[] = { .encrypt = ecb_encrypt, .decrypt = ecb_decrypt, }, { - .base.cra_name = "__cbc(serpent)", - .base.cra_driver_name = "__cbc-serpent-avx", + .base.cra_name = "cbc(serpent)", + .base.cra_driver_name = "cbc-serpent-avx", .base.cra_priority = 500, - .base.cra_flags = CRYPTO_ALG_INTERNAL, .base.cra_blocksize = SERPENT_BLOCK_SIZE, .base.cra_ctxsize = sizeof(struct serpent_ctx), .base.cra_module = THIS_MODULE, @@ -100,8 +97,6 @@ static struct skcipher_alg serpent_algs[] = { }, }; -static struct simd_skcipher_alg *serpent_simd_algs[ARRAY_SIZE(serpent_algs)]; - static int __init serpent_init(void) { const char *feature_name; @@ -112,15 +107,13 @@ static int __init serpent_init(void) return -ENODEV; } - return simd_register_skciphers_compat(serpent_algs, - ARRAY_SIZE(serpent_algs), - serpent_simd_algs); + return crypto_register_skciphers(serpent_algs, + ARRAY_SIZE(serpent_algs)); } static void __exit serpent_exit(void) { - simd_unregister_skciphers(serpent_algs, ARRAY_SIZE(serpent_algs), - serpent_simd_algs); + crypto_unregister_skciphers(serpent_algs, ARRAY_SIZE(serpent_algs)); } module_init(serpent_init); diff --git a/arch/x86/crypto/serpent_sse2_glue.c b/arch/x86/crypto/serpent_sse2_glue.c index d78f37e9b2cf..80ee17ec21b4 100644 --- a/arch/x86/crypto/serpent_sse2_glue.c +++ b/arch/x86/crypto/serpent_sse2_glue.c @@ -18,7 +18,6 @@ #include <linux/err.h> #include <crypto/algapi.h> #include <crypto/b128ops.h> -#include <crypto/internal/simd.h> #include <crypto/serpent.h> #include "serpent-sse2.h" @@ -74,10 +73,9 @@ static int cbc_decrypt(struct skcipher_request *req) static struct skcipher_alg serpent_algs[] = { { - .base.cra_name = "__ecb(serpent)", - .base.cra_driver_name = "__ecb-serpent-sse2", + .base.cra_name = "ecb(serpent)", + .base.cra_driver_name = "ecb-serpent-sse2", .base.cra_priority = 400, - .base.cra_flags = CRYPTO_ALG_INTERNAL, .base.cra_blocksize = SERPENT_BLOCK_SIZE, .base.cra_ctxsize = sizeof(struct serpent_ctx), .base.cra_module = THIS_MODULE, @@ -87,10 +85,9 @@ static struct skcipher_alg serpent_algs[] = { .encrypt = ecb_encrypt, .decrypt = ecb_decrypt, }, { - .base.cra_name = "__cbc(serpent)", - .base.cra_driver_name = "__cbc-serpent-sse2", + .base.cra_name = "cbc(serpent)", + .base.cra_driver_name = "cbc-serpent-sse2", .base.cra_priority = 400, - .base.cra_flags = CRYPTO_ALG_INTERNAL, .base.cra_blocksize = SERPENT_BLOCK_SIZE, .base.cra_ctxsize = sizeof(struct serpent_ctx), .base.cra_module = THIS_MODULE, @@ -103,8 +100,6 @@ static struct skcipher_alg serpent_algs[] = { }, }; -static struct simd_skcipher_alg *serpent_simd_algs[ARRAY_SIZE(serpent_algs)]; - static int __init serpent_sse2_init(void) { if (!boot_cpu_has(X86_FEATURE_XMM2)) { @@ -112,15 +107,13 @@ static int __init serpent_sse2_init(void) return -ENODEV; } - return simd_register_skciphers_compat(serpent_algs, - ARRAY_SIZE(serpent_algs), - serpent_simd_algs); + return crypto_register_skciphers(serpent_algs, + ARRAY_SIZE(serpent_algs)); } static void __exit serpent_sse2_exit(void) { - simd_unregister_skciphers(serpent_algs, ARRAY_SIZE(serpent_algs), - serpent_simd_algs); + crypto_unregister_skciphers(serpent_algs, ARRAY_SIZE(serpent_algs)); } module_init(serpent_sse2_init); diff --git a/arch/x86/crypto/sha1_ssse3_glue.c b/arch/x86/crypto/sha1_ssse3_glue.c index ab8bc54f254d..0a912bfc86c5 100644 --- a/arch/x86/crypto/sha1_ssse3_glue.c +++ b/arch/x86/crypto/sha1_ssse3_glue.c @@ -16,21 +16,17 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <asm/cpu_device_id.h> +#include <asm/simd.h> #include <crypto/internal/hash.h> -#include <crypto/internal/simd.h> -#include <linux/init.h> -#include <linux/module.h> -#include <linux/mm.h> -#include <linux/types.h> #include <crypto/sha1.h> #include <crypto/sha1_base.h> -#include <asm/cpu_device_id.h> -#include <asm/simd.h> +#include <linux/errno.h> +#include <linux/kernel.h> +#include <linux/module.h> static const struct x86_cpu_id module_cpu_ids[] = { -#ifdef CONFIG_AS_SHA1_NI X86_MATCH_FEATURE(X86_FEATURE_SHA_NI, NULL), -#endif X86_MATCH_FEATURE(X86_FEATURE_AVX2, NULL), X86_MATCH_FEATURE(X86_FEATURE_AVX, NULL), X86_MATCH_FEATURE(X86_FEATURE_SSSE3, NULL), @@ -38,14 +34,10 @@ static const struct x86_cpu_id module_cpu_ids[] = { }; MODULE_DEVICE_TABLE(x86cpu, module_cpu_ids); -static int sha1_update(struct shash_desc *desc, const u8 *data, - unsigned int len, sha1_block_fn *sha1_xform) +static inline int sha1_update(struct shash_desc *desc, const u8 *data, + unsigned int len, sha1_block_fn *sha1_xform) { - struct sha1_state *sctx = shash_desc_ctx(desc); - - if (!crypto_simd_usable() || - (sctx->count % SHA1_BLOCK_SIZE) + len < SHA1_BLOCK_SIZE) - return crypto_sha1_update(desc, data, len); + int remain; /* * Make sure struct sha1_state begins directly with the SHA1 @@ -54,22 +46,18 @@ static int sha1_update(struct shash_desc *desc, const u8 *data, BUILD_BUG_ON(offsetof(struct sha1_state, state) != 0); kernel_fpu_begin(); - sha1_base_do_update(desc, data, len, sha1_xform); + remain = sha1_base_do_update_blocks(desc, data, len, sha1_xform); kernel_fpu_end(); - return 0; + return remain; } -static int sha1_finup(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *out, sha1_block_fn *sha1_xform) +static inline int sha1_finup(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *out, + sha1_block_fn *sha1_xform) { - if (!crypto_simd_usable()) - return crypto_sha1_finup(desc, data, len, out); - kernel_fpu_begin(); - if (len) - sha1_base_do_update(desc, data, len, sha1_xform); - sha1_base_do_finalize(desc, sha1_xform); + sha1_base_do_finup(desc, data, len, sha1_xform); kernel_fpu_end(); return sha1_base_finish(desc, out); @@ -90,23 +78,17 @@ static int sha1_ssse3_finup(struct shash_desc *desc, const u8 *data, return sha1_finup(desc, data, len, out, sha1_transform_ssse3); } -/* Add padding and return the message digest. */ -static int sha1_ssse3_final(struct shash_desc *desc, u8 *out) -{ - return sha1_ssse3_finup(desc, NULL, 0, out); -} - static struct shash_alg sha1_ssse3_alg = { .digestsize = SHA1_DIGEST_SIZE, .init = sha1_base_init, .update = sha1_ssse3_update, - .final = sha1_ssse3_final, .finup = sha1_ssse3_finup, - .descsize = sizeof(struct sha1_state), + .descsize = SHA1_STATE_SIZE, .base = { .cra_name = "sha1", .cra_driver_name = "sha1-ssse3", .cra_priority = 150, + .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY, .cra_blocksize = SHA1_BLOCK_SIZE, .cra_module = THIS_MODULE, } @@ -140,22 +122,17 @@ static int sha1_avx_finup(struct shash_desc *desc, const u8 *data, return sha1_finup(desc, data, len, out, sha1_transform_avx); } -static int sha1_avx_final(struct shash_desc *desc, u8 *out) -{ - return sha1_avx_finup(desc, NULL, 0, out); -} - static struct shash_alg sha1_avx_alg = { .digestsize = SHA1_DIGEST_SIZE, .init = sha1_base_init, .update = sha1_avx_update, - .final = sha1_avx_final, .finup = sha1_avx_finup, - .descsize = sizeof(struct sha1_state), + .descsize = SHA1_STATE_SIZE, .base = { .cra_name = "sha1", .cra_driver_name = "sha1-avx", .cra_priority = 160, + .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY, .cra_blocksize = SHA1_BLOCK_SIZE, .cra_module = THIS_MODULE, } @@ -200,8 +177,8 @@ static bool avx2_usable(void) return false; } -static void sha1_apply_transform_avx2(struct sha1_state *state, - const u8 *data, int blocks) +static inline void sha1_apply_transform_avx2(struct sha1_state *state, + const u8 *data, int blocks) { /* Select the optimal transform based on data block size */ if (blocks >= SHA1_AVX2_BLOCK_OPTSIZE) @@ -222,22 +199,17 @@ static int sha1_avx2_finup(struct shash_desc *desc, const u8 *data, return sha1_finup(desc, data, len, out, sha1_apply_transform_avx2); } -static int sha1_avx2_final(struct shash_desc *desc, u8 *out) -{ - return sha1_avx2_finup(desc, NULL, 0, out); -} - static struct shash_alg sha1_avx2_alg = { .digestsize = SHA1_DIGEST_SIZE, .init = sha1_base_init, .update = sha1_avx2_update, - .final = sha1_avx2_final, .finup = sha1_avx2_finup, - .descsize = sizeof(struct sha1_state), + .descsize = SHA1_STATE_SIZE, .base = { .cra_name = "sha1", .cra_driver_name = "sha1-avx2", .cra_priority = 170, + .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY, .cra_blocksize = SHA1_BLOCK_SIZE, .cra_module = THIS_MODULE, } @@ -256,7 +228,6 @@ static void unregister_sha1_avx2(void) crypto_unregister_shash(&sha1_avx2_alg); } -#ifdef CONFIG_AS_SHA1_NI asmlinkage void sha1_ni_transform(struct sha1_state *digest, const u8 *data, int rounds); @@ -272,22 +243,17 @@ static int sha1_ni_finup(struct shash_desc *desc, const u8 *data, return sha1_finup(desc, data, len, out, sha1_ni_transform); } -static int sha1_ni_final(struct shash_desc *desc, u8 *out) -{ - return sha1_ni_finup(desc, NULL, 0, out); -} - static struct shash_alg sha1_ni_alg = { .digestsize = SHA1_DIGEST_SIZE, .init = sha1_base_init, .update = sha1_ni_update, - .final = sha1_ni_final, .finup = sha1_ni_finup, - .descsize = sizeof(struct sha1_state), + .descsize = SHA1_STATE_SIZE, .base = { .cra_name = "sha1", .cra_driver_name = "sha1-ni", .cra_priority = 250, + .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY, .cra_blocksize = SHA1_BLOCK_SIZE, .cra_module = THIS_MODULE, } @@ -306,11 +272,6 @@ static void unregister_sha1_ni(void) crypto_unregister_shash(&sha1_ni_alg); } -#else -static inline int register_sha1_ni(void) { return 0; } -static inline void unregister_sha1_ni(void) { } -#endif - static int __init sha1_ssse3_mod_init(void) { if (!x86_match_cpu(module_cpu_ids)) @@ -360,6 +321,4 @@ MODULE_ALIAS_CRYPTO("sha1"); MODULE_ALIAS_CRYPTO("sha1-ssse3"); MODULE_ALIAS_CRYPTO("sha1-avx"); MODULE_ALIAS_CRYPTO("sha1-avx2"); -#ifdef CONFIG_AS_SHA1_NI MODULE_ALIAS_CRYPTO("sha1-ni"); -#endif diff --git a/arch/x86/crypto/sha256-avx-asm.S b/arch/x86/crypto/sha256-avx-asm.S deleted file mode 100644 index 53de72bdd851..000000000000 --- a/arch/x86/crypto/sha256-avx-asm.S +++ /dev/null @@ -1,499 +0,0 @@ -######################################################################## -# Implement fast SHA-256 with AVX1 instructions. (x86_64) -# -# Copyright (C) 2013 Intel Corporation. -# -# Authors: -# James Guilford <james.guilford@intel.com> -# Kirk Yap <kirk.s.yap@intel.com> -# Tim Chen <tim.c.chen@linux.intel.com> -# -# This software is available to you under a choice of one of two -# licenses. You may choose to be licensed under the terms of the GNU -# General Public License (GPL) Version 2, available from the file -# COPYING in the main directory of this source tree, or the -# OpenIB.org BSD license below: -# -# Redistribution and use in source and binary forms, with or -# without modification, are permitted provided that the following -# conditions are met: -# -# - Redistributions of source code must retain the above -# copyright notice, this list of conditions and the following -# disclaimer. -# -# - Redistributions in binary form must reproduce the above -# copyright notice, this list of conditions and the following -# disclaimer in the documentation and/or other materials -# provided with the distribution. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF -# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS -# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -# SOFTWARE. -######################################################################## -# -# This code is described in an Intel White-Paper: -# "Fast SHA-256 Implementations on Intel Architecture Processors" -# -# To find it, surf to http://www.intel.com/p/en_US/embedded -# and search for that title. -# -######################################################################## -# This code schedules 1 block at a time, with 4 lanes per block -######################################################################## - -#include <linux/linkage.h> -#include <linux/cfi_types.h> - -## assume buffers not aligned -#define VMOVDQ vmovdqu - -################################ Define Macros - -# addm [mem], reg -# Add reg to mem using reg-mem add and store -.macro addm p1 p2 - add \p1, \p2 - mov \p2, \p1 -.endm - - -.macro MY_ROR p1 p2 - shld $(32-(\p1)), \p2, \p2 -.endm - -################################ - -# COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask -# Load xmm with mem and byte swap each dword -.macro COPY_XMM_AND_BSWAP p1 p2 p3 - VMOVDQ \p2, \p1 - vpshufb \p3, \p1, \p1 -.endm - -################################ - -X0 = %xmm4 -X1 = %xmm5 -X2 = %xmm6 -X3 = %xmm7 - -XTMP0 = %xmm0 -XTMP1 = %xmm1 -XTMP2 = %xmm2 -XTMP3 = %xmm3 -XTMP4 = %xmm8 -XFER = %xmm9 -XTMP5 = %xmm11 - -SHUF_00BA = %xmm10 # shuffle xBxA -> 00BA -SHUF_DC00 = %xmm12 # shuffle xDxC -> DC00 -BYTE_FLIP_MASK = %xmm13 - -NUM_BLKS = %rdx # 3rd arg -INP = %rsi # 2nd arg -CTX = %rdi # 1st arg - -SRND = %rsi # clobbers INP -c = %ecx -d = %r8d -e = %edx -TBL = %r12 -a = %eax -b = %ebx - -f = %r9d -g = %r10d -h = %r11d - -y0 = %r13d -y1 = %r14d -y2 = %r15d - - -_INP_END_SIZE = 8 -_INP_SIZE = 8 -_XFER_SIZE = 16 -_XMM_SAVE_SIZE = 0 - -_INP_END = 0 -_INP = _INP_END + _INP_END_SIZE -_XFER = _INP + _INP_SIZE -_XMM_SAVE = _XFER + _XFER_SIZE -STACK_SIZE = _XMM_SAVE + _XMM_SAVE_SIZE - -# rotate_Xs -# Rotate values of symbols X0...X3 -.macro rotate_Xs -X_ = X0 -X0 = X1 -X1 = X2 -X2 = X3 -X3 = X_ -.endm - -# ROTATE_ARGS -# Rotate values of symbols a...h -.macro ROTATE_ARGS -TMP_ = h -h = g -g = f -f = e -e = d -d = c -c = b -b = a -a = TMP_ -.endm - -.macro FOUR_ROUNDS_AND_SCHED - ## compute s0 four at a time and s1 two at a time - ## compute W[-16] + W[-7] 4 at a time - - mov e, y0 # y0 = e - MY_ROR (25-11), y0 # y0 = e >> (25-11) - mov a, y1 # y1 = a - vpalignr $4, X2, X3, XTMP0 # XTMP0 = W[-7] - MY_ROR (22-13), y1 # y1 = a >> (22-13) - xor e, y0 # y0 = e ^ (e >> (25-11)) - mov f, y2 # y2 = f - MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6)) - xor a, y1 # y1 = a ^ (a >> (22-13) - xor g, y2 # y2 = f^g - vpaddd X0, XTMP0, XTMP0 # XTMP0 = W[-7] + W[-16] - xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) - and e, y2 # y2 = (f^g)&e - MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2)) - ## compute s0 - vpalignr $4, X0, X1, XTMP1 # XTMP1 = W[-15] - xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) - MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) - xor g, y2 # y2 = CH = ((f^g)&e)^g - MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) - add y0, y2 # y2 = S1 + CH - add _XFER(%rsp), y2 # y2 = k + w + S1 + CH - mov a, y0 # y0 = a - add y2, h # h = h + S1 + CH + k + w - mov a, y2 # y2 = a - vpsrld $7, XTMP1, XTMP2 - or c, y0 # y0 = a|c - add h, d # d = d + h + S1 + CH + k + w - and c, y2 # y2 = a&c - vpslld $(32-7), XTMP1, XTMP3 - and b, y0 # y0 = (a|c)&b - add y1, h # h = h + S1 + CH + k + w + S0 - vpor XTMP2, XTMP3, XTMP3 # XTMP1 = W[-15] MY_ROR 7 - or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c) - add y0, h # h = h + S1 + CH + k + w + S0 + MAJ - ROTATE_ARGS - mov e, y0 # y0 = e - mov a, y1 # y1 = a - MY_ROR (25-11), y0 # y0 = e >> (25-11) - xor e, y0 # y0 = e ^ (e >> (25-11)) - mov f, y2 # y2 = f - MY_ROR (22-13), y1 # y1 = a >> (22-13) - vpsrld $18, XTMP1, XTMP2 # - xor a, y1 # y1 = a ^ (a >> (22-13) - MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6)) - xor g, y2 # y2 = f^g - vpsrld $3, XTMP1, XTMP4 # XTMP4 = W[-15] >> 3 - MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2)) - xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) - and e, y2 # y2 = (f^g)&e - MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) - vpslld $(32-18), XTMP1, XTMP1 - xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) - xor g, y2 # y2 = CH = ((f^g)&e)^g - vpxor XTMP1, XTMP3, XTMP3 # - add y0, y2 # y2 = S1 + CH - add (1*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH - MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) - vpxor XTMP2, XTMP3, XTMP3 # XTMP1 = W[-15] MY_ROR 7 ^ W[-15] MY_ROR - mov a, y0 # y0 = a - add y2, h # h = h + S1 + CH + k + w - mov a, y2 # y2 = a - vpxor XTMP4, XTMP3, XTMP1 # XTMP1 = s0 - or c, y0 # y0 = a|c - add h, d # d = d + h + S1 + CH + k + w - and c, y2 # y2 = a&c - ## compute low s1 - vpshufd $0b11111010, X3, XTMP2 # XTMP2 = W[-2] {BBAA} - and b, y0 # y0 = (a|c)&b - add y1, h # h = h + S1 + CH + k + w + S0 - vpaddd XTMP1, XTMP0, XTMP0 # XTMP0 = W[-16] + W[-7] + s0 - or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c) - add y0, h # h = h + S1 + CH + k + w + S0 + MAJ - ROTATE_ARGS - mov e, y0 # y0 = e - mov a, y1 # y1 = a - MY_ROR (25-11), y0 # y0 = e >> (25-11) - xor e, y0 # y0 = e ^ (e >> (25-11)) - MY_ROR (22-13), y1 # y1 = a >> (22-13) - mov f, y2 # y2 = f - xor a, y1 # y1 = a ^ (a >> (22-13) - MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6)) - vpsrld $10, XTMP2, XTMP4 # XTMP4 = W[-2] >> 10 {BBAA} - xor g, y2 # y2 = f^g - vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] MY_ROR 19 {xBxA} - xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) - and e, y2 # y2 = (f^g)&e - vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] MY_ROR 17 {xBxA} - MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2)) - xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) - xor g, y2 # y2 = CH = ((f^g)&e)^g - MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) - vpxor XTMP3, XTMP2, XTMP2 # - add y0, y2 # y2 = S1 + CH - MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) - add (2*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH - vpxor XTMP2, XTMP4, XTMP4 # XTMP4 = s1 {xBxA} - mov a, y0 # y0 = a - add y2, h # h = h + S1 + CH + k + w - mov a, y2 # y2 = a - vpshufb SHUF_00BA, XTMP4, XTMP4 # XTMP4 = s1 {00BA} - or c, y0 # y0 = a|c - add h, d # d = d + h + S1 + CH + k + w - and c, y2 # y2 = a&c - vpaddd XTMP4, XTMP0, XTMP0 # XTMP0 = {..., ..., W[1], W[0]} - and b, y0 # y0 = (a|c)&b - add y1, h # h = h + S1 + CH + k + w + S0 - ## compute high s1 - vpshufd $0b01010000, XTMP0, XTMP2 # XTMP2 = W[-2] {DDCC} - or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c) - add y0, h # h = h + S1 + CH + k + w + S0 + MAJ - ROTATE_ARGS - mov e, y0 # y0 = e - MY_ROR (25-11), y0 # y0 = e >> (25-11) - mov a, y1 # y1 = a - MY_ROR (22-13), y1 # y1 = a >> (22-13) - xor e, y0 # y0 = e ^ (e >> (25-11)) - mov f, y2 # y2 = f - MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6)) - vpsrld $10, XTMP2, XTMP5 # XTMP5 = W[-2] >> 10 {DDCC} - xor a, y1 # y1 = a ^ (a >> (22-13) - xor g, y2 # y2 = f^g - vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] MY_ROR 19 {xDxC} - xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) - and e, y2 # y2 = (f^g)&e - MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2)) - vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] MY_ROR 17 {xDxC} - xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) - MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) - xor g, y2 # y2 = CH = ((f^g)&e)^g - vpxor XTMP3, XTMP2, XTMP2 - MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) - add y0, y2 # y2 = S1 + CH - add (3*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH - vpxor XTMP2, XTMP5, XTMP5 # XTMP5 = s1 {xDxC} - mov a, y0 # y0 = a - add y2, h # h = h + S1 + CH + k + w - mov a, y2 # y2 = a - vpshufb SHUF_DC00, XTMP5, XTMP5 # XTMP5 = s1 {DC00} - or c, y0 # y0 = a|c - add h, d # d = d + h + S1 + CH + k + w - and c, y2 # y2 = a&c - vpaddd XTMP0, XTMP5, X0 # X0 = {W[3], W[2], W[1], W[0]} - and b, y0 # y0 = (a|c)&b - add y1, h # h = h + S1 + CH + k + w + S0 - or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c) - add y0, h # h = h + S1 + CH + k + w + S0 + MAJ - ROTATE_ARGS - rotate_Xs -.endm - -## input is [rsp + _XFER + %1 * 4] -.macro DO_ROUND round - mov e, y0 # y0 = e - MY_ROR (25-11), y0 # y0 = e >> (25-11) - mov a, y1 # y1 = a - xor e, y0 # y0 = e ^ (e >> (25-11)) - MY_ROR (22-13), y1 # y1 = a >> (22-13) - mov f, y2 # y2 = f - xor a, y1 # y1 = a ^ (a >> (22-13) - MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6)) - xor g, y2 # y2 = f^g - xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) - MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2)) - and e, y2 # y2 = (f^g)&e - xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) - MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) - xor g, y2 # y2 = CH = ((f^g)&e)^g - add y0, y2 # y2 = S1 + CH - MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) - offset = \round * 4 + _XFER # - add offset(%rsp), y2 # y2 = k + w + S1 + CH - mov a, y0 # y0 = a - add y2, h # h = h + S1 + CH + k + w - mov a, y2 # y2 = a - or c, y0 # y0 = a|c - add h, d # d = d + h + S1 + CH + k + w - and c, y2 # y2 = a&c - and b, y0 # y0 = (a|c)&b - add y1, h # h = h + S1 + CH + k + w + S0 - or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c) - add y0, h # h = h + S1 + CH + k + w + S0 + MAJ - ROTATE_ARGS -.endm - -######################################################################## -## void sha256_transform_avx(state sha256_state *state, const u8 *data, int blocks) -## arg 1 : pointer to state -## arg 2 : pointer to input data -## arg 3 : Num blocks -######################################################################## -.text -SYM_TYPED_FUNC_START(sha256_transform_avx) - pushq %rbx - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 - pushq %rbp - movq %rsp, %rbp - - subq $STACK_SIZE, %rsp # allocate stack space - and $~15, %rsp # align stack pointer - - shl $6, NUM_BLKS # convert to bytes - jz .Ldone_hash - add INP, NUM_BLKS # pointer to end of data - mov NUM_BLKS, _INP_END(%rsp) - - ## load initial digest - mov 4*0(CTX), a - mov 4*1(CTX), b - mov 4*2(CTX), c - mov 4*3(CTX), d - mov 4*4(CTX), e - mov 4*5(CTX), f - mov 4*6(CTX), g - mov 4*7(CTX), h - - vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK - vmovdqa _SHUF_00BA(%rip), SHUF_00BA - vmovdqa _SHUF_DC00(%rip), SHUF_DC00 -.Lloop0: - lea K256(%rip), TBL - - ## byte swap first 16 dwords - COPY_XMM_AND_BSWAP X0, 0*16(INP), BYTE_FLIP_MASK - COPY_XMM_AND_BSWAP X1, 1*16(INP), BYTE_FLIP_MASK - COPY_XMM_AND_BSWAP X2, 2*16(INP), BYTE_FLIP_MASK - COPY_XMM_AND_BSWAP X3, 3*16(INP), BYTE_FLIP_MASK - - mov INP, _INP(%rsp) - - ## schedule 48 input dwords, by doing 3 rounds of 16 each - mov $3, SRND -.align 16 -.Lloop1: - vpaddd (TBL), X0, XFER - vmovdqa XFER, _XFER(%rsp) - FOUR_ROUNDS_AND_SCHED - - vpaddd 1*16(TBL), X0, XFER - vmovdqa XFER, _XFER(%rsp) - FOUR_ROUNDS_AND_SCHED - - vpaddd 2*16(TBL), X0, XFER - vmovdqa XFER, _XFER(%rsp) - FOUR_ROUNDS_AND_SCHED - - vpaddd 3*16(TBL), X0, XFER - vmovdqa XFER, _XFER(%rsp) - add $4*16, TBL - FOUR_ROUNDS_AND_SCHED - - sub $1, SRND - jne .Lloop1 - - mov $2, SRND -.Lloop2: - vpaddd (TBL), X0, XFER - vmovdqa XFER, _XFER(%rsp) - DO_ROUND 0 - DO_ROUND 1 - DO_ROUND 2 - DO_ROUND 3 - - vpaddd 1*16(TBL), X1, XFER - vmovdqa XFER, _XFER(%rsp) - add $2*16, TBL - DO_ROUND 0 - DO_ROUND 1 - DO_ROUND 2 - DO_ROUND 3 - - vmovdqa X2, X0 - vmovdqa X3, X1 - - sub $1, SRND - jne .Lloop2 - - addm (4*0)(CTX),a - addm (4*1)(CTX),b - addm (4*2)(CTX),c - addm (4*3)(CTX),d - addm (4*4)(CTX),e - addm (4*5)(CTX),f - addm (4*6)(CTX),g - addm (4*7)(CTX),h - - mov _INP(%rsp), INP - add $64, INP - cmp _INP_END(%rsp), INP - jne .Lloop0 - -.Ldone_hash: - - mov %rbp, %rsp - popq %rbp - popq %r15 - popq %r14 - popq %r13 - popq %r12 - popq %rbx - RET -SYM_FUNC_END(sha256_transform_avx) - -.section .rodata.cst256.K256, "aM", @progbits, 256 -.align 64 -K256: - .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 - .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 - .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 - .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 - .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc - .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da - .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 - .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 - .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 - .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 - .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 - .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 - .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 - .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 - .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 - .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 - -.section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16 -.align 16 -PSHUFFLE_BYTE_FLIP_MASK: - .octa 0x0c0d0e0f08090a0b0405060700010203 - -.section .rodata.cst16._SHUF_00BA, "aM", @progbits, 16 -.align 16 -# shuffle xBxA -> 00BA -_SHUF_00BA: - .octa 0xFFFFFFFFFFFFFFFF0b0a090803020100 - -.section .rodata.cst16._SHUF_DC00, "aM", @progbits, 16 -.align 16 -# shuffle xDxC -> DC00 -_SHUF_DC00: - .octa 0x0b0a090803020100FFFFFFFFFFFFFFFF diff --git a/arch/x86/crypto/sha256-avx2-asm.S b/arch/x86/crypto/sha256-avx2-asm.S deleted file mode 100644 index 9918212faf91..000000000000 --- a/arch/x86/crypto/sha256-avx2-asm.S +++ /dev/null @@ -1,773 +0,0 @@ -######################################################################## -# Implement fast SHA-256 with AVX2 instructions. (x86_64) -# -# Copyright (C) 2013 Intel Corporation. -# -# Authors: -# James Guilford <james.guilford@intel.com> -# Kirk Yap <kirk.s.yap@intel.com> -# Tim Chen <tim.c.chen@linux.intel.com> -# -# This software is available to you under a choice of one of two -# licenses. You may choose to be licensed under the terms of the GNU -# General Public License (GPL) Version 2, available from the file -# COPYING in the main directory of this source tree, or the -# OpenIB.org BSD license below: -# -# Redistribution and use in source and binary forms, with or -# without modification, are permitted provided that the following -# conditions are met: -# -# - Redistributions of source code must retain the above -# copyright notice, this list of conditions and the following -# disclaimer. -# -# - Redistributions in binary form must reproduce the above -# copyright notice, this list of conditions and the following -# disclaimer in the documentation and/or other materials -# provided with the distribution. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF -# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS -# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -# SOFTWARE. -# -######################################################################## -# -# This code is described in an Intel White-Paper: -# "Fast SHA-256 Implementations on Intel Architecture Processors" -# -# To find it, surf to http://www.intel.com/p/en_US/embedded -# and search for that title. -# -######################################################################## -# This code schedules 2 blocks at a time, with 4 lanes per block -######################################################################## - -#include <linux/linkage.h> -#include <linux/cfi_types.h> - -## assume buffers not aligned -#define VMOVDQ vmovdqu - -################################ Define Macros - -# addm [mem], reg -# Add reg to mem using reg-mem add and store -.macro addm p1 p2 - add \p1, \p2 - mov \p2, \p1 -.endm - -################################ - -X0 = %ymm4 -X1 = %ymm5 -X2 = %ymm6 -X3 = %ymm7 - -# XMM versions of above -XWORD0 = %xmm4 -XWORD1 = %xmm5 -XWORD2 = %xmm6 -XWORD3 = %xmm7 - -XTMP0 = %ymm0 -XTMP1 = %ymm1 -XTMP2 = %ymm2 -XTMP3 = %ymm3 -XTMP4 = %ymm8 -XFER = %ymm9 -XTMP5 = %ymm11 - -SHUF_00BA = %ymm10 # shuffle xBxA -> 00BA -SHUF_DC00 = %ymm12 # shuffle xDxC -> DC00 -BYTE_FLIP_MASK = %ymm13 - -X_BYTE_FLIP_MASK = %xmm13 # XMM version of BYTE_FLIP_MASK - -NUM_BLKS = %rdx # 3rd arg -INP = %rsi # 2nd arg -CTX = %rdi # 1st arg -c = %ecx -d = %r8d -e = %edx # clobbers NUM_BLKS -y3 = %esi # clobbers INP - -SRND = CTX # SRND is same register as CTX - -a = %eax -b = %ebx -f = %r9d -g = %r10d -h = %r11d -old_h = %r11d - -T1 = %r12d -y0 = %r13d -y1 = %r14d -y2 = %r15d - - -_XFER_SIZE = 2*64*4 # 2 blocks, 64 rounds, 4 bytes/round -_XMM_SAVE_SIZE = 0 -_INP_END_SIZE = 8 -_INP_SIZE = 8 -_CTX_SIZE = 8 - -_XFER = 0 -_XMM_SAVE = _XFER + _XFER_SIZE -_INP_END = _XMM_SAVE + _XMM_SAVE_SIZE -_INP = _INP_END + _INP_END_SIZE -_CTX = _INP + _INP_SIZE -STACK_SIZE = _CTX + _CTX_SIZE - -# rotate_Xs -# Rotate values of symbols X0...X3 -.macro rotate_Xs - X_ = X0 - X0 = X1 - X1 = X2 - X2 = X3 - X3 = X_ -.endm - -# ROTATE_ARGS -# Rotate values of symbols a...h -.macro ROTATE_ARGS - old_h = h - TMP_ = h - h = g - g = f - f = e - e = d - d = c - c = b - b = a - a = TMP_ -.endm - -.macro FOUR_ROUNDS_AND_SCHED disp -################################### RND N + 0 ############################ - - mov a, y3 # y3 = a # MAJA - rorx $25, e, y0 # y0 = e >> 25 # S1A - rorx $11, e, y1 # y1 = e >> 11 # S1B - - addl \disp(%rsp, SRND), h # h = k + w + h # -- - or c, y3 # y3 = a|c # MAJA - vpalignr $4, X2, X3, XTMP0 # XTMP0 = W[-7] - mov f, y2 # y2 = f # CH - rorx $13, a, T1 # T1 = a >> 13 # S0B - - xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 - xor g, y2 # y2 = f^g # CH - vpaddd X0, XTMP0, XTMP0 # XTMP0 = W[-7] + W[-16]# y1 = (e >> 6)# S1 - rorx $6, e, y1 # y1 = (e >> 6) # S1 - - and e, y2 # y2 = (f^g)&e # CH - xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 - rorx $22, a, y1 # y1 = a >> 22 # S0A - add h, d # d = k + w + h + d # -- - - and b, y3 # y3 = (a|c)&b # MAJA - vpalignr $4, X0, X1, XTMP1 # XTMP1 = W[-15] - xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 - rorx $2, a, T1 # T1 = (a >> 2) # S0 - - xor g, y2 # y2 = CH = ((f^g)&e)^g # CH - vpsrld $7, XTMP1, XTMP2 - xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 - mov a, T1 # T1 = a # MAJB - and c, T1 # T1 = a&c # MAJB - - add y0, y2 # y2 = S1 + CH # -- - vpslld $(32-7), XTMP1, XTMP3 - or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ - add y1, h # h = k + w + h + S0 # -- - - add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- - vpor XTMP2, XTMP3, XTMP3 # XTMP3 = W[-15] ror 7 - - vpsrld $18, XTMP1, XTMP2 - add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- - add y3, h # h = t1 + S0 + MAJ # -- - - - ROTATE_ARGS - -################################### RND N + 1 ############################ - - mov a, y3 # y3 = a # MAJA - rorx $25, e, y0 # y0 = e >> 25 # S1A - rorx $11, e, y1 # y1 = e >> 11 # S1B - offset = \disp + 1*4 - addl offset(%rsp, SRND), h # h = k + w + h # -- - or c, y3 # y3 = a|c # MAJA - - - vpsrld $3, XTMP1, XTMP4 # XTMP4 = W[-15] >> 3 - mov f, y2 # y2 = f # CH - rorx $13, a, T1 # T1 = a >> 13 # S0B - xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 - xor g, y2 # y2 = f^g # CH - - - rorx $6, e, y1 # y1 = (e >> 6) # S1 - xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 - rorx $22, a, y1 # y1 = a >> 22 # S0A - and e, y2 # y2 = (f^g)&e # CH - add h, d # d = k + w + h + d # -- - - vpslld $(32-18), XTMP1, XTMP1 - and b, y3 # y3 = (a|c)&b # MAJA - xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 - - vpxor XTMP1, XTMP3, XTMP3 - rorx $2, a, T1 # T1 = (a >> 2) # S0 - xor g, y2 # y2 = CH = ((f^g)&e)^g # CH - - vpxor XTMP2, XTMP3, XTMP3 # XTMP3 = W[-15] ror 7 ^ W[-15] ror 18 - xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 - mov a, T1 # T1 = a # MAJB - and c, T1 # T1 = a&c # MAJB - add y0, y2 # y2 = S1 + CH # -- - - vpxor XTMP4, XTMP3, XTMP1 # XTMP1 = s0 - vpshufd $0b11111010, X3, XTMP2 # XTMP2 = W[-2] {BBAA} - or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ - add y1, h # h = k + w + h + S0 # -- - - vpaddd XTMP1, XTMP0, XTMP0 # XTMP0 = W[-16] + W[-7] + s0 - add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- - add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- - add y3, h # h = t1 + S0 + MAJ # -- - - vpsrld $10, XTMP2, XTMP4 # XTMP4 = W[-2] >> 10 {BBAA} - - - ROTATE_ARGS - -################################### RND N + 2 ############################ - - mov a, y3 # y3 = a # MAJA - rorx $25, e, y0 # y0 = e >> 25 # S1A - offset = \disp + 2*4 - addl offset(%rsp, SRND), h # h = k + w + h # -- - - vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] ror 19 {xBxA} - rorx $11, e, y1 # y1 = e >> 11 # S1B - or c, y3 # y3 = a|c # MAJA - mov f, y2 # y2 = f # CH - xor g, y2 # y2 = f^g # CH - - rorx $13, a, T1 # T1 = a >> 13 # S0B - xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 - vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] ror 17 {xBxA} - and e, y2 # y2 = (f^g)&e # CH - - rorx $6, e, y1 # y1 = (e >> 6) # S1 - vpxor XTMP3, XTMP2, XTMP2 - add h, d # d = k + w + h + d # -- - and b, y3 # y3 = (a|c)&b # MAJA - - xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 - rorx $22, a, y1 # y1 = a >> 22 # S0A - vpxor XTMP2, XTMP4, XTMP4 # XTMP4 = s1 {xBxA} - xor g, y2 # y2 = CH = ((f^g)&e)^g # CH - - vpshufb SHUF_00BA, XTMP4, XTMP4 # XTMP4 = s1 {00BA} - xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 - rorx $2, a ,T1 # T1 = (a >> 2) # S0 - vpaddd XTMP4, XTMP0, XTMP0 # XTMP0 = {..., ..., W[1], W[0]} - - xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 - mov a, T1 # T1 = a # MAJB - and c, T1 # T1 = a&c # MAJB - add y0, y2 # y2 = S1 + CH # -- - vpshufd $0b01010000, XTMP0, XTMP2 # XTMP2 = W[-2] {DDCC} - - or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ - add y1,h # h = k + w + h + S0 # -- - add y2,d # d = k + w + h + d + S1 + CH = d + t1 # -- - add y2,h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- - - add y3,h # h = t1 + S0 + MAJ # -- - - - ROTATE_ARGS - -################################### RND N + 3 ############################ - - mov a, y3 # y3 = a # MAJA - rorx $25, e, y0 # y0 = e >> 25 # S1A - rorx $11, e, y1 # y1 = e >> 11 # S1B - offset = \disp + 3*4 - addl offset(%rsp, SRND), h # h = k + w + h # -- - or c, y3 # y3 = a|c # MAJA - - - vpsrld $10, XTMP2, XTMP5 # XTMP5 = W[-2] >> 10 {DDCC} - mov f, y2 # y2 = f # CH - rorx $13, a, T1 # T1 = a >> 13 # S0B - xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 - xor g, y2 # y2 = f^g # CH - - - vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] ror 19 {xDxC} - rorx $6, e, y1 # y1 = (e >> 6) # S1 - and e, y2 # y2 = (f^g)&e # CH - add h, d # d = k + w + h + d # -- - and b, y3 # y3 = (a|c)&b # MAJA - - vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] ror 17 {xDxC} - xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 - xor g, y2 # y2 = CH = ((f^g)&e)^g # CH - - vpxor XTMP3, XTMP2, XTMP2 - rorx $22, a, y1 # y1 = a >> 22 # S0A - add y0, y2 # y2 = S1 + CH # -- - - vpxor XTMP2, XTMP5, XTMP5 # XTMP5 = s1 {xDxC} - xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 - add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- - - rorx $2, a, T1 # T1 = (a >> 2) # S0 - vpshufb SHUF_DC00, XTMP5, XTMP5 # XTMP5 = s1 {DC00} - - vpaddd XTMP0, XTMP5, X0 # X0 = {W[3], W[2], W[1], W[0]} - xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 - mov a, T1 # T1 = a # MAJB - and c, T1 # T1 = a&c # MAJB - or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ - - add y1, h # h = k + w + h + S0 # -- - add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- - add y3, h # h = t1 + S0 + MAJ # -- - - ROTATE_ARGS - rotate_Xs -.endm - -.macro DO_4ROUNDS disp -################################### RND N + 0 ########################### - - mov f, y2 # y2 = f # CH - rorx $25, e, y0 # y0 = e >> 25 # S1A - rorx $11, e, y1 # y1 = e >> 11 # S1B - xor g, y2 # y2 = f^g # CH - - xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 - rorx $6, e, y1 # y1 = (e >> 6) # S1 - and e, y2 # y2 = (f^g)&e # CH - - xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 - rorx $13, a, T1 # T1 = a >> 13 # S0B - xor g, y2 # y2 = CH = ((f^g)&e)^g # CH - rorx $22, a, y1 # y1 = a >> 22 # S0A - mov a, y3 # y3 = a # MAJA - - xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 - rorx $2, a, T1 # T1 = (a >> 2) # S0 - addl \disp(%rsp, SRND), h # h = k + w + h # -- - or c, y3 # y3 = a|c # MAJA - - xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 - mov a, T1 # T1 = a # MAJB - and b, y3 # y3 = (a|c)&b # MAJA - and c, T1 # T1 = a&c # MAJB - add y0, y2 # y2 = S1 + CH # -- - - - add h, d # d = k + w + h + d # -- - or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ - add y1, h # h = k + w + h + S0 # -- - add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- - - ROTATE_ARGS - -################################### RND N + 1 ########################### - - add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- - mov f, y2 # y2 = f # CH - rorx $25, e, y0 # y0 = e >> 25 # S1A - rorx $11, e, y1 # y1 = e >> 11 # S1B - xor g, y2 # y2 = f^g # CH - - xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 - rorx $6, e, y1 # y1 = (e >> 6) # S1 - and e, y2 # y2 = (f^g)&e # CH - add y3, old_h # h = t1 + S0 + MAJ # -- - - xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 - rorx $13, a, T1 # T1 = a >> 13 # S0B - xor g, y2 # y2 = CH = ((f^g)&e)^g # CH - rorx $22, a, y1 # y1 = a >> 22 # S0A - mov a, y3 # y3 = a # MAJA - - xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 - rorx $2, a, T1 # T1 = (a >> 2) # S0 - offset = 4*1 + \disp - addl offset(%rsp, SRND), h # h = k + w + h # -- - or c, y3 # y3 = a|c # MAJA - - xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 - mov a, T1 # T1 = a # MAJB - and b, y3 # y3 = (a|c)&b # MAJA - and c, T1 # T1 = a&c # MAJB - add y0, y2 # y2 = S1 + CH # -- - - - add h, d # d = k + w + h + d # -- - or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ - add y1, h # h = k + w + h + S0 # -- - - add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- - - ROTATE_ARGS - -################################### RND N + 2 ############################## - - add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- - mov f, y2 # y2 = f # CH - rorx $25, e, y0 # y0 = e >> 25 # S1A - rorx $11, e, y1 # y1 = e >> 11 # S1B - xor g, y2 # y2 = f^g # CH - - xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 - rorx $6, e, y1 # y1 = (e >> 6) # S1 - and e, y2 # y2 = (f^g)&e # CH - add y3, old_h # h = t1 + S0 + MAJ # -- - - xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 - rorx $13, a, T1 # T1 = a >> 13 # S0B - xor g, y2 # y2 = CH = ((f^g)&e)^g # CH - rorx $22, a, y1 # y1 = a >> 22 # S0A - mov a, y3 # y3 = a # MAJA - - xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 - rorx $2, a, T1 # T1 = (a >> 2) # S0 - offset = 4*2 + \disp - addl offset(%rsp, SRND), h # h = k + w + h # -- - or c, y3 # y3 = a|c # MAJA - - xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 - mov a, T1 # T1 = a # MAJB - and b, y3 # y3 = (a|c)&b # MAJA - and c, T1 # T1 = a&c # MAJB - add y0, y2 # y2 = S1 + CH # -- - - - add h, d # d = k + w + h + d # -- - or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ - add y1, h # h = k + w + h + S0 # -- - - add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- - - ROTATE_ARGS - -################################### RND N + 3 ########################### - - add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- - mov f, y2 # y2 = f # CH - rorx $25, e, y0 # y0 = e >> 25 # S1A - rorx $11, e, y1 # y1 = e >> 11 # S1B - xor g, y2 # y2 = f^g # CH - - xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 - rorx $6, e, y1 # y1 = (e >> 6) # S1 - and e, y2 # y2 = (f^g)&e # CH - add y3, old_h # h = t1 + S0 + MAJ # -- - - xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 - rorx $13, a, T1 # T1 = a >> 13 # S0B - xor g, y2 # y2 = CH = ((f^g)&e)^g # CH - rorx $22, a, y1 # y1 = a >> 22 # S0A - mov a, y3 # y3 = a # MAJA - - xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 - rorx $2, a, T1 # T1 = (a >> 2) # S0 - offset = 4*3 + \disp - addl offset(%rsp, SRND), h # h = k + w + h # -- - or c, y3 # y3 = a|c # MAJA - - xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 - mov a, T1 # T1 = a # MAJB - and b, y3 # y3 = (a|c)&b # MAJA - and c, T1 # T1 = a&c # MAJB - add y0, y2 # y2 = S1 + CH # -- - - - add h, d # d = k + w + h + d # -- - or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ - add y1, h # h = k + w + h + S0 # -- - - add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- - - - add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- - - add y3, h # h = t1 + S0 + MAJ # -- - - ROTATE_ARGS - -.endm - -######################################################################## -## void sha256_transform_rorx(struct sha256_state *state, const u8 *data, int blocks) -## arg 1 : pointer to state -## arg 2 : pointer to input data -## arg 3 : Num blocks -######################################################################## -.text -SYM_TYPED_FUNC_START(sha256_transform_rorx) - pushq %rbx - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 - - push %rbp - mov %rsp, %rbp - - subq $STACK_SIZE, %rsp - and $-32, %rsp # align rsp to 32 byte boundary - - shl $6, NUM_BLKS # convert to bytes - jz .Ldone_hash - lea -64(INP, NUM_BLKS), NUM_BLKS # pointer to last block - mov NUM_BLKS, _INP_END(%rsp) - - cmp NUM_BLKS, INP - je .Lonly_one_block - - ## load initial digest - mov (CTX), a - mov 4*1(CTX), b - mov 4*2(CTX), c - mov 4*3(CTX), d - mov 4*4(CTX), e - mov 4*5(CTX), f - mov 4*6(CTX), g - mov 4*7(CTX), h - - vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK - vmovdqa _SHUF_00BA(%rip), SHUF_00BA - vmovdqa _SHUF_DC00(%rip), SHUF_DC00 - - mov CTX, _CTX(%rsp) - -.Lloop0: - ## Load first 16 dwords from two blocks - VMOVDQ 0*32(INP),XTMP0 - VMOVDQ 1*32(INP),XTMP1 - VMOVDQ 2*32(INP),XTMP2 - VMOVDQ 3*32(INP),XTMP3 - - ## byte swap data - vpshufb BYTE_FLIP_MASK, XTMP0, XTMP0 - vpshufb BYTE_FLIP_MASK, XTMP1, XTMP1 - vpshufb BYTE_FLIP_MASK, XTMP2, XTMP2 - vpshufb BYTE_FLIP_MASK, XTMP3, XTMP3 - - ## transpose data into high/low halves - vperm2i128 $0x20, XTMP2, XTMP0, X0 - vperm2i128 $0x31, XTMP2, XTMP0, X1 - vperm2i128 $0x20, XTMP3, XTMP1, X2 - vperm2i128 $0x31, XTMP3, XTMP1, X3 - -.Llast_block_enter: - add $64, INP - mov INP, _INP(%rsp) - - ## schedule 48 input dwords, by doing 3 rounds of 12 each - xor SRND, SRND - -.align 16 -.Lloop1: - leaq K256+0*32(%rip), INP ## reuse INP as scratch reg - vpaddd (INP, SRND), X0, XFER - vmovdqa XFER, 0*32+_XFER(%rsp, SRND) - FOUR_ROUNDS_AND_SCHED _XFER + 0*32 - - leaq K256+1*32(%rip), INP - vpaddd (INP, SRND), X0, XFER - vmovdqa XFER, 1*32+_XFER(%rsp, SRND) - FOUR_ROUNDS_AND_SCHED _XFER + 1*32 - - leaq K256+2*32(%rip), INP - vpaddd (INP, SRND), X0, XFER - vmovdqa XFER, 2*32+_XFER(%rsp, SRND) - FOUR_ROUNDS_AND_SCHED _XFER + 2*32 - - leaq K256+3*32(%rip), INP - vpaddd (INP, SRND), X0, XFER - vmovdqa XFER, 3*32+_XFER(%rsp, SRND) - FOUR_ROUNDS_AND_SCHED _XFER + 3*32 - - add $4*32, SRND - cmp $3*4*32, SRND - jb .Lloop1 - -.Lloop2: - ## Do last 16 rounds with no scheduling - leaq K256+0*32(%rip), INP - vpaddd (INP, SRND), X0, XFER - vmovdqa XFER, 0*32+_XFER(%rsp, SRND) - DO_4ROUNDS _XFER + 0*32 - - leaq K256+1*32(%rip), INP - vpaddd (INP, SRND), X1, XFER - vmovdqa XFER, 1*32+_XFER(%rsp, SRND) - DO_4ROUNDS _XFER + 1*32 - add $2*32, SRND - - vmovdqa X2, X0 - vmovdqa X3, X1 - - cmp $4*4*32, SRND - jb .Lloop2 - - mov _CTX(%rsp), CTX - mov _INP(%rsp), INP - - addm (4*0)(CTX),a - addm (4*1)(CTX),b - addm (4*2)(CTX),c - addm (4*3)(CTX),d - addm (4*4)(CTX),e - addm (4*5)(CTX),f - addm (4*6)(CTX),g - addm (4*7)(CTX),h - - cmp _INP_END(%rsp), INP - ja .Ldone_hash - - #### Do second block using previously scheduled results - xor SRND, SRND -.align 16 -.Lloop3: - DO_4ROUNDS _XFER + 0*32 + 16 - DO_4ROUNDS _XFER + 1*32 + 16 - add $2*32, SRND - cmp $4*4*32, SRND - jb .Lloop3 - - mov _CTX(%rsp), CTX - mov _INP(%rsp), INP - add $64, INP - - addm (4*0)(CTX),a - addm (4*1)(CTX),b - addm (4*2)(CTX),c - addm (4*3)(CTX),d - addm (4*4)(CTX),e - addm (4*5)(CTX),f - addm (4*6)(CTX),g - addm (4*7)(CTX),h - - cmp _INP_END(%rsp), INP - jb .Lloop0 - ja .Ldone_hash - -.Ldo_last_block: - VMOVDQ 0*16(INP),XWORD0 - VMOVDQ 1*16(INP),XWORD1 - VMOVDQ 2*16(INP),XWORD2 - VMOVDQ 3*16(INP),XWORD3 - - vpshufb X_BYTE_FLIP_MASK, XWORD0, XWORD0 - vpshufb X_BYTE_FLIP_MASK, XWORD1, XWORD1 - vpshufb X_BYTE_FLIP_MASK, XWORD2, XWORD2 - vpshufb X_BYTE_FLIP_MASK, XWORD3, XWORD3 - - jmp .Llast_block_enter - -.Lonly_one_block: - - ## load initial digest - mov (4*0)(CTX),a - mov (4*1)(CTX),b - mov (4*2)(CTX),c - mov (4*3)(CTX),d - mov (4*4)(CTX),e - mov (4*5)(CTX),f - mov (4*6)(CTX),g - mov (4*7)(CTX),h - - vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK - vmovdqa _SHUF_00BA(%rip), SHUF_00BA - vmovdqa _SHUF_DC00(%rip), SHUF_DC00 - - mov CTX, _CTX(%rsp) - jmp .Ldo_last_block - -.Ldone_hash: - - mov %rbp, %rsp - pop %rbp - - popq %r15 - popq %r14 - popq %r13 - popq %r12 - popq %rbx - RET -SYM_FUNC_END(sha256_transform_rorx) - -.section .rodata.cst512.K256, "aM", @progbits, 512 -.align 64 -K256: - .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 - .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 - .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 - .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 - .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 - .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 - .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 - .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 - .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc - .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc - .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da - .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da - .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 - .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 - .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 - .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 - .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 - .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 - .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 - .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 - .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 - .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 - .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 - .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 - .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 - .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 - .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 - .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 - .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 - .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 - .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 - .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 - -.section .rodata.cst32.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 32 -.align 32 -PSHUFFLE_BYTE_FLIP_MASK: - .octa 0x0c0d0e0f08090a0b0405060700010203,0x0c0d0e0f08090a0b0405060700010203 - -# shuffle xBxA -> 00BA -.section .rodata.cst32._SHUF_00BA, "aM", @progbits, 32 -.align 32 -_SHUF_00BA: - .octa 0xFFFFFFFFFFFFFFFF0b0a090803020100,0xFFFFFFFFFFFFFFFF0b0a090803020100 - -# shuffle xDxC -> DC00 -.section .rodata.cst32._SHUF_DC00, "aM", @progbits, 32 -.align 32 -_SHUF_DC00: - .octa 0x0b0a090803020100FFFFFFFFFFFFFFFF,0x0b0a090803020100FFFFFFFFFFFFFFFF diff --git a/arch/x86/crypto/sha256-ssse3-asm.S b/arch/x86/crypto/sha256-ssse3-asm.S deleted file mode 100644 index 93264ee44543..000000000000 --- a/arch/x86/crypto/sha256-ssse3-asm.S +++ /dev/null @@ -1,513 +0,0 @@ -######################################################################## -# Implement fast SHA-256 with SSSE3 instructions. (x86_64) -# -# Copyright (C) 2013 Intel Corporation. -# -# Authors: -# James Guilford <james.guilford@intel.com> -# Kirk Yap <kirk.s.yap@intel.com> -# Tim Chen <tim.c.chen@linux.intel.com> -# -# This software is available to you under a choice of one of two -# licenses. You may choose to be licensed under the terms of the GNU -# General Public License (GPL) Version 2, available from the file -# COPYING in the main directory of this source tree, or the -# OpenIB.org BSD license below: -# -# Redistribution and use in source and binary forms, with or -# without modification, are permitted provided that the following -# conditions are met: -# -# - Redistributions of source code must retain the above -# copyright notice, this list of conditions and the following -# disclaimer. -# -# - Redistributions in binary form must reproduce the above -# copyright notice, this list of conditions and the following -# disclaimer in the documentation and/or other materials -# provided with the distribution. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF -# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS -# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -# SOFTWARE. -# -######################################################################## -# -# This code is described in an Intel White-Paper: -# "Fast SHA-256 Implementations on Intel Architecture Processors" -# -# To find it, surf to http://www.intel.com/p/en_US/embedded -# and search for that title. -# -######################################################################## - -#include <linux/linkage.h> -#include <linux/cfi_types.h> - -## assume buffers not aligned -#define MOVDQ movdqu - -################################ Define Macros - -# addm [mem], reg -# Add reg to mem using reg-mem add and store -.macro addm p1 p2 - add \p1, \p2 - mov \p2, \p1 -.endm - -################################ - -# COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask -# Load xmm with mem and byte swap each dword -.macro COPY_XMM_AND_BSWAP p1 p2 p3 - MOVDQ \p2, \p1 - pshufb \p3, \p1 -.endm - -################################ - -X0 = %xmm4 -X1 = %xmm5 -X2 = %xmm6 -X3 = %xmm7 - -XTMP0 = %xmm0 -XTMP1 = %xmm1 -XTMP2 = %xmm2 -XTMP3 = %xmm3 -XTMP4 = %xmm8 -XFER = %xmm9 - -SHUF_00BA = %xmm10 # shuffle xBxA -> 00BA -SHUF_DC00 = %xmm11 # shuffle xDxC -> DC00 -BYTE_FLIP_MASK = %xmm12 - -NUM_BLKS = %rdx # 3rd arg -INP = %rsi # 2nd arg -CTX = %rdi # 1st arg - -SRND = %rsi # clobbers INP -c = %ecx -d = %r8d -e = %edx -TBL = %r12 -a = %eax -b = %ebx - -f = %r9d -g = %r10d -h = %r11d - -y0 = %r13d -y1 = %r14d -y2 = %r15d - - - -_INP_END_SIZE = 8 -_INP_SIZE = 8 -_XFER_SIZE = 16 -_XMM_SAVE_SIZE = 0 - -_INP_END = 0 -_INP = _INP_END + _INP_END_SIZE -_XFER = _INP + _INP_SIZE -_XMM_SAVE = _XFER + _XFER_SIZE -STACK_SIZE = _XMM_SAVE + _XMM_SAVE_SIZE - -# rotate_Xs -# Rotate values of symbols X0...X3 -.macro rotate_Xs -X_ = X0 -X0 = X1 -X1 = X2 -X2 = X3 -X3 = X_ -.endm - -# ROTATE_ARGS -# Rotate values of symbols a...h -.macro ROTATE_ARGS -TMP_ = h -h = g -g = f -f = e -e = d -d = c -c = b -b = a -a = TMP_ -.endm - -.macro FOUR_ROUNDS_AND_SCHED - ## compute s0 four at a time and s1 two at a time - ## compute W[-16] + W[-7] 4 at a time - movdqa X3, XTMP0 - mov e, y0 # y0 = e - ror $(25-11), y0 # y0 = e >> (25-11) - mov a, y1 # y1 = a - palignr $4, X2, XTMP0 # XTMP0 = W[-7] - ror $(22-13), y1 # y1 = a >> (22-13) - xor e, y0 # y0 = e ^ (e >> (25-11)) - mov f, y2 # y2 = f - ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6)) - movdqa X1, XTMP1 - xor a, y1 # y1 = a ^ (a >> (22-13) - xor g, y2 # y2 = f^g - paddd X0, XTMP0 # XTMP0 = W[-7] + W[-16] - xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) - and e, y2 # y2 = (f^g)&e - ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2)) - ## compute s0 - palignr $4, X0, XTMP1 # XTMP1 = W[-15] - xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) - ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) - xor g, y2 # y2 = CH = ((f^g)&e)^g - movdqa XTMP1, XTMP2 # XTMP2 = W[-15] - ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) - add y0, y2 # y2 = S1 + CH - add _XFER(%rsp) , y2 # y2 = k + w + S1 + CH - movdqa XTMP1, XTMP3 # XTMP3 = W[-15] - mov a, y0 # y0 = a - add y2, h # h = h + S1 + CH + k + w - mov a, y2 # y2 = a - pslld $(32-7), XTMP1 # - or c, y0 # y0 = a|c - add h, d # d = d + h + S1 + CH + k + w - and c, y2 # y2 = a&c - psrld $7, XTMP2 # - and b, y0 # y0 = (a|c)&b - add y1, h # h = h + S1 + CH + k + w + S0 - por XTMP2, XTMP1 # XTMP1 = W[-15] ror 7 - or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c) - add y0, h # h = h + S1 + CH + k + w + S0 + MAJ - # - ROTATE_ARGS # - movdqa XTMP3, XTMP2 # XTMP2 = W[-15] - mov e, y0 # y0 = e - mov a, y1 # y1 = a - movdqa XTMP3, XTMP4 # XTMP4 = W[-15] - ror $(25-11), y0 # y0 = e >> (25-11) - xor e, y0 # y0 = e ^ (e >> (25-11)) - mov f, y2 # y2 = f - ror $(22-13), y1 # y1 = a >> (22-13) - pslld $(32-18), XTMP3 # - xor a, y1 # y1 = a ^ (a >> (22-13) - ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6)) - xor g, y2 # y2 = f^g - psrld $18, XTMP2 # - ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2)) - xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) - and e, y2 # y2 = (f^g)&e - ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) - pxor XTMP3, XTMP1 - xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) - xor g, y2 # y2 = CH = ((f^g)&e)^g - psrld $3, XTMP4 # XTMP4 = W[-15] >> 3 - add y0, y2 # y2 = S1 + CH - add (1*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH - ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) - pxor XTMP2, XTMP1 # XTMP1 = W[-15] ror 7 ^ W[-15] ror 18 - mov a, y0 # y0 = a - add y2, h # h = h + S1 + CH + k + w - mov a, y2 # y2 = a - pxor XTMP4, XTMP1 # XTMP1 = s0 - or c, y0 # y0 = a|c - add h, d # d = d + h + S1 + CH + k + w - and c, y2 # y2 = a&c - ## compute low s1 - pshufd $0b11111010, X3, XTMP2 # XTMP2 = W[-2] {BBAA} - and b, y0 # y0 = (a|c)&b - add y1, h # h = h + S1 + CH + k + w + S0 - paddd XTMP1, XTMP0 # XTMP0 = W[-16] + W[-7] + s0 - or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c) - add y0, h # h = h + S1 + CH + k + w + S0 + MAJ - - ROTATE_ARGS - movdqa XTMP2, XTMP3 # XTMP3 = W[-2] {BBAA} - mov e, y0 # y0 = e - mov a, y1 # y1 = a - ror $(25-11), y0 # y0 = e >> (25-11) - movdqa XTMP2, XTMP4 # XTMP4 = W[-2] {BBAA} - xor e, y0 # y0 = e ^ (e >> (25-11)) - ror $(22-13), y1 # y1 = a >> (22-13) - mov f, y2 # y2 = f - xor a, y1 # y1 = a ^ (a >> (22-13) - ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6)) - psrlq $17, XTMP2 # XTMP2 = W[-2] ror 17 {xBxA} - xor g, y2 # y2 = f^g - psrlq $19, XTMP3 # XTMP3 = W[-2] ror 19 {xBxA} - xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) - and e, y2 # y2 = (f^g)&e - psrld $10, XTMP4 # XTMP4 = W[-2] >> 10 {BBAA} - ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2)) - xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) - xor g, y2 # y2 = CH = ((f^g)&e)^g - ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) - pxor XTMP3, XTMP2 - add y0, y2 # y2 = S1 + CH - ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) - add (2*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH - pxor XTMP2, XTMP4 # XTMP4 = s1 {xBxA} - mov a, y0 # y0 = a - add y2, h # h = h + S1 + CH + k + w - mov a, y2 # y2 = a - pshufb SHUF_00BA, XTMP4 # XTMP4 = s1 {00BA} - or c, y0 # y0 = a|c - add h, d # d = d + h + S1 + CH + k + w - and c, y2 # y2 = a&c - paddd XTMP4, XTMP0 # XTMP0 = {..., ..., W[1], W[0]} - and b, y0 # y0 = (a|c)&b - add y1, h # h = h + S1 + CH + k + w + S0 - ## compute high s1 - pshufd $0b01010000, XTMP0, XTMP2 # XTMP2 = W[-2] {BBAA} - or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c) - add y0, h # h = h + S1 + CH + k + w + S0 + MAJ - # - ROTATE_ARGS # - movdqa XTMP2, XTMP3 # XTMP3 = W[-2] {DDCC} - mov e, y0 # y0 = e - ror $(25-11), y0 # y0 = e >> (25-11) - mov a, y1 # y1 = a - movdqa XTMP2, X0 # X0 = W[-2] {DDCC} - ror $(22-13), y1 # y1 = a >> (22-13) - xor e, y0 # y0 = e ^ (e >> (25-11)) - mov f, y2 # y2 = f - ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6)) - psrlq $17, XTMP2 # XTMP2 = W[-2] ror 17 {xDxC} - xor a, y1 # y1 = a ^ (a >> (22-13) - xor g, y2 # y2 = f^g - psrlq $19, XTMP3 # XTMP3 = W[-2] ror 19 {xDxC} - xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25 - and e, y2 # y2 = (f^g)&e - ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2)) - psrld $10, X0 # X0 = W[-2] >> 10 {DDCC} - xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22 - ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>2 - xor g, y2 # y2 = CH = ((f^g)&e)^g - pxor XTMP3, XTMP2 # - ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>2 - add y0, y2 # y2 = S1 + CH - add (3*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH - pxor XTMP2, X0 # X0 = s1 {xDxC} - mov a, y0 # y0 = a - add y2, h # h = h + S1 + CH + k + w - mov a, y2 # y2 = a - pshufb SHUF_DC00, X0 # X0 = s1 {DC00} - or c, y0 # y0 = a|c - add h, d # d = d + h + S1 + CH + k + w - and c, y2 # y2 = a&c - paddd XTMP0, X0 # X0 = {W[3], W[2], W[1], W[0]} - and b, y0 # y0 = (a|c)&b - add y1, h # h = h + S1 + CH + k + w + S0 - or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c) - add y0, h # h = h + S1 + CH + k + w + S0 + MAJ - - ROTATE_ARGS - rotate_Xs -.endm - -## input is [rsp + _XFER + %1 * 4] -.macro DO_ROUND round - mov e, y0 # y0 = e - ror $(25-11), y0 # y0 = e >> (25-11) - mov a, y1 # y1 = a - xor e, y0 # y0 = e ^ (e >> (25-11)) - ror $(22-13), y1 # y1 = a >> (22-13) - mov f, y2 # y2 = f - xor a, y1 # y1 = a ^ (a >> (22-13) - ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6)) - xor g, y2 # y2 = f^g - xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) - ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2)) - and e, y2 # y2 = (f^g)&e - xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) - ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) - xor g, y2 # y2 = CH = ((f^g)&e)^g - add y0, y2 # y2 = S1 + CH - ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) - offset = \round * 4 + _XFER - add offset(%rsp), y2 # y2 = k + w + S1 + CH - mov a, y0 # y0 = a - add y2, h # h = h + S1 + CH + k + w - mov a, y2 # y2 = a - or c, y0 # y0 = a|c - add h, d # d = d + h + S1 + CH + k + w - and c, y2 # y2 = a&c - and b, y0 # y0 = (a|c)&b - add y1, h # h = h + S1 + CH + k + w + S0 - or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c) - add y0, h # h = h + S1 + CH + k + w + S0 + MAJ - ROTATE_ARGS -.endm - -######################################################################## -## void sha256_transform_ssse3(struct sha256_state *state, const u8 *data, -## int blocks); -## arg 1 : pointer to state -## (struct sha256_state is assumed to begin with u32 state[8]) -## arg 2 : pointer to input data -## arg 3 : Num blocks -######################################################################## -.text -SYM_TYPED_FUNC_START(sha256_transform_ssse3) - pushq %rbx - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 - pushq %rbp - mov %rsp, %rbp - - subq $STACK_SIZE, %rsp - and $~15, %rsp - - shl $6, NUM_BLKS # convert to bytes - jz .Ldone_hash - add INP, NUM_BLKS - mov NUM_BLKS, _INP_END(%rsp) # pointer to end of data - - ## load initial digest - mov 4*0(CTX), a - mov 4*1(CTX), b - mov 4*2(CTX), c - mov 4*3(CTX), d - mov 4*4(CTX), e - mov 4*5(CTX), f - mov 4*6(CTX), g - mov 4*7(CTX), h - - movdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK - movdqa _SHUF_00BA(%rip), SHUF_00BA - movdqa _SHUF_DC00(%rip), SHUF_DC00 - -.Lloop0: - lea K256(%rip), TBL - - ## byte swap first 16 dwords - COPY_XMM_AND_BSWAP X0, 0*16(INP), BYTE_FLIP_MASK - COPY_XMM_AND_BSWAP X1, 1*16(INP), BYTE_FLIP_MASK - COPY_XMM_AND_BSWAP X2, 2*16(INP), BYTE_FLIP_MASK - COPY_XMM_AND_BSWAP X3, 3*16(INP), BYTE_FLIP_MASK - - mov INP, _INP(%rsp) - - ## schedule 48 input dwords, by doing 3 rounds of 16 each - mov $3, SRND -.align 16 -.Lloop1: - movdqa (TBL), XFER - paddd X0, XFER - movdqa XFER, _XFER(%rsp) - FOUR_ROUNDS_AND_SCHED - - movdqa 1*16(TBL), XFER - paddd X0, XFER - movdqa XFER, _XFER(%rsp) - FOUR_ROUNDS_AND_SCHED - - movdqa 2*16(TBL), XFER - paddd X0, XFER - movdqa XFER, _XFER(%rsp) - FOUR_ROUNDS_AND_SCHED - - movdqa 3*16(TBL), XFER - paddd X0, XFER - movdqa XFER, _XFER(%rsp) - add $4*16, TBL - FOUR_ROUNDS_AND_SCHED - - sub $1, SRND - jne .Lloop1 - - mov $2, SRND -.Lloop2: - paddd (TBL), X0 - movdqa X0, _XFER(%rsp) - DO_ROUND 0 - DO_ROUND 1 - DO_ROUND 2 - DO_ROUND 3 - paddd 1*16(TBL), X1 - movdqa X1, _XFER(%rsp) - add $2*16, TBL - DO_ROUND 0 - DO_ROUND 1 - DO_ROUND 2 - DO_ROUND 3 - - movdqa X2, X0 - movdqa X3, X1 - - sub $1, SRND - jne .Lloop2 - - addm (4*0)(CTX),a - addm (4*1)(CTX),b - addm (4*2)(CTX),c - addm (4*3)(CTX),d - addm (4*4)(CTX),e - addm (4*5)(CTX),f - addm (4*6)(CTX),g - addm (4*7)(CTX),h - - mov _INP(%rsp), INP - add $64, INP - cmp _INP_END(%rsp), INP - jne .Lloop0 - -.Ldone_hash: - - mov %rbp, %rsp - popq %rbp - popq %r15 - popq %r14 - popq %r13 - popq %r12 - popq %rbx - - RET -SYM_FUNC_END(sha256_transform_ssse3) - -.section .rodata.cst256.K256, "aM", @progbits, 256 -.align 64 -K256: - .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 - .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 - .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 - .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 - .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc - .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da - .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 - .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 - .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 - .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 - .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 - .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 - .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 - .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 - .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 - .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 - -.section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16 -.align 16 -PSHUFFLE_BYTE_FLIP_MASK: - .octa 0x0c0d0e0f08090a0b0405060700010203 - -.section .rodata.cst16._SHUF_00BA, "aM", @progbits, 16 -.align 16 -# shuffle xBxA -> 00BA -_SHUF_00BA: - .octa 0xFFFFFFFFFFFFFFFF0b0a090803020100 - -.section .rodata.cst16._SHUF_DC00, "aM", @progbits, 16 -.align 16 -# shuffle xDxC -> DC00 -_SHUF_DC00: - .octa 0x0b0a090803020100FFFFFFFFFFFFFFFF diff --git a/arch/x86/crypto/sha256_ni_asm.S b/arch/x86/crypto/sha256_ni_asm.S deleted file mode 100644 index 537b6dcd7ed8..000000000000 --- a/arch/x86/crypto/sha256_ni_asm.S +++ /dev/null @@ -1,355 +0,0 @@ -/* - * Intel SHA Extensions optimized implementation of a SHA-256 update function - * - * This file is provided under a dual BSD/GPLv2 license. When using or - * redistributing this file, you may do so under either license. - * - * GPL LICENSE SUMMARY - * - * Copyright(c) 2015 Intel Corporation. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of version 2 of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * Contact Information: - * Sean Gulley <sean.m.gulley@intel.com> - * Tim Chen <tim.c.chen@linux.intel.com> - * - * BSD LICENSE - * - * Copyright(c) 2015 Intel Corporation. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * * Neither the name of Intel Corporation nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - */ - -#include <linux/linkage.h> -#include <linux/cfi_types.h> - -#define DIGEST_PTR %rdi /* 1st arg */ -#define DATA_PTR %rsi /* 2nd arg */ -#define NUM_BLKS %rdx /* 3rd arg */ - -#define SHA256CONSTANTS %rax - -#define MSG %xmm0 -#define STATE0 %xmm1 -#define STATE1 %xmm2 -#define MSGTMP0 %xmm3 -#define MSGTMP1 %xmm4 -#define MSGTMP2 %xmm5 -#define MSGTMP3 %xmm6 -#define MSGTMP4 %xmm7 - -#define SHUF_MASK %xmm8 - -#define ABEF_SAVE %xmm9 -#define CDGH_SAVE %xmm10 - -/* - * Intel SHA Extensions optimized implementation of a SHA-256 update function - * - * The function takes a pointer to the current hash values, a pointer to the - * input data, and a number of 64 byte blocks to process. Once all blocks have - * been processed, the digest pointer is updated with the resulting hash value. - * The function only processes complete blocks, there is no functionality to - * store partial blocks. All message padding and hash value initialization must - * be done outside the update function. - * - * The indented lines in the loop are instructions related to rounds processing. - * The non-indented lines are instructions related to the message schedule. - * - * void sha256_ni_transform(uint32_t *digest, const void *data, - uint32_t numBlocks); - * digest : pointer to digest - * data: pointer to input data - * numBlocks: Number of blocks to process - */ - -.text -SYM_TYPED_FUNC_START(sha256_ni_transform) - - shl $6, NUM_BLKS /* convert to bytes */ - jz .Ldone_hash - add DATA_PTR, NUM_BLKS /* pointer to end of data */ - - /* - * load initial hash values - * Need to reorder these appropriately - * DCBA, HGFE -> ABEF, CDGH - */ - movdqu 0*16(DIGEST_PTR), STATE0 - movdqu 1*16(DIGEST_PTR), STATE1 - - pshufd $0xB1, STATE0, STATE0 /* CDAB */ - pshufd $0x1B, STATE1, STATE1 /* EFGH */ - movdqa STATE0, MSGTMP4 - palignr $8, STATE1, STATE0 /* ABEF */ - pblendw $0xF0, MSGTMP4, STATE1 /* CDGH */ - - movdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), SHUF_MASK - lea K256(%rip), SHA256CONSTANTS - -.Lloop0: - /* Save hash values for addition after rounds */ - movdqa STATE0, ABEF_SAVE - movdqa STATE1, CDGH_SAVE - - /* Rounds 0-3 */ - movdqu 0*16(DATA_PTR), MSG - pshufb SHUF_MASK, MSG - movdqa MSG, MSGTMP0 - paddd 0*16(SHA256CONSTANTS), MSG - sha256rnds2 STATE0, STATE1 - pshufd $0x0E, MSG, MSG - sha256rnds2 STATE1, STATE0 - - /* Rounds 4-7 */ - movdqu 1*16(DATA_PTR), MSG - pshufb SHUF_MASK, MSG - movdqa MSG, MSGTMP1 - paddd 1*16(SHA256CONSTANTS), MSG - sha256rnds2 STATE0, STATE1 - pshufd $0x0E, MSG, MSG - sha256rnds2 STATE1, STATE0 - sha256msg1 MSGTMP1, MSGTMP0 - - /* Rounds 8-11 */ - movdqu 2*16(DATA_PTR), MSG - pshufb SHUF_MASK, MSG - movdqa MSG, MSGTMP2 - paddd 2*16(SHA256CONSTANTS), MSG - sha256rnds2 STATE0, STATE1 - pshufd $0x0E, MSG, MSG - sha256rnds2 STATE1, STATE0 - sha256msg1 MSGTMP2, MSGTMP1 - - /* Rounds 12-15 */ - movdqu 3*16(DATA_PTR), MSG - pshufb SHUF_MASK, MSG - movdqa MSG, MSGTMP3 - paddd 3*16(SHA256CONSTANTS), MSG - sha256rnds2 STATE0, STATE1 - movdqa MSGTMP3, MSGTMP4 - palignr $4, MSGTMP2, MSGTMP4 - paddd MSGTMP4, MSGTMP0 - sha256msg2 MSGTMP3, MSGTMP0 - pshufd $0x0E, MSG, MSG - sha256rnds2 STATE1, STATE0 - sha256msg1 MSGTMP3, MSGTMP2 - - /* Rounds 16-19 */ - movdqa MSGTMP0, MSG - paddd 4*16(SHA256CONSTANTS), MSG - sha256rnds2 STATE0, STATE1 - movdqa MSGTMP0, MSGTMP4 - palignr $4, MSGTMP3, MSGTMP4 - paddd MSGTMP4, MSGTMP1 - sha256msg2 MSGTMP0, MSGTMP1 - pshufd $0x0E, MSG, MSG - sha256rnds2 STATE1, STATE0 - sha256msg1 MSGTMP0, MSGTMP3 - - /* Rounds 20-23 */ - movdqa MSGTMP1, MSG - paddd 5*16(SHA256CONSTANTS), MSG - sha256rnds2 STATE0, STATE1 - movdqa MSGTMP1, MSGTMP4 - palignr $4, MSGTMP0, MSGTMP4 - paddd MSGTMP4, MSGTMP2 - sha256msg2 MSGTMP1, MSGTMP2 - pshufd $0x0E, MSG, MSG - sha256rnds2 STATE1, STATE0 - sha256msg1 MSGTMP1, MSGTMP0 - - /* Rounds 24-27 */ - movdqa MSGTMP2, MSG - paddd 6*16(SHA256CONSTANTS), MSG - sha256rnds2 STATE0, STATE1 - movdqa MSGTMP2, MSGTMP4 - palignr $4, MSGTMP1, MSGTMP4 - paddd MSGTMP4, MSGTMP3 - sha256msg2 MSGTMP2, MSGTMP3 - pshufd $0x0E, MSG, MSG - sha256rnds2 STATE1, STATE0 - sha256msg1 MSGTMP2, MSGTMP1 - - /* Rounds 28-31 */ - movdqa MSGTMP3, MSG - paddd 7*16(SHA256CONSTANTS), MSG - sha256rnds2 STATE0, STATE1 - movdqa MSGTMP3, MSGTMP4 - palignr $4, MSGTMP2, MSGTMP4 - paddd MSGTMP4, MSGTMP0 - sha256msg2 MSGTMP3, MSGTMP0 - pshufd $0x0E, MSG, MSG - sha256rnds2 STATE1, STATE0 - sha256msg1 MSGTMP3, MSGTMP2 - - /* Rounds 32-35 */ - movdqa MSGTMP0, MSG - paddd 8*16(SHA256CONSTANTS), MSG - sha256rnds2 STATE0, STATE1 - movdqa MSGTMP0, MSGTMP4 - palignr $4, MSGTMP3, MSGTMP4 - paddd MSGTMP4, MSGTMP1 - sha256msg2 MSGTMP0, MSGTMP1 - pshufd $0x0E, MSG, MSG - sha256rnds2 STATE1, STATE0 - sha256msg1 MSGTMP0, MSGTMP3 - - /* Rounds 36-39 */ - movdqa MSGTMP1, MSG - paddd 9*16(SHA256CONSTANTS), MSG - sha256rnds2 STATE0, STATE1 - movdqa MSGTMP1, MSGTMP4 - palignr $4, MSGTMP0, MSGTMP4 - paddd MSGTMP4, MSGTMP2 - sha256msg2 MSGTMP1, MSGTMP2 - pshufd $0x0E, MSG, MSG - sha256rnds2 STATE1, STATE0 - sha256msg1 MSGTMP1, MSGTMP0 - - /* Rounds 40-43 */ - movdqa MSGTMP2, MSG - paddd 10*16(SHA256CONSTANTS), MSG - sha256rnds2 STATE0, STATE1 - movdqa MSGTMP2, MSGTMP4 - palignr $4, MSGTMP1, MSGTMP4 - paddd MSGTMP4, MSGTMP3 - sha256msg2 MSGTMP2, MSGTMP3 - pshufd $0x0E, MSG, MSG - sha256rnds2 STATE1, STATE0 - sha256msg1 MSGTMP2, MSGTMP1 - - /* Rounds 44-47 */ - movdqa MSGTMP3, MSG - paddd 11*16(SHA256CONSTANTS), MSG - sha256rnds2 STATE0, STATE1 - movdqa MSGTMP3, MSGTMP4 - palignr $4, MSGTMP2, MSGTMP4 - paddd MSGTMP4, MSGTMP0 - sha256msg2 MSGTMP3, MSGTMP0 - pshufd $0x0E, MSG, MSG - sha256rnds2 STATE1, STATE0 - sha256msg1 MSGTMP3, MSGTMP2 - - /* Rounds 48-51 */ - movdqa MSGTMP0, MSG - paddd 12*16(SHA256CONSTANTS), MSG - sha256rnds2 STATE0, STATE1 - movdqa MSGTMP0, MSGTMP4 - palignr $4, MSGTMP3, MSGTMP4 - paddd MSGTMP4, MSGTMP1 - sha256msg2 MSGTMP0, MSGTMP1 - pshufd $0x0E, MSG, MSG - sha256rnds2 STATE1, STATE0 - sha256msg1 MSGTMP0, MSGTMP3 - - /* Rounds 52-55 */ - movdqa MSGTMP1, MSG - paddd 13*16(SHA256CONSTANTS), MSG - sha256rnds2 STATE0, STATE1 - movdqa MSGTMP1, MSGTMP4 - palignr $4, MSGTMP0, MSGTMP4 - paddd MSGTMP4, MSGTMP2 - sha256msg2 MSGTMP1, MSGTMP2 - pshufd $0x0E, MSG, MSG - sha256rnds2 STATE1, STATE0 - - /* Rounds 56-59 */ - movdqa MSGTMP2, MSG - paddd 14*16(SHA256CONSTANTS), MSG - sha256rnds2 STATE0, STATE1 - movdqa MSGTMP2, MSGTMP4 - palignr $4, MSGTMP1, MSGTMP4 - paddd MSGTMP4, MSGTMP3 - sha256msg2 MSGTMP2, MSGTMP3 - pshufd $0x0E, MSG, MSG - sha256rnds2 STATE1, STATE0 - - /* Rounds 60-63 */ - movdqa MSGTMP3, MSG - paddd 15*16(SHA256CONSTANTS), MSG - sha256rnds2 STATE0, STATE1 - pshufd $0x0E, MSG, MSG - sha256rnds2 STATE1, STATE0 - - /* Add current hash values with previously saved */ - paddd ABEF_SAVE, STATE0 - paddd CDGH_SAVE, STATE1 - - /* Increment data pointer and loop if more to process */ - add $64, DATA_PTR - cmp NUM_BLKS, DATA_PTR - jne .Lloop0 - - /* Write hash values back in the correct order */ - pshufd $0x1B, STATE0, STATE0 /* FEBA */ - pshufd $0xB1, STATE1, STATE1 /* DCHG */ - movdqa STATE0, MSGTMP4 - pblendw $0xF0, STATE1, STATE0 /* DCBA */ - palignr $8, MSGTMP4, STATE1 /* HGFE */ - - movdqu STATE0, 0*16(DIGEST_PTR) - movdqu STATE1, 1*16(DIGEST_PTR) - -.Ldone_hash: - - RET -SYM_FUNC_END(sha256_ni_transform) - -.section .rodata.cst256.K256, "aM", @progbits, 256 -.align 64 -K256: - .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 - .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 - .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 - .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 - .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc - .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da - .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 - .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 - .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 - .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 - .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 - .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 - .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 - .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 - .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 - .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 - -.section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16 -.align 16 -PSHUFFLE_BYTE_FLIP_MASK: - .octa 0x0c0d0e0f08090a0b0405060700010203 diff --git a/arch/x86/crypto/sha256_ssse3_glue.c b/arch/x86/crypto/sha256_ssse3_glue.c deleted file mode 100644 index e04a43d9f7d5..000000000000 --- a/arch/x86/crypto/sha256_ssse3_glue.c +++ /dev/null @@ -1,467 +0,0 @@ -/* - * Cryptographic API. - * - * Glue code for the SHA256 Secure Hash Algorithm assembler implementations - * using SSSE3, AVX, AVX2, and SHA-NI instructions. - * - * This file is based on sha256_generic.c - * - * Copyright (C) 2013 Intel Corporation. - * - * Author: - * Tim Chen <tim.c.chen@linux.intel.com> - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by the Free - * Software Foundation; either version 2 of the License, or (at your option) - * any later version. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - - -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - -#include <crypto/internal/hash.h> -#include <crypto/internal/simd.h> -#include <linux/init.h> -#include <linux/module.h> -#include <linux/mm.h> -#include <linux/types.h> -#include <crypto/sha2.h> -#include <crypto/sha256_base.h> -#include <linux/string.h> -#include <asm/cpu_device_id.h> -#include <asm/simd.h> - -asmlinkage void sha256_transform_ssse3(struct sha256_state *state, - const u8 *data, int blocks); - -static const struct x86_cpu_id module_cpu_ids[] = { -#ifdef CONFIG_AS_SHA256_NI - X86_MATCH_FEATURE(X86_FEATURE_SHA_NI, NULL), -#endif - X86_MATCH_FEATURE(X86_FEATURE_AVX2, NULL), - X86_MATCH_FEATURE(X86_FEATURE_AVX, NULL), - X86_MATCH_FEATURE(X86_FEATURE_SSSE3, NULL), - {} -}; -MODULE_DEVICE_TABLE(x86cpu, module_cpu_ids); - -static int _sha256_update(struct shash_desc *desc, const u8 *data, - unsigned int len, sha256_block_fn *sha256_xform) -{ - struct sha256_state *sctx = shash_desc_ctx(desc); - - if (!crypto_simd_usable() || - (sctx->count % SHA256_BLOCK_SIZE) + len < SHA256_BLOCK_SIZE) - return crypto_sha256_update(desc, data, len); - - /* - * Make sure struct sha256_state begins directly with the SHA256 - * 256-bit internal state, as this is what the asm functions expect. - */ - BUILD_BUG_ON(offsetof(struct sha256_state, state) != 0); - - kernel_fpu_begin(); - sha256_base_do_update(desc, data, len, sha256_xform); - kernel_fpu_end(); - - return 0; -} - -static int sha256_finup(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *out, sha256_block_fn *sha256_xform) -{ - if (!crypto_simd_usable()) - return crypto_sha256_finup(desc, data, len, out); - - kernel_fpu_begin(); - if (len) - sha256_base_do_update(desc, data, len, sha256_xform); - sha256_base_do_finalize(desc, sha256_xform); - kernel_fpu_end(); - - return sha256_base_finish(desc, out); -} - -static int sha256_ssse3_update(struct shash_desc *desc, const u8 *data, - unsigned int len) -{ - return _sha256_update(desc, data, len, sha256_transform_ssse3); -} - -static int sha256_ssse3_finup(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *out) -{ - return sha256_finup(desc, data, len, out, sha256_transform_ssse3); -} - -/* Add padding and return the message digest. */ -static int sha256_ssse3_final(struct shash_desc *desc, u8 *out) -{ - return sha256_ssse3_finup(desc, NULL, 0, out); -} - -static int sha256_ssse3_digest(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *out) -{ - return sha256_base_init(desc) ?: - sha256_ssse3_finup(desc, data, len, out); -} - -static struct shash_alg sha256_ssse3_algs[] = { { - .digestsize = SHA256_DIGEST_SIZE, - .init = sha256_base_init, - .update = sha256_ssse3_update, - .final = sha256_ssse3_final, - .finup = sha256_ssse3_finup, - .digest = sha256_ssse3_digest, - .descsize = sizeof(struct sha256_state), - .base = { - .cra_name = "sha256", - .cra_driver_name = "sha256-ssse3", - .cra_priority = 150, - .cra_blocksize = SHA256_BLOCK_SIZE, - .cra_module = THIS_MODULE, - } -}, { - .digestsize = SHA224_DIGEST_SIZE, - .init = sha224_base_init, - .update = sha256_ssse3_update, - .final = sha256_ssse3_final, - .finup = sha256_ssse3_finup, - .descsize = sizeof(struct sha256_state), - .base = { - .cra_name = "sha224", - .cra_driver_name = "sha224-ssse3", - .cra_priority = 150, - .cra_blocksize = SHA224_BLOCK_SIZE, - .cra_module = THIS_MODULE, - } -} }; - -static int register_sha256_ssse3(void) -{ - if (boot_cpu_has(X86_FEATURE_SSSE3)) - return crypto_register_shashes(sha256_ssse3_algs, - ARRAY_SIZE(sha256_ssse3_algs)); - return 0; -} - -static void unregister_sha256_ssse3(void) -{ - if (boot_cpu_has(X86_FEATURE_SSSE3)) - crypto_unregister_shashes(sha256_ssse3_algs, - ARRAY_SIZE(sha256_ssse3_algs)); -} - -asmlinkage void sha256_transform_avx(struct sha256_state *state, - const u8 *data, int blocks); - -static int sha256_avx_update(struct shash_desc *desc, const u8 *data, - unsigned int len) -{ - return _sha256_update(desc, data, len, sha256_transform_avx); -} - -static int sha256_avx_finup(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *out) -{ - return sha256_finup(desc, data, len, out, sha256_transform_avx); -} - -static int sha256_avx_final(struct shash_desc *desc, u8 *out) -{ - return sha256_avx_finup(desc, NULL, 0, out); -} - -static int sha256_avx_digest(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *out) -{ - return sha256_base_init(desc) ?: - sha256_avx_finup(desc, data, len, out); -} - -static struct shash_alg sha256_avx_algs[] = { { - .digestsize = SHA256_DIGEST_SIZE, - .init = sha256_base_init, - .update = sha256_avx_update, - .final = sha256_avx_final, - .finup = sha256_avx_finup, - .digest = sha256_avx_digest, - .descsize = sizeof(struct sha256_state), - .base = { - .cra_name = "sha256", - .cra_driver_name = "sha256-avx", - .cra_priority = 160, - .cra_blocksize = SHA256_BLOCK_SIZE, - .cra_module = THIS_MODULE, - } -}, { - .digestsize = SHA224_DIGEST_SIZE, - .init = sha224_base_init, - .update = sha256_avx_update, - .final = sha256_avx_final, - .finup = sha256_avx_finup, - .descsize = sizeof(struct sha256_state), - .base = { - .cra_name = "sha224", - .cra_driver_name = "sha224-avx", - .cra_priority = 160, - .cra_blocksize = SHA224_BLOCK_SIZE, - .cra_module = THIS_MODULE, - } -} }; - -static bool avx_usable(void) -{ - if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) { - if (boot_cpu_has(X86_FEATURE_AVX)) - pr_info("AVX detected but unusable.\n"); - return false; - } - - return true; -} - -static int register_sha256_avx(void) -{ - if (avx_usable()) - return crypto_register_shashes(sha256_avx_algs, - ARRAY_SIZE(sha256_avx_algs)); - return 0; -} - -static void unregister_sha256_avx(void) -{ - if (avx_usable()) - crypto_unregister_shashes(sha256_avx_algs, - ARRAY_SIZE(sha256_avx_algs)); -} - -asmlinkage void sha256_transform_rorx(struct sha256_state *state, - const u8 *data, int blocks); - -static int sha256_avx2_update(struct shash_desc *desc, const u8 *data, - unsigned int len) -{ - return _sha256_update(desc, data, len, sha256_transform_rorx); -} - -static int sha256_avx2_finup(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *out) -{ - return sha256_finup(desc, data, len, out, sha256_transform_rorx); -} - -static int sha256_avx2_final(struct shash_desc *desc, u8 *out) -{ - return sha256_avx2_finup(desc, NULL, 0, out); -} - -static int sha256_avx2_digest(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *out) -{ - return sha256_base_init(desc) ?: - sha256_avx2_finup(desc, data, len, out); -} - -static struct shash_alg sha256_avx2_algs[] = { { - .digestsize = SHA256_DIGEST_SIZE, - .init = sha256_base_init, - .update = sha256_avx2_update, - .final = sha256_avx2_final, - .finup = sha256_avx2_finup, - .digest = sha256_avx2_digest, - .descsize = sizeof(struct sha256_state), - .base = { - .cra_name = "sha256", - .cra_driver_name = "sha256-avx2", - .cra_priority = 170, - .cra_blocksize = SHA256_BLOCK_SIZE, - .cra_module = THIS_MODULE, - } -}, { - .digestsize = SHA224_DIGEST_SIZE, - .init = sha224_base_init, - .update = sha256_avx2_update, - .final = sha256_avx2_final, - .finup = sha256_avx2_finup, - .descsize = sizeof(struct sha256_state), - .base = { - .cra_name = "sha224", - .cra_driver_name = "sha224-avx2", - .cra_priority = 170, - .cra_blocksize = SHA224_BLOCK_SIZE, - .cra_module = THIS_MODULE, - } -} }; - -static bool avx2_usable(void) -{ - if (avx_usable() && boot_cpu_has(X86_FEATURE_AVX2) && - boot_cpu_has(X86_FEATURE_BMI2)) - return true; - - return false; -} - -static int register_sha256_avx2(void) -{ - if (avx2_usable()) - return crypto_register_shashes(sha256_avx2_algs, - ARRAY_SIZE(sha256_avx2_algs)); - return 0; -} - -static void unregister_sha256_avx2(void) -{ - if (avx2_usable()) - crypto_unregister_shashes(sha256_avx2_algs, - ARRAY_SIZE(sha256_avx2_algs)); -} - -#ifdef CONFIG_AS_SHA256_NI -asmlinkage void sha256_ni_transform(struct sha256_state *digest, - const u8 *data, int rounds); - -static int sha256_ni_update(struct shash_desc *desc, const u8 *data, - unsigned int len) -{ - return _sha256_update(desc, data, len, sha256_ni_transform); -} - -static int sha256_ni_finup(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *out) -{ - return sha256_finup(desc, data, len, out, sha256_ni_transform); -} - -static int sha256_ni_final(struct shash_desc *desc, u8 *out) -{ - return sha256_ni_finup(desc, NULL, 0, out); -} - -static int sha256_ni_digest(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *out) -{ - return sha256_base_init(desc) ?: - sha256_ni_finup(desc, data, len, out); -} - -static struct shash_alg sha256_ni_algs[] = { { - .digestsize = SHA256_DIGEST_SIZE, - .init = sha256_base_init, - .update = sha256_ni_update, - .final = sha256_ni_final, - .finup = sha256_ni_finup, - .digest = sha256_ni_digest, - .descsize = sizeof(struct sha256_state), - .base = { - .cra_name = "sha256", - .cra_driver_name = "sha256-ni", - .cra_priority = 250, - .cra_blocksize = SHA256_BLOCK_SIZE, - .cra_module = THIS_MODULE, - } -}, { - .digestsize = SHA224_DIGEST_SIZE, - .init = sha224_base_init, - .update = sha256_ni_update, - .final = sha256_ni_final, - .finup = sha256_ni_finup, - .descsize = sizeof(struct sha256_state), - .base = { - .cra_name = "sha224", - .cra_driver_name = "sha224-ni", - .cra_priority = 250, - .cra_blocksize = SHA224_BLOCK_SIZE, - .cra_module = THIS_MODULE, - } -} }; - -static int register_sha256_ni(void) -{ - if (boot_cpu_has(X86_FEATURE_SHA_NI)) - return crypto_register_shashes(sha256_ni_algs, - ARRAY_SIZE(sha256_ni_algs)); - return 0; -} - -static void unregister_sha256_ni(void) -{ - if (boot_cpu_has(X86_FEATURE_SHA_NI)) - crypto_unregister_shashes(sha256_ni_algs, - ARRAY_SIZE(sha256_ni_algs)); -} - -#else -static inline int register_sha256_ni(void) { return 0; } -static inline void unregister_sha256_ni(void) { } -#endif - -static int __init sha256_ssse3_mod_init(void) -{ - if (!x86_match_cpu(module_cpu_ids)) - return -ENODEV; - - if (register_sha256_ssse3()) - goto fail; - - if (register_sha256_avx()) { - unregister_sha256_ssse3(); - goto fail; - } - - if (register_sha256_avx2()) { - unregister_sha256_avx(); - unregister_sha256_ssse3(); - goto fail; - } - - if (register_sha256_ni()) { - unregister_sha256_avx2(); - unregister_sha256_avx(); - unregister_sha256_ssse3(); - goto fail; - } - - return 0; -fail: - return -ENODEV; -} - -static void __exit sha256_ssse3_mod_fini(void) -{ - unregister_sha256_ni(); - unregister_sha256_avx2(); - unregister_sha256_avx(); - unregister_sha256_ssse3(); -} - -module_init(sha256_ssse3_mod_init); -module_exit(sha256_ssse3_mod_fini); - -MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("SHA256 Secure Hash Algorithm, Supplemental SSE3 accelerated"); - -MODULE_ALIAS_CRYPTO("sha256"); -MODULE_ALIAS_CRYPTO("sha256-ssse3"); -MODULE_ALIAS_CRYPTO("sha256-avx"); -MODULE_ALIAS_CRYPTO("sha256-avx2"); -MODULE_ALIAS_CRYPTO("sha224"); -MODULE_ALIAS_CRYPTO("sha224-ssse3"); -MODULE_ALIAS_CRYPTO("sha224-avx"); -MODULE_ALIAS_CRYPTO("sha224-avx2"); -#ifdef CONFIG_AS_SHA256_NI -MODULE_ALIAS_CRYPTO("sha256-ni"); -MODULE_ALIAS_CRYPTO("sha224-ni"); -#endif diff --git a/arch/x86/crypto/sha512-avx2-asm.S b/arch/x86/crypto/sha512-avx2-asm.S index f08496cd6870..24973f42c43f 100644 --- a/arch/x86/crypto/sha512-avx2-asm.S +++ b/arch/x86/crypto/sha512-avx2-asm.S @@ -680,6 +680,7 @@ SYM_TYPED_FUNC_START(sha512_transform_rorx) pop %r12 pop %rbx + vzeroupper RET SYM_FUNC_END(sha512_transform_rorx) diff --git a/arch/x86/crypto/sha512_ssse3_glue.c b/arch/x86/crypto/sha512_ssse3_glue.c index 6d3b85e53d0e..067684c54395 100644 --- a/arch/x86/crypto/sha512_ssse3_glue.c +++ b/arch/x86/crypto/sha512_ssse3_glue.c @@ -27,17 +27,13 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <asm/cpu_device_id.h> +#include <asm/simd.h> #include <crypto/internal/hash.h> -#include <crypto/internal/simd.h> -#include <linux/init.h> +#include <linux/kernel.h> #include <linux/module.h> -#include <linux/mm.h> -#include <linux/string.h> -#include <linux/types.h> #include <crypto/sha2.h> #include <crypto/sha512_base.h> -#include <asm/cpu_device_id.h> -#include <asm/simd.h> asmlinkage void sha512_transform_ssse3(struct sha512_state *state, const u8 *data, int blocks); @@ -45,11 +41,7 @@ asmlinkage void sha512_transform_ssse3(struct sha512_state *state, static int sha512_update(struct shash_desc *desc, const u8 *data, unsigned int len, sha512_block_fn *sha512_xform) { - struct sha512_state *sctx = shash_desc_ctx(desc); - - if (!crypto_simd_usable() || - (sctx->count[0] % SHA512_BLOCK_SIZE) + len < SHA512_BLOCK_SIZE) - return crypto_sha512_update(desc, data, len); + int remain; /* * Make sure struct sha512_state begins directly with the SHA512 @@ -58,22 +50,17 @@ static int sha512_update(struct shash_desc *desc, const u8 *data, BUILD_BUG_ON(offsetof(struct sha512_state, state) != 0); kernel_fpu_begin(); - sha512_base_do_update(desc, data, len, sha512_xform); + remain = sha512_base_do_update_blocks(desc, data, len, sha512_xform); kernel_fpu_end(); - return 0; + return remain; } static int sha512_finup(struct shash_desc *desc, const u8 *data, unsigned int len, u8 *out, sha512_block_fn *sha512_xform) { - if (!crypto_simd_usable()) - return crypto_sha512_finup(desc, data, len, out); - kernel_fpu_begin(); - if (len) - sha512_base_do_update(desc, data, len, sha512_xform); - sha512_base_do_finalize(desc, sha512_xform); + sha512_base_do_finup(desc, data, len, sha512_xform); kernel_fpu_end(); return sha512_base_finish(desc, out); @@ -91,23 +78,18 @@ static int sha512_ssse3_finup(struct shash_desc *desc, const u8 *data, return sha512_finup(desc, data, len, out, sha512_transform_ssse3); } -/* Add padding and return the message digest. */ -static int sha512_ssse3_final(struct shash_desc *desc, u8 *out) -{ - return sha512_ssse3_finup(desc, NULL, 0, out); -} - static struct shash_alg sha512_ssse3_algs[] = { { .digestsize = SHA512_DIGEST_SIZE, .init = sha512_base_init, .update = sha512_ssse3_update, - .final = sha512_ssse3_final, .finup = sha512_ssse3_finup, - .descsize = sizeof(struct sha512_state), + .descsize = SHA512_STATE_SIZE, .base = { .cra_name = "sha512", .cra_driver_name = "sha512-ssse3", .cra_priority = 150, + .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY | + CRYPTO_AHASH_ALG_FINUP_MAX, .cra_blocksize = SHA512_BLOCK_SIZE, .cra_module = THIS_MODULE, } @@ -115,13 +97,14 @@ static struct shash_alg sha512_ssse3_algs[] = { { .digestsize = SHA384_DIGEST_SIZE, .init = sha384_base_init, .update = sha512_ssse3_update, - .final = sha512_ssse3_final, .finup = sha512_ssse3_finup, - .descsize = sizeof(struct sha512_state), + .descsize = SHA512_STATE_SIZE, .base = { .cra_name = "sha384", .cra_driver_name = "sha384-ssse3", .cra_priority = 150, + .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY | + CRYPTO_AHASH_ALG_FINUP_MAX, .cra_blocksize = SHA384_BLOCK_SIZE, .cra_module = THIS_MODULE, } @@ -167,23 +150,18 @@ static int sha512_avx_finup(struct shash_desc *desc, const u8 *data, return sha512_finup(desc, data, len, out, sha512_transform_avx); } -/* Add padding and return the message digest. */ -static int sha512_avx_final(struct shash_desc *desc, u8 *out) -{ - return sha512_avx_finup(desc, NULL, 0, out); -} - static struct shash_alg sha512_avx_algs[] = { { .digestsize = SHA512_DIGEST_SIZE, .init = sha512_base_init, .update = sha512_avx_update, - .final = sha512_avx_final, .finup = sha512_avx_finup, - .descsize = sizeof(struct sha512_state), + .descsize = SHA512_STATE_SIZE, .base = { .cra_name = "sha512", .cra_driver_name = "sha512-avx", .cra_priority = 160, + .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY | + CRYPTO_AHASH_ALG_FINUP_MAX, .cra_blocksize = SHA512_BLOCK_SIZE, .cra_module = THIS_MODULE, } @@ -191,13 +169,14 @@ static struct shash_alg sha512_avx_algs[] = { { .digestsize = SHA384_DIGEST_SIZE, .init = sha384_base_init, .update = sha512_avx_update, - .final = sha512_avx_final, .finup = sha512_avx_finup, - .descsize = sizeof(struct sha512_state), + .descsize = SHA512_STATE_SIZE, .base = { .cra_name = "sha384", .cra_driver_name = "sha384-avx", .cra_priority = 160, + .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY | + CRYPTO_AHASH_ALG_FINUP_MAX, .cra_blocksize = SHA384_BLOCK_SIZE, .cra_module = THIS_MODULE, } @@ -233,23 +212,18 @@ static int sha512_avx2_finup(struct shash_desc *desc, const u8 *data, return sha512_finup(desc, data, len, out, sha512_transform_rorx); } -/* Add padding and return the message digest. */ -static int sha512_avx2_final(struct shash_desc *desc, u8 *out) -{ - return sha512_avx2_finup(desc, NULL, 0, out); -} - static struct shash_alg sha512_avx2_algs[] = { { .digestsize = SHA512_DIGEST_SIZE, .init = sha512_base_init, .update = sha512_avx2_update, - .final = sha512_avx2_final, .finup = sha512_avx2_finup, - .descsize = sizeof(struct sha512_state), + .descsize = SHA512_STATE_SIZE, .base = { .cra_name = "sha512", .cra_driver_name = "sha512-avx2", .cra_priority = 170, + .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY | + CRYPTO_AHASH_ALG_FINUP_MAX, .cra_blocksize = SHA512_BLOCK_SIZE, .cra_module = THIS_MODULE, } @@ -257,13 +231,14 @@ static struct shash_alg sha512_avx2_algs[] = { { .digestsize = SHA384_DIGEST_SIZE, .init = sha384_base_init, .update = sha512_avx2_update, - .final = sha512_avx2_final, .finup = sha512_avx2_finup, - .descsize = sizeof(struct sha512_state), + .descsize = SHA512_STATE_SIZE, .base = { .cra_name = "sha384", .cra_driver_name = "sha384-avx2", .cra_priority = 170, + .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY | + CRYPTO_AHASH_ALG_FINUP_MAX, .cra_blocksize = SHA384_BLOCK_SIZE, .cra_module = THIS_MODULE, } diff --git a/arch/x86/crypto/sm3_avx_glue.c b/arch/x86/crypto/sm3_avx_glue.c index 661b6f22ffcd..6e8c42b9dc8e 100644 --- a/arch/x86/crypto/sm3_avx_glue.c +++ b/arch/x86/crypto/sm3_avx_glue.c @@ -10,12 +10,11 @@ #include <crypto/internal/hash.h> #include <crypto/internal/simd.h> -#include <linux/init.h> -#include <linux/module.h> -#include <linux/types.h> #include <crypto/sm3.h> #include <crypto/sm3_base.h> -#include <asm/simd.h> +#include <linux/cpufeature.h> +#include <linux/kernel.h> +#include <linux/module.h> asmlinkage void sm3_transform_avx(struct sm3_state *state, const u8 *data, int nblocks); @@ -23,13 +22,7 @@ asmlinkage void sm3_transform_avx(struct sm3_state *state, static int sm3_avx_update(struct shash_desc *desc, const u8 *data, unsigned int len) { - struct sm3_state *sctx = shash_desc_ctx(desc); - - if (!crypto_simd_usable() || - (sctx->count % SM3_BLOCK_SIZE) + len < SM3_BLOCK_SIZE) { - sm3_update(sctx, data, len); - return 0; - } + int remain; /* * Make sure struct sm3_state begins directly with the SM3 @@ -38,45 +31,17 @@ static int sm3_avx_update(struct shash_desc *desc, const u8 *data, BUILD_BUG_ON(offsetof(struct sm3_state, state) != 0); kernel_fpu_begin(); - sm3_base_do_update(desc, data, len, sm3_transform_avx); + remain = sm3_base_do_update_blocks(desc, data, len, sm3_transform_avx); kernel_fpu_end(); - - return 0; + return remain; } static int sm3_avx_finup(struct shash_desc *desc, const u8 *data, unsigned int len, u8 *out) { - if (!crypto_simd_usable()) { - struct sm3_state *sctx = shash_desc_ctx(desc); - - if (len) - sm3_update(sctx, data, len); - - sm3_final(sctx, out); - return 0; - } - kernel_fpu_begin(); - if (len) - sm3_base_do_update(desc, data, len, sm3_transform_avx); - sm3_base_do_finalize(desc, sm3_transform_avx); + sm3_base_do_finup(desc, data, len, sm3_transform_avx); kernel_fpu_end(); - - return sm3_base_finish(desc, out); -} - -static int sm3_avx_final(struct shash_desc *desc, u8 *out) -{ - if (!crypto_simd_usable()) { - sm3_final(shash_desc_ctx(desc), out); - return 0; - } - - kernel_fpu_begin(); - sm3_base_do_finalize(desc, sm3_transform_avx); - kernel_fpu_end(); - return sm3_base_finish(desc, out); } @@ -84,13 +49,14 @@ static struct shash_alg sm3_avx_alg = { .digestsize = SM3_DIGEST_SIZE, .init = sm3_base_init, .update = sm3_avx_update, - .final = sm3_avx_final, .finup = sm3_avx_finup, - .descsize = sizeof(struct sm3_state), + .descsize = SM3_STATE_SIZE, .base = { .cra_name = "sm3", .cra_driver_name = "sm3-avx", .cra_priority = 300, + .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY | + CRYPTO_AHASH_ALG_FINUP_MAX, .cra_blocksize = SM3_BLOCK_SIZE, .cra_module = THIS_MODULE, } diff --git a/arch/x86/crypto/sm4_aesni_avx2_glue.c b/arch/x86/crypto/sm4_aesni_avx2_glue.c index 1148fd4cd57f..fec0ab7a63dd 100644 --- a/arch/x86/crypto/sm4_aesni_avx2_glue.c +++ b/arch/x86/crypto/sm4_aesni_avx2_glue.c @@ -8,11 +8,10 @@ * Copyright (c) 2021 Tianjia Zhang <tianjia.zhang@linux.alibaba.com> */ +#include <asm/fpu/api.h> #include <linux/module.h> #include <linux/crypto.h> #include <linux/kernel.h> -#include <asm/simd.h> -#include <crypto/internal/simd.h> #include <crypto/internal/skcipher.h> #include <crypto/sm4.h> #include "sm4-avx.h" @@ -48,10 +47,9 @@ static int ctr_crypt(struct skcipher_request *req) static struct skcipher_alg sm4_aesni_avx2_skciphers[] = { { .base = { - .cra_name = "__ecb(sm4)", - .cra_driver_name = "__ecb-sm4-aesni-avx2", + .cra_name = "ecb(sm4)", + .cra_driver_name = "ecb-sm4-aesni-avx2", .cra_priority = 500, - .cra_flags = CRYPTO_ALG_INTERNAL, .cra_blocksize = SM4_BLOCK_SIZE, .cra_ctxsize = sizeof(struct sm4_ctx), .cra_module = THIS_MODULE, @@ -64,10 +62,9 @@ static struct skcipher_alg sm4_aesni_avx2_skciphers[] = { .decrypt = sm4_avx_ecb_decrypt, }, { .base = { - .cra_name = "__cbc(sm4)", - .cra_driver_name = "__cbc-sm4-aesni-avx2", + .cra_name = "cbc(sm4)", + .cra_driver_name = "cbc-sm4-aesni-avx2", .cra_priority = 500, - .cra_flags = CRYPTO_ALG_INTERNAL, .cra_blocksize = SM4_BLOCK_SIZE, .cra_ctxsize = sizeof(struct sm4_ctx), .cra_module = THIS_MODULE, @@ -81,10 +78,9 @@ static struct skcipher_alg sm4_aesni_avx2_skciphers[] = { .decrypt = cbc_decrypt, }, { .base = { - .cra_name = "__ctr(sm4)", - .cra_driver_name = "__ctr-sm4-aesni-avx2", + .cra_name = "ctr(sm4)", + .cra_driver_name = "ctr-sm4-aesni-avx2", .cra_priority = 500, - .cra_flags = CRYPTO_ALG_INTERNAL, .cra_blocksize = 1, .cra_ctxsize = sizeof(struct sm4_ctx), .cra_module = THIS_MODULE, @@ -100,9 +96,6 @@ static struct skcipher_alg sm4_aesni_avx2_skciphers[] = { } }; -static struct simd_skcipher_alg * -simd_sm4_aesni_avx2_skciphers[ARRAY_SIZE(sm4_aesni_avx2_skciphers)]; - static int __init sm4_init(void) { const char *feature_name; @@ -121,16 +114,14 @@ static int __init sm4_init(void) return -ENODEV; } - return simd_register_skciphers_compat(sm4_aesni_avx2_skciphers, - ARRAY_SIZE(sm4_aesni_avx2_skciphers), - simd_sm4_aesni_avx2_skciphers); + return crypto_register_skciphers(sm4_aesni_avx2_skciphers, + ARRAY_SIZE(sm4_aesni_avx2_skciphers)); } static void __exit sm4_exit(void) { - simd_unregister_skciphers(sm4_aesni_avx2_skciphers, - ARRAY_SIZE(sm4_aesni_avx2_skciphers), - simd_sm4_aesni_avx2_skciphers); + crypto_unregister_skciphers(sm4_aesni_avx2_skciphers, + ARRAY_SIZE(sm4_aesni_avx2_skciphers)); } module_init(sm4_init); diff --git a/arch/x86/crypto/sm4_aesni_avx_glue.c b/arch/x86/crypto/sm4_aesni_avx_glue.c index 85b4ca78b47b..72867fc49ce8 100644 --- a/arch/x86/crypto/sm4_aesni_avx_glue.c +++ b/arch/x86/crypto/sm4_aesni_avx_glue.c @@ -8,11 +8,10 @@ * Copyright (c) 2021 Tianjia Zhang <tianjia.zhang@linux.alibaba.com> */ +#include <asm/fpu/api.h> #include <linux/module.h> #include <linux/crypto.h> #include <linux/kernel.h> -#include <asm/simd.h> -#include <crypto/internal/simd.h> #include <crypto/internal/skcipher.h> #include <crypto/sm4.h> #include "sm4-avx.h" @@ -263,10 +262,9 @@ static int ctr_crypt(struct skcipher_request *req) static struct skcipher_alg sm4_aesni_avx_skciphers[] = { { .base = { - .cra_name = "__ecb(sm4)", - .cra_driver_name = "__ecb-sm4-aesni-avx", + .cra_name = "ecb(sm4)", + .cra_driver_name = "ecb-sm4-aesni-avx", .cra_priority = 400, - .cra_flags = CRYPTO_ALG_INTERNAL, .cra_blocksize = SM4_BLOCK_SIZE, .cra_ctxsize = sizeof(struct sm4_ctx), .cra_module = THIS_MODULE, @@ -279,10 +277,9 @@ static struct skcipher_alg sm4_aesni_avx_skciphers[] = { .decrypt = sm4_avx_ecb_decrypt, }, { .base = { - .cra_name = "__cbc(sm4)", - .cra_driver_name = "__cbc-sm4-aesni-avx", + .cra_name = "cbc(sm4)", + .cra_driver_name = "cbc-sm4-aesni-avx", .cra_priority = 400, - .cra_flags = CRYPTO_ALG_INTERNAL, .cra_blocksize = SM4_BLOCK_SIZE, .cra_ctxsize = sizeof(struct sm4_ctx), .cra_module = THIS_MODULE, @@ -296,10 +293,9 @@ static struct skcipher_alg sm4_aesni_avx_skciphers[] = { .decrypt = cbc_decrypt, }, { .base = { - .cra_name = "__ctr(sm4)", - .cra_driver_name = "__ctr-sm4-aesni-avx", + .cra_name = "ctr(sm4)", + .cra_driver_name = "ctr-sm4-aesni-avx", .cra_priority = 400, - .cra_flags = CRYPTO_ALG_INTERNAL, .cra_blocksize = 1, .cra_ctxsize = sizeof(struct sm4_ctx), .cra_module = THIS_MODULE, @@ -315,9 +311,6 @@ static struct skcipher_alg sm4_aesni_avx_skciphers[] = { } }; -static struct simd_skcipher_alg * -simd_sm4_aesni_avx_skciphers[ARRAY_SIZE(sm4_aesni_avx_skciphers)]; - static int __init sm4_init(void) { const char *feature_name; @@ -335,16 +328,14 @@ static int __init sm4_init(void) return -ENODEV; } - return simd_register_skciphers_compat(sm4_aesni_avx_skciphers, - ARRAY_SIZE(sm4_aesni_avx_skciphers), - simd_sm4_aesni_avx_skciphers); + return crypto_register_skciphers(sm4_aesni_avx_skciphers, + ARRAY_SIZE(sm4_aesni_avx_skciphers)); } static void __exit sm4_exit(void) { - simd_unregister_skciphers(sm4_aesni_avx_skciphers, - ARRAY_SIZE(sm4_aesni_avx_skciphers), - simd_sm4_aesni_avx_skciphers); + crypto_unregister_skciphers(sm4_aesni_avx_skciphers, + ARRAY_SIZE(sm4_aesni_avx_skciphers)); } module_init(sm4_init); diff --git a/arch/x86/crypto/twofish-x86_64-asm_64-3way.S b/arch/x86/crypto/twofish-x86_64-asm_64-3way.S index d2288bf38a8a..071e90e7f0d8 100644 --- a/arch/x86/crypto/twofish-x86_64-asm_64-3way.S +++ b/arch/x86/crypto/twofish-x86_64-asm_64-3way.S @@ -6,6 +6,7 @@ */ #include <linux/linkage.h> +#include <linux/cfi_types.h> .file "twofish-x86_64-asm-3way.S" .text @@ -220,7 +221,7 @@ rorq $32, RAB2; \ outunpack3(mov, RIO, 2, RAB, 2); -SYM_FUNC_START(__twofish_enc_blk_3way) +SYM_TYPED_FUNC_START(__twofish_enc_blk_3way) /* input: * %rdi: ctx, CTX * %rsi: dst @@ -269,7 +270,7 @@ SYM_FUNC_START(__twofish_enc_blk_3way) RET; SYM_FUNC_END(__twofish_enc_blk_3way) -SYM_FUNC_START(twofish_dec_blk_3way) +SYM_TYPED_FUNC_START(twofish_dec_blk_3way) /* input: * %rdi: ctx, CTX * %rsi: dst diff --git a/arch/x86/crypto/twofish-x86_64-asm_64.S b/arch/x86/crypto/twofish-x86_64-asm_64.S index 775af290cd19..e08b4ba07b93 100644 --- a/arch/x86/crypto/twofish-x86_64-asm_64.S +++ b/arch/x86/crypto/twofish-x86_64-asm_64.S @@ -8,6 +8,7 @@ .text #include <linux/linkage.h> +#include <linux/cfi_types.h> #include <asm/asm-offsets.h> #define a_offset 0 @@ -202,7 +203,7 @@ xor %r8d, d ## D;\ ror $1, d ## D; -SYM_FUNC_START(twofish_enc_blk) +SYM_TYPED_FUNC_START(twofish_enc_blk) pushq R1 /* %rdi contains the ctx address */ @@ -255,7 +256,7 @@ SYM_FUNC_START(twofish_enc_blk) RET SYM_FUNC_END(twofish_enc_blk) -SYM_FUNC_START(twofish_dec_blk) +SYM_TYPED_FUNC_START(twofish_dec_blk) pushq R1 /* %rdi contains the ctx address */ diff --git a/arch/x86/crypto/twofish_avx_glue.c b/arch/x86/crypto/twofish_avx_glue.c index 3eb3440b477a..9e20db013750 100644 --- a/arch/x86/crypto/twofish_avx_glue.c +++ b/arch/x86/crypto/twofish_avx_glue.c @@ -13,7 +13,6 @@ #include <linux/crypto.h> #include <linux/err.h> #include <crypto/algapi.h> -#include <crypto/internal/simd.h> #include <crypto/twofish.h> #include "twofish.h" @@ -74,10 +73,9 @@ static int cbc_decrypt(struct skcipher_request *req) static struct skcipher_alg twofish_algs[] = { { - .base.cra_name = "__ecb(twofish)", - .base.cra_driver_name = "__ecb-twofish-avx", + .base.cra_name = "ecb(twofish)", + .base.cra_driver_name = "ecb-twofish-avx", .base.cra_priority = 400, - .base.cra_flags = CRYPTO_ALG_INTERNAL, .base.cra_blocksize = TF_BLOCK_SIZE, .base.cra_ctxsize = sizeof(struct twofish_ctx), .base.cra_module = THIS_MODULE, @@ -87,10 +85,9 @@ static struct skcipher_alg twofish_algs[] = { .encrypt = ecb_encrypt, .decrypt = ecb_decrypt, }, { - .base.cra_name = "__cbc(twofish)", - .base.cra_driver_name = "__cbc-twofish-avx", + .base.cra_name = "cbc(twofish)", + .base.cra_driver_name = "cbc-twofish-avx", .base.cra_priority = 400, - .base.cra_flags = CRYPTO_ALG_INTERNAL, .base.cra_blocksize = TF_BLOCK_SIZE, .base.cra_ctxsize = sizeof(struct twofish_ctx), .base.cra_module = THIS_MODULE, @@ -103,8 +100,6 @@ static struct skcipher_alg twofish_algs[] = { }, }; -static struct simd_skcipher_alg *twofish_simd_algs[ARRAY_SIZE(twofish_algs)]; - static int __init twofish_init(void) { const char *feature_name; @@ -114,15 +109,13 @@ static int __init twofish_init(void) return -ENODEV; } - return simd_register_skciphers_compat(twofish_algs, - ARRAY_SIZE(twofish_algs), - twofish_simd_algs); + return crypto_register_skciphers(twofish_algs, + ARRAY_SIZE(twofish_algs)); } static void __exit twofish_exit(void) { - simd_unregister_skciphers(twofish_algs, ARRAY_SIZE(twofish_algs), - twofish_simd_algs); + crypto_unregister_skciphers(twofish_algs, ARRAY_SIZE(twofish_algs)); } module_init(twofish_init); diff --git a/arch/x86/crypto/twofish_glue.c b/arch/x86/crypto/twofish_glue.c index 0614beece279..4c67184dc573 100644 --- a/arch/x86/crypto/twofish_glue.c +++ b/arch/x86/crypto/twofish_glue.c @@ -68,7 +68,6 @@ static struct crypto_alg alg = { .cra_flags = CRYPTO_ALG_TYPE_CIPHER, .cra_blocksize = TF_BLOCK_SIZE, .cra_ctxsize = sizeof(struct twofish_ctx), - .cra_alignmask = 0, .cra_module = THIS_MODULE, .cra_u = { .cipher = { diff --git a/arch/x86/crypto/twofish_glue_3way.c b/arch/x86/crypto/twofish_glue_3way.c index 90454cf18e0d..1a1ecfa7f72a 100644 --- a/arch/x86/crypto/twofish_glue_3way.c +++ b/arch/x86/crypto/twofish_glue_3way.c @@ -5,6 +5,7 @@ * Copyright (c) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> */ +#include <asm/cpu_device_id.h> #include <crypto/algapi.h> #include <crypto/twofish.h> #include <linux/crypto.h> @@ -107,10 +108,10 @@ static bool is_blacklisted_cpu(void) if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) return false; - if (boot_cpu_data.x86 == 0x06 && - (boot_cpu_data.x86_model == 0x1c || - boot_cpu_data.x86_model == 0x26 || - boot_cpu_data.x86_model == 0x36)) { + switch (boot_cpu_data.x86_vfm) { + case INTEL_ATOM_BONNELL: + case INTEL_ATOM_BONNELL_MID: + case INTEL_ATOM_SALTWELL: /* * On Atom, twofish-3way is slower than original assembler * implementation. Twofish-3way trades off some performance in |