aboutsummaryrefslogtreecommitdiffstats
path: root/arch/arm64/crypto
diff options
context:
space:
mode:
Diffstat (limited to 'arch/arm64/crypto')
-rw-r--r--arch/arm64/crypto/.gitignore2
-rw-r--r--arch/arm64/crypto/Kconfig287
-rw-r--r--arch/arm64/crypto/Makefile51
-rw-r--r--arch/arm64/crypto/aes-ce-ccm-core.S46
-rw-r--r--arch/arm64/crypto/aes-ce-ccm-glue.c228
-rw-r--r--arch/arm64/crypto/aes-ce-core.S84
-rw-r--r--arch/arm64/crypto/aes-ce-glue.c (renamed from arch/arm64/crypto/aes-ce-cipher.c)140
-rw-r--r--arch/arm64/crypto/aes-ce-setkey.h1
-rw-r--r--arch/arm64/crypto/aes-ce.S92
-rw-r--r--arch/arm64/crypto/aes-cipher-core.S52
-rw-r--r--arch/arm64/crypto/aes-cipher-glue.c16
-rw-r--r--arch/arm64/crypto/aes-ctr-fallback.h53
-rw-r--r--arch/arm64/crypto/aes-glue.c747
-rw-r--r--arch/arm64/crypto/aes-modes.S1002
-rw-r--r--arch/arm64/crypto/aes-neon.S152
-rw-r--r--arch/arm64/crypto/aes-neonbs-core.S393
-rw-r--r--arch/arm64/crypto/aes-neonbs-glue.c349
-rw-r--r--arch/arm64/crypto/chacha-neon-core.S (renamed from arch/arm64/crypto/chacha20-neon-core.S)439
-rw-r--r--arch/arm64/crypto/chacha-neon-glue.c243
-rw-r--r--arch/arm64/crypto/chacha20-neon-glue.c127
-rw-r--r--arch/arm64/crypto/crc32-ce-core.S266
-rw-r--r--arch/arm64/crypto/crc32-ce-glue.c242
-rw-r--r--arch/arm64/crypto/crct10dif-ce-core.S651
-rw-r--r--arch/arm64/crypto/crct10dif-ce-glue.c93
-rw-r--r--arch/arm64/crypto/ghash-ce-core.S614
-rw-r--r--arch/arm64/crypto/ghash-ce-glue.c456
-rw-r--r--arch/arm64/crypto/nh-neon-core.S103
-rw-r--r--arch/arm64/crypto/nhpoly1305-neon-glue.c78
-rw-r--r--arch/arm64/crypto/poly1305-armv8.pl913
-rw-r--r--arch/arm64/crypto/poly1305-glue.c231
-rw-r--r--arch/arm64/crypto/polyval-ce-core.S361
-rw-r--r--arch/arm64/crypto/polyval-ce-glue.c191
-rw-r--r--arch/arm64/crypto/sha1-ce-core.S40
-rw-r--r--arch/arm64/crypto/sha1-ce-glue.c76
-rw-r--r--arch/arm64/crypto/sha2-ce-core.S20
-rw-r--r--arch/arm64/crypto/sha2-ce-glue.c98
-rw-r--r--arch/arm64/crypto/sha256-core.S_shipped2061
-rw-r--r--arch/arm64/crypto/sha256-glue.c115
-rw-r--r--arch/arm64/crypto/sha3-ce-core.S212
-rw-r--r--arch/arm64/crypto/sha3-ce-glue.c166
-rw-r--r--arch/arm64/crypto/sha512-armv8.pl14
-rw-r--r--arch/arm64/crypto/sha512-ce-core.S206
-rw-r--r--arch/arm64/crypto/sha512-ce-glue.c121
-rw-r--r--arch/arm64/crypto/sha512-core.S_shipped1085
-rw-r--r--arch/arm64/crypto/sha512-glue.c28
-rw-r--r--arch/arm64/crypto/sm3-ce-core.S139
-rw-r--r--arch/arm64/crypto/sm3-ce-glue.c101
-rw-r--r--arch/arm64/crypto/sm4-ce-cipher-core.S36
-rw-r--r--arch/arm64/crypto/sm4-ce-cipher-glue.c82
-rw-r--r--arch/arm64/crypto/sm4-ce-core.S660
-rw-r--r--arch/arm64/crypto/sm4-ce-glue.c372
-rw-r--r--arch/arm64/crypto/sm4-neon-core.S487
-rw-r--r--arch/arm64/crypto/sm4-neon-glue.c442
53 files changed, 8982 insertions, 6282 deletions
diff --git a/arch/arm64/crypto/.gitignore b/arch/arm64/crypto/.gitignore
index 879df8781ed5..fcf2d731e6c1 100644
--- a/arch/arm64/crypto/.gitignore
+++ b/arch/arm64/crypto/.gitignore
@@ -1,2 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
sha256-core.S
sha512-core.S
+poly1305-core.S
diff --git a/arch/arm64/crypto/Kconfig b/arch/arm64/crypto/Kconfig
index 7ca54a76f6b9..8bd80508a710 100644
--- a/arch/arm64/crypto/Kconfig
+++ b/arch/arm64/crypto/Kconfig
@@ -1,97 +1,282 @@
+# SPDX-License-Identifier: GPL-2.0
-menuconfig ARM64_CRYPTO
- bool "ARM64 Accelerated Cryptographic Algorithms"
- depends on ARM64
+menu "Accelerated Cryptographic Algorithms for CPU (arm64)"
+
+config CRYPTO_GHASH_ARM64_CE
+ tristate "Hash functions: GHASH (ARMv8 Crypto Extensions)"
+ depends on KERNEL_MODE_NEON
+ select CRYPTO_HASH
+ select CRYPTO_GF128MUL
+ select CRYPTO_LIB_AES
+ select CRYPTO_AEAD
help
- Say Y here to choose from a selection of cryptographic algorithms
- implemented using ARM64 specific CPU features or instructions.
+ GCM GHASH function (NIST SP800-38D)
-if ARM64_CRYPTO
+ Architecture: arm64 using:
+ - ARMv8 Crypto Extensions
-config CRYPTO_SHA256_ARM64
- tristate "SHA-224/SHA-256 digest algorithm for arm64"
- select CRYPTO_HASH
+config CRYPTO_NHPOLY1305_NEON
+ tristate "Hash functions: NHPoly1305 (NEON)"
+ depends on KERNEL_MODE_NEON
+ select CRYPTO_NHPOLY1305
+ help
+ NHPoly1305 hash function (Adiantum)
-config CRYPTO_SHA512_ARM64
- tristate "SHA-384/SHA-512 digest algorithm for arm64"
+ Architecture: arm64 using:
+ - NEON (Advanced SIMD) extensions
+
+config CRYPTO_POLY1305_NEON
+ tristate "Hash functions: Poly1305 (NEON)"
+ depends on KERNEL_MODE_NEON
select CRYPTO_HASH
+ select CRYPTO_ARCH_HAVE_LIB_POLY1305
+ help
+ Poly1305 authenticator algorithm (RFC7539)
+
+ Architecture: arm64 using:
+ - NEON (Advanced SIMD) extensions
config CRYPTO_SHA1_ARM64_CE
- tristate "SHA-1 digest algorithm (ARMv8 Crypto Extensions)"
+ tristate "Hash functions: SHA-1 (ARMv8 Crypto Extensions)"
depends on KERNEL_MODE_NEON
select CRYPTO_HASH
select CRYPTO_SHA1
+ help
+ SHA-1 secure hash algorithm (FIPS 180)
+
+ Architecture: arm64 using:
+ - ARMv8 Crypto Extensions
+
+config CRYPTO_SHA256_ARM64
+ tristate "Hash functions: SHA-224 and SHA-256"
+ select CRYPTO_HASH
+ help
+ SHA-224 and SHA-256 secure hash algorithms (FIPS 180)
+
+ Architecture: arm64
config CRYPTO_SHA2_ARM64_CE
- tristate "SHA-224/SHA-256 digest algorithm (ARMv8 Crypto Extensions)"
+ tristate "Hash functions: SHA-224 and SHA-256 (ARMv8 Crypto Extensions)"
depends on KERNEL_MODE_NEON
select CRYPTO_HASH
select CRYPTO_SHA256_ARM64
+ help
+ SHA-224 and SHA-256 secure hash algorithms (FIPS 180)
-config CRYPTO_GHASH_ARM64_CE
- tristate "GHASH/AES-GCM using ARMv8 Crypto Extensions"
+ Architecture: arm64 using:
+ - ARMv8 Crypto Extensions
+
+config CRYPTO_SHA512_ARM64
+ tristate "Hash functions: SHA-384 and SHA-512"
+ select CRYPTO_HASH
+ help
+ SHA-384 and SHA-512 secure hash algorithms (FIPS 180)
+
+ Architecture: arm64
+
+config CRYPTO_SHA512_ARM64_CE
+ tristate "Hash functions: SHA-384 and SHA-512 (ARMv8 Crypto Extensions)"
depends on KERNEL_MODE_NEON
select CRYPTO_HASH
- select CRYPTO_GF128MUL
- select CRYPTO_AES
- select CRYPTO_AES_ARM64
+ select CRYPTO_SHA512_ARM64
+ help
+ SHA-384 and SHA-512 secure hash algorithms (FIPS 180)
-config CRYPTO_CRCT10DIF_ARM64_CE
- tristate "CRCT10DIF digest algorithm using PMULL instructions"
- depends on KERNEL_MODE_NEON && CRC_T10DIF
+ Architecture: arm64 using:
+ - ARMv8 Crypto Extensions
+
+config CRYPTO_SHA3_ARM64
+ tristate "Hash functions: SHA-3 (ARMv8.2 Crypto Extensions)"
+ depends on KERNEL_MODE_NEON
select CRYPTO_HASH
+ select CRYPTO_SHA3
+ help
+ SHA-3 secure hash algorithms (FIPS 202)
-config CRYPTO_CRC32_ARM64_CE
- tristate "CRC32 and CRC32C digest algorithms using ARMv8 extensions"
- depends on CRC32
+ Architecture: arm64 using:
+ - ARMv8.2 Crypto Extensions
+
+config CRYPTO_SM3_ARM64_CE
+ tristate "Hash functions: SM3 (ARMv8.2 Crypto Extensions)"
+ depends on KERNEL_MODE_NEON
select CRYPTO_HASH
+ select CRYPTO_SM3
+ help
+ SM3 (ShangMi 3) secure hash function (OSCCA GM/T 0004-2012)
+
+ Architecture: arm64 using:
+ - ARMv8.2 Crypto Extensions
+
+config CRYPTO_POLYVAL_ARM64_CE
+ tristate "Hash functions: POLYVAL (ARMv8 Crypto Extensions)"
+ depends on KERNEL_MODE_NEON
+ select CRYPTO_POLYVAL
+ help
+ POLYVAL hash function for HCTR2
+
+ Architecture: arm64 using:
+ - ARMv8 Crypto Extensions
config CRYPTO_AES_ARM64
- tristate "AES core cipher using scalar instructions"
+ tristate "Ciphers: AES, modes: ECB, CBC, CTR, CTS, XCTR, XTS"
select CRYPTO_AES
+ help
+ Block ciphers: AES cipher algorithms (FIPS-197)
+ Length-preserving ciphers: AES with ECB, CBC, CTR, CTS,
+ XCTR, and XTS modes
+ AEAD cipher: AES with CBC, ESSIV, and SHA-256
+ for fscrypt and dm-crypt
+
+ Architecture: arm64
config CRYPTO_AES_ARM64_CE
- tristate "AES core cipher using ARMv8 Crypto Extensions"
+ tristate "Ciphers: AES (ARMv8 Crypto Extensions)"
depends on ARM64 && KERNEL_MODE_NEON
select CRYPTO_ALGAPI
- select CRYPTO_AES_ARM64
+ select CRYPTO_LIB_AES
+ help
+ Block ciphers: AES cipher algorithms (FIPS-197)
-config CRYPTO_AES_ARM64_CE_CCM
- tristate "AES in CCM mode using ARMv8 Crypto Extensions"
- depends on ARM64 && KERNEL_MODE_NEON
- select CRYPTO_ALGAPI
- select CRYPTO_AES_ARM64_CE
- select CRYPTO_AES_ARM64
- select CRYPTO_AEAD
+ Architecture: arm64 using:
+ - ARMv8 Crypto Extensions
config CRYPTO_AES_ARM64_CE_BLK
- tristate "AES in ECB/CBC/CTR/XTS modes using ARMv8 Crypto Extensions"
+ tristate "Ciphers: AES, modes: ECB/CBC/CTR/XTS (ARMv8 Crypto Extensions)"
depends on KERNEL_MODE_NEON
- select CRYPTO_BLKCIPHER
+ select CRYPTO_SKCIPHER
select CRYPTO_AES_ARM64_CE
- select CRYPTO_AES_ARM64
- select CRYPTO_SIMD
+ help
+ Length-preserving ciphers: AES cipher algorithms (FIPS-197)
+ with block cipher modes:
+ - ECB (Electronic Codebook) mode (NIST SP800-38A)
+ - CBC (Cipher Block Chaining) mode (NIST SP800-38A)
+ - CTR (Counter) mode (NIST SP800-38A)
+ - XTS (XOR Encrypt XOR with ciphertext stealing) mode (NIST SP800-38E
+ and IEEE 1619)
+
+ Architecture: arm64 using:
+ - ARMv8 Crypto Extensions
config CRYPTO_AES_ARM64_NEON_BLK
- tristate "AES in ECB/CBC/CTR/XTS modes using NEON instructions"
+ tristate "Ciphers: AES, modes: ECB/CBC/CTR/XTS (NEON)"
depends on KERNEL_MODE_NEON
- select CRYPTO_BLKCIPHER
- select CRYPTO_AES_ARM64
- select CRYPTO_AES
- select CRYPTO_SIMD
+ select CRYPTO_SKCIPHER
+ select CRYPTO_LIB_AES
+ help
+ Length-preserving ciphers: AES cipher algorithms (FIPS-197)
+ with block cipher modes:
+ - ECB (Electronic Codebook) mode (NIST SP800-38A)
+ - CBC (Cipher Block Chaining) mode (NIST SP800-38A)
+ - CTR (Counter) mode (NIST SP800-38A)
+ - XTS (XOR Encrypt XOR with ciphertext stealing) mode (NIST SP800-38E
+ and IEEE 1619)
+
+ Architecture: arm64 using:
+ - NEON (Advanced SIMD) extensions
config CRYPTO_CHACHA20_NEON
- tristate "NEON accelerated ChaCha20 symmetric cipher"
+ tristate "Ciphers: ChaCha (NEON)"
depends on KERNEL_MODE_NEON
- select CRYPTO_BLKCIPHER
- select CRYPTO_CHACHA20
+ select CRYPTO_SKCIPHER
+ select CRYPTO_LIB_CHACHA_GENERIC
+ select CRYPTO_ARCH_HAVE_LIB_CHACHA
+ help
+ Length-preserving ciphers: ChaCha20, XChaCha20, and XChaCha12
+ stream cipher algorithms
+
+ Architecture: arm64 using:
+ - NEON (Advanced SIMD) extensions
config CRYPTO_AES_ARM64_BS
- tristate "AES in ECB/CBC/CTR/XTS modes using bit-sliced NEON algorithm"
+ tristate "Ciphers: AES, modes: ECB/CBC/CTR/XCTR/XTS modes (bit-sliced NEON)"
depends on KERNEL_MODE_NEON
- select CRYPTO_BLKCIPHER
+ select CRYPTO_SKCIPHER
select CRYPTO_AES_ARM64_NEON_BLK
- select CRYPTO_AES_ARM64
- select CRYPTO_SIMD
+ select CRYPTO_LIB_AES
+ help
+ Length-preserving ciphers: AES cipher algorithms (FIPS-197)
+ with block cipher modes:
+ - ECB (Electronic Codebook) mode (NIST SP800-38A)
+ - CBC (Cipher Block Chaining) mode (NIST SP800-38A)
+ - CTR (Counter) mode (NIST SP800-38A)
+ - XCTR mode for HCTR2
+ - XTS (XOR Encrypt XOR with ciphertext stealing) mode (NIST SP800-38E
+ and IEEE 1619)
+
+ Architecture: arm64 using:
+ - bit-sliced algorithm
+ - NEON (Advanced SIMD) extensions
+
+config CRYPTO_SM4_ARM64_CE
+ tristate "Ciphers: SM4 (ARMv8.2 Crypto Extensions)"
+ depends on KERNEL_MODE_NEON
+ select CRYPTO_ALGAPI
+ select CRYPTO_SM4
+ help
+ Block ciphers: SM4 cipher algorithms (OSCCA GB/T 32907-2016)
+
+ Architecture: arm64 using:
+ - ARMv8.2 Crypto Extensions
+ - NEON (Advanced SIMD) extensions
+
+config CRYPTO_SM4_ARM64_CE_BLK
+ tristate "Ciphers: SM4, modes: ECB/CBC/CFB/CTR (ARMv8 Crypto Extensions)"
+ depends on KERNEL_MODE_NEON
+ select CRYPTO_SKCIPHER
+ select CRYPTO_SM4
+ help
+ Length-preserving ciphers: SM4 cipher algorithms (OSCCA GB/T 32907-2016)
+ with block cipher modes:
+ - ECB (Electronic Codebook) mode (NIST SP800-38A)
+ - CBC (Cipher Block Chaining) mode (NIST SP800-38A)
+ - CFB (Cipher Feedback) mode (NIST SP800-38A)
+ - CTR (Counter) mode (NIST SP800-38A)
+
+ Architecture: arm64 using:
+ - ARMv8 Crypto Extensions
+ - NEON (Advanced SIMD) extensions
+
+config CRYPTO_SM4_ARM64_NEON_BLK
+ tristate "Ciphers: SM4, modes: ECB/CBC/CFB/CTR (NEON)"
+ depends on KERNEL_MODE_NEON
+ select CRYPTO_SKCIPHER
+ select CRYPTO_SM4
+ help
+ Length-preserving ciphers: SM4 cipher algorithms (OSCCA GB/T 32907-2016)
+ with block cipher modes:
+ - ECB (Electronic Codebook) mode (NIST SP800-38A)
+ - CBC (Cipher Block Chaining) mode (NIST SP800-38A)
+ - CFB (Cipher Feedback) mode (NIST SP800-38A)
+ - CTR (Counter) mode (NIST SP800-38A)
+
+ Architecture: arm64 using:
+ - NEON (Advanced SIMD) extensions
+
+config CRYPTO_AES_ARM64_CE_CCM
+ tristate "AEAD cipher: AES in CCM mode (ARMv8 Crypto Extensions)"
+ depends on ARM64 && KERNEL_MODE_NEON
+ select CRYPTO_ALGAPI
+ select CRYPTO_AES_ARM64_CE
+ select CRYPTO_AEAD
+ select CRYPTO_LIB_AES
+ help
+ AEAD cipher: AES cipher algorithms (FIPS-197) with
+ CCM (Counter with Cipher Block Chaining-Message Authentication Code)
+ authenticated encryption mode (NIST SP800-38C)
+
+ Architecture: arm64 using:
+ - ARMv8 Crypto Extensions
+ - NEON (Advanced SIMD) extensions
+
+config CRYPTO_CRCT10DIF_ARM64_CE
+ tristate "CRCT10DIF (PMULL)"
+ depends on KERNEL_MODE_NEON && CRC_T10DIF
+ select CRYPTO_HASH
+ help
+ CRC16 CRC algorithm used for the T10 (SCSI) Data Integrity Field (DIF)
+
+ Architecture: arm64 using
+ - PMULL (Polynomial Multiply Long) instructions
+
+endmenu
-endif
diff --git a/arch/arm64/crypto/Makefile b/arch/arm64/crypto/Makefile
index b5edc5918c28..24bb0c4610de 100644
--- a/arch/arm64/crypto/Makefile
+++ b/arch/arm64/crypto/Makefile
@@ -1,12 +1,9 @@
+# SPDX-License-Identifier: GPL-2.0-only
#
# linux/arch/arm64/crypto/Makefile
#
# Copyright (C) 2014 Linaro Ltd <ard.biesheuvel@linaro.org>
#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License version 2 as
-# published by the Free Software Foundation.
-#
obj-$(CONFIG_CRYPTO_SHA1_ARM64_CE) += sha1-ce.o
sha1-ce-y := sha1-ce-glue.o sha1-ce-core.o
@@ -14,17 +11,35 @@ sha1-ce-y := sha1-ce-glue.o sha1-ce-core.o
obj-$(CONFIG_CRYPTO_SHA2_ARM64_CE) += sha2-ce.o
sha2-ce-y := sha2-ce-glue.o sha2-ce-core.o
+obj-$(CONFIG_CRYPTO_SHA512_ARM64_CE) += sha512-ce.o
+sha512-ce-y := sha512-ce-glue.o sha512-ce-core.o
+
+obj-$(CONFIG_CRYPTO_SHA3_ARM64) += sha3-ce.o
+sha3-ce-y := sha3-ce-glue.o sha3-ce-core.o
+
+obj-$(CONFIG_CRYPTO_SM3_ARM64_CE) += sm3-ce.o
+sm3-ce-y := sm3-ce-glue.o sm3-ce-core.o
+
+obj-$(CONFIG_CRYPTO_SM4_ARM64_CE) += sm4-ce-cipher.o
+sm4-ce-cipher-y := sm4-ce-cipher-glue.o sm4-ce-cipher-core.o
+
+obj-$(CONFIG_CRYPTO_SM4_ARM64_CE_BLK) += sm4-ce.o
+sm4-ce-y := sm4-ce-glue.o sm4-ce-core.o
+
+obj-$(CONFIG_CRYPTO_SM4_ARM64_NEON_BLK) += sm4-neon.o
+sm4-neon-y := sm4-neon-glue.o sm4-neon-core.o
+
obj-$(CONFIG_CRYPTO_GHASH_ARM64_CE) += ghash-ce.o
ghash-ce-y := ghash-ce-glue.o ghash-ce-core.o
+obj-$(CONFIG_CRYPTO_POLYVAL_ARM64_CE) += polyval-ce.o
+polyval-ce-y := polyval-ce-glue.o polyval-ce-core.o
+
obj-$(CONFIG_CRYPTO_CRCT10DIF_ARM64_CE) += crct10dif-ce.o
crct10dif-ce-y := crct10dif-ce-core.o crct10dif-ce-glue.o
-obj-$(CONFIG_CRYPTO_CRC32_ARM64_CE) += crc32-ce.o
-crc32-ce-y:= crc32-ce-core.o crc32-ce-glue.o
-
obj-$(CONFIG_CRYPTO_AES_ARM64_CE) += aes-ce-cipher.o
-CFLAGS_aes-ce-cipher.o += -march=armv8-a+crypto
+aes-ce-cipher-y := aes-ce-core.o aes-ce-glue.o
obj-$(CONFIG_CRYPTO_AES_ARM64_CE_CCM) += aes-ce-ccm.o
aes-ce-ccm-y := aes-ce-ccm-glue.o aes-ce-ccm-core.o
@@ -41,8 +56,15 @@ sha256-arm64-y := sha256-glue.o sha256-core.o
obj-$(CONFIG_CRYPTO_SHA512_ARM64) += sha512-arm64.o
sha512-arm64-y := sha512-glue.o sha512-core.o
-obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha20-neon.o
-chacha20-neon-y := chacha20-neon-core.o chacha20-neon-glue.o
+obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha-neon.o
+chacha-neon-y := chacha-neon-core.o chacha-neon-glue.o
+
+obj-$(CONFIG_CRYPTO_POLY1305_NEON) += poly1305-neon.o
+poly1305-neon-y := poly1305-core.o poly1305-glue.o
+AFLAGS_poly1305-core.o += -Dpoly1305_init=poly1305_init_arm64
+
+obj-$(CONFIG_CRYPTO_NHPOLY1305_NEON) += nhpoly1305-neon.o
+nhpoly1305-neon-y := nh-neon-core.o nhpoly1305-neon-glue.o
obj-$(CONFIG_CRYPTO_AES_ARM64) += aes-arm64.o
aes-arm64-y := aes-cipher-core.o aes-cipher-glue.o
@@ -50,9 +72,6 @@ aes-arm64-y := aes-cipher-core.o aes-cipher-glue.o
obj-$(CONFIG_CRYPTO_AES_ARM64_BS) += aes-neon-bs.o
aes-neon-bs-y := aes-neonbs-core.o aes-neonbs-glue.o
-AFLAGS_aes-ce.o := -DINTERLEAVE=4
-AFLAGS_aes-neon.o := -DINTERLEAVE=4
-
CFLAGS_aes-glue-ce.o := -DUSE_V8_CRYPTO_EXTENSIONS
$(obj)/aes-glue-%.o: $(src)/aes-glue.c FORCE
@@ -61,10 +80,10 @@ $(obj)/aes-glue-%.o: $(src)/aes-glue.c FORCE
quiet_cmd_perlasm = PERLASM $@
cmd_perlasm = $(PERL) $(<) void $(@)
-$(src)/sha256-core.S_shipped: $(src)/sha512-armv8.pl
+$(obj)/%-core.S: $(src)/%-armv8.pl
$(call cmd,perlasm)
-$(src)/sha512-core.S_shipped: $(src)/sha512-armv8.pl
+$(obj)/sha256-core.S: $(src)/sha512-armv8.pl
$(call cmd,perlasm)
-.PRECIOUS: $(obj)/sha256-core.S $(obj)/sha512-core.S
+clean-files += poly1305-core.S sha256-core.S sha512-core.S
diff --git a/arch/arm64/crypto/aes-ce-ccm-core.S b/arch/arm64/crypto/aes-ce-ccm-core.S
index e3a375c4cb83..b03f7f71f893 100644
--- a/arch/arm64/crypto/aes-ce-ccm-core.S
+++ b/arch/arm64/crypto/aes-ce-ccm-core.S
@@ -1,11 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
/*
* aesce-ccm-core.S - AES-CCM transform for ARMv8 with Crypto Extensions
*
* Copyright (C) 2013 - 2017 Linaro Ltd <ard.biesheuvel@linaro.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#include <linux/linkage.h>
@@ -15,22 +12,21 @@
.arch armv8-a+crypto
/*
- * void ce_aes_ccm_auth_data(u8 mac[], u8 const in[], u32 abytes,
- * u32 *macp, u8 const rk[], u32 rounds);
+ * u32 ce_aes_ccm_auth_data(u8 mac[], u8 const in[], u32 abytes,
+ * u32 macp, u8 const rk[], u32 rounds);
*/
-ENTRY(ce_aes_ccm_auth_data)
- ldr w8, [x3] /* leftover from prev round? */
+SYM_FUNC_START(ce_aes_ccm_auth_data)
ld1 {v0.16b}, [x0] /* load mac */
- cbz w8, 1f
- sub w8, w8, #16
+ cbz w3, 1f
+ sub w3, w3, #16
eor v1.16b, v1.16b, v1.16b
0: ldrb w7, [x1], #1 /* get 1 byte of input */
subs w2, w2, #1
- add w8, w8, #1
+ add w3, w3, #1
ins v1.b[0], w7
ext v1.16b, v1.16b, v1.16b, #1 /* rotate in the input bytes */
beq 8f /* out of input? */
- cbnz w8, 0b
+ cbnz w3, 0b
eor v0.16b, v0.16b, v1.16b
1: ld1 {v3.4s}, [x4] /* load first round key */
prfm pldl1strm, [x1]
@@ -65,7 +61,7 @@ ENTRY(ce_aes_ccm_auth_data)
beq 10f
adds w2, w2, #16
beq 10f
- mov w8, w2
+ mov w3, w2
7: ldrb w7, [x1], #1
umov w6, v0.b[0]
eor w6, w6, w7
@@ -74,22 +70,23 @@ ENTRY(ce_aes_ccm_auth_data)
beq 10f
ext v0.16b, v0.16b, v0.16b, #1 /* rotate out the mac bytes */
b 7b
-8: mov w7, w8
- add w8, w8, #16
+8: cbz w3, 91f
+ mov w7, w3
+ add w3, w3, #16
9: ext v1.16b, v1.16b, v1.16b, #1
adds w7, w7, #1
bne 9b
- eor v0.16b, v0.16b, v1.16b
+91: eor v0.16b, v0.16b, v1.16b
st1 {v0.16b}, [x0]
-10: str w8, [x3]
+10: mov w0, w3
ret
-ENDPROC(ce_aes_ccm_auth_data)
+SYM_FUNC_END(ce_aes_ccm_auth_data)
/*
* void ce_aes_ccm_final(u8 mac[], u8 const ctr[], u8 const rk[],
* u32 rounds);
*/
-ENTRY(ce_aes_ccm_final)
+SYM_FUNC_START(ce_aes_ccm_final)
ld1 {v3.4s}, [x2], #16 /* load first round key */
ld1 {v0.16b}, [x0] /* load mac */
cmp w3, #12 /* which key size? */
@@ -123,9 +120,10 @@ ENTRY(ce_aes_ccm_final)
eor v0.16b, v0.16b, v1.16b /* en-/decrypt the mac */
st1 {v0.16b}, [x0] /* store result */
ret
-ENDPROC(ce_aes_ccm_final)
+SYM_FUNC_END(ce_aes_ccm_final)
.macro aes_ccm_do_crypt,enc
+ cbz x2, 5f
ldr x8, [x6, #8] /* load lower ctr */
ld1 {v0.16b}, [x5] /* load mac */
CPU_LE( rev x8, x8 ) /* keep swabbed ctr in reg */
@@ -214,10 +212,10 @@ CPU_LE( rev x8, x8 )
* u8 const rk[], u32 rounds, u8 mac[],
* u8 ctr[]);
*/
-ENTRY(ce_aes_ccm_encrypt)
+SYM_FUNC_START(ce_aes_ccm_encrypt)
aes_ccm_do_crypt 1
-ENDPROC(ce_aes_ccm_encrypt)
+SYM_FUNC_END(ce_aes_ccm_encrypt)
-ENTRY(ce_aes_ccm_decrypt)
+SYM_FUNC_START(ce_aes_ccm_decrypt)
aes_ccm_do_crypt 0
-ENDPROC(ce_aes_ccm_decrypt)
+SYM_FUNC_END(ce_aes_ccm_decrypt)
diff --git a/arch/arm64/crypto/aes-ce-ccm-glue.c b/arch/arm64/crypto/aes-ce-ccm-glue.c
index a1254036f2b1..c4f14415f5f0 100644
--- a/arch/arm64/crypto/aes-ce-ccm-glue.c
+++ b/arch/arm64/crypto/aes-ce-ccm-glue.c
@@ -1,15 +1,11 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* aes-ccm-glue.c - AES-CCM transform for ARMv8 with Crypto Extensions
*
* Copyright (C) 2013 - 2017 Linaro Ltd <ard.biesheuvel@linaro.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#include <asm/neon.h>
-#include <asm/simd.h>
#include <asm/unaligned.h>
#include <crypto/aes.h>
#include <crypto/scatterwalk.h>
@@ -31,8 +27,8 @@ static int num_rounds(struct crypto_aes_ctx *ctx)
return 6 + ctx->key_length / 4;
}
-asmlinkage void ce_aes_ccm_auth_data(u8 mac[], u8 const in[], u32 abytes,
- u32 *macp, u32 const rk[], u32 rounds);
+asmlinkage u32 ce_aes_ccm_auth_data(u8 mac[], u8 const in[], u32 abytes,
+ u32 macp, u32 const rk[], u32 rounds);
asmlinkage void ce_aes_ccm_encrypt(u8 out[], u8 const in[], u32 cbytes,
u32 const rk[], u32 rounds, u8 mac[],
@@ -45,20 +41,12 @@ asmlinkage void ce_aes_ccm_decrypt(u8 out[], u8 const in[], u32 cbytes,
asmlinkage void ce_aes_ccm_final(u8 mac[], u8 const ctr[], u32 const rk[],
u32 rounds);
-asmlinkage void __aes_arm64_encrypt(u32 *rk, u8 *out, const u8 *in, int rounds);
-
static int ccm_setkey(struct crypto_aead *tfm, const u8 *in_key,
unsigned int key_len)
{
struct crypto_aes_ctx *ctx = crypto_aead_ctx(tfm);
- int ret;
- ret = ce_aes_expandkey(ctx, in_key, key_len);
- if (!ret)
- return 0;
-
- tfm->base.crt_flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
- return -EINVAL;
+ return ce_aes_expandkey(ctx, in_key, key_len);
}
static int ccm_setauthsize(struct crypto_aead *tfm, unsigned int authsize)
@@ -106,45 +94,7 @@ static int ccm_init_mac(struct aead_request *req, u8 maciv[], u32 msglen)
return 0;
}
-static void ccm_update_mac(struct crypto_aes_ctx *key, u8 mac[], u8 const in[],
- u32 abytes, u32 *macp, bool use_neon)
-{
- if (likely(use_neon)) {
- ce_aes_ccm_auth_data(mac, in, abytes, macp, key->key_enc,
- num_rounds(key));
- } else {
- if (*macp > 0 && *macp < AES_BLOCK_SIZE) {
- int added = min(abytes, AES_BLOCK_SIZE - *macp);
-
- crypto_xor(&mac[*macp], in, added);
-
- *macp += added;
- in += added;
- abytes -= added;
- }
-
- while (abytes > AES_BLOCK_SIZE) {
- __aes_arm64_encrypt(key->key_enc, mac, mac,
- num_rounds(key));
- crypto_xor(mac, in, AES_BLOCK_SIZE);
-
- in += AES_BLOCK_SIZE;
- abytes -= AES_BLOCK_SIZE;
- }
-
- if (abytes > 0) {
- __aes_arm64_encrypt(key->key_enc, mac, mac,
- num_rounds(key));
- crypto_xor(mac, in, abytes);
- *macp = abytes;
- } else {
- *macp = 0;
- }
- }
-}
-
-static void ccm_calculate_auth_mac(struct aead_request *req, u8 mac[],
- bool use_neon)
+static void ccm_calculate_auth_mac(struct aead_request *req, u8 mac[])
{
struct crypto_aead *aead = crypto_aead_reqtfm(req);
struct crypto_aes_ctx *ctx = crypto_aead_ctx(aead);
@@ -163,7 +113,8 @@ static void ccm_calculate_auth_mac(struct aead_request *req, u8 mac[],
ltag.len = 6;
}
- ccm_update_mac(ctx, mac, (u8 *)&ltag, ltag.len, &macp, use_neon);
+ macp = ce_aes_ccm_auth_data(mac, (u8 *)&ltag, ltag.len, macp,
+ ctx->key_enc, num_rounds(ctx));
scatterwalk_start(&walk, req->src);
do {
@@ -174,8 +125,16 @@ static void ccm_calculate_auth_mac(struct aead_request *req, u8 mac[],
scatterwalk_start(&walk, sg_next(walk.sg));
n = scatterwalk_clamp(&walk, len);
}
+ n = min_t(u32, n, SZ_4K); /* yield NEON at least every 4k */
p = scatterwalk_map(&walk);
- ccm_update_mac(ctx, mac, p, n, &macp, use_neon);
+
+ macp = ce_aes_ccm_auth_data(mac, p, n, macp, ctx->key_enc,
+ num_rounds(ctx));
+
+ if (len / SZ_4K > (len - n) / SZ_4K) {
+ kernel_neon_end();
+ kernel_neon_begin();
+ }
len -= n;
scatterwalk_unmap(p);
@@ -184,56 +143,6 @@ static void ccm_calculate_auth_mac(struct aead_request *req, u8 mac[],
} while (len);
}
-static int ccm_crypt_fallback(struct skcipher_walk *walk, u8 mac[], u8 iv0[],
- struct crypto_aes_ctx *ctx, bool enc)
-{
- u8 buf[AES_BLOCK_SIZE];
- int err = 0;
-
- while (walk->nbytes) {
- int blocks = walk->nbytes / AES_BLOCK_SIZE;
- u32 tail = walk->nbytes % AES_BLOCK_SIZE;
- u8 *dst = walk->dst.virt.addr;
- u8 *src = walk->src.virt.addr;
- u32 nbytes = walk->nbytes;
-
- if (nbytes == walk->total && tail > 0) {
- blocks++;
- tail = 0;
- }
-
- do {
- u32 bsize = AES_BLOCK_SIZE;
-
- if (nbytes < AES_BLOCK_SIZE)
- bsize = nbytes;
-
- crypto_inc(walk->iv, AES_BLOCK_SIZE);
- __aes_arm64_encrypt(ctx->key_enc, buf, walk->iv,
- num_rounds(ctx));
- __aes_arm64_encrypt(ctx->key_enc, mac, mac,
- num_rounds(ctx));
- if (enc)
- crypto_xor(mac, src, bsize);
- crypto_xor_cpy(dst, src, buf, bsize);
- if (!enc)
- crypto_xor(mac, dst, bsize);
- dst += bsize;
- src += bsize;
- nbytes -= bsize;
- } while (--blocks);
-
- err = skcipher_walk_done(walk, tail);
- }
-
- if (!err) {
- __aes_arm64_encrypt(ctx->key_enc, buf, iv0, num_rounds(ctx));
- __aes_arm64_encrypt(ctx->key_enc, mac, mac, num_rounds(ctx));
- crypto_xor(mac, buf, AES_BLOCK_SIZE);
- }
- return err;
-}
-
static int ccm_encrypt(struct aead_request *req)
{
struct crypto_aead *aead = crypto_aead_reqtfm(req);
@@ -242,48 +151,47 @@ static int ccm_encrypt(struct aead_request *req)
u8 __aligned(8) mac[AES_BLOCK_SIZE];
u8 buf[AES_BLOCK_SIZE];
u32 len = req->cryptlen;
- bool use_neon = may_use_simd();
int err;
err = ccm_init_mac(req, mac, len);
if (err)
return err;
- if (likely(use_neon))
- kernel_neon_begin();
+ /* preserve the original iv for the final round */
+ memcpy(buf, req->iv, AES_BLOCK_SIZE);
+
+ err = skcipher_walk_aead_encrypt(&walk, req, false);
+ if (unlikely(err))
+ return err;
+
+ kernel_neon_begin();
if (req->assoclen)
- ccm_calculate_auth_mac(req, mac, use_neon);
+ ccm_calculate_auth_mac(req, mac);
- /* preserve the original iv for the final round */
- memcpy(buf, req->iv, AES_BLOCK_SIZE);
+ do {
+ u32 tail = walk.nbytes % AES_BLOCK_SIZE;
- err = skcipher_walk_aead_encrypt(&walk, req, true);
+ if (walk.nbytes == walk.total)
+ tail = 0;
- if (likely(use_neon)) {
- while (walk.nbytes) {
- u32 tail = walk.nbytes % AES_BLOCK_SIZE;
+ ce_aes_ccm_encrypt(walk.dst.virt.addr, walk.src.virt.addr,
+ walk.nbytes - tail, ctx->key_enc,
+ num_rounds(ctx), mac, walk.iv);
- if (walk.nbytes == walk.total)
- tail = 0;
+ if (walk.nbytes == walk.total)
+ ce_aes_ccm_final(mac, buf, ctx->key_enc, num_rounds(ctx));
- ce_aes_ccm_encrypt(walk.dst.virt.addr,
- walk.src.virt.addr,
- walk.nbytes - tail, ctx->key_enc,
- num_rounds(ctx), mac, walk.iv);
+ kernel_neon_end();
+ if (walk.nbytes) {
err = skcipher_walk_done(&walk, tail);
+ if (unlikely(err))
+ return err;
+ if (unlikely(walk.nbytes))
+ kernel_neon_begin();
}
- if (!err)
- ce_aes_ccm_final(mac, buf, ctx->key_enc,
- num_rounds(ctx));
-
- kernel_neon_end();
- } else {
- err = ccm_crypt_fallback(&walk, mac, buf, ctx, true);
- }
- if (err)
- return err;
+ } while (walk.nbytes);
/* copy authtag to end of dst */
scatterwalk_map_and_copy(mac, req->dst, req->assoclen + req->cryptlen,
@@ -301,49 +209,47 @@ static int ccm_decrypt(struct aead_request *req)
u8 __aligned(8) mac[AES_BLOCK_SIZE];
u8 buf[AES_BLOCK_SIZE];
u32 len = req->cryptlen - authsize;
- bool use_neon = may_use_simd();
int err;
err = ccm_init_mac(req, mac, len);
if (err)
return err;
- if (likely(use_neon))
- kernel_neon_begin();
-
- if (req->assoclen)
- ccm_calculate_auth_mac(req, mac, use_neon);
-
/* preserve the original iv for the final round */
memcpy(buf, req->iv, AES_BLOCK_SIZE);
- err = skcipher_walk_aead_decrypt(&walk, req, true);
+ err = skcipher_walk_aead_decrypt(&walk, req, false);
+ if (unlikely(err))
+ return err;
- if (likely(use_neon)) {
- while (walk.nbytes) {
- u32 tail = walk.nbytes % AES_BLOCK_SIZE;
+ kernel_neon_begin();
- if (walk.nbytes == walk.total)
- tail = 0;
+ if (req->assoclen)
+ ccm_calculate_auth_mac(req, mac);
- ce_aes_ccm_decrypt(walk.dst.virt.addr,
- walk.src.virt.addr,
- walk.nbytes - tail, ctx->key_enc,
- num_rounds(ctx), mac, walk.iv);
+ do {
+ u32 tail = walk.nbytes % AES_BLOCK_SIZE;
- err = skcipher_walk_done(&walk, tail);
- }
- if (!err)
- ce_aes_ccm_final(mac, buf, ctx->key_enc,
- num_rounds(ctx));
+ if (walk.nbytes == walk.total)
+ tail = 0;
+
+ ce_aes_ccm_decrypt(walk.dst.virt.addr, walk.src.virt.addr,
+ walk.nbytes - tail, ctx->key_enc,
+ num_rounds(ctx), mac, walk.iv);
+
+ if (walk.nbytes == walk.total)
+ ce_aes_ccm_final(mac, buf, ctx->key_enc, num_rounds(ctx));
kernel_neon_end();
- } else {
- err = ccm_crypt_fallback(&walk, mac, buf, ctx, false);
- }
- if (err)
- return err;
+ if (walk.nbytes) {
+ err = skcipher_walk_done(&walk, tail);
+ if (unlikely(err))
+ return err;
+ if (unlikely(walk.nbytes))
+ kernel_neon_begin();
+ }
+ } while (walk.nbytes);
/* compare calculated auth tag with the stored one */
scatterwalk_map_and_copy(buf, req->src,
@@ -375,7 +281,7 @@ static struct aead_alg ccm_aes_alg = {
static int __init aes_mod_init(void)
{
- if (!(elf_hwcap & HWCAP_AES))
+ if (!cpu_have_named_feature(AES))
return -ENODEV;
return crypto_register_aead(&ccm_aes_alg);
}
diff --git a/arch/arm64/crypto/aes-ce-core.S b/arch/arm64/crypto/aes-ce-core.S
new file mode 100644
index 000000000000..e52e13eb8fdb
--- /dev/null
+++ b/arch/arm64/crypto/aes-ce-core.S
@@ -0,0 +1,84 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2013 - 2017 Linaro Ltd <ard.biesheuvel@linaro.org>
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+ .arch armv8-a+crypto
+
+SYM_FUNC_START(__aes_ce_encrypt)
+ sub w3, w3, #2
+ ld1 {v0.16b}, [x2]
+ ld1 {v1.4s}, [x0], #16
+ cmp w3, #10
+ bmi 0f
+ bne 3f
+ mov v3.16b, v1.16b
+ b 2f
+0: mov v2.16b, v1.16b
+ ld1 {v3.4s}, [x0], #16
+1: aese v0.16b, v2.16b
+ aesmc v0.16b, v0.16b
+2: ld1 {v1.4s}, [x0], #16
+ aese v0.16b, v3.16b
+ aesmc v0.16b, v0.16b
+3: ld1 {v2.4s}, [x0], #16
+ subs w3, w3, #3
+ aese v0.16b, v1.16b
+ aesmc v0.16b, v0.16b
+ ld1 {v3.4s}, [x0], #16
+ bpl 1b
+ aese v0.16b, v2.16b
+ eor v0.16b, v0.16b, v3.16b
+ st1 {v0.16b}, [x1]
+ ret
+SYM_FUNC_END(__aes_ce_encrypt)
+
+SYM_FUNC_START(__aes_ce_decrypt)
+ sub w3, w3, #2
+ ld1 {v0.16b}, [x2]
+ ld1 {v1.4s}, [x0], #16
+ cmp w3, #10
+ bmi 0f
+ bne 3f
+ mov v3.16b, v1.16b
+ b 2f
+0: mov v2.16b, v1.16b
+ ld1 {v3.4s}, [x0], #16
+1: aesd v0.16b, v2.16b
+ aesimc v0.16b, v0.16b
+2: ld1 {v1.4s}, [x0], #16
+ aesd v0.16b, v3.16b
+ aesimc v0.16b, v0.16b
+3: ld1 {v2.4s}, [x0], #16
+ subs w3, w3, #3
+ aesd v0.16b, v1.16b
+ aesimc v0.16b, v0.16b
+ ld1 {v3.4s}, [x0], #16
+ bpl 1b
+ aesd v0.16b, v2.16b
+ eor v0.16b, v0.16b, v3.16b
+ st1 {v0.16b}, [x1]
+ ret
+SYM_FUNC_END(__aes_ce_decrypt)
+
+/*
+ * __aes_ce_sub() - use the aese instruction to perform the AES sbox
+ * substitution on each byte in 'input'
+ */
+SYM_FUNC_START(__aes_ce_sub)
+ dup v1.4s, w0
+ movi v0.16b, #0
+ aese v0.16b, v1.16b
+ umov w0, v0.s[0]
+ ret
+SYM_FUNC_END(__aes_ce_sub)
+
+SYM_FUNC_START(__aes_ce_invert)
+ ld1 {v0.4s}, [x1]
+ aesimc v1.16b, v0.16b
+ st1 {v1.4s}, [x0]
+ ret
+SYM_FUNC_END(__aes_ce_invert)
diff --git a/arch/arm64/crypto/aes-ce-cipher.c b/arch/arm64/crypto/aes-ce-glue.c
index 6a75cd75ed11..56a5f6f0b0c1 100644
--- a/arch/arm64/crypto/aes-ce-cipher.c
+++ b/arch/arm64/crypto/aes-ce-glue.c
@@ -1,17 +1,15 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* aes-ce-cipher.c - core AES cipher using ARMv8 Crypto Extensions
*
* Copyright (C) 2013 - 2017 Linaro Ltd <ard.biesheuvel@linaro.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#include <asm/neon.h>
#include <asm/simd.h>
#include <asm/unaligned.h>
#include <crypto/aes.h>
+#include <crypto/internal/simd.h>
#include <linux/cpufeature.h>
#include <linux/crypto.h>
#include <linux/module.h>
@@ -22,13 +20,17 @@ MODULE_DESCRIPTION("Synchronous AES cipher using ARMv8 Crypto Extensions");
MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
MODULE_LICENSE("GPL v2");
-asmlinkage void __aes_arm64_encrypt(u32 *rk, u8 *out, const u8 *in, int rounds);
-asmlinkage void __aes_arm64_decrypt(u32 *rk, u8 *out, const u8 *in, int rounds);
-
struct aes_block {
u8 b[AES_BLOCK_SIZE];
};
+asmlinkage void __aes_ce_encrypt(u32 *rk, u8 *out, const u8 *in, int rounds);
+asmlinkage void __aes_ce_decrypt(u32 *rk, u8 *out, const u8 *in, int rounds);
+
+asmlinkage u32 __aes_ce_sub(u32 l);
+asmlinkage void __aes_ce_invert(struct aes_block *out,
+ const struct aes_block *in);
+
static int num_rounds(struct crypto_aes_ctx *ctx)
{
/*
@@ -44,123 +46,31 @@ static int num_rounds(struct crypto_aes_ctx *ctx)
static void aes_cipher_encrypt(struct crypto_tfm *tfm, u8 dst[], u8 const src[])
{
struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm);
- struct aes_block *out = (struct aes_block *)dst;
- struct aes_block const *in = (struct aes_block *)src;
- void *dummy0;
- int dummy1;
- if (!may_use_simd()) {
- __aes_arm64_encrypt(ctx->key_enc, dst, src, num_rounds(ctx));
+ if (!crypto_simd_usable()) {
+ aes_encrypt(ctx, dst, src);
return;
}
kernel_neon_begin();
-
- __asm__(" ld1 {v0.16b}, %[in] ;"
- " ld1 {v1.4s}, [%[key]], #16 ;"
- " cmp %w[rounds], #10 ;"
- " bmi 0f ;"
- " bne 3f ;"
- " mov v3.16b, v1.16b ;"
- " b 2f ;"
- "0: mov v2.16b, v1.16b ;"
- " ld1 {v3.4s}, [%[key]], #16 ;"
- "1: aese v0.16b, v2.16b ;"
- " aesmc v0.16b, v0.16b ;"
- "2: ld1 {v1.4s}, [%[key]], #16 ;"
- " aese v0.16b, v3.16b ;"
- " aesmc v0.16b, v0.16b ;"
- "3: ld1 {v2.4s}, [%[key]], #16 ;"
- " subs %w[rounds], %w[rounds], #3 ;"
- " aese v0.16b, v1.16b ;"
- " aesmc v0.16b, v0.16b ;"
- " ld1 {v3.4s}, [%[key]], #16 ;"
- " bpl 1b ;"
- " aese v0.16b, v2.16b ;"
- " eor v0.16b, v0.16b, v3.16b ;"
- " st1 {v0.16b}, %[out] ;"
-
- : [out] "=Q"(*out),
- [key] "=r"(dummy0),
- [rounds] "=r"(dummy1)
- : [in] "Q"(*in),
- "1"(ctx->key_enc),
- "2"(num_rounds(ctx) - 2)
- : "cc");
-
+ __aes_ce_encrypt(ctx->key_enc, dst, src, num_rounds(ctx));
kernel_neon_end();
}
static void aes_cipher_decrypt(struct crypto_tfm *tfm, u8 dst[], u8 const src[])
{
struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm);
- struct aes_block *out = (struct aes_block *)dst;
- struct aes_block const *in = (struct aes_block *)src;
- void *dummy0;
- int dummy1;
- if (!may_use_simd()) {
- __aes_arm64_decrypt(ctx->key_dec, dst, src, num_rounds(ctx));
+ if (!crypto_simd_usable()) {
+ aes_decrypt(ctx, dst, src);
return;
}
kernel_neon_begin();
-
- __asm__(" ld1 {v0.16b}, %[in] ;"
- " ld1 {v1.4s}, [%[key]], #16 ;"
- " cmp %w[rounds], #10 ;"
- " bmi 0f ;"
- " bne 3f ;"
- " mov v3.16b, v1.16b ;"
- " b 2f ;"
- "0: mov v2.16b, v1.16b ;"
- " ld1 {v3.4s}, [%[key]], #16 ;"
- "1: aesd v0.16b, v2.16b ;"
- " aesimc v0.16b, v0.16b ;"
- "2: ld1 {v1.4s}, [%[key]], #16 ;"
- " aesd v0.16b, v3.16b ;"
- " aesimc v0.16b, v0.16b ;"
- "3: ld1 {v2.4s}, [%[key]], #16 ;"
- " subs %w[rounds], %w[rounds], #3 ;"
- " aesd v0.16b, v1.16b ;"
- " aesimc v0.16b, v0.16b ;"
- " ld1 {v3.4s}, [%[key]], #16 ;"
- " bpl 1b ;"
- " aesd v0.16b, v2.16b ;"
- " eor v0.16b, v0.16b, v3.16b ;"
- " st1 {v0.16b}, %[out] ;"
-
- : [out] "=Q"(*out),
- [key] "=r"(dummy0),
- [rounds] "=r"(dummy1)
- : [in] "Q"(*in),
- "1"(ctx->key_dec),
- "2"(num_rounds(ctx) - 2)
- : "cc");
-
+ __aes_ce_decrypt(ctx->key_dec, dst, src, num_rounds(ctx));
kernel_neon_end();
}
-/*
- * aes_sub() - use the aese instruction to perform the AES sbox substitution
- * on each byte in 'input'
- */
-static u32 aes_sub(u32 input)
-{
- u32 ret;
-
- __asm__("dup v1.4s, %w[in] ;"
- "movi v0.16b, #0 ;"
- "aese v0.16b, v1.16b ;"
- "umov %w[out], v0.4s[0] ;"
-
- : [out] "=r"(ret)
- : [in] "r"(input)
- : "v0","v1");
-
- return ret;
-}
-
int ce_aes_expandkey(struct crypto_aes_ctx *ctx, const u8 *in_key,
unsigned int key_len)
{
@@ -189,7 +99,7 @@ int ce_aes_expandkey(struct crypto_aes_ctx *ctx, const u8 *in_key,
u32 *rki = ctx->key_enc + (i * kwords);
u32 *rko = rki + kwords;
- rko[0] = ror32(aes_sub(rki[kwords - 1]), 8) ^ rcon[i] ^ rki[0];
+ rko[0] = ror32(__aes_ce_sub(rki[kwords - 1]), 8) ^ rcon[i] ^ rki[0];
rko[1] = rko[0] ^ rki[1];
rko[2] = rko[1] ^ rki[2];
rko[3] = rko[2] ^ rki[3];
@@ -202,7 +112,7 @@ int ce_aes_expandkey(struct crypto_aes_ctx *ctx, const u8 *in_key,
} else if (key_len == AES_KEYSIZE_256) {
if (i >= 6)
break;
- rko[4] = aes_sub(rko[3]) ^ rki[4];
+ rko[4] = __aes_ce_sub(rko[3]) ^ rki[4];
rko[5] = rko[4] ^ rki[5];
rko[6] = rko[5] ^ rki[6];
rko[7] = rko[6] ^ rki[7];
@@ -221,13 +131,7 @@ int ce_aes_expandkey(struct crypto_aes_ctx *ctx, const u8 *in_key,
key_dec[0] = key_enc[j];
for (i = 1, j--; j > 0; i++, j--)
- __asm__("ld1 {v0.4s}, %[in] ;"
- "aesimc v1.16b, v0.16b ;"
- "st1 {v1.4s}, %[out] ;"
-
- : [out] "=Q"(key_dec[i])
- : [in] "Q"(key_enc[j])
- : "v0","v1");
+ __aes_ce_invert(key_dec + i, key_enc + j);
key_dec[i] = key_enc[0];
kernel_neon_end();
@@ -239,14 +143,8 @@ int ce_aes_setkey(struct crypto_tfm *tfm, const u8 *in_key,
unsigned int key_len)
{
struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm);
- int ret;
-
- ret = ce_aes_expandkey(ctx, in_key, key_len);
- if (!ret)
- return 0;
- tfm->crt_flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
- return -EINVAL;
+ return ce_aes_expandkey(ctx, in_key, key_len);
}
EXPORT_SYMBOL(ce_aes_setkey);
diff --git a/arch/arm64/crypto/aes-ce-setkey.h b/arch/arm64/crypto/aes-ce-setkey.h
index f08a6471d034..fd9ecf07d88c 100644
--- a/arch/arm64/crypto/aes-ce-setkey.h
+++ b/arch/arm64/crypto/aes-ce-setkey.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
int ce_aes_setkey(struct crypto_tfm *tfm, const u8 *in_key,
unsigned int key_len);
diff --git a/arch/arm64/crypto/aes-ce.S b/arch/arm64/crypto/aes-ce.S
index 50330f5c3adc..1dc5bbbfeed2 100644
--- a/arch/arm64/crypto/aes-ce.S
+++ b/arch/arm64/crypto/aes-ce.S
@@ -1,22 +1,29 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
/*
* linux/arch/arm64/crypto/aes-ce.S - AES cipher for ARMv8 with
* Crypto Extensions
*
* Copyright (C) 2013 - 2017 Linaro Ltd <ard.biesheuvel@linaro.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#include <linux/linkage.h>
#include <asm/assembler.h>
-#define AES_ENTRY(func) ENTRY(ce_ ## func)
-#define AES_ENDPROC(func) ENDPROC(ce_ ## func)
+#define AES_FUNC_START(func) SYM_FUNC_START(ce_ ## func)
+#define AES_FUNC_END(func) SYM_FUNC_END(ce_ ## func)
.arch armv8-a+crypto
+ xtsmask .req v16
+ cbciv .req v16
+ vctr .req v16
+
+ .macro xts_reload_mask, tmp
+ .endm
+
+ .macro xts_cts_skip_tw, reg, lbl
+ .endm
+
/* preload all round keys */
.macro load_round_keys, rounds, rk
cmp \rounds, #12
@@ -30,21 +37,24 @@
.endm
/* prepare for encryption with key in rk[] */
- .macro enc_prepare, rounds, rk, ignore
- load_round_keys \rounds, \rk
+ .macro enc_prepare, rounds, rk, temp
+ mov \temp, \rk
+ load_round_keys \rounds, \temp
.endm
/* prepare for encryption (again) but with new key in rk[] */
- .macro enc_switch_key, rounds, rk, ignore
- load_round_keys \rounds, \rk
+ .macro enc_switch_key, rounds, rk, temp
+ mov \temp, \rk
+ load_round_keys \rounds, \temp
.endm
/* prepare for decryption with key in rk[] */
- .macro dec_prepare, rounds, rk, ignore
- load_round_keys \rounds, \rk
+ .macro dec_prepare, rounds, rk, temp
+ mov \temp, \rk
+ load_round_keys \rounds, \temp
.endm
- .macro do_enc_Nx, de, mc, k, i0, i1, i2, i3
+ .macro do_enc_Nx, de, mc, k, i0, i1, i2, i3, i4
aes\de \i0\().16b, \k\().16b
aes\mc \i0\().16b, \i0\().16b
.ifnb \i1
@@ -55,27 +65,34 @@
aes\mc \i2\().16b, \i2\().16b
aes\de \i3\().16b, \k\().16b
aes\mc \i3\().16b, \i3\().16b
+ .ifnb \i4
+ aes\de \i4\().16b, \k\().16b
+ aes\mc \i4\().16b, \i4\().16b
+ .endif
.endif
.endif
.endm
- /* up to 4 interleaved encryption rounds with the same round key */
- .macro round_Nx, enc, k, i0, i1, i2, i3
+ /* up to 5 interleaved encryption rounds with the same round key */
+ .macro round_Nx, enc, k, i0, i1, i2, i3, i4
.ifc \enc, e
- do_enc_Nx e, mc, \k, \i0, \i1, \i2, \i3
+ do_enc_Nx e, mc, \k, \i0, \i1, \i2, \i3, \i4
.else
- do_enc_Nx d, imc, \k, \i0, \i1, \i2, \i3
+ do_enc_Nx d, imc, \k, \i0, \i1, \i2, \i3, \i4
.endif
.endm
- /* up to 4 interleaved final rounds */
- .macro fin_round_Nx, de, k, k2, i0, i1, i2, i3
+ /* up to 5 interleaved final rounds */
+ .macro fin_round_Nx, de, k, k2, i0, i1, i2, i3, i4
aes\de \i0\().16b, \k\().16b
.ifnb \i1
aes\de \i1\().16b, \k\().16b
.ifnb \i3
aes\de \i2\().16b, \k\().16b
aes\de \i3\().16b, \k\().16b
+ .ifnb \i4
+ aes\de \i4\().16b, \k\().16b
+ .endif
.endif
.endif
eor \i0\().16b, \i0\().16b, \k2\().16b
@@ -84,47 +101,52 @@
.ifnb \i3
eor \i2\().16b, \i2\().16b, \k2\().16b
eor \i3\().16b, \i3\().16b, \k2\().16b
+ .ifnb \i4
+ eor \i4\().16b, \i4\().16b, \k2\().16b
+ .endif
.endif
.endif
.endm
- /* up to 4 interleaved blocks */
- .macro do_block_Nx, enc, rounds, i0, i1, i2, i3
+ /* up to 5 interleaved blocks */
+ .macro do_block_Nx, enc, rounds, i0, i1, i2, i3, i4
cmp \rounds, #12
blo 2222f /* 128 bits */
beq 1111f /* 192 bits */
- round_Nx \enc, v17, \i0, \i1, \i2, \i3
- round_Nx \enc, v18, \i0, \i1, \i2, \i3
-1111: round_Nx \enc, v19, \i0, \i1, \i2, \i3
- round_Nx \enc, v20, \i0, \i1, \i2, \i3
+ round_Nx \enc, v17, \i0, \i1, \i2, \i3, \i4
+ round_Nx \enc, v18, \i0, \i1, \i2, \i3, \i4
+1111: round_Nx \enc, v19, \i0, \i1, \i2, \i3, \i4
+ round_Nx \enc, v20, \i0, \i1, \i2, \i3, \i4
2222: .irp key, v21, v22, v23, v24, v25, v26, v27, v28, v29
- round_Nx \enc, \key, \i0, \i1, \i2, \i3
+ round_Nx \enc, \key, \i0, \i1, \i2, \i3, \i4
.endr
- fin_round_Nx \enc, v30, v31, \i0, \i1, \i2, \i3
+ fin_round_Nx \enc, v30, v31, \i0, \i1, \i2, \i3, \i4
.endm
.macro encrypt_block, in, rounds, t0, t1, t2
do_block_Nx e, \rounds, \in
.endm
- .macro encrypt_block2x, i0, i1, rounds, t0, t1, t2
- do_block_Nx e, \rounds, \i0, \i1
- .endm
-
.macro encrypt_block4x, i0, i1, i2, i3, rounds, t0, t1, t2
do_block_Nx e, \rounds, \i0, \i1, \i2, \i3
.endm
- .macro decrypt_block, in, rounds, t0, t1, t2
- do_block_Nx d, \rounds, \in
+ .macro encrypt_block5x, i0, i1, i2, i3, i4, rounds, t0, t1, t2
+ do_block_Nx e, \rounds, \i0, \i1, \i2, \i3, \i4
.endm
- .macro decrypt_block2x, i0, i1, rounds, t0, t1, t2
- do_block_Nx d, \rounds, \i0, \i1
+ .macro decrypt_block, in, rounds, t0, t1, t2
+ do_block_Nx d, \rounds, \in
.endm
.macro decrypt_block4x, i0, i1, i2, i3, rounds, t0, t1, t2
do_block_Nx d, \rounds, \i0, \i1, \i2, \i3
.endm
+ .macro decrypt_block5x, i0, i1, i2, i3, i4, rounds, t0, t1, t2
+ do_block_Nx d, \rounds, \i0, \i1, \i2, \i3, \i4
+ .endm
+
+#define MAX_STRIDE 5
+
#include "aes-modes.S"
diff --git a/arch/arm64/crypto/aes-cipher-core.S b/arch/arm64/crypto/aes-cipher-core.S
index 6d2445d603cc..c9d6955f8404 100644
--- a/arch/arm64/crypto/aes-cipher-core.S
+++ b/arch/arm64/crypto/aes-cipher-core.S
@@ -1,11 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
/*
* Scalar AES core transform
*
* Copyright (C) 2017 Linaro Ltd <ard.biesheuvel@linaro.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#include <linux/linkage.h>
@@ -125,48 +122,11 @@ CPU_BE( rev w7, w7 )
ret
.endm
- .align L1_CACHE_SHIFT
- .type __aes_arm64_inverse_sbox, %object
-__aes_arm64_inverse_sbox:
- .byte 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38
- .byte 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb
- .byte 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87
- .byte 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb
- .byte 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d
- .byte 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e
- .byte 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2
- .byte 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25
- .byte 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16
- .byte 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92
- .byte 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda
- .byte 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84
- .byte 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a
- .byte 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06
- .byte 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02
- .byte 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b
- .byte 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea
- .byte 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73
- .byte 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85
- .byte 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e
- .byte 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89
- .byte 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b
- .byte 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20
- .byte 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4
- .byte 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31
- .byte 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f
- .byte 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d
- .byte 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef
- .byte 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0
- .byte 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
- .byte 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
- .byte 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
- .size __aes_arm64_inverse_sbox, . - __aes_arm64_inverse_sbox
-
-ENTRY(__aes_arm64_encrypt)
+SYM_FUNC_START(__aes_arm64_encrypt)
do_crypt fround, crypto_ft_tab, crypto_ft_tab + 1, 2
-ENDPROC(__aes_arm64_encrypt)
+SYM_FUNC_END(__aes_arm64_encrypt)
.align 5
-ENTRY(__aes_arm64_decrypt)
- do_crypt iround, crypto_it_tab, __aes_arm64_inverse_sbox, 0
-ENDPROC(__aes_arm64_decrypt)
+SYM_FUNC_START(__aes_arm64_decrypt)
+ do_crypt iround, crypto_it_tab, crypto_aes_inv_sbox, 0
+SYM_FUNC_END(__aes_arm64_decrypt)
diff --git a/arch/arm64/crypto/aes-cipher-glue.c b/arch/arm64/crypto/aes-cipher-glue.c
index 7288e7cbebff..8caf6dfefce8 100644
--- a/arch/arm64/crypto/aes-cipher-glue.c
+++ b/arch/arm64/crypto/aes-cipher-glue.c
@@ -1,11 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Scalar AES core transform
*
* Copyright (C) 2017 Linaro Ltd <ard.biesheuvel@linaro.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#include <crypto/aes.h>
@@ -13,12 +10,9 @@
#include <linux/module.h>
asmlinkage void __aes_arm64_encrypt(u32 *rk, u8 *out, const u8 *in, int rounds);
-EXPORT_SYMBOL(__aes_arm64_encrypt);
-
asmlinkage void __aes_arm64_decrypt(u32 *rk, u8 *out, const u8 *in, int rounds);
-EXPORT_SYMBOL(__aes_arm64_decrypt);
-static void aes_encrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
+static void aes_arm64_encrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
{
struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm);
int rounds = 6 + ctx->key_length / 4;
@@ -26,7 +20,7 @@ static void aes_encrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
__aes_arm64_encrypt(ctx->key_enc, out, in, rounds);
}
-static void aes_decrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
+static void aes_arm64_decrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
{
struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm);
int rounds = 6 + ctx->key_length / 4;
@@ -46,8 +40,8 @@ static struct crypto_alg aes_alg = {
.cra_cipher.cia_min_keysize = AES_MIN_KEY_SIZE,
.cra_cipher.cia_max_keysize = AES_MAX_KEY_SIZE,
.cra_cipher.cia_setkey = crypto_aes_set_key,
- .cra_cipher.cia_encrypt = aes_encrypt,
- .cra_cipher.cia_decrypt = aes_decrypt
+ .cra_cipher.cia_encrypt = aes_arm64_encrypt,
+ .cra_cipher.cia_decrypt = aes_arm64_decrypt
};
static int __init aes_init(void)
diff --git a/arch/arm64/crypto/aes-ctr-fallback.h b/arch/arm64/crypto/aes-ctr-fallback.h
deleted file mode 100644
index c9285717b6b5..000000000000
--- a/arch/arm64/crypto/aes-ctr-fallback.h
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- * Fallback for sync aes(ctr) in contexts where kernel mode NEON
- * is not allowed
- *
- * Copyright (C) 2017 Linaro Ltd <ard.biesheuvel@linaro.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <crypto/aes.h>
-#include <crypto/internal/skcipher.h>
-
-asmlinkage void __aes_arm64_encrypt(u32 *rk, u8 *out, const u8 *in, int rounds);
-
-static inline int aes_ctr_encrypt_fallback(struct crypto_aes_ctx *ctx,
- struct skcipher_request *req)
-{
- struct skcipher_walk walk;
- u8 buf[AES_BLOCK_SIZE];
- int err;
-
- err = skcipher_walk_virt(&walk, req, true);
-
- while (walk.nbytes > 0) {
- u8 *dst = walk.dst.virt.addr;
- u8 *src = walk.src.virt.addr;
- int nbytes = walk.nbytes;
- int tail = 0;
-
- if (nbytes < walk.total) {
- nbytes = round_down(nbytes, AES_BLOCK_SIZE);
- tail = walk.nbytes % AES_BLOCK_SIZE;
- }
-
- do {
- int bsize = min(nbytes, AES_BLOCK_SIZE);
-
- __aes_arm64_encrypt(ctx->key_enc, buf, walk.iv,
- 6 + ctx->key_length / 4);
- crypto_xor_cpy(dst, src, buf, bsize);
- crypto_inc(walk.iv, AES_BLOCK_SIZE);
-
- dst += AES_BLOCK_SIZE;
- src += AES_BLOCK_SIZE;
- nbytes -= AES_BLOCK_SIZE;
- } while (nbytes > 0);
-
- err = skcipher_walk_done(&walk, tail);
- }
- return err;
-}
diff --git a/arch/arm64/crypto/aes-glue.c b/arch/arm64/crypto/aes-glue.c
index 998ba519a026..162787c7aa86 100644
--- a/arch/arm64/crypto/aes-glue.c
+++ b/arch/arm64/crypto/aes-glue.c
@@ -1,97 +1,129 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* linux/arch/arm64/crypto/aes-glue.c - wrapper code for ARMv8 AES
*
* Copyright (C) 2013 - 2017 Linaro Ltd <ard.biesheuvel@linaro.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#include <asm/neon.h>
#include <asm/hwcap.h>
#include <asm/simd.h>
#include <crypto/aes.h>
+#include <crypto/ctr.h>
+#include <crypto/sha2.h>
#include <crypto/internal/hash.h>
#include <crypto/internal/simd.h>
#include <crypto/internal/skcipher.h>
+#include <crypto/scatterwalk.h>
#include <linux/module.h>
#include <linux/cpufeature.h>
#include <crypto/xts.h>
#include "aes-ce-setkey.h"
-#include "aes-ctr-fallback.h"
#ifdef USE_V8_CRYPTO_EXTENSIONS
#define MODE "ce"
#define PRIO 300
-#define aes_setkey ce_aes_setkey
#define aes_expandkey ce_aes_expandkey
#define aes_ecb_encrypt ce_aes_ecb_encrypt
#define aes_ecb_decrypt ce_aes_ecb_decrypt
#define aes_cbc_encrypt ce_aes_cbc_encrypt
#define aes_cbc_decrypt ce_aes_cbc_decrypt
+#define aes_cbc_cts_encrypt ce_aes_cbc_cts_encrypt
+#define aes_cbc_cts_decrypt ce_aes_cbc_cts_decrypt
+#define aes_essiv_cbc_encrypt ce_aes_essiv_cbc_encrypt
+#define aes_essiv_cbc_decrypt ce_aes_essiv_cbc_decrypt
#define aes_ctr_encrypt ce_aes_ctr_encrypt
+#define aes_xctr_encrypt ce_aes_xctr_encrypt
#define aes_xts_encrypt ce_aes_xts_encrypt
#define aes_xts_decrypt ce_aes_xts_decrypt
#define aes_mac_update ce_aes_mac_update
-MODULE_DESCRIPTION("AES-ECB/CBC/CTR/XTS using ARMv8 Crypto Extensions");
+MODULE_DESCRIPTION("AES-ECB/CBC/CTR/XTS/XCTR using ARMv8 Crypto Extensions");
#else
#define MODE "neon"
#define PRIO 200
-#define aes_setkey crypto_aes_set_key
-#define aes_expandkey crypto_aes_expand_key
#define aes_ecb_encrypt neon_aes_ecb_encrypt
#define aes_ecb_decrypt neon_aes_ecb_decrypt
#define aes_cbc_encrypt neon_aes_cbc_encrypt
#define aes_cbc_decrypt neon_aes_cbc_decrypt
+#define aes_cbc_cts_encrypt neon_aes_cbc_cts_encrypt
+#define aes_cbc_cts_decrypt neon_aes_cbc_cts_decrypt
+#define aes_essiv_cbc_encrypt neon_aes_essiv_cbc_encrypt
+#define aes_essiv_cbc_decrypt neon_aes_essiv_cbc_decrypt
#define aes_ctr_encrypt neon_aes_ctr_encrypt
+#define aes_xctr_encrypt neon_aes_xctr_encrypt
#define aes_xts_encrypt neon_aes_xts_encrypt
#define aes_xts_decrypt neon_aes_xts_decrypt
#define aes_mac_update neon_aes_mac_update
-MODULE_DESCRIPTION("AES-ECB/CBC/CTR/XTS using ARMv8 NEON");
+MODULE_DESCRIPTION("AES-ECB/CBC/CTR/XTS/XCTR using ARMv8 NEON");
+#endif
+#if defined(USE_V8_CRYPTO_EXTENSIONS) || !IS_ENABLED(CONFIG_CRYPTO_AES_ARM64_BS)
MODULE_ALIAS_CRYPTO("ecb(aes)");
MODULE_ALIAS_CRYPTO("cbc(aes)");
MODULE_ALIAS_CRYPTO("ctr(aes)");
MODULE_ALIAS_CRYPTO("xts(aes)");
+MODULE_ALIAS_CRYPTO("xctr(aes)");
+#endif
+MODULE_ALIAS_CRYPTO("cts(cbc(aes))");
+MODULE_ALIAS_CRYPTO("essiv(cbc(aes),sha256)");
MODULE_ALIAS_CRYPTO("cmac(aes)");
MODULE_ALIAS_CRYPTO("xcbc(aes)");
MODULE_ALIAS_CRYPTO("cbcmac(aes)");
-#endif
MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
MODULE_LICENSE("GPL v2");
/* defined in aes-modes.S */
-asmlinkage void aes_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[],
- int rounds, int blocks, int first);
-asmlinkage void aes_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[],
- int rounds, int blocks, int first);
+asmlinkage void aes_ecb_encrypt(u8 out[], u8 const in[], u32 const rk[],
+ int rounds, int blocks);
+asmlinkage void aes_ecb_decrypt(u8 out[], u8 const in[], u32 const rk[],
+ int rounds, int blocks);
-asmlinkage void aes_cbc_encrypt(u8 out[], u8 const in[], u8 const rk[],
- int rounds, int blocks, u8 iv[], int first);
-asmlinkage void aes_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[],
- int rounds, int blocks, u8 iv[], int first);
+asmlinkage void aes_cbc_encrypt(u8 out[], u8 const in[], u32 const rk[],
+ int rounds, int blocks, u8 iv[]);
+asmlinkage void aes_cbc_decrypt(u8 out[], u8 const in[], u32 const rk[],
+ int rounds, int blocks, u8 iv[]);
-asmlinkage void aes_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[],
- int rounds, int blocks, u8 ctr[], int first);
+asmlinkage void aes_cbc_cts_encrypt(u8 out[], u8 const in[], u32 const rk[],
+ int rounds, int bytes, u8 const iv[]);
+asmlinkage void aes_cbc_cts_decrypt(u8 out[], u8 const in[], u32 const rk[],
+ int rounds, int bytes, u8 const iv[]);
-asmlinkage void aes_xts_encrypt(u8 out[], u8 const in[], u8 const rk1[],
- int rounds, int blocks, u8 const rk2[], u8 iv[],
+asmlinkage void aes_ctr_encrypt(u8 out[], u8 const in[], u32 const rk[],
+ int rounds, int bytes, u8 ctr[]);
+
+asmlinkage void aes_xctr_encrypt(u8 out[], u8 const in[], u32 const rk[],
+ int rounds, int bytes, u8 ctr[], int byte_ctr);
+
+asmlinkage void aes_xts_encrypt(u8 out[], u8 const in[], u32 const rk1[],
+ int rounds, int bytes, u32 const rk2[], u8 iv[],
int first);
-asmlinkage void aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[],
- int rounds, int blocks, u8 const rk2[], u8 iv[],
+asmlinkage void aes_xts_decrypt(u8 out[], u8 const in[], u32 const rk1[],
+ int rounds, int bytes, u32 const rk2[], u8 iv[],
int first);
-asmlinkage void aes_mac_update(u8 const in[], u32 const rk[], int rounds,
- int blocks, u8 dg[], int enc_before,
- int enc_after);
+asmlinkage void aes_essiv_cbc_encrypt(u8 out[], u8 const in[], u32 const rk1[],
+ int rounds, int blocks, u8 iv[],
+ u32 const rk2[]);
+asmlinkage void aes_essiv_cbc_decrypt(u8 out[], u8 const in[], u32 const rk1[],
+ int rounds, int blocks, u8 iv[],
+ u32 const rk2[]);
+
+asmlinkage int aes_mac_update(u8 const in[], u32 const rk[], int rounds,
+ int blocks, u8 dg[], int enc_before,
+ int enc_after);
struct crypto_aes_xts_ctx {
struct crypto_aes_ctx key1;
struct crypto_aes_ctx __aligned(8) key2;
};
+struct crypto_aes_essiv_cbc_ctx {
+ struct crypto_aes_ctx key1;
+ struct crypto_aes_ctx __aligned(8) key2;
+ struct crypto_shash *hash;
+};
+
struct mac_tfm_ctx {
struct crypto_aes_ctx key;
u8 __aligned(8) consts[];
@@ -105,11 +137,13 @@ struct mac_desc_ctx {
static int skcipher_aes_setkey(struct crypto_skcipher *tfm, const u8 *in_key,
unsigned int key_len)
{
- return aes_setkey(crypto_skcipher_tfm(tfm), in_key, key_len);
+ struct crypto_aes_ctx *ctx = crypto_skcipher_ctx(tfm);
+
+ return aes_expandkey(ctx, in_key, key_len);
}
-static int xts_set_key(struct crypto_skcipher *tfm, const u8 *in_key,
- unsigned int key_len)
+static int __maybe_unused xts_set_key(struct crypto_skcipher *tfm,
+ const u8 *in_key, unsigned int key_len)
{
struct crypto_aes_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
int ret;
@@ -122,196 +156,539 @@ static int xts_set_key(struct crypto_skcipher *tfm, const u8 *in_key,
if (!ret)
ret = aes_expandkey(&ctx->key2, &in_key[key_len / 2],
key_len / 2);
- if (!ret)
- return 0;
+ return ret;
+}
+
+static int __maybe_unused essiv_cbc_set_key(struct crypto_skcipher *tfm,
+ const u8 *in_key,
+ unsigned int key_len)
+{
+ struct crypto_aes_essiv_cbc_ctx *ctx = crypto_skcipher_ctx(tfm);
+ u8 digest[SHA256_DIGEST_SIZE];
+ int ret;
+
+ ret = aes_expandkey(&ctx->key1, in_key, key_len);
+ if (ret)
+ return ret;
+
+ crypto_shash_tfm_digest(ctx->hash, in_key, key_len, digest);
- crypto_skcipher_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
- return -EINVAL;
+ return aes_expandkey(&ctx->key2, digest, sizeof(digest));
}
-static int ecb_encrypt(struct skcipher_request *req)
+static int __maybe_unused ecb_encrypt(struct skcipher_request *req)
{
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
struct crypto_aes_ctx *ctx = crypto_skcipher_ctx(tfm);
- int err, first, rounds = 6 + ctx->key_length / 4;
+ int err, rounds = 6 + ctx->key_length / 4;
struct skcipher_walk walk;
unsigned int blocks;
- err = skcipher_walk_virt(&walk, req, true);
+ err = skcipher_walk_virt(&walk, req, false);
- kernel_neon_begin();
- for (first = 1; (blocks = (walk.nbytes / AES_BLOCK_SIZE)); first = 0) {
+ while ((blocks = (walk.nbytes / AES_BLOCK_SIZE))) {
+ kernel_neon_begin();
aes_ecb_encrypt(walk.dst.virt.addr, walk.src.virt.addr,
- (u8 *)ctx->key_enc, rounds, blocks, first);
+ ctx->key_enc, rounds, blocks);
+ kernel_neon_end();
err = skcipher_walk_done(&walk, walk.nbytes % AES_BLOCK_SIZE);
}
- kernel_neon_end();
return err;
}
-static int ecb_decrypt(struct skcipher_request *req)
+static int __maybe_unused ecb_decrypt(struct skcipher_request *req)
{
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
struct crypto_aes_ctx *ctx = crypto_skcipher_ctx(tfm);
- int err, first, rounds = 6 + ctx->key_length / 4;
+ int err, rounds = 6 + ctx->key_length / 4;
struct skcipher_walk walk;
unsigned int blocks;
- err = skcipher_walk_virt(&walk, req, true);
+ err = skcipher_walk_virt(&walk, req, false);
- kernel_neon_begin();
- for (first = 1; (blocks = (walk.nbytes / AES_BLOCK_SIZE)); first = 0) {
+ while ((blocks = (walk.nbytes / AES_BLOCK_SIZE))) {
+ kernel_neon_begin();
aes_ecb_decrypt(walk.dst.virt.addr, walk.src.virt.addr,
- (u8 *)ctx->key_dec, rounds, blocks, first);
+ ctx->key_dec, rounds, blocks);
+ kernel_neon_end();
err = skcipher_walk_done(&walk, walk.nbytes % AES_BLOCK_SIZE);
}
- kernel_neon_end();
return err;
}
-static int cbc_encrypt(struct skcipher_request *req)
+static int cbc_encrypt_walk(struct skcipher_request *req,
+ struct skcipher_walk *walk)
{
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
struct crypto_aes_ctx *ctx = crypto_skcipher_ctx(tfm);
- int err, first, rounds = 6 + ctx->key_length / 4;
- struct skcipher_walk walk;
+ int err = 0, rounds = 6 + ctx->key_length / 4;
unsigned int blocks;
- err = skcipher_walk_virt(&walk, req, true);
+ while ((blocks = (walk->nbytes / AES_BLOCK_SIZE))) {
+ kernel_neon_begin();
+ aes_cbc_encrypt(walk->dst.virt.addr, walk->src.virt.addr,
+ ctx->key_enc, rounds, blocks, walk->iv);
+ kernel_neon_end();
+ err = skcipher_walk_done(walk, walk->nbytes % AES_BLOCK_SIZE);
+ }
+ return err;
+}
- kernel_neon_begin();
- for (first = 1; (blocks = (walk.nbytes / AES_BLOCK_SIZE)); first = 0) {
- aes_cbc_encrypt(walk.dst.virt.addr, walk.src.virt.addr,
- (u8 *)ctx->key_enc, rounds, blocks, walk.iv,
- first);
- err = skcipher_walk_done(&walk, walk.nbytes % AES_BLOCK_SIZE);
+static int __maybe_unused cbc_encrypt(struct skcipher_request *req)
+{
+ struct skcipher_walk walk;
+ int err;
+
+ err = skcipher_walk_virt(&walk, req, false);
+ if (err)
+ return err;
+ return cbc_encrypt_walk(req, &walk);
+}
+
+static int cbc_decrypt_walk(struct skcipher_request *req,
+ struct skcipher_walk *walk)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct crypto_aes_ctx *ctx = crypto_skcipher_ctx(tfm);
+ int err = 0, rounds = 6 + ctx->key_length / 4;
+ unsigned int blocks;
+
+ while ((blocks = (walk->nbytes / AES_BLOCK_SIZE))) {
+ kernel_neon_begin();
+ aes_cbc_decrypt(walk->dst.virt.addr, walk->src.virt.addr,
+ ctx->key_dec, rounds, blocks, walk->iv);
+ kernel_neon_end();
+ err = skcipher_walk_done(walk, walk->nbytes % AES_BLOCK_SIZE);
}
- kernel_neon_end();
return err;
}
-static int cbc_decrypt(struct skcipher_request *req)
+static int __maybe_unused cbc_decrypt(struct skcipher_request *req)
+{
+ struct skcipher_walk walk;
+ int err;
+
+ err = skcipher_walk_virt(&walk, req, false);
+ if (err)
+ return err;
+ return cbc_decrypt_walk(req, &walk);
+}
+
+static int cts_cbc_encrypt(struct skcipher_request *req)
{
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
struct crypto_aes_ctx *ctx = crypto_skcipher_ctx(tfm);
- int err, first, rounds = 6 + ctx->key_length / 4;
+ int err, rounds = 6 + ctx->key_length / 4;
+ int cbc_blocks = DIV_ROUND_UP(req->cryptlen, AES_BLOCK_SIZE) - 2;
+ struct scatterlist *src = req->src, *dst = req->dst;
+ struct scatterlist sg_src[2], sg_dst[2];
+ struct skcipher_request subreq;
struct skcipher_walk walk;
- unsigned int blocks;
- err = skcipher_walk_virt(&walk, req, true);
+ skcipher_request_set_tfm(&subreq, tfm);
+ skcipher_request_set_callback(&subreq, skcipher_request_flags(req),
+ NULL, NULL);
- kernel_neon_begin();
- for (first = 1; (blocks = (walk.nbytes / AES_BLOCK_SIZE)); first = 0) {
- aes_cbc_decrypt(walk.dst.virt.addr, walk.src.virt.addr,
- (u8 *)ctx->key_dec, rounds, blocks, walk.iv,
- first);
- err = skcipher_walk_done(&walk, walk.nbytes % AES_BLOCK_SIZE);
+ if (req->cryptlen <= AES_BLOCK_SIZE) {
+ if (req->cryptlen < AES_BLOCK_SIZE)
+ return -EINVAL;
+ cbc_blocks = 1;
}
+
+ if (cbc_blocks > 0) {
+ skcipher_request_set_crypt(&subreq, req->src, req->dst,
+ cbc_blocks * AES_BLOCK_SIZE,
+ req->iv);
+
+ err = skcipher_walk_virt(&walk, &subreq, false) ?:
+ cbc_encrypt_walk(&subreq, &walk);
+ if (err)
+ return err;
+
+ if (req->cryptlen == AES_BLOCK_SIZE)
+ return 0;
+
+ dst = src = scatterwalk_ffwd(sg_src, req->src, subreq.cryptlen);
+ if (req->dst != req->src)
+ dst = scatterwalk_ffwd(sg_dst, req->dst,
+ subreq.cryptlen);
+ }
+
+ /* handle ciphertext stealing */
+ skcipher_request_set_crypt(&subreq, src, dst,
+ req->cryptlen - cbc_blocks * AES_BLOCK_SIZE,
+ req->iv);
+
+ err = skcipher_walk_virt(&walk, &subreq, false);
+ if (err)
+ return err;
+
+ kernel_neon_begin();
+ aes_cbc_cts_encrypt(walk.dst.virt.addr, walk.src.virt.addr,
+ ctx->key_enc, rounds, walk.nbytes, walk.iv);
kernel_neon_end();
- return err;
+
+ return skcipher_walk_done(&walk, 0);
}
-static int ctr_encrypt(struct skcipher_request *req)
+static int cts_cbc_decrypt(struct skcipher_request *req)
{
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
struct crypto_aes_ctx *ctx = crypto_skcipher_ctx(tfm);
- int err, first, rounds = 6 + ctx->key_length / 4;
+ int err, rounds = 6 + ctx->key_length / 4;
+ int cbc_blocks = DIV_ROUND_UP(req->cryptlen, AES_BLOCK_SIZE) - 2;
+ struct scatterlist *src = req->src, *dst = req->dst;
+ struct scatterlist sg_src[2], sg_dst[2];
+ struct skcipher_request subreq;
struct skcipher_walk walk;
- int blocks;
- err = skcipher_walk_virt(&walk, req, true);
+ skcipher_request_set_tfm(&subreq, tfm);
+ skcipher_request_set_callback(&subreq, skcipher_request_flags(req),
+ NULL, NULL);
+
+ if (req->cryptlen <= AES_BLOCK_SIZE) {
+ if (req->cryptlen < AES_BLOCK_SIZE)
+ return -EINVAL;
+ cbc_blocks = 1;
+ }
+
+ if (cbc_blocks > 0) {
+ skcipher_request_set_crypt(&subreq, req->src, req->dst,
+ cbc_blocks * AES_BLOCK_SIZE,
+ req->iv);
+
+ err = skcipher_walk_virt(&walk, &subreq, false) ?:
+ cbc_decrypt_walk(&subreq, &walk);
+ if (err)
+ return err;
+
+ if (req->cryptlen == AES_BLOCK_SIZE)
+ return 0;
+
+ dst = src = scatterwalk_ffwd(sg_src, req->src, subreq.cryptlen);
+ if (req->dst != req->src)
+ dst = scatterwalk_ffwd(sg_dst, req->dst,
+ subreq.cryptlen);
+ }
+
+ /* handle ciphertext stealing */
+ skcipher_request_set_crypt(&subreq, src, dst,
+ req->cryptlen - cbc_blocks * AES_BLOCK_SIZE,
+ req->iv);
+
+ err = skcipher_walk_virt(&walk, &subreq, false);
+ if (err)
+ return err;
- first = 1;
kernel_neon_begin();
- while ((blocks = (walk.nbytes / AES_BLOCK_SIZE))) {
- aes_ctr_encrypt(walk.dst.virt.addr, walk.src.virt.addr,
- (u8 *)ctx->key_enc, rounds, blocks, walk.iv,
- first);
+ aes_cbc_cts_decrypt(walk.dst.virt.addr, walk.src.virt.addr,
+ ctx->key_dec, rounds, walk.nbytes, walk.iv);
+ kernel_neon_end();
+
+ return skcipher_walk_done(&walk, 0);
+}
+
+static int __maybe_unused essiv_cbc_init_tfm(struct crypto_skcipher *tfm)
+{
+ struct crypto_aes_essiv_cbc_ctx *ctx = crypto_skcipher_ctx(tfm);
+
+ ctx->hash = crypto_alloc_shash("sha256", 0, 0);
+
+ return PTR_ERR_OR_ZERO(ctx->hash);
+}
+
+static void __maybe_unused essiv_cbc_exit_tfm(struct crypto_skcipher *tfm)
+{
+ struct crypto_aes_essiv_cbc_ctx *ctx = crypto_skcipher_ctx(tfm);
+
+ crypto_free_shash(ctx->hash);
+}
+
+static int __maybe_unused essiv_cbc_encrypt(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct crypto_aes_essiv_cbc_ctx *ctx = crypto_skcipher_ctx(tfm);
+ int err, rounds = 6 + ctx->key1.key_length / 4;
+ struct skcipher_walk walk;
+ unsigned int blocks;
+
+ err = skcipher_walk_virt(&walk, req, false);
+
+ blocks = walk.nbytes / AES_BLOCK_SIZE;
+ if (blocks) {
+ kernel_neon_begin();
+ aes_essiv_cbc_encrypt(walk.dst.virt.addr, walk.src.virt.addr,
+ ctx->key1.key_enc, rounds, blocks,
+ req->iv, ctx->key2.key_enc);
+ kernel_neon_end();
err = skcipher_walk_done(&walk, walk.nbytes % AES_BLOCK_SIZE);
- first = 0;
}
- if (walk.nbytes) {
- u8 __aligned(8) tail[AES_BLOCK_SIZE];
+ return err ?: cbc_encrypt_walk(req, &walk);
+}
+
+static int __maybe_unused essiv_cbc_decrypt(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct crypto_aes_essiv_cbc_ctx *ctx = crypto_skcipher_ctx(tfm);
+ int err, rounds = 6 + ctx->key1.key_length / 4;
+ struct skcipher_walk walk;
+ unsigned int blocks;
+
+ err = skcipher_walk_virt(&walk, req, false);
+
+ blocks = walk.nbytes / AES_BLOCK_SIZE;
+ if (blocks) {
+ kernel_neon_begin();
+ aes_essiv_cbc_decrypt(walk.dst.virt.addr, walk.src.virt.addr,
+ ctx->key1.key_dec, rounds, blocks,
+ req->iv, ctx->key2.key_enc);
+ kernel_neon_end();
+ err = skcipher_walk_done(&walk, walk.nbytes % AES_BLOCK_SIZE);
+ }
+ return err ?: cbc_decrypt_walk(req, &walk);
+}
+
+static int __maybe_unused xctr_encrypt(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct crypto_aes_ctx *ctx = crypto_skcipher_ctx(tfm);
+ int err, rounds = 6 + ctx->key_length / 4;
+ struct skcipher_walk walk;
+ unsigned int byte_ctr = 0;
+
+ err = skcipher_walk_virt(&walk, req, false);
+
+ while (walk.nbytes > 0) {
+ const u8 *src = walk.src.virt.addr;
unsigned int nbytes = walk.nbytes;
- u8 *tdst = walk.dst.virt.addr;
- u8 *tsrc = walk.src.virt.addr;
+ u8 *dst = walk.dst.virt.addr;
+ u8 buf[AES_BLOCK_SIZE];
/*
- * Tell aes_ctr_encrypt() to process a tail block.
+ * If given less than 16 bytes, we must copy the partial block
+ * into a temporary buffer of 16 bytes to avoid out of bounds
+ * reads and writes. Furthermore, this code is somewhat unusual
+ * in that it expects the end of the data to be at the end of
+ * the temporary buffer, rather than the start of the data at
+ * the start of the temporary buffer.
*/
- blocks = -1;
+ if (unlikely(nbytes < AES_BLOCK_SIZE))
+ src = dst = memcpy(buf + sizeof(buf) - nbytes,
+ src, nbytes);
+ else if (nbytes < walk.total)
+ nbytes &= ~(AES_BLOCK_SIZE - 1);
- aes_ctr_encrypt(tail, NULL, (u8 *)ctx->key_enc, rounds,
- blocks, walk.iv, first);
- crypto_xor_cpy(tdst, tsrc, tail, nbytes);
- err = skcipher_walk_done(&walk, 0);
+ kernel_neon_begin();
+ aes_xctr_encrypt(dst, src, ctx->key_enc, rounds, nbytes,
+ walk.iv, byte_ctr);
+ kernel_neon_end();
+
+ if (unlikely(nbytes < AES_BLOCK_SIZE))
+ memcpy(walk.dst.virt.addr,
+ buf + sizeof(buf) - nbytes, nbytes);
+ byte_ctr += nbytes;
+
+ err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
}
- kernel_neon_end();
return err;
}
-static int ctr_encrypt_sync(struct skcipher_request *req)
+static int __maybe_unused ctr_encrypt(struct skcipher_request *req)
{
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
struct crypto_aes_ctx *ctx = crypto_skcipher_ctx(tfm);
+ int err, rounds = 6 + ctx->key_length / 4;
+ struct skcipher_walk walk;
+
+ err = skcipher_walk_virt(&walk, req, false);
+
+ while (walk.nbytes > 0) {
+ const u8 *src = walk.src.virt.addr;
+ unsigned int nbytes = walk.nbytes;
+ u8 *dst = walk.dst.virt.addr;
+ u8 buf[AES_BLOCK_SIZE];
+
+ /*
+ * If given less than 16 bytes, we must copy the partial block
+ * into a temporary buffer of 16 bytes to avoid out of bounds
+ * reads and writes. Furthermore, this code is somewhat unusual
+ * in that it expects the end of the data to be at the end of
+ * the temporary buffer, rather than the start of the data at
+ * the start of the temporary buffer.
+ */
+ if (unlikely(nbytes < AES_BLOCK_SIZE))
+ src = dst = memcpy(buf + sizeof(buf) - nbytes,
+ src, nbytes);
+ else if (nbytes < walk.total)
+ nbytes &= ~(AES_BLOCK_SIZE - 1);
+
+ kernel_neon_begin();
+ aes_ctr_encrypt(dst, src, ctx->key_enc, rounds, nbytes,
+ walk.iv);
+ kernel_neon_end();
- if (!may_use_simd())
- return aes_ctr_encrypt_fallback(ctx, req);
+ if (unlikely(nbytes < AES_BLOCK_SIZE))
+ memcpy(walk.dst.virt.addr,
+ buf + sizeof(buf) - nbytes, nbytes);
- return ctr_encrypt(req);
+ err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
+ }
+
+ return err;
}
-static int xts_encrypt(struct skcipher_request *req)
+static int __maybe_unused xts_encrypt(struct skcipher_request *req)
{
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
struct crypto_aes_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
int err, first, rounds = 6 + ctx->key1.key_length / 4;
+ int tail = req->cryptlen % AES_BLOCK_SIZE;
+ struct scatterlist sg_src[2], sg_dst[2];
+ struct skcipher_request subreq;
+ struct scatterlist *src, *dst;
struct skcipher_walk walk;
- unsigned int blocks;
- err = skcipher_walk_virt(&walk, req, true);
+ if (req->cryptlen < AES_BLOCK_SIZE)
+ return -EINVAL;
- kernel_neon_begin();
- for (first = 1; (blocks = (walk.nbytes / AES_BLOCK_SIZE)); first = 0) {
+ err = skcipher_walk_virt(&walk, req, false);
+
+ if (unlikely(tail > 0 && walk.nbytes < walk.total)) {
+ int xts_blocks = DIV_ROUND_UP(req->cryptlen,
+ AES_BLOCK_SIZE) - 2;
+
+ skcipher_walk_abort(&walk);
+
+ skcipher_request_set_tfm(&subreq, tfm);
+ skcipher_request_set_callback(&subreq,
+ skcipher_request_flags(req),
+ NULL, NULL);
+ skcipher_request_set_crypt(&subreq, req->src, req->dst,
+ xts_blocks * AES_BLOCK_SIZE,
+ req->iv);
+ req = &subreq;
+ err = skcipher_walk_virt(&walk, req, false);
+ } else {
+ tail = 0;
+ }
+
+ for (first = 1; walk.nbytes >= AES_BLOCK_SIZE; first = 0) {
+ int nbytes = walk.nbytes;
+
+ if (walk.nbytes < walk.total)
+ nbytes &= ~(AES_BLOCK_SIZE - 1);
+
+ kernel_neon_begin();
aes_xts_encrypt(walk.dst.virt.addr, walk.src.virt.addr,
- (u8 *)ctx->key1.key_enc, rounds, blocks,
- (u8 *)ctx->key2.key_enc, walk.iv, first);
- err = skcipher_walk_done(&walk, walk.nbytes % AES_BLOCK_SIZE);
+ ctx->key1.key_enc, rounds, nbytes,
+ ctx->key2.key_enc, walk.iv, first);
+ kernel_neon_end();
+ err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
}
+
+ if (err || likely(!tail))
+ return err;
+
+ dst = src = scatterwalk_ffwd(sg_src, req->src, req->cryptlen);
+ if (req->dst != req->src)
+ dst = scatterwalk_ffwd(sg_dst, req->dst, req->cryptlen);
+
+ skcipher_request_set_crypt(req, src, dst, AES_BLOCK_SIZE + tail,
+ req->iv);
+
+ err = skcipher_walk_virt(&walk, &subreq, false);
+ if (err)
+ return err;
+
+ kernel_neon_begin();
+ aes_xts_encrypt(walk.dst.virt.addr, walk.src.virt.addr,
+ ctx->key1.key_enc, rounds, walk.nbytes,
+ ctx->key2.key_enc, walk.iv, first);
kernel_neon_end();
- return err;
+ return skcipher_walk_done(&walk, 0);
}
-static int xts_decrypt(struct skcipher_request *req)
+static int __maybe_unused xts_decrypt(struct skcipher_request *req)
{
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
struct crypto_aes_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
int err, first, rounds = 6 + ctx->key1.key_length / 4;
+ int tail = req->cryptlen % AES_BLOCK_SIZE;
+ struct scatterlist sg_src[2], sg_dst[2];
+ struct skcipher_request subreq;
+ struct scatterlist *src, *dst;
struct skcipher_walk walk;
- unsigned int blocks;
- err = skcipher_walk_virt(&walk, req, true);
+ if (req->cryptlen < AES_BLOCK_SIZE)
+ return -EINVAL;
- kernel_neon_begin();
- for (first = 1; (blocks = (walk.nbytes / AES_BLOCK_SIZE)); first = 0) {
+ err = skcipher_walk_virt(&walk, req, false);
+
+ if (unlikely(tail > 0 && walk.nbytes < walk.total)) {
+ int xts_blocks = DIV_ROUND_UP(req->cryptlen,
+ AES_BLOCK_SIZE) - 2;
+
+ skcipher_walk_abort(&walk);
+
+ skcipher_request_set_tfm(&subreq, tfm);
+ skcipher_request_set_callback(&subreq,
+ skcipher_request_flags(req),
+ NULL, NULL);
+ skcipher_request_set_crypt(&subreq, req->src, req->dst,
+ xts_blocks * AES_BLOCK_SIZE,
+ req->iv);
+ req = &subreq;
+ err = skcipher_walk_virt(&walk, req, false);
+ } else {
+ tail = 0;
+ }
+
+ for (first = 1; walk.nbytes >= AES_BLOCK_SIZE; first = 0) {
+ int nbytes = walk.nbytes;
+
+ if (walk.nbytes < walk.total)
+ nbytes &= ~(AES_BLOCK_SIZE - 1);
+
+ kernel_neon_begin();
aes_xts_decrypt(walk.dst.virt.addr, walk.src.virt.addr,
- (u8 *)ctx->key1.key_dec, rounds, blocks,
- (u8 *)ctx->key2.key_enc, walk.iv, first);
- err = skcipher_walk_done(&walk, walk.nbytes % AES_BLOCK_SIZE);
+ ctx->key1.key_dec, rounds, nbytes,
+ ctx->key2.key_enc, walk.iv, first);
+ kernel_neon_end();
+ err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
}
+
+ if (err || likely(!tail))
+ return err;
+
+ dst = src = scatterwalk_ffwd(sg_src, req->src, req->cryptlen);
+ if (req->dst != req->src)
+ dst = scatterwalk_ffwd(sg_dst, req->dst, req->cryptlen);
+
+ skcipher_request_set_crypt(req, src, dst, AES_BLOCK_SIZE + tail,
+ req->iv);
+
+ err = skcipher_walk_virt(&walk, &subreq, false);
+ if (err)
+ return err;
+
+
+ kernel_neon_begin();
+ aes_xts_decrypt(walk.dst.virt.addr, walk.src.virt.addr,
+ ctx->key1.key_dec, rounds, walk.nbytes,
+ ctx->key2.key_enc, walk.iv, first);
kernel_neon_end();
- return err;
+ return skcipher_walk_done(&walk, 0);
}
static struct skcipher_alg aes_algs[] = { {
+#if defined(USE_V8_CRYPTO_EXTENSIONS) || !IS_ENABLED(CONFIG_CRYPTO_AES_ARM64_BS)
.base = {
- .cra_name = "__ecb(aes)",
- .cra_driver_name = "__ecb-aes-" MODE,
+ .cra_name = "ecb(aes)",
+ .cra_driver_name = "ecb-aes-" MODE,
.cra_priority = PRIO,
- .cra_flags = CRYPTO_ALG_INTERNAL,
.cra_blocksize = AES_BLOCK_SIZE,
.cra_ctxsize = sizeof(struct crypto_aes_ctx),
.cra_module = THIS_MODULE,
@@ -323,10 +700,9 @@ static struct skcipher_alg aes_algs[] = { {
.decrypt = ecb_decrypt,
}, {
.base = {
- .cra_name = "__cbc(aes)",
- .cra_driver_name = "__cbc-aes-" MODE,
+ .cra_name = "cbc(aes)",
+ .cra_driver_name = "cbc-aes-" MODE,
.cra_priority = PRIO,
- .cra_flags = CRYPTO_ALG_INTERNAL,
.cra_blocksize = AES_BLOCK_SIZE,
.cra_ctxsize = sizeof(struct crypto_aes_ctx),
.cra_module = THIS_MODULE,
@@ -339,10 +715,9 @@ static struct skcipher_alg aes_algs[] = { {
.decrypt = cbc_decrypt,
}, {
.base = {
- .cra_name = "__ctr(aes)",
- .cra_driver_name = "__ctr-aes-" MODE,
+ .cra_name = "ctr(aes)",
+ .cra_driver_name = "ctr-aes-" MODE,
.cra_priority = PRIO,
- .cra_flags = CRYPTO_ALG_INTERNAL,
.cra_blocksize = 1,
.cra_ctxsize = sizeof(struct crypto_aes_ctx),
.cra_module = THIS_MODULE,
@@ -356,9 +731,9 @@ static struct skcipher_alg aes_algs[] = { {
.decrypt = ctr_encrypt,
}, {
.base = {
- .cra_name = "ctr(aes)",
- .cra_driver_name = "ctr-aes-" MODE,
- .cra_priority = PRIO - 1,
+ .cra_name = "xctr(aes)",
+ .cra_driver_name = "xctr-aes-" MODE,
+ .cra_priority = PRIO,
.cra_blocksize = 1,
.cra_ctxsize = sizeof(struct crypto_aes_ctx),
.cra_module = THIS_MODULE,
@@ -368,14 +743,13 @@ static struct skcipher_alg aes_algs[] = { {
.ivsize = AES_BLOCK_SIZE,
.chunksize = AES_BLOCK_SIZE,
.setkey = skcipher_aes_setkey,
- .encrypt = ctr_encrypt_sync,
- .decrypt = ctr_encrypt_sync,
+ .encrypt = xctr_encrypt,
+ .decrypt = xctr_encrypt,
}, {
.base = {
- .cra_name = "__xts(aes)",
- .cra_driver_name = "__xts-aes-" MODE,
+ .cra_name = "xts(aes)",
+ .cra_driver_name = "xts-aes-" MODE,
.cra_priority = PRIO,
- .cra_flags = CRYPTO_ALG_INTERNAL,
.cra_blocksize = AES_BLOCK_SIZE,
.cra_ctxsize = sizeof(struct crypto_aes_xts_ctx),
.cra_module = THIS_MODULE,
@@ -383,22 +757,52 @@ static struct skcipher_alg aes_algs[] = { {
.min_keysize = 2 * AES_MIN_KEY_SIZE,
.max_keysize = 2 * AES_MAX_KEY_SIZE,
.ivsize = AES_BLOCK_SIZE,
+ .walksize = 2 * AES_BLOCK_SIZE,
.setkey = xts_set_key,
.encrypt = xts_encrypt,
.decrypt = xts_decrypt,
+}, {
+#endif
+ .base = {
+ .cra_name = "cts(cbc(aes))",
+ .cra_driver_name = "cts-cbc-aes-" MODE,
+ .cra_priority = PRIO,
+ .cra_blocksize = AES_BLOCK_SIZE,
+ .cra_ctxsize = sizeof(struct crypto_aes_ctx),
+ .cra_module = THIS_MODULE,
+ },
+ .min_keysize = AES_MIN_KEY_SIZE,
+ .max_keysize = AES_MAX_KEY_SIZE,
+ .ivsize = AES_BLOCK_SIZE,
+ .walksize = 2 * AES_BLOCK_SIZE,
+ .setkey = skcipher_aes_setkey,
+ .encrypt = cts_cbc_encrypt,
+ .decrypt = cts_cbc_decrypt,
+}, {
+ .base = {
+ .cra_name = "essiv(cbc(aes),sha256)",
+ .cra_driver_name = "essiv-cbc-aes-sha256-" MODE,
+ .cra_priority = PRIO + 1,
+ .cra_blocksize = AES_BLOCK_SIZE,
+ .cra_ctxsize = sizeof(struct crypto_aes_essiv_cbc_ctx),
+ .cra_module = THIS_MODULE,
+ },
+ .min_keysize = AES_MIN_KEY_SIZE,
+ .max_keysize = AES_MAX_KEY_SIZE,
+ .ivsize = AES_BLOCK_SIZE,
+ .setkey = essiv_cbc_set_key,
+ .encrypt = essiv_cbc_encrypt,
+ .decrypt = essiv_cbc_decrypt,
+ .init = essiv_cbc_init_tfm,
+ .exit = essiv_cbc_exit_tfm,
} };
static int cbcmac_setkey(struct crypto_shash *tfm, const u8 *in_key,
unsigned int key_len)
{
struct mac_tfm_ctx *ctx = crypto_shash_ctx(tfm);
- int err;
-
- err = aes_expandkey(&ctx->key, in_key, key_len);
- if (err)
- crypto_shash_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
- return err;
+ return aes_expandkey(&ctx->key, in_key, key_len);
}
static void cmac_gf128_mul_by_x(be128 *y, const be128 *x)
@@ -415,7 +819,6 @@ static int cmac_setkey(struct crypto_shash *tfm, const u8 *in_key,
{
struct mac_tfm_ctx *ctx = crypto_shash_ctx(tfm);
be128 *consts = (be128 *)ctx->consts;
- u8 *rk = (u8 *)ctx->key.key_enc;
int rounds = 6 + key_len / 4;
int err;
@@ -425,7 +828,8 @@ static int cmac_setkey(struct crypto_shash *tfm, const u8 *in_key,
/* encrypt the zero vector */
kernel_neon_begin();
- aes_ecb_encrypt(ctx->consts, (u8[AES_BLOCK_SIZE]){}, rk, rounds, 1, 1);
+ aes_ecb_encrypt(ctx->consts, (u8[AES_BLOCK_SIZE]){}, ctx->key.key_enc,
+ rounds, 1);
kernel_neon_end();
cmac_gf128_mul_by_x(consts, consts);
@@ -444,7 +848,6 @@ static int xcbc_setkey(struct crypto_shash *tfm, const u8 *in_key,
};
struct mac_tfm_ctx *ctx = crypto_shash_ctx(tfm);
- u8 *rk = (u8 *)ctx->key.key_enc;
int rounds = 6 + key_len / 4;
u8 key[AES_BLOCK_SIZE];
int err;
@@ -454,8 +857,8 @@ static int xcbc_setkey(struct crypto_shash *tfm, const u8 *in_key,
return err;
kernel_neon_begin();
- aes_ecb_encrypt(key, ks[0], rk, rounds, 1, 1);
- aes_ecb_encrypt(ctx->consts, ks[1], rk, rounds, 2, 0);
+ aes_ecb_encrypt(key, ks[0], ctx->key.key_enc, rounds, 1);
+ aes_ecb_encrypt(ctx->consts, ks[1], ctx->key.key_enc, rounds, 2);
kernel_neon_end();
return cbcmac_setkey(tfm, key, sizeof(key));
@@ -476,22 +879,28 @@ static void mac_do_update(struct crypto_aes_ctx *ctx, u8 const in[], int blocks,
{
int rounds = 6 + ctx->key_length / 4;
- if (may_use_simd()) {
- kernel_neon_begin();
- aes_mac_update(in, ctx->key_enc, rounds, blocks, dg, enc_before,
- enc_after);
- kernel_neon_end();
+ if (crypto_simd_usable()) {
+ int rem;
+
+ do {
+ kernel_neon_begin();
+ rem = aes_mac_update(in, ctx->key_enc, rounds, blocks,
+ dg, enc_before, enc_after);
+ kernel_neon_end();
+ in += (blocks - rem) * AES_BLOCK_SIZE;
+ blocks = rem;
+ enc_before = 0;
+ } while (blocks);
} else {
if (enc_before)
- __aes_arm64_encrypt(ctx->key_enc, dg, dg, rounds);
+ aes_encrypt(ctx, dg, dg);
while (blocks--) {
crypto_xor(dg, in, AES_BLOCK_SIZE);
in += AES_BLOCK_SIZE;
if (blocks || enc_after)
- __aes_arm64_encrypt(ctx->key_enc, dg, dg,
- rounds);
+ aes_encrypt(ctx, dg, dg);
}
}
}
@@ -541,7 +950,7 @@ static int cbcmac_final(struct shash_desc *desc, u8 *out)
struct mac_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm);
struct mac_desc_ctx *ctx = shash_desc_ctx(desc);
- mac_do_update(&tctx->key, NULL, 0, ctx->dg, 1, 0);
+ mac_do_update(&tctx->key, NULL, 0, ctx->dg, (ctx->len != 0), 0);
memcpy(out, ctx->dg, AES_BLOCK_SIZE);
@@ -570,7 +979,6 @@ static struct shash_alg mac_algs[] = { {
.base.cra_name = "cmac(aes)",
.base.cra_driver_name = "cmac-aes-" MODE,
.base.cra_priority = PRIO,
- .base.cra_flags = CRYPTO_ALG_TYPE_SHASH,
.base.cra_blocksize = AES_BLOCK_SIZE,
.base.cra_ctxsize = sizeof(struct mac_tfm_ctx) +
2 * AES_BLOCK_SIZE,
@@ -586,7 +994,6 @@ static struct shash_alg mac_algs[] = { {
.base.cra_name = "xcbc(aes)",
.base.cra_driver_name = "xcbc-aes-" MODE,
.base.cra_priority = PRIO,
- .base.cra_flags = CRYPTO_ALG_TYPE_SHASH,
.base.cra_blocksize = AES_BLOCK_SIZE,
.base.cra_ctxsize = sizeof(struct mac_tfm_ctx) +
2 * AES_BLOCK_SIZE,
@@ -602,7 +1009,6 @@ static struct shash_alg mac_algs[] = { {
.base.cra_name = "cbcmac(aes)",
.base.cra_driver_name = "cbcmac-aes-" MODE,
.base.cra_priority = PRIO,
- .base.cra_flags = CRYPTO_ALG_TYPE_SHASH,
.base.cra_blocksize = 1,
.base.cra_ctxsize = sizeof(struct mac_tfm_ctx),
.base.cra_module = THIS_MODULE,
@@ -615,28 +1021,15 @@ static struct shash_alg mac_algs[] = { {
.descsize = sizeof(struct mac_desc_ctx),
} };
-static struct simd_skcipher_alg *aes_simd_algs[ARRAY_SIZE(aes_algs)];
-
static void aes_exit(void)
{
- int i;
-
- for (i = 0; i < ARRAY_SIZE(aes_simd_algs); i++)
- if (aes_simd_algs[i])
- simd_skcipher_free(aes_simd_algs[i]);
-
crypto_unregister_shashes(mac_algs, ARRAY_SIZE(mac_algs));
crypto_unregister_skciphers(aes_algs, ARRAY_SIZE(aes_algs));
}
static int __init aes_init(void)
{
- struct simd_skcipher_alg *simd;
- const char *basename;
- const char *algname;
- const char *drvname;
int err;
- int i;
err = crypto_register_skciphers(aes_algs, ARRAY_SIZE(aes_algs));
if (err)
@@ -646,25 +1039,8 @@ static int __init aes_init(void)
if (err)
goto unregister_ciphers;
- for (i = 0; i < ARRAY_SIZE(aes_algs); i++) {
- if (!(aes_algs[i].base.cra_flags & CRYPTO_ALG_INTERNAL))
- continue;
-
- algname = aes_algs[i].base.cra_name + 2;
- drvname = aes_algs[i].base.cra_driver_name + 2;
- basename = aes_algs[i].base.cra_driver_name;
- simd = simd_skcipher_create_compat(algname, drvname, basename);
- err = PTR_ERR(simd);
- if (IS_ERR(simd))
- goto unregister_simds;
-
- aes_simd_algs[i] = simd;
- }
-
return 0;
-unregister_simds:
- aes_exit();
unregister_ciphers:
crypto_unregister_skciphers(aes_algs, ARRAY_SIZE(aes_algs));
return err;
@@ -676,5 +1052,8 @@ module_cpu_feature_match(AES, aes_init);
module_init(aes_init);
EXPORT_SYMBOL(neon_aes_ecb_encrypt);
EXPORT_SYMBOL(neon_aes_cbc_encrypt);
+EXPORT_SYMBOL(neon_aes_ctr_encrypt);
+EXPORT_SYMBOL(neon_aes_xts_encrypt);
+EXPORT_SYMBOL(neon_aes_xts_decrypt);
#endif
module_exit(aes_exit);
diff --git a/arch/arm64/crypto/aes-modes.S b/arch/arm64/crypto/aes-modes.S
index 2674d43d1384..5abc834271f4 100644
--- a/arch/arm64/crypto/aes-modes.S
+++ b/arch/arm64/crypto/aes-modes.S
@@ -1,11 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
/*
* linux/arch/arm64/crypto/aes-modes.S - chaining mode wrappers for AES
*
* Copyright (C) 2013 - 2017 Linaro Ltd <ard.biesheuvel@linaro.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
/* included by aes-ce.S and aes-neon.S */
@@ -13,127 +10,66 @@
.text
.align 4
-/*
- * There are several ways to instantiate this code:
- * - no interleave, all inline
- * - 2-way interleave, 2x calls out of line (-DINTERLEAVE=2)
- * - 2-way interleave, all inline (-DINTERLEAVE=2 -DINTERLEAVE_INLINE)
- * - 4-way interleave, 4x calls out of line (-DINTERLEAVE=4)
- * - 4-way interleave, all inline (-DINTERLEAVE=4 -DINTERLEAVE_INLINE)
- *
- * Macros imported by this code:
- * - enc_prepare - setup NEON registers for encryption
- * - dec_prepare - setup NEON registers for decryption
- * - enc_switch_key - change to new key after having prepared for encryption
- * - encrypt_block - encrypt a single block
- * - decrypt block - decrypt a single block
- * - encrypt_block2x - encrypt 2 blocks in parallel (if INTERLEAVE == 2)
- * - decrypt_block2x - decrypt 2 blocks in parallel (if INTERLEAVE == 2)
- * - encrypt_block4x - encrypt 4 blocks in parallel (if INTERLEAVE == 4)
- * - decrypt_block4x - decrypt 4 blocks in parallel (if INTERLEAVE == 4)
- */
-
-#if defined(INTERLEAVE) && !defined(INTERLEAVE_INLINE)
-#define FRAME_PUSH stp x29, x30, [sp,#-16]! ; mov x29, sp
-#define FRAME_POP ldp x29, x30, [sp],#16
+#ifndef MAX_STRIDE
+#define MAX_STRIDE 4
+#endif
-#if INTERLEAVE == 2
+#if MAX_STRIDE == 4
+#define ST4(x...) x
+#define ST5(x...)
+#else
+#define ST4(x...)
+#define ST5(x...) x
+#endif
-aes_encrypt_block2x:
- encrypt_block2x v0, v1, w3, x2, x6, w7
+SYM_FUNC_START_LOCAL(aes_encrypt_block4x)
+ encrypt_block4x v0, v1, v2, v3, w3, x2, x8, w7
ret
-ENDPROC(aes_encrypt_block2x)
+SYM_FUNC_END(aes_encrypt_block4x)
-aes_decrypt_block2x:
- decrypt_block2x v0, v1, w3, x2, x6, w7
+SYM_FUNC_START_LOCAL(aes_decrypt_block4x)
+ decrypt_block4x v0, v1, v2, v3, w3, x2, x8, w7
ret
-ENDPROC(aes_decrypt_block2x)
-
-#elif INTERLEAVE == 4
+SYM_FUNC_END(aes_decrypt_block4x)
-aes_encrypt_block4x:
- encrypt_block4x v0, v1, v2, v3, w3, x2, x6, w7
+#if MAX_STRIDE == 5
+SYM_FUNC_START_LOCAL(aes_encrypt_block5x)
+ encrypt_block5x v0, v1, v2, v3, v4, w3, x2, x8, w7
ret
-ENDPROC(aes_encrypt_block4x)
+SYM_FUNC_END(aes_encrypt_block5x)
-aes_decrypt_block4x:
- decrypt_block4x v0, v1, v2, v3, w3, x2, x6, w7
+SYM_FUNC_START_LOCAL(aes_decrypt_block5x)
+ decrypt_block5x v0, v1, v2, v3, v4, w3, x2, x8, w7
ret
-ENDPROC(aes_decrypt_block4x)
-
-#else
-#error INTERLEAVE should equal 2 or 4
-#endif
-
- .macro do_encrypt_block2x
- bl aes_encrypt_block2x
- .endm
-
- .macro do_decrypt_block2x
- bl aes_decrypt_block2x
- .endm
-
- .macro do_encrypt_block4x
- bl aes_encrypt_block4x
- .endm
-
- .macro do_decrypt_block4x
- bl aes_decrypt_block4x
- .endm
-
-#else
-#define FRAME_PUSH
-#define FRAME_POP
-
- .macro do_encrypt_block2x
- encrypt_block2x v0, v1, w3, x2, x6, w7
- .endm
-
- .macro do_decrypt_block2x
- decrypt_block2x v0, v1, w3, x2, x6, w7
- .endm
-
- .macro do_encrypt_block4x
- encrypt_block4x v0, v1, v2, v3, w3, x2, x6, w7
- .endm
-
- .macro do_decrypt_block4x
- decrypt_block4x v0, v1, v2, v3, w3, x2, x6, w7
- .endm
-
+SYM_FUNC_END(aes_decrypt_block5x)
#endif
/*
* aes_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
- * int blocks, int first)
+ * int blocks)
* aes_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
- * int blocks, int first)
+ * int blocks)
*/
-AES_ENTRY(aes_ecb_encrypt)
- FRAME_PUSH
- cbz w5, .LecbencloopNx
+AES_FUNC_START(aes_ecb_encrypt)
+ stp x29, x30, [sp, #-16]!
+ mov x29, sp
enc_prepare w3, x2, x5
.LecbencloopNx:
-#if INTERLEAVE >= 2
- subs w4, w4, #INTERLEAVE
+ subs w4, w4, #MAX_STRIDE
bmi .Lecbenc1x
-#if INTERLEAVE == 2
- ld1 {v0.16b-v1.16b}, [x1], #32 /* get 2 pt blocks */
- do_encrypt_block2x
- st1 {v0.16b-v1.16b}, [x0], #32
-#else
ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 pt blocks */
- do_encrypt_block4x
+ST4( bl aes_encrypt_block4x )
+ST5( ld1 {v4.16b}, [x1], #16 )
+ST5( bl aes_encrypt_block5x )
st1 {v0.16b-v3.16b}, [x0], #64
-#endif
+ST5( st1 {v4.16b}, [x0], #16 )
b .LecbencloopNx
.Lecbenc1x:
- adds w4, w4, #INTERLEAVE
+ adds w4, w4, #MAX_STRIDE
beq .Lecbencout
-#endif
.Lecbencloop:
ld1 {v0.16b}, [x1], #16 /* get next pt block */
encrypt_block v0, w3, x2, x5, w6
@@ -141,35 +77,30 @@ AES_ENTRY(aes_ecb_encrypt)
subs w4, w4, #1
bne .Lecbencloop
.Lecbencout:
- FRAME_POP
+ ldp x29, x30, [sp], #16
ret
-AES_ENDPROC(aes_ecb_encrypt)
+AES_FUNC_END(aes_ecb_encrypt)
-AES_ENTRY(aes_ecb_decrypt)
- FRAME_PUSH
- cbz w5, .LecbdecloopNx
+AES_FUNC_START(aes_ecb_decrypt)
+ stp x29, x30, [sp, #-16]!
+ mov x29, sp
dec_prepare w3, x2, x5
.LecbdecloopNx:
-#if INTERLEAVE >= 2
- subs w4, w4, #INTERLEAVE
+ subs w4, w4, #MAX_STRIDE
bmi .Lecbdec1x
-#if INTERLEAVE == 2
- ld1 {v0.16b-v1.16b}, [x1], #32 /* get 2 ct blocks */
- do_decrypt_block2x
- st1 {v0.16b-v1.16b}, [x0], #32
-#else
ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */
- do_decrypt_block4x
+ST4( bl aes_decrypt_block4x )
+ST5( ld1 {v4.16b}, [x1], #16 )
+ST5( bl aes_decrypt_block5x )
st1 {v0.16b-v3.16b}, [x0], #64
-#endif
+ST5( st1 {v4.16b}, [x0], #16 )
b .LecbdecloopNx
.Lecbdec1x:
- adds w4, w4, #INTERLEAVE
+ adds w4, w4, #MAX_STRIDE
beq .Lecbdecout
-#endif
.Lecbdecloop:
ld1 {v0.16b}, [x1], #16 /* get next ct block */
decrypt_block v0, w3, x2, x5, w6
@@ -177,327 +108,665 @@ AES_ENTRY(aes_ecb_decrypt)
subs w4, w4, #1
bne .Lecbdecloop
.Lecbdecout:
- FRAME_POP
+ ldp x29, x30, [sp], #16
ret
-AES_ENDPROC(aes_ecb_decrypt)
+AES_FUNC_END(aes_ecb_decrypt)
/*
* aes_cbc_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
- * int blocks, u8 iv[], int first)
+ * int blocks, u8 iv[])
* aes_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
- * int blocks, u8 iv[], int first)
+ * int blocks, u8 iv[])
+ * aes_essiv_cbc_encrypt(u8 out[], u8 const in[], u32 const rk1[],
+ * int rounds, int blocks, u8 iv[],
+ * u32 const rk2[]);
+ * aes_essiv_cbc_decrypt(u8 out[], u8 const in[], u32 const rk1[],
+ * int rounds, int blocks, u8 iv[],
+ * u32 const rk2[]);
*/
-AES_ENTRY(aes_cbc_encrypt)
- cbz w6, .Lcbcencloop
+AES_FUNC_START(aes_essiv_cbc_encrypt)
+ ld1 {v4.16b}, [x5] /* get iv */
- ld1 {v0.16b}, [x5] /* get iv */
+ mov w8, #14 /* AES-256: 14 rounds */
+ enc_prepare w8, x6, x7
+ encrypt_block v4, w8, x6, x7, w9
+ enc_switch_key w3, x2, x6
+ b .Lcbcencloop4x
+
+AES_FUNC_START(aes_cbc_encrypt)
+ ld1 {v4.16b}, [x5] /* get iv */
enc_prepare w3, x2, x6
-.Lcbcencloop:
- ld1 {v1.16b}, [x1], #16 /* get next pt block */
- eor v0.16b, v0.16b, v1.16b /* ..and xor with iv */
+.Lcbcencloop4x:
+ subs w4, w4, #4
+ bmi .Lcbcenc1x
+ ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 pt blocks */
+ eor v0.16b, v0.16b, v4.16b /* ..and xor with iv */
encrypt_block v0, w3, x2, x6, w7
- st1 {v0.16b}, [x0], #16
+ eor v1.16b, v1.16b, v0.16b
+ encrypt_block v1, w3, x2, x6, w7
+ eor v2.16b, v2.16b, v1.16b
+ encrypt_block v2, w3, x2, x6, w7
+ eor v3.16b, v3.16b, v2.16b
+ encrypt_block v3, w3, x2, x6, w7
+ st1 {v0.16b-v3.16b}, [x0], #64
+ mov v4.16b, v3.16b
+ b .Lcbcencloop4x
+.Lcbcenc1x:
+ adds w4, w4, #4
+ beq .Lcbcencout
+.Lcbcencloop:
+ ld1 {v0.16b}, [x1], #16 /* get next pt block */
+ eor v4.16b, v4.16b, v0.16b /* ..and xor with iv */
+ encrypt_block v4, w3, x2, x6, w7
+ st1 {v4.16b}, [x0], #16
subs w4, w4, #1
bne .Lcbcencloop
- st1 {v0.16b}, [x5] /* return iv */
+.Lcbcencout:
+ st1 {v4.16b}, [x5] /* return iv */
ret
-AES_ENDPROC(aes_cbc_encrypt)
+AES_FUNC_END(aes_cbc_encrypt)
+AES_FUNC_END(aes_essiv_cbc_encrypt)
+
+AES_FUNC_START(aes_essiv_cbc_decrypt)
+ stp x29, x30, [sp, #-16]!
+ mov x29, sp
+ ld1 {cbciv.16b}, [x5] /* get iv */
-AES_ENTRY(aes_cbc_decrypt)
- FRAME_PUSH
- cbz w6, .LcbcdecloopNx
+ mov w8, #14 /* AES-256: 14 rounds */
+ enc_prepare w8, x6, x7
+ encrypt_block cbciv, w8, x6, x7, w9
+ b .Lessivcbcdecstart
- ld1 {v7.16b}, [x5] /* get iv */
+AES_FUNC_START(aes_cbc_decrypt)
+ stp x29, x30, [sp, #-16]!
+ mov x29, sp
+
+ ld1 {cbciv.16b}, [x5] /* get iv */
+.Lessivcbcdecstart:
dec_prepare w3, x2, x6
.LcbcdecloopNx:
-#if INTERLEAVE >= 2
- subs w4, w4, #INTERLEAVE
+ subs w4, w4, #MAX_STRIDE
bmi .Lcbcdec1x
-#if INTERLEAVE == 2
- ld1 {v0.16b-v1.16b}, [x1], #32 /* get 2 ct blocks */
- mov v2.16b, v0.16b
- mov v3.16b, v1.16b
- do_decrypt_block2x
- eor v0.16b, v0.16b, v7.16b
- eor v1.16b, v1.16b, v2.16b
- mov v7.16b, v3.16b
- st1 {v0.16b-v1.16b}, [x0], #32
-#else
ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */
+#if MAX_STRIDE == 5
+ ld1 {v4.16b}, [x1], #16 /* get 1 ct block */
+ mov v5.16b, v0.16b
+ mov v6.16b, v1.16b
+ mov v7.16b, v2.16b
+ bl aes_decrypt_block5x
+ sub x1, x1, #32
+ eor v0.16b, v0.16b, cbciv.16b
+ eor v1.16b, v1.16b, v5.16b
+ ld1 {v5.16b}, [x1], #16 /* reload 1 ct block */
+ ld1 {cbciv.16b}, [x1], #16 /* reload 1 ct block */
+ eor v2.16b, v2.16b, v6.16b
+ eor v3.16b, v3.16b, v7.16b
+ eor v4.16b, v4.16b, v5.16b
+#else
mov v4.16b, v0.16b
mov v5.16b, v1.16b
mov v6.16b, v2.16b
- do_decrypt_block4x
+ bl aes_decrypt_block4x
sub x1, x1, #16
- eor v0.16b, v0.16b, v7.16b
+ eor v0.16b, v0.16b, cbciv.16b
eor v1.16b, v1.16b, v4.16b
- ld1 {v7.16b}, [x1], #16 /* reload 1 ct block */
+ ld1 {cbciv.16b}, [x1], #16 /* reload 1 ct block */
eor v2.16b, v2.16b, v5.16b
eor v3.16b, v3.16b, v6.16b
- st1 {v0.16b-v3.16b}, [x0], #64
#endif
+ st1 {v0.16b-v3.16b}, [x0], #64
+ST5( st1 {v4.16b}, [x0], #16 )
b .LcbcdecloopNx
.Lcbcdec1x:
- adds w4, w4, #INTERLEAVE
+ adds w4, w4, #MAX_STRIDE
beq .Lcbcdecout
-#endif
.Lcbcdecloop:
ld1 {v1.16b}, [x1], #16 /* get next ct block */
mov v0.16b, v1.16b /* ...and copy to v0 */
decrypt_block v0, w3, x2, x6, w7
- eor v0.16b, v0.16b, v7.16b /* xor with iv => pt */
- mov v7.16b, v1.16b /* ct is next iv */
+ eor v0.16b, v0.16b, cbciv.16b /* xor with iv => pt */
+ mov cbciv.16b, v1.16b /* ct is next iv */
st1 {v0.16b}, [x0], #16
subs w4, w4, #1
bne .Lcbcdecloop
.Lcbcdecout:
- FRAME_POP
- st1 {v7.16b}, [x5] /* return iv */
+ st1 {cbciv.16b}, [x5] /* return iv */
+ ldp x29, x30, [sp], #16
ret
-AES_ENDPROC(aes_cbc_decrypt)
+AES_FUNC_END(aes_cbc_decrypt)
+AES_FUNC_END(aes_essiv_cbc_decrypt)
/*
- * aes_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
- * int blocks, u8 ctr[], int first)
+ * aes_cbc_cts_encrypt(u8 out[], u8 const in[], u32 const rk[],
+ * int rounds, int bytes, u8 const iv[])
+ * aes_cbc_cts_decrypt(u8 out[], u8 const in[], u32 const rk[],
+ * int rounds, int bytes, u8 const iv[])
*/
-AES_ENTRY(aes_ctr_encrypt)
- FRAME_PUSH
- cbz w6, .Lctrnotfirst /* 1st time around? */
+AES_FUNC_START(aes_cbc_cts_encrypt)
+ adr_l x8, .Lcts_permute_table
+ sub x4, x4, #16
+ add x9, x8, #32
+ add x8, x8, x4
+ sub x9, x9, x4
+ ld1 {v3.16b}, [x8]
+ ld1 {v4.16b}, [x9]
+
+ ld1 {v0.16b}, [x1], x4 /* overlapping loads */
+ ld1 {v1.16b}, [x1]
+
+ ld1 {v5.16b}, [x5] /* get iv */
enc_prepare w3, x2, x6
- ld1 {v4.16b}, [x5]
-
-.Lctrnotfirst:
- umov x8, v4.d[1] /* keep swabbed ctr in reg */
- rev x8, x8
-#if INTERLEAVE >= 2
- cmn w8, w4 /* 32 bit overflow? */
- bcs .Lctrloop
-.LctrloopNx:
- subs w4, w4, #INTERLEAVE
- bmi .Lctr1x
-#if INTERLEAVE == 2
- mov v0.8b, v4.8b
- mov v1.8b, v4.8b
- rev x7, x8
- add x8, x8, #1
- ins v0.d[1], x7
- rev x7, x8
- add x8, x8, #1
- ins v1.d[1], x7
- ld1 {v2.16b-v3.16b}, [x1], #32 /* get 2 input blocks */
- do_encrypt_block2x
- eor v0.16b, v0.16b, v2.16b
- eor v1.16b, v1.16b, v3.16b
- st1 {v0.16b-v1.16b}, [x0], #32
-#else
- ldr q8, =0x30000000200000001 /* addends 1,2,3[,0] */
- dup v7.4s, w8
- mov v0.16b, v4.16b
- add v7.4s, v7.4s, v8.4s
- mov v1.16b, v4.16b
- rev32 v8.16b, v7.16b
- mov v2.16b, v4.16b
- mov v3.16b, v4.16b
- mov v1.s[3], v8.s[0]
- mov v2.s[3], v8.s[1]
- mov v3.s[3], v8.s[2]
- ld1 {v5.16b-v7.16b}, [x1], #48 /* get 3 input blocks */
- do_encrypt_block4x
- eor v0.16b, v5.16b, v0.16b
- ld1 {v5.16b}, [x1], #16 /* get 1 input block */
- eor v1.16b, v6.16b, v1.16b
- eor v2.16b, v7.16b, v2.16b
- eor v3.16b, v5.16b, v3.16b
- st1 {v0.16b-v3.16b}, [x0], #64
- add x8, x8, #INTERLEAVE
-#endif
- rev x7, x8
- ins v4.d[1], x7
- cbz w4, .Lctrout
- b .LctrloopNx
-.Lctr1x:
- adds w4, w4, #INTERLEAVE
- beq .Lctrout
-#endif
-.Lctrloop:
- mov v0.16b, v4.16b
+
+ eor v0.16b, v0.16b, v5.16b /* xor with iv */
+ tbl v1.16b, {v1.16b}, v4.16b
encrypt_block v0, w3, x2, x6, w7
- adds x8, x8, #1 /* increment BE ctr */
- rev x7, x8
- ins v4.d[1], x7
- bcs .Lctrcarry /* overflow? */
+ eor v1.16b, v1.16b, v0.16b
+ tbl v0.16b, {v0.16b}, v3.16b
+ encrypt_block v1, w3, x2, x6, w7
-.Lctrcarrydone:
- subs w4, w4, #1
- bmi .Lctrtailblock /* blocks <0 means tail block */
- ld1 {v3.16b}, [x1], #16
- eor v3.16b, v0.16b, v3.16b
- st1 {v3.16b}, [x0], #16
- bne .Lctrloop
-
-.Lctrout:
- st1 {v4.16b}, [x5] /* return next CTR value */
- FRAME_POP
+ add x4, x0, x4
+ st1 {v0.16b}, [x4] /* overlapping stores */
+ st1 {v1.16b}, [x0]
ret
+AES_FUNC_END(aes_cbc_cts_encrypt)
+
+AES_FUNC_START(aes_cbc_cts_decrypt)
+ adr_l x8, .Lcts_permute_table
+ sub x4, x4, #16
+ add x9, x8, #32
+ add x8, x8, x4
+ sub x9, x9, x4
+ ld1 {v3.16b}, [x8]
+ ld1 {v4.16b}, [x9]
-.Lctrtailblock:
+ ld1 {v0.16b}, [x1], x4 /* overlapping loads */
+ ld1 {v1.16b}, [x1]
+
+ ld1 {v5.16b}, [x5] /* get iv */
+ dec_prepare w3, x2, x6
+
+ decrypt_block v0, w3, x2, x6, w7
+ tbl v2.16b, {v0.16b}, v3.16b
+ eor v2.16b, v2.16b, v1.16b
+
+ tbx v0.16b, {v1.16b}, v4.16b
+ decrypt_block v0, w3, x2, x6, w7
+ eor v0.16b, v0.16b, v5.16b /* xor with iv */
+
+ add x4, x0, x4
+ st1 {v2.16b}, [x4] /* overlapping stores */
st1 {v0.16b}, [x0]
- FRAME_POP
ret
+AES_FUNC_END(aes_cbc_cts_decrypt)
+
+ .section ".rodata", "a"
+ .align 6
+.Lcts_permute_table:
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+ .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7
+ .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+ .previous
-.Lctrcarry:
- umov x7, v4.d[0] /* load upper word of ctr */
- rev x7, x7 /* ... to handle the carry */
- add x7, x7, #1
- rev x7, x7
- ins v4.d[0], x7
- b .Lctrcarrydone
-AES_ENDPROC(aes_ctr_encrypt)
- .ltorg
+ /*
+ * This macro generates the code for CTR and XCTR mode.
+ */
+.macro ctr_encrypt xctr
+ // Arguments
+ OUT .req x0
+ IN .req x1
+ KEY .req x2
+ ROUNDS_W .req w3
+ BYTES_W .req w4
+ IV .req x5
+ BYTE_CTR_W .req w6 // XCTR only
+ // Intermediate values
+ CTR_W .req w11 // XCTR only
+ CTR .req x11 // XCTR only
+ IV_PART .req x12
+ BLOCKS .req x13
+ BLOCKS_W .req w13
+
+ stp x29, x30, [sp, #-16]!
+ mov x29, sp
+
+ enc_prepare ROUNDS_W, KEY, IV_PART
+ ld1 {vctr.16b}, [IV]
+ /*
+ * Keep 64 bits of the IV in a register. For CTR mode this lets us
+ * easily increment the IV. For XCTR mode this lets us efficiently XOR
+ * the 64-bit counter with the IV.
+ */
+ .if \xctr
+ umov IV_PART, vctr.d[0]
+ lsr CTR_W, BYTE_CTR_W, #4
+ .else
+ umov IV_PART, vctr.d[1]
+ rev IV_PART, IV_PART
+ .endif
+
+.LctrloopNx\xctr:
+ add BLOCKS_W, BYTES_W, #15
+ sub BYTES_W, BYTES_W, #MAX_STRIDE << 4
+ lsr BLOCKS_W, BLOCKS_W, #4
+ mov w8, #MAX_STRIDE
+ cmp BLOCKS_W, w8
+ csel BLOCKS_W, BLOCKS_W, w8, lt
/*
+ * Set up the counter values in v0-v{MAX_STRIDE-1}.
+ *
+ * If we are encrypting less than MAX_STRIDE blocks, the tail block
+ * handling code expects the last keystream block to be in
+ * v{MAX_STRIDE-1}. For example: if encrypting two blocks with
+ * MAX_STRIDE=5, then v3 and v4 should have the next two counter blocks.
+ */
+ .if \xctr
+ add CTR, CTR, BLOCKS
+ .else
+ adds IV_PART, IV_PART, BLOCKS
+ .endif
+ mov v0.16b, vctr.16b
+ mov v1.16b, vctr.16b
+ mov v2.16b, vctr.16b
+ mov v3.16b, vctr.16b
+ST5( mov v4.16b, vctr.16b )
+ .if \xctr
+ sub x6, CTR, #MAX_STRIDE - 1
+ sub x7, CTR, #MAX_STRIDE - 2
+ sub x8, CTR, #MAX_STRIDE - 3
+ sub x9, CTR, #MAX_STRIDE - 4
+ST5( sub x10, CTR, #MAX_STRIDE - 5 )
+ eor x6, x6, IV_PART
+ eor x7, x7, IV_PART
+ eor x8, x8, IV_PART
+ eor x9, x9, IV_PART
+ST5( eor x10, x10, IV_PART )
+ mov v0.d[0], x6
+ mov v1.d[0], x7
+ mov v2.d[0], x8
+ mov v3.d[0], x9
+ST5( mov v4.d[0], x10 )
+ .else
+ bcs 0f
+ .subsection 1
+ /*
+ * This subsection handles carries.
+ *
+ * Conditional branching here is allowed with respect to time
+ * invariance since the branches are dependent on the IV instead
+ * of the plaintext or key. This code is rarely executed in
+ * practice anyway.
+ */
+
+ /* Apply carry to outgoing counter. */
+0: umov x8, vctr.d[0]
+ rev x8, x8
+ add x8, x8, #1
+ rev x8, x8
+ ins vctr.d[0], x8
+
+ /*
+ * Apply carry to counter blocks if needed.
+ *
+ * Since the carry flag was set, we know 0 <= IV_PART <
+ * MAX_STRIDE. Using the value of IV_PART we can determine how
+ * many counter blocks need to be updated.
+ */
+ cbz IV_PART, 2f
+ adr x16, 1f
+ sub x16, x16, IV_PART, lsl #3
+ br x16
+ bti c
+ mov v0.d[0], vctr.d[0]
+ bti c
+ mov v1.d[0], vctr.d[0]
+ bti c
+ mov v2.d[0], vctr.d[0]
+ bti c
+ mov v3.d[0], vctr.d[0]
+ST5( bti c )
+ST5( mov v4.d[0], vctr.d[0] )
+1: b 2f
+ .previous
+
+2: rev x7, IV_PART
+ ins vctr.d[1], x7
+ sub x7, IV_PART, #MAX_STRIDE - 1
+ sub x8, IV_PART, #MAX_STRIDE - 2
+ sub x9, IV_PART, #MAX_STRIDE - 3
+ rev x7, x7
+ rev x8, x8
+ mov v1.d[1], x7
+ rev x9, x9
+ST5( sub x10, IV_PART, #MAX_STRIDE - 4 )
+ mov v2.d[1], x8
+ST5( rev x10, x10 )
+ mov v3.d[1], x9
+ST5( mov v4.d[1], x10 )
+ .endif
+
+ /*
+ * If there are at least MAX_STRIDE blocks left, XOR the data with
+ * keystream and store. Otherwise jump to tail handling.
+ */
+ tbnz BYTES_W, #31, .Lctrtail\xctr
+ ld1 {v5.16b-v7.16b}, [IN], #48
+ST4( bl aes_encrypt_block4x )
+ST5( bl aes_encrypt_block5x )
+ eor v0.16b, v5.16b, v0.16b
+ST4( ld1 {v5.16b}, [IN], #16 )
+ eor v1.16b, v6.16b, v1.16b
+ST5( ld1 {v5.16b-v6.16b}, [IN], #32 )
+ eor v2.16b, v7.16b, v2.16b
+ eor v3.16b, v5.16b, v3.16b
+ST5( eor v4.16b, v6.16b, v4.16b )
+ st1 {v0.16b-v3.16b}, [OUT], #64
+ST5( st1 {v4.16b}, [OUT], #16 )
+ cbz BYTES_W, .Lctrout\xctr
+ b .LctrloopNx\xctr
+
+.Lctrout\xctr:
+ .if !\xctr
+ st1 {vctr.16b}, [IV] /* return next CTR value */
+ .endif
+ ldp x29, x30, [sp], #16
+ ret
+
+.Lctrtail\xctr:
+ /*
+ * Handle up to MAX_STRIDE * 16 - 1 bytes of plaintext
+ *
+ * This code expects the last keystream block to be in v{MAX_STRIDE-1}.
+ * For example: if encrypting two blocks with MAX_STRIDE=5, then v3 and
+ * v4 should have the next two counter blocks.
+ *
+ * This allows us to store the ciphertext by writing to overlapping
+ * regions of memory. Any invalid ciphertext blocks get overwritten by
+ * correctly computed blocks. This approach greatly simplifies the
+ * logic for storing the ciphertext.
+ */
+ mov x16, #16
+ ands w7, BYTES_W, #0xf
+ csel x13, x7, x16, ne
+
+ST5( cmp BYTES_W, #64 - (MAX_STRIDE << 4))
+ST5( csel x14, x16, xzr, gt )
+ cmp BYTES_W, #48 - (MAX_STRIDE << 4)
+ csel x15, x16, xzr, gt
+ cmp BYTES_W, #32 - (MAX_STRIDE << 4)
+ csel x16, x16, xzr, gt
+ cmp BYTES_W, #16 - (MAX_STRIDE << 4)
+
+ adr_l x9, .Lcts_permute_table
+ add x9, x9, x13
+ ble .Lctrtail1x\xctr
+
+ST5( ld1 {v5.16b}, [IN], x14 )
+ ld1 {v6.16b}, [IN], x15
+ ld1 {v7.16b}, [IN], x16
+
+ST4( bl aes_encrypt_block4x )
+ST5( bl aes_encrypt_block5x )
+
+ ld1 {v8.16b}, [IN], x13
+ ld1 {v9.16b}, [IN]
+ ld1 {v10.16b}, [x9]
+
+ST4( eor v6.16b, v6.16b, v0.16b )
+ST4( eor v7.16b, v7.16b, v1.16b )
+ST4( tbl v3.16b, {v3.16b}, v10.16b )
+ST4( eor v8.16b, v8.16b, v2.16b )
+ST4( eor v9.16b, v9.16b, v3.16b )
+
+ST5( eor v5.16b, v5.16b, v0.16b )
+ST5( eor v6.16b, v6.16b, v1.16b )
+ST5( tbl v4.16b, {v4.16b}, v10.16b )
+ST5( eor v7.16b, v7.16b, v2.16b )
+ST5( eor v8.16b, v8.16b, v3.16b )
+ST5( eor v9.16b, v9.16b, v4.16b )
+
+ST5( st1 {v5.16b}, [OUT], x14 )
+ st1 {v6.16b}, [OUT], x15
+ st1 {v7.16b}, [OUT], x16
+ add x13, x13, OUT
+ st1 {v9.16b}, [x13] // overlapping stores
+ st1 {v8.16b}, [OUT]
+ b .Lctrout\xctr
+
+.Lctrtail1x\xctr:
+ /*
+ * Handle <= 16 bytes of plaintext
+ *
+ * This code always reads and writes 16 bytes. To avoid out of bounds
+ * accesses, XCTR and CTR modes must use a temporary buffer when
+ * encrypting/decrypting less than 16 bytes.
+ *
+ * This code is unusual in that it loads the input and stores the output
+ * relative to the end of the buffers rather than relative to the start.
+ * This causes unusual behaviour when encrypting/decrypting less than 16
+ * bytes; the end of the data is expected to be at the end of the
+ * temporary buffer rather than the start of the data being at the start
+ * of the temporary buffer.
+ */
+ sub x8, x7, #16
+ csel x7, x7, x8, eq
+ add IN, IN, x7
+ add OUT, OUT, x7
+ ld1 {v5.16b}, [IN]
+ ld1 {v6.16b}, [OUT]
+ST5( mov v3.16b, v4.16b )
+ encrypt_block v3, ROUNDS_W, KEY, x8, w7
+ ld1 {v10.16b-v11.16b}, [x9]
+ tbl v3.16b, {v3.16b}, v10.16b
+ sshr v11.16b, v11.16b, #7
+ eor v5.16b, v5.16b, v3.16b
+ bif v5.16b, v6.16b, v11.16b
+ st1 {v5.16b}, [OUT]
+ b .Lctrout\xctr
+
+ // Arguments
+ .unreq OUT
+ .unreq IN
+ .unreq KEY
+ .unreq ROUNDS_W
+ .unreq BYTES_W
+ .unreq IV
+ .unreq BYTE_CTR_W // XCTR only
+ // Intermediate values
+ .unreq CTR_W // XCTR only
+ .unreq CTR // XCTR only
+ .unreq IV_PART
+ .unreq BLOCKS
+ .unreq BLOCKS_W
+.endm
+
+ /*
+ * aes_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
+ * int bytes, u8 ctr[])
+ *
+ * The input and output buffers must always be at least 16 bytes even if
+ * encrypting/decrypting less than 16 bytes. Otherwise out of bounds
+ * accesses will occur. The data to be encrypted/decrypted is expected
+ * to be at the end of this 16-byte temporary buffer rather than the
+ * start.
+ */
+
+AES_FUNC_START(aes_ctr_encrypt)
+ ctr_encrypt 0
+AES_FUNC_END(aes_ctr_encrypt)
+
+ /*
+ * aes_xctr_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
+ * int bytes, u8 const iv[], int byte_ctr)
+ *
+ * The input and output buffers must always be at least 16 bytes even if
+ * encrypting/decrypting less than 16 bytes. Otherwise out of bounds
+ * accesses will occur. The data to be encrypted/decrypted is expected
+ * to be at the end of this 16-byte temporary buffer rather than the
+ * start.
+ */
+
+AES_FUNC_START(aes_xctr_encrypt)
+ ctr_encrypt 1
+AES_FUNC_END(aes_xctr_encrypt)
+
+
+ /*
+ * aes_xts_encrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds,
+ * int bytes, u8 const rk2[], u8 iv[], int first)
* aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds,
- * int blocks, u8 const rk2[], u8 iv[], int first)
- * aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds,
- * int blocks, u8 const rk2[], u8 iv[], int first)
+ * int bytes, u8 const rk2[], u8 iv[], int first)
*/
- .macro next_tweak, out, in, const, tmp
+ .macro next_tweak, out, in, tmp
sshr \tmp\().2d, \in\().2d, #63
- and \tmp\().16b, \tmp\().16b, \const\().16b
+ and \tmp\().16b, \tmp\().16b, xtsmask.16b
add \out\().2d, \in\().2d, \in\().2d
ext \tmp\().16b, \tmp\().16b, \tmp\().16b, #8
eor \out\().16b, \out\().16b, \tmp\().16b
.endm
-.Lxts_mul_x:
-CPU_LE( .quad 1, 0x87 )
-CPU_BE( .quad 0x87, 1 )
+ .macro xts_load_mask, tmp
+ movi xtsmask.2s, #0x1
+ movi \tmp\().2s, #0x87
+ uzp1 xtsmask.4s, xtsmask.4s, \tmp\().4s
+ .endm
-AES_ENTRY(aes_xts_encrypt)
- FRAME_PUSH
- cbz w7, .LxtsencloopNx
+AES_FUNC_START(aes_xts_encrypt)
+ stp x29, x30, [sp, #-16]!
+ mov x29, sp
ld1 {v4.16b}, [x6]
- enc_prepare w3, x5, x6
- encrypt_block v4, w3, x5, x6, w7 /* first tweak */
- enc_switch_key w3, x2, x6
- ldr q7, .Lxts_mul_x
+ xts_load_mask v8
+ cbz w7, .Lxtsencnotfirst
+
+ enc_prepare w3, x5, x8
+ xts_cts_skip_tw w7, .LxtsencNx
+ encrypt_block v4, w3, x5, x8, w7 /* first tweak */
+ enc_switch_key w3, x2, x8
b .LxtsencNx
+.Lxtsencnotfirst:
+ enc_prepare w3, x2, x8
.LxtsencloopNx:
- ldr q7, .Lxts_mul_x
- next_tweak v4, v4, v7, v8
+ next_tweak v4, v4, v8
.LxtsencNx:
-#if INTERLEAVE >= 2
- subs w4, w4, #INTERLEAVE
+ subs w4, w4, #64
bmi .Lxtsenc1x
-#if INTERLEAVE == 2
- ld1 {v0.16b-v1.16b}, [x1], #32 /* get 2 pt blocks */
- next_tweak v5, v4, v7, v8
- eor v0.16b, v0.16b, v4.16b
- eor v1.16b, v1.16b, v5.16b
- do_encrypt_block2x
- eor v0.16b, v0.16b, v4.16b
- eor v1.16b, v1.16b, v5.16b
- st1 {v0.16b-v1.16b}, [x0], #32
- cbz w4, .LxtsencoutNx
- next_tweak v4, v5, v7, v8
- b .LxtsencNx
-.LxtsencoutNx:
- mov v4.16b, v5.16b
- b .Lxtsencout
-#else
ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 pt blocks */
- next_tweak v5, v4, v7, v8
+ next_tweak v5, v4, v8
eor v0.16b, v0.16b, v4.16b
- next_tweak v6, v5, v7, v8
+ next_tweak v6, v5, v8
eor v1.16b, v1.16b, v5.16b
eor v2.16b, v2.16b, v6.16b
- next_tweak v7, v6, v7, v8
+ next_tweak v7, v6, v8
eor v3.16b, v3.16b, v7.16b
- do_encrypt_block4x
+ bl aes_encrypt_block4x
eor v3.16b, v3.16b, v7.16b
eor v0.16b, v0.16b, v4.16b
eor v1.16b, v1.16b, v5.16b
eor v2.16b, v2.16b, v6.16b
st1 {v0.16b-v3.16b}, [x0], #64
mov v4.16b, v7.16b
- cbz w4, .Lxtsencout
+ cbz w4, .Lxtsencret
+ xts_reload_mask v8
b .LxtsencloopNx
-#endif
.Lxtsenc1x:
- adds w4, w4, #INTERLEAVE
+ adds w4, w4, #64
beq .Lxtsencout
-#endif
+ subs w4, w4, #16
+ bmi .LxtsencctsNx
.Lxtsencloop:
- ld1 {v1.16b}, [x1], #16
- eor v0.16b, v1.16b, v4.16b
- encrypt_block v0, w3, x2, x6, w7
+ ld1 {v0.16b}, [x1], #16
+.Lxtsencctsout:
+ eor v0.16b, v0.16b, v4.16b
+ encrypt_block v0, w3, x2, x8, w7
eor v0.16b, v0.16b, v4.16b
+ cbz w4, .Lxtsencout
+ subs w4, w4, #16
+ next_tweak v4, v4, v8
+ bmi .Lxtsenccts
st1 {v0.16b}, [x0], #16
- subs w4, w4, #1
- beq .Lxtsencout
- next_tweak v4, v4, v7, v8
b .Lxtsencloop
.Lxtsencout:
- FRAME_POP
+ st1 {v0.16b}, [x0]
+.Lxtsencret:
+ st1 {v4.16b}, [x6]
+ ldp x29, x30, [sp], #16
ret
-AES_ENDPROC(aes_xts_encrypt)
-
-AES_ENTRY(aes_xts_decrypt)
- FRAME_PUSH
- cbz w7, .LxtsdecloopNx
+.LxtsencctsNx:
+ mov v0.16b, v3.16b
+ sub x0, x0, #16
+.Lxtsenccts:
+ adr_l x8, .Lcts_permute_table
+
+ add x1, x1, w4, sxtw /* rewind input pointer */
+ add w4, w4, #16 /* # bytes in final block */
+ add x9, x8, #32
+ add x8, x8, x4
+ sub x9, x9, x4
+ add x4, x0, x4 /* output address of final block */
+
+ ld1 {v1.16b}, [x1] /* load final block */
+ ld1 {v2.16b}, [x8]
+ ld1 {v3.16b}, [x9]
+
+ tbl v2.16b, {v0.16b}, v2.16b
+ tbx v0.16b, {v1.16b}, v3.16b
+ st1 {v2.16b}, [x4] /* overlapping stores */
+ mov w4, wzr
+ b .Lxtsencctsout
+AES_FUNC_END(aes_xts_encrypt)
+
+AES_FUNC_START(aes_xts_decrypt)
+ stp x29, x30, [sp, #-16]!
+ mov x29, sp
+
+ /* subtract 16 bytes if we are doing CTS */
+ sub w8, w4, #0x10
+ tst w4, #0xf
+ csel w4, w4, w8, eq
ld1 {v4.16b}, [x6]
- enc_prepare w3, x5, x6
- encrypt_block v4, w3, x5, x6, w7 /* first tweak */
- dec_prepare w3, x2, x6
- ldr q7, .Lxts_mul_x
+ xts_load_mask v8
+ xts_cts_skip_tw w7, .Lxtsdecskiptw
+ cbz w7, .Lxtsdecnotfirst
+
+ enc_prepare w3, x5, x8
+ encrypt_block v4, w3, x5, x8, w7 /* first tweak */
+.Lxtsdecskiptw:
+ dec_prepare w3, x2, x8
b .LxtsdecNx
+.Lxtsdecnotfirst:
+ dec_prepare w3, x2, x8
.LxtsdecloopNx:
- ldr q7, .Lxts_mul_x
- next_tweak v4, v4, v7, v8
+ next_tweak v4, v4, v8
.LxtsdecNx:
-#if INTERLEAVE >= 2
- subs w4, w4, #INTERLEAVE
+ subs w4, w4, #64
bmi .Lxtsdec1x
-#if INTERLEAVE == 2
- ld1 {v0.16b-v1.16b}, [x1], #32 /* get 2 ct blocks */
- next_tweak v5, v4, v7, v8
- eor v0.16b, v0.16b, v4.16b
- eor v1.16b, v1.16b, v5.16b
- do_decrypt_block2x
- eor v0.16b, v0.16b, v4.16b
- eor v1.16b, v1.16b, v5.16b
- st1 {v0.16b-v1.16b}, [x0], #32
- cbz w4, .LxtsdecoutNx
- next_tweak v4, v5, v7, v8
- b .LxtsdecNx
-.LxtsdecoutNx:
- mov v4.16b, v5.16b
- b .Lxtsdecout
-#else
ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */
- next_tweak v5, v4, v7, v8
+ next_tweak v5, v4, v8
eor v0.16b, v0.16b, v4.16b
- next_tweak v6, v5, v7, v8
+ next_tweak v6, v5, v8
eor v1.16b, v1.16b, v5.16b
eor v2.16b, v2.16b, v6.16b
- next_tweak v7, v6, v7, v8
+ next_tweak v7, v6, v8
eor v3.16b, v3.16b, v7.16b
- do_decrypt_block4x
+ bl aes_decrypt_block4x
eor v3.16b, v3.16b, v7.16b
eor v0.16b, v0.16b, v4.16b
eor v1.16b, v1.16b, v5.16b
@@ -505,36 +774,88 @@ AES_ENTRY(aes_xts_decrypt)
st1 {v0.16b-v3.16b}, [x0], #64
mov v4.16b, v7.16b
cbz w4, .Lxtsdecout
+ xts_reload_mask v8
b .LxtsdecloopNx
-#endif
.Lxtsdec1x:
- adds w4, w4, #INTERLEAVE
+ adds w4, w4, #64
beq .Lxtsdecout
-#endif
+ subs w4, w4, #16
.Lxtsdecloop:
- ld1 {v1.16b}, [x1], #16
- eor v0.16b, v1.16b, v4.16b
- decrypt_block v0, w3, x2, x6, w7
+ ld1 {v0.16b}, [x1], #16
+ bmi .Lxtsdeccts
+.Lxtsdecctsout:
+ eor v0.16b, v0.16b, v4.16b
+ decrypt_block v0, w3, x2, x8, w7
eor v0.16b, v0.16b, v4.16b
st1 {v0.16b}, [x0], #16
- subs w4, w4, #1
- beq .Lxtsdecout
- next_tweak v4, v4, v7, v8
+ cbz w4, .Lxtsdecout
+ subs w4, w4, #16
+ next_tweak v4, v4, v8
b .Lxtsdecloop
.Lxtsdecout:
- FRAME_POP
+ st1 {v4.16b}, [x6]
+ ldp x29, x30, [sp], #16
ret
-AES_ENDPROC(aes_xts_decrypt)
+
+.Lxtsdeccts:
+ adr_l x8, .Lcts_permute_table
+
+ add x1, x1, w4, sxtw /* rewind input pointer */
+ add w4, w4, #16 /* # bytes in final block */
+ add x9, x8, #32
+ add x8, x8, x4
+ sub x9, x9, x4
+ add x4, x0, x4 /* output address of final block */
+
+ next_tweak v5, v4, v8
+
+ ld1 {v1.16b}, [x1] /* load final block */
+ ld1 {v2.16b}, [x8]
+ ld1 {v3.16b}, [x9]
+
+ eor v0.16b, v0.16b, v5.16b
+ decrypt_block v0, w3, x2, x8, w7
+ eor v0.16b, v0.16b, v5.16b
+
+ tbl v2.16b, {v0.16b}, v2.16b
+ tbx v0.16b, {v1.16b}, v3.16b
+
+ st1 {v2.16b}, [x4] /* overlapping stores */
+ mov w4, wzr
+ b .Lxtsdecctsout
+AES_FUNC_END(aes_xts_decrypt)
/*
* aes_mac_update(u8 const in[], u32 const rk[], int rounds,
* int blocks, u8 dg[], int enc_before, int enc_after)
*/
-AES_ENTRY(aes_mac_update)
+AES_FUNC_START(aes_mac_update)
ld1 {v0.16b}, [x4] /* get dg */
enc_prepare w2, x1, x7
- cbnz w5, .Lmacenc
+ cbz w5, .Lmacloop4x
+
+ encrypt_block v0, w2, x1, x7, w8
+.Lmacloop4x:
+ subs w3, w3, #4
+ bmi .Lmac1x
+ ld1 {v1.16b-v4.16b}, [x0], #64 /* get next pt block */
+ eor v0.16b, v0.16b, v1.16b /* ..and xor with dg */
+ encrypt_block v0, w2, x1, x7, w8
+ eor v0.16b, v0.16b, v2.16b
+ encrypt_block v0, w2, x1, x7, w8
+ eor v0.16b, v0.16b, v3.16b
+ encrypt_block v0, w2, x1, x7, w8
+ eor v0.16b, v0.16b, v4.16b
+ cmp w3, wzr
+ csinv x5, x6, xzr, eq
+ cbz w5, .Lmacout
+ encrypt_block v0, w2, x1, x7, w8
+ st1 {v0.16b}, [x4] /* return dg */
+ cond_yield .Lmacout, x7, x8
+ b .Lmacloop4x
+.Lmac1x:
+ add w3, w3, #4
.Lmacloop:
cbz w3, .Lmacout
ld1 {v1.16b}, [x0], #16 /* get next pt block */
@@ -550,5 +871,6 @@ AES_ENTRY(aes_mac_update)
.Lmacout:
st1 {v0.16b}, [x4] /* return dg */
+ mov w0, w3
ret
-AES_ENDPROC(aes_mac_update)
+AES_FUNC_END(aes_mac_update)
diff --git a/arch/arm64/crypto/aes-neon.S b/arch/arm64/crypto/aes-neon.S
index f1e3aa2732f9..9de7fbc797af 100644
--- a/arch/arm64/crypto/aes-neon.S
+++ b/arch/arm64/crypto/aes-neon.S
@@ -1,18 +1,28 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
/*
* linux/arch/arm64/crypto/aes-neon.S - AES cipher for ARMv8 NEON
*
* Copyright (C) 2013 - 2017 Linaro Ltd. <ard.biesheuvel@linaro.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#include <linux/linkage.h>
#include <asm/assembler.h>
-#define AES_ENTRY(func) ENTRY(neon_ ## func)
-#define AES_ENDPROC(func) ENDPROC(neon_ ## func)
+#define AES_FUNC_START(func) SYM_FUNC_START(neon_ ## func)
+#define AES_FUNC_END(func) SYM_FUNC_END(neon_ ## func)
+
+ xtsmask .req v7
+ cbciv .req v7
+ vctr .req v4
+
+ .macro xts_reload_mask, tmp
+ xts_load_mask \tmp
+ .endm
+
+ /* special case for the neon-bs driver calling into this one for CTS */
+ .macro xts_cts_skip_tw, reg, lbl
+ tbnz \reg, #1, \lbl
+ .endm
/* multiply by polynomial 'x' in GF(2^8) */
.macro mul_by_x, out, in, temp, const
@@ -32,10 +42,10 @@
/* preload the entire Sbox */
.macro prepare, sbox, shiftrows, temp
- adr \temp, \sbox
movi v12.16b, #0x1b
- ldr q13, \shiftrows
- ldr q14, .Lror32by8
+ ldr_l q13, \shiftrows, \temp
+ ldr_l q14, .Lror32by8, \temp
+ adr_l \temp, \sbox
ld1 {v16.16b-v19.16b}, [\temp], #64
ld1 {v20.16b-v23.16b}, [\temp], #64
ld1 {v24.16b-v27.16b}, [\temp], #64
@@ -44,7 +54,7 @@
/* do preload for encryption */
.macro enc_prepare, ignore0, ignore1, temp
- prepare .LForward_Sbox, .LForward_ShiftRows, \temp
+ prepare crypto_aes_sbox, .LForward_ShiftRows, \temp
.endm
.macro enc_switch_key, ignore0, ignore1, temp
@@ -53,10 +63,10 @@
/* do preload for decryption */
.macro dec_prepare, ignore0, ignore1, temp
- prepare .LReverse_Sbox, .LReverse_ShiftRows, \temp
+ prepare crypto_aes_inv_sbox, .LReverse_ShiftRows, \temp
.endm
- /* apply SubBytes transformation using the the preloaded Sbox */
+ /* apply SubBytes transformation using the preloaded Sbox */
.macro sub_bytes, in
sub v9.16b, \in\().16b, v15.16b
tbl \in\().16b, {v16.16b-v19.16b}, \in\().16b
@@ -111,26 +121,9 @@
/*
* Interleaved versions: functionally equivalent to the
- * ones above, but applied to 2 or 4 AES states in parallel.
+ * ones above, but applied to AES states in parallel.
*/
- .macro sub_bytes_2x, in0, in1
- sub v8.16b, \in0\().16b, v15.16b
- tbl \in0\().16b, {v16.16b-v19.16b}, \in0\().16b
- sub v9.16b, \in1\().16b, v15.16b
- tbl \in1\().16b, {v16.16b-v19.16b}, \in1\().16b
- sub v10.16b, v8.16b, v15.16b
- tbx \in0\().16b, {v20.16b-v23.16b}, v8.16b
- sub v11.16b, v9.16b, v15.16b
- tbx \in1\().16b, {v20.16b-v23.16b}, v9.16b
- sub v8.16b, v10.16b, v15.16b
- tbx \in0\().16b, {v24.16b-v27.16b}, v10.16b
- sub v9.16b, v11.16b, v15.16b
- tbx \in1\().16b, {v24.16b-v27.16b}, v11.16b
- tbx \in0\().16b, {v28.16b-v31.16b}, v8.16b
- tbx \in1\().16b, {v28.16b-v31.16b}, v9.16b
- .endm
-
.macro sub_bytes_4x, in0, in1, in2, in3
sub v8.16b, \in0\().16b, v15.16b
tbl \in0\().16b, {v16.16b-v19.16b}, \in0\().16b
@@ -209,25 +202,6 @@
eor \in1\().16b, \in1\().16b, v11.16b
.endm
- .macro do_block_2x, enc, in0, in1, rounds, rk, rkp, i
- ld1 {v15.4s}, [\rk]
- add \rkp, \rk, #16
- mov \i, \rounds
-1111: eor \in0\().16b, \in0\().16b, v15.16b /* ^round key */
- eor \in1\().16b, \in1\().16b, v15.16b /* ^round key */
- movi v15.16b, #0x40
- tbl \in0\().16b, {\in0\().16b}, v13.16b /* ShiftRows */
- tbl \in1\().16b, {\in1\().16b}, v13.16b /* ShiftRows */
- sub_bytes_2x \in0, \in1
- subs \i, \i, #1
- ld1 {v15.4s}, [\rkp], #16
- beq 2222f
- mix_columns_2x \in0, \in1, \enc
- b 1111b
-2222: eor \in0\().16b, \in0\().16b, v15.16b /* ^round key */
- eor \in1\().16b, \in1\().16b, v15.16b /* ^round key */
- .endm
-
.macro do_block_4x, enc, in0, in1, in2, in3, rounds, rk, rkp, i
ld1 {v15.4s}, [\rk]
add \rkp, \rk, #16
@@ -254,14 +228,6 @@
eor \in3\().16b, \in3\().16b, v15.16b /* ^round key */
.endm
- .macro encrypt_block2x, in0, in1, rounds, rk, rkp, i
- do_block_2x 1, \in0, \in1, \rounds, \rk, \rkp, \i
- .endm
-
- .macro decrypt_block2x, in0, in1, rounds, rk, rkp, i
- do_block_2x 0, \in0, \in1, \rounds, \rk, \rkp, \i
- .endm
-
.macro encrypt_block4x, in0, in1, in2, in3, rounds, rk, rkp, i
do_block_4x 1, \in0, \in1, \in2, \in3, \rounds, \rk, \rkp, \i
.endm
@@ -272,76 +238,8 @@
#include "aes-modes.S"
- .text
- .align 6
-.LForward_Sbox:
- .byte 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5
- .byte 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76
- .byte 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0
- .byte 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0
- .byte 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc
- .byte 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15
- .byte 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a
- .byte 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75
- .byte 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0
- .byte 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84
- .byte 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b
- .byte 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf
- .byte 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85
- .byte 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8
- .byte 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5
- .byte 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2
- .byte 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17
- .byte 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73
- .byte 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88
- .byte 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb
- .byte 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c
- .byte 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79
- .byte 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9
- .byte 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08
- .byte 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6
- .byte 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a
- .byte 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e
- .byte 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e
- .byte 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94
- .byte 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf
- .byte 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68
- .byte 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
-
-.LReverse_Sbox:
- .byte 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38
- .byte 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb
- .byte 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87
- .byte 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb
- .byte 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d
- .byte 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e
- .byte 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2
- .byte 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25
- .byte 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16
- .byte 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92
- .byte 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda
- .byte 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84
- .byte 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a
- .byte 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06
- .byte 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02
- .byte 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b
- .byte 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea
- .byte 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73
- .byte 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85
- .byte 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e
- .byte 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89
- .byte 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b
- .byte 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20
- .byte 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4
- .byte 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31
- .byte 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f
- .byte 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d
- .byte 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef
- .byte 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0
- .byte 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
- .byte 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
- .byte 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
-
+ .section ".rodata", "a"
+ .align 4
.LForward_ShiftRows:
.octa 0x0b06010c07020d08030e09040f0a0500
diff --git a/arch/arm64/crypto/aes-neonbs-core.S b/arch/arm64/crypto/aes-neonbs-core.S
index ca0472500433..d427f4556b6e 100644
--- a/arch/arm64/crypto/aes-neonbs-core.S
+++ b/arch/arm64/crypto/aes-neonbs-core.S
@@ -1,11 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
/*
* Bit sliced AES using NEON instructions
*
* Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
/*
@@ -383,7 +380,7 @@ ISRM0: .octa 0x0306090c00070a0d01040b0e0205080f
/*
* void aesbs_convert_key(u8 out[], u32 const rk[], int rounds)
*/
-ENTRY(aesbs_convert_key)
+SYM_FUNC_START(aesbs_convert_key)
ld1 {v7.4s}, [x1], #16 // load round 0 key
ld1 {v17.4s}, [x1], #16 // load round 1 key
@@ -428,10 +425,10 @@ ENTRY(aesbs_convert_key)
eor v17.16b, v17.16b, v7.16b
str q17, [x0]
ret
-ENDPROC(aesbs_convert_key)
+SYM_FUNC_END(aesbs_convert_key)
.align 4
-aesbs_encrypt8:
+SYM_FUNC_START_LOCAL(aesbs_encrypt8)
ldr q9, [bskey], #16 // round 0 key
ldr q8, M0SR
ldr q24, SR
@@ -491,10 +488,10 @@ aesbs_encrypt8:
eor v2.16b, v2.16b, v12.16b
eor v5.16b, v5.16b, v12.16b
ret
-ENDPROC(aesbs_encrypt8)
+SYM_FUNC_END(aesbs_encrypt8)
.align 4
-aesbs_decrypt8:
+SYM_FUNC_START_LOCAL(aesbs_decrypt8)
lsl x9, rounds, #7
add bskey, bskey, x9
@@ -556,7 +553,7 @@ aesbs_decrypt8:
eor v3.16b, v3.16b, v12.16b
eor v5.16b, v5.16b, v12.16b
ret
-ENDPROC(aesbs_decrypt8)
+SYM_FUNC_END(aesbs_decrypt8)
/*
* aesbs_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
@@ -565,110 +562,122 @@ ENDPROC(aesbs_decrypt8)
* int blocks)
*/
.macro __ecb_crypt, do8, o0, o1, o2, o3, o4, o5, o6, o7
- stp x29, x30, [sp, #-16]!
- mov x29, sp
+ frame_push 5
+
+ mov x19, x0
+ mov x20, x1
+ mov x21, x2
+ mov x22, x3
+ mov x23, x4
99: mov x5, #1
- lsl x5, x5, x4
- subs w4, w4, #8
- csel x4, x4, xzr, pl
+ lsl x5, x5, x23
+ subs w23, w23, #8
+ csel x23, x23, xzr, pl
csel x5, x5, xzr, mi
- ld1 {v0.16b}, [x1], #16
+ ld1 {v0.16b}, [x20], #16
tbnz x5, #1, 0f
- ld1 {v1.16b}, [x1], #16
+ ld1 {v1.16b}, [x20], #16
tbnz x5, #2, 0f
- ld1 {v2.16b}, [x1], #16
+ ld1 {v2.16b}, [x20], #16
tbnz x5, #3, 0f
- ld1 {v3.16b}, [x1], #16
+ ld1 {v3.16b}, [x20], #16
tbnz x5, #4, 0f
- ld1 {v4.16b}, [x1], #16
+ ld1 {v4.16b}, [x20], #16
tbnz x5, #5, 0f
- ld1 {v5.16b}, [x1], #16
+ ld1 {v5.16b}, [x20], #16
tbnz x5, #6, 0f
- ld1 {v6.16b}, [x1], #16
+ ld1 {v6.16b}, [x20], #16
tbnz x5, #7, 0f
- ld1 {v7.16b}, [x1], #16
+ ld1 {v7.16b}, [x20], #16
-0: mov bskey, x2
- mov rounds, x3
+0: mov bskey, x21
+ mov rounds, x22
bl \do8
- st1 {\o0\().16b}, [x0], #16
+ st1 {\o0\().16b}, [x19], #16
tbnz x5, #1, 1f
- st1 {\o1\().16b}, [x0], #16
+ st1 {\o1\().16b}, [x19], #16
tbnz x5, #2, 1f
- st1 {\o2\().16b}, [x0], #16
+ st1 {\o2\().16b}, [x19], #16
tbnz x5, #3, 1f
- st1 {\o3\().16b}, [x0], #16
+ st1 {\o3\().16b}, [x19], #16
tbnz x5, #4, 1f
- st1 {\o4\().16b}, [x0], #16
+ st1 {\o4\().16b}, [x19], #16
tbnz x5, #5, 1f
- st1 {\o5\().16b}, [x0], #16
+ st1 {\o5\().16b}, [x19], #16
tbnz x5, #6, 1f
- st1 {\o6\().16b}, [x0], #16
+ st1 {\o6\().16b}, [x19], #16
tbnz x5, #7, 1f
- st1 {\o7\().16b}, [x0], #16
+ st1 {\o7\().16b}, [x19], #16
- cbnz x4, 99b
+ cbz x23, 1f
+ b 99b
-1: ldp x29, x30, [sp], #16
+1: frame_pop
ret
.endm
.align 4
-ENTRY(aesbs_ecb_encrypt)
+SYM_FUNC_START(aesbs_ecb_encrypt)
__ecb_crypt aesbs_encrypt8, v0, v1, v4, v6, v3, v7, v2, v5
-ENDPROC(aesbs_ecb_encrypt)
+SYM_FUNC_END(aesbs_ecb_encrypt)
.align 4
-ENTRY(aesbs_ecb_decrypt)
+SYM_FUNC_START(aesbs_ecb_decrypt)
__ecb_crypt aesbs_decrypt8, v0, v1, v6, v4, v2, v7, v3, v5
-ENDPROC(aesbs_ecb_decrypt)
+SYM_FUNC_END(aesbs_ecb_decrypt)
/*
* aesbs_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
* int blocks, u8 iv[])
*/
.align 4
-ENTRY(aesbs_cbc_decrypt)
- stp x29, x30, [sp, #-16]!
- mov x29, sp
+SYM_FUNC_START(aesbs_cbc_decrypt)
+ frame_push 6
+
+ mov x19, x0
+ mov x20, x1
+ mov x21, x2
+ mov x22, x3
+ mov x23, x4
+ mov x24, x5
99: mov x6, #1
- lsl x6, x6, x4
- subs w4, w4, #8
- csel x4, x4, xzr, pl
+ lsl x6, x6, x23
+ subs w23, w23, #8
+ csel x23, x23, xzr, pl
csel x6, x6, xzr, mi
- ld1 {v0.16b}, [x1], #16
+ ld1 {v0.16b}, [x20], #16
mov v25.16b, v0.16b
tbnz x6, #1, 0f
- ld1 {v1.16b}, [x1], #16
+ ld1 {v1.16b}, [x20], #16
mov v26.16b, v1.16b
tbnz x6, #2, 0f
- ld1 {v2.16b}, [x1], #16
+ ld1 {v2.16b}, [x20], #16
mov v27.16b, v2.16b
tbnz x6, #3, 0f
- ld1 {v3.16b}, [x1], #16
+ ld1 {v3.16b}, [x20], #16
mov v28.16b, v3.16b
tbnz x6, #4, 0f
- ld1 {v4.16b}, [x1], #16
+ ld1 {v4.16b}, [x20], #16
mov v29.16b, v4.16b
tbnz x6, #5, 0f
- ld1 {v5.16b}, [x1], #16
+ ld1 {v5.16b}, [x20], #16
mov v30.16b, v5.16b
tbnz x6, #6, 0f
- ld1 {v6.16b}, [x1], #16
+ ld1 {v6.16b}, [x20], #16
mov v31.16b, v6.16b
tbnz x6, #7, 0f
- ld1 {v7.16b}, [x1]
+ ld1 {v7.16b}, [x20]
-0: mov bskey, x2
- mov rounds, x3
+0: mov bskey, x21
+ mov rounds, x22
bl aesbs_decrypt8
- ld1 {v24.16b}, [x5] // load IV
+ ld1 {v24.16b}, [x24] // load IV
eor v1.16b, v1.16b, v25.16b
eor v6.16b, v6.16b, v26.16b
@@ -679,36 +688,37 @@ ENTRY(aesbs_cbc_decrypt)
eor v3.16b, v3.16b, v30.16b
eor v5.16b, v5.16b, v31.16b
- st1 {v0.16b}, [x0], #16
+ st1 {v0.16b}, [x19], #16
mov v24.16b, v25.16b
tbnz x6, #1, 1f
- st1 {v1.16b}, [x0], #16
+ st1 {v1.16b}, [x19], #16
mov v24.16b, v26.16b
tbnz x6, #2, 1f
- st1 {v6.16b}, [x0], #16
+ st1 {v6.16b}, [x19], #16
mov v24.16b, v27.16b
tbnz x6, #3, 1f
- st1 {v4.16b}, [x0], #16
+ st1 {v4.16b}, [x19], #16
mov v24.16b, v28.16b
tbnz x6, #4, 1f
- st1 {v2.16b}, [x0], #16
+ st1 {v2.16b}, [x19], #16
mov v24.16b, v29.16b
tbnz x6, #5, 1f
- st1 {v7.16b}, [x0], #16
+ st1 {v7.16b}, [x19], #16
mov v24.16b, v30.16b
tbnz x6, #6, 1f
- st1 {v3.16b}, [x0], #16
+ st1 {v3.16b}, [x19], #16
mov v24.16b, v31.16b
tbnz x6, #7, 1f
- ld1 {v24.16b}, [x1], #16
- st1 {v5.16b}, [x0], #16
-1: st1 {v24.16b}, [x5] // store IV
+ ld1 {v24.16b}, [x20], #16
+ st1 {v5.16b}, [x19], #16
+1: st1 {v24.16b}, [x24] // store IV
- cbnz x4, 99b
+ cbz x23, 2f
+ b 99b
- ldp x29, x30, [sp], #16
+2: frame_pop
ret
-ENDPROC(aesbs_cbc_decrypt)
+SYM_FUNC_END(aesbs_cbc_decrypt)
.macro next_tweak, out, in, const, tmp
sshr \tmp\().2d, \in\().2d, #63
@@ -718,130 +728,84 @@ ENDPROC(aesbs_cbc_decrypt)
eor \out\().16b, \out\().16b, \tmp\().16b
.endm
- .align 4
-.Lxts_mul_x:
-CPU_LE( .quad 1, 0x87 )
-CPU_BE( .quad 0x87, 1 )
-
/*
* aesbs_xts_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
* int blocks, u8 iv[])
* aesbs_xts_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
* int blocks, u8 iv[])
*/
-__xts_crypt8:
- mov x6, #1
- lsl x6, x6, x4
- subs w4, w4, #8
- csel x4, x4, xzr, pl
- csel x6, x6, xzr, mi
+SYM_FUNC_START_LOCAL(__xts_crypt8)
+ movi v18.2s, #0x1
+ movi v19.2s, #0x87
+ uzp1 v18.4s, v18.4s, v19.4s
+
+ ld1 {v0.16b-v3.16b}, [x1], #64
+ ld1 {v4.16b-v7.16b}, [x1], #64
+
+ next_tweak v26, v25, v18, v19
+ next_tweak v27, v26, v18, v19
+ next_tweak v28, v27, v18, v19
+ next_tweak v29, v28, v18, v19
+ next_tweak v30, v29, v18, v19
+ next_tweak v31, v30, v18, v19
+ next_tweak v16, v31, v18, v19
+ next_tweak v17, v16, v18, v19
- ld1 {v0.16b}, [x1], #16
- next_tweak v26, v25, v30, v31
eor v0.16b, v0.16b, v25.16b
- tbnz x6, #1, 0f
-
- ld1 {v1.16b}, [x1], #16
- next_tweak v27, v26, v30, v31
eor v1.16b, v1.16b, v26.16b
- tbnz x6, #2, 0f
-
- ld1 {v2.16b}, [x1], #16
- next_tweak v28, v27, v30, v31
eor v2.16b, v2.16b, v27.16b
- tbnz x6, #3, 0f
-
- ld1 {v3.16b}, [x1], #16
- next_tweak v29, v28, v30, v31
eor v3.16b, v3.16b, v28.16b
- tbnz x6, #4, 0f
-
- ld1 {v4.16b}, [x1], #16
- str q29, [sp, #16]
eor v4.16b, v4.16b, v29.16b
- next_tweak v29, v29, v30, v31
- tbnz x6, #5, 0f
+ eor v5.16b, v5.16b, v30.16b
+ eor v6.16b, v6.16b, v31.16b
+ eor v7.16b, v7.16b, v16.16b
- ld1 {v5.16b}, [x1], #16
- str q29, [sp, #32]
- eor v5.16b, v5.16b, v29.16b
- next_tweak v29, v29, v30, v31
- tbnz x6, #6, 0f
+ stp q16, q17, [sp, #16]
- ld1 {v6.16b}, [x1], #16
- str q29, [sp, #48]
- eor v6.16b, v6.16b, v29.16b
- next_tweak v29, v29, v30, v31
- tbnz x6, #7, 0f
-
- ld1 {v7.16b}, [x1], #16
- str q29, [sp, #64]
- eor v7.16b, v7.16b, v29.16b
- next_tweak v29, v29, v30, v31
-
-0: mov bskey, x2
+ mov bskey, x2
mov rounds, x3
- br x7
-ENDPROC(__xts_crypt8)
+ br x16
+SYM_FUNC_END(__xts_crypt8)
.macro __xts_crypt, do8, o0, o1, o2, o3, o4, o5, o6, o7
- stp x29, x30, [sp, #-80]!
+ stp x29, x30, [sp, #-48]!
mov x29, sp
- ldr q30, .Lxts_mul_x
ld1 {v25.16b}, [x5]
-99: adr x7, \do8
+0: adr x16, \do8
bl __xts_crypt8
- ldp q16, q17, [sp, #16]
- ldp q18, q19, [sp, #48]
+ eor v16.16b, \o0\().16b, v25.16b
+ eor v17.16b, \o1\().16b, v26.16b
+ eor v18.16b, \o2\().16b, v27.16b
+ eor v19.16b, \o3\().16b, v28.16b
- eor \o0\().16b, \o0\().16b, v25.16b
- eor \o1\().16b, \o1\().16b, v26.16b
- eor \o2\().16b, \o2\().16b, v27.16b
- eor \o3\().16b, \o3\().16b, v28.16b
-
- st1 {\o0\().16b}, [x0], #16
- mov v25.16b, v26.16b
- tbnz x6, #1, 1f
- st1 {\o1\().16b}, [x0], #16
- mov v25.16b, v27.16b
- tbnz x6, #2, 1f
- st1 {\o2\().16b}, [x0], #16
- mov v25.16b, v28.16b
- tbnz x6, #3, 1f
- st1 {\o3\().16b}, [x0], #16
- mov v25.16b, v29.16b
- tbnz x6, #4, 1f
+ ldp q24, q25, [sp, #16]
- eor \o4\().16b, \o4\().16b, v16.16b
- eor \o5\().16b, \o5\().16b, v17.16b
- eor \o6\().16b, \o6\().16b, v18.16b
- eor \o7\().16b, \o7\().16b, v19.16b
+ eor v20.16b, \o4\().16b, v29.16b
+ eor v21.16b, \o5\().16b, v30.16b
+ eor v22.16b, \o6\().16b, v31.16b
+ eor v23.16b, \o7\().16b, v24.16b
- st1 {\o4\().16b}, [x0], #16
- tbnz x6, #5, 1f
- st1 {\o5\().16b}, [x0], #16
- tbnz x6, #6, 1f
- st1 {\o6\().16b}, [x0], #16
- tbnz x6, #7, 1f
- st1 {\o7\().16b}, [x0], #16
+ st1 {v16.16b-v19.16b}, [x0], #64
+ st1 {v20.16b-v23.16b}, [x0], #64
- cbnz x4, 99b
+ subs x4, x4, #8
+ b.gt 0b
-1: st1 {v25.16b}, [x5]
- ldp x29, x30, [sp], #80
+ st1 {v25.16b}, [x5]
+ ldp x29, x30, [sp], #48
ret
.endm
-ENTRY(aesbs_xts_encrypt)
+SYM_FUNC_START(aesbs_xts_encrypt)
__xts_crypt aesbs_encrypt8, v0, v1, v4, v6, v3, v7, v2, v5
-ENDPROC(aesbs_xts_encrypt)
+SYM_FUNC_END(aesbs_xts_encrypt)
-ENTRY(aesbs_xts_decrypt)
+SYM_FUNC_START(aesbs_xts_decrypt)
__xts_crypt aesbs_decrypt8, v0, v1, v6, v4, v2, v7, v3, v5
-ENDPROC(aesbs_xts_decrypt)
+SYM_FUNC_END(aesbs_xts_decrypt)
.macro next_ctr, v
mov \v\().d[1], x8
@@ -853,16 +817,12 @@ ENDPROC(aesbs_xts_decrypt)
/*
* aesbs_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[],
- * int rounds, int blocks, u8 iv[], u8 final[])
+ * int rounds, int blocks, u8 iv[])
*/
-ENTRY(aesbs_ctr_encrypt)
+SYM_FUNC_START(aesbs_ctr_encrypt)
stp x29, x30, [sp, #-16]!
mov x29, sp
- cmp x6, #0
- cset x10, ne
- add x4, x4, x10 // do one extra block if final
-
ldp x7, x8, [x5]
ld1 {v0.16b}, [x5]
CPU_LE( rev x7, x7 )
@@ -870,103 +830,38 @@ CPU_LE( rev x8, x8 )
adds x8, x8, #1
adc x7, x7, xzr
-99: mov x9, #1
- lsl x9, x9, x4
- subs w4, w4, #8
- csel x4, x4, xzr, pl
- csel x9, x9, xzr, le
-
- tbnz x9, #1, 0f
- next_ctr v1
- tbnz x9, #2, 0f
+0: next_ctr v1
next_ctr v2
- tbnz x9, #3, 0f
next_ctr v3
- tbnz x9, #4, 0f
next_ctr v4
- tbnz x9, #5, 0f
next_ctr v5
- tbnz x9, #6, 0f
next_ctr v6
- tbnz x9, #7, 0f
next_ctr v7
-0: mov bskey, x2
+ mov bskey, x2
mov rounds, x3
bl aesbs_encrypt8
- lsr x9, x9, x10 // disregard the extra block
- tbnz x9, #0, 0f
-
- ld1 {v8.16b}, [x1], #16
- eor v0.16b, v0.16b, v8.16b
- st1 {v0.16b}, [x0], #16
- tbnz x9, #1, 1f
-
- ld1 {v9.16b}, [x1], #16
- eor v1.16b, v1.16b, v9.16b
- st1 {v1.16b}, [x0], #16
- tbnz x9, #2, 2f
+ ld1 { v8.16b-v11.16b}, [x1], #64
+ ld1 {v12.16b-v15.16b}, [x1], #64
- ld1 {v10.16b}, [x1], #16
- eor v4.16b, v4.16b, v10.16b
- st1 {v4.16b}, [x0], #16
- tbnz x9, #3, 3f
-
- ld1 {v11.16b}, [x1], #16
- eor v6.16b, v6.16b, v11.16b
- st1 {v6.16b}, [x0], #16
- tbnz x9, #4, 4f
-
- ld1 {v12.16b}, [x1], #16
- eor v3.16b, v3.16b, v12.16b
- st1 {v3.16b}, [x0], #16
- tbnz x9, #5, 5f
+ eor v8.16b, v0.16b, v8.16b
+ eor v9.16b, v1.16b, v9.16b
+ eor v10.16b, v4.16b, v10.16b
+ eor v11.16b, v6.16b, v11.16b
+ eor v12.16b, v3.16b, v12.16b
+ eor v13.16b, v7.16b, v13.16b
+ eor v14.16b, v2.16b, v14.16b
+ eor v15.16b, v5.16b, v15.16b
- ld1 {v13.16b}, [x1], #16
- eor v7.16b, v7.16b, v13.16b
- st1 {v7.16b}, [x0], #16
- tbnz x9, #6, 6f
+ st1 { v8.16b-v11.16b}, [x0], #64
+ st1 {v12.16b-v15.16b}, [x0], #64
- ld1 {v14.16b}, [x1], #16
- eor v2.16b, v2.16b, v14.16b
- st1 {v2.16b}, [x0], #16
- tbnz x9, #7, 7f
+ next_ctr v0
+ subs x4, x4, #8
+ b.gt 0b
- ld1 {v15.16b}, [x1], #16
- eor v5.16b, v5.16b, v15.16b
- st1 {v5.16b}, [x0], #16
-
-8: next_ctr v0
- cbnz x4, 99b
-
-0: st1 {v0.16b}, [x5]
+ st1 {v0.16b}, [x5]
ldp x29, x30, [sp], #16
ret
-
- /*
- * If we are handling the tail of the input (x6 != NULL), return the
- * final keystream block back to the caller.
- */
-1: cbz x6, 8b
- st1 {v1.16b}, [x6]
- b 8b
-2: cbz x6, 8b
- st1 {v4.16b}, [x6]
- b 8b
-3: cbz x6, 8b
- st1 {v6.16b}, [x6]
- b 8b
-4: cbz x6, 8b
- st1 {v3.16b}, [x6]
- b 8b
-5: cbz x6, 8b
- st1 {v7.16b}, [x6]
- b 8b
-6: cbz x6, 8b
- st1 {v2.16b}, [x6]
- b 8b
-7: cbz x6, 8b
- st1 {v5.16b}, [x6]
- b 8b
-ENDPROC(aesbs_ctr_encrypt)
+SYM_FUNC_END(aesbs_ctr_encrypt)
diff --git a/arch/arm64/crypto/aes-neonbs-glue.c b/arch/arm64/crypto/aes-neonbs-glue.c
index c55d68ccb89f..bac4cabef607 100644
--- a/arch/arm64/crypto/aes-neonbs-glue.c
+++ b/arch/arm64/crypto/aes-neonbs-glue.c
@@ -1,23 +1,20 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Bit sliced AES using NEON instructions
*
* Copyright (C) 2016 - 2017 Linaro Ltd <ard.biesheuvel@linaro.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#include <asm/neon.h>
#include <asm/simd.h>
#include <crypto/aes.h>
+#include <crypto/ctr.h>
#include <crypto/internal/simd.h>
#include <crypto/internal/skcipher.h>
+#include <crypto/scatterwalk.h>
#include <crypto/xts.h>
#include <linux/module.h>
-#include "aes-ctr-fallback.h"
-
MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
MODULE_LICENSE("GPL v2");
@@ -37,7 +34,7 @@ asmlinkage void aesbs_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[],
int rounds, int blocks, u8 iv[]);
asmlinkage void aesbs_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[],
- int rounds, int blocks, u8 iv[], u8 final[]);
+ int rounds, int blocks, u8 iv[]);
asmlinkage void aesbs_xts_encrypt(u8 out[], u8 const in[], u8 const rk[],
int rounds, int blocks, u8 iv[]);
@@ -46,29 +43,32 @@ asmlinkage void aesbs_xts_decrypt(u8 out[], u8 const in[], u8 const rk[],
/* borrowed from aes-neon-blk.ko */
asmlinkage void neon_aes_ecb_encrypt(u8 out[], u8 const in[], u32 const rk[],
- int rounds, int blocks, int first);
+ int rounds, int blocks);
asmlinkage void neon_aes_cbc_encrypt(u8 out[], u8 const in[], u32 const rk[],
- int rounds, int blocks, u8 iv[],
- int first);
+ int rounds, int blocks, u8 iv[]);
+asmlinkage void neon_aes_ctr_encrypt(u8 out[], u8 const in[], u32 const rk[],
+ int rounds, int bytes, u8 ctr[]);
+asmlinkage void neon_aes_xts_encrypt(u8 out[], u8 const in[],
+ u32 const rk1[], int rounds, int bytes,
+ u32 const rk2[], u8 iv[], int first);
+asmlinkage void neon_aes_xts_decrypt(u8 out[], u8 const in[],
+ u32 const rk1[], int rounds, int bytes,
+ u32 const rk2[], u8 iv[], int first);
struct aesbs_ctx {
u8 rk[13 * (8 * AES_BLOCK_SIZE) + 32];
int rounds;
} __aligned(AES_BLOCK_SIZE);
-struct aesbs_cbc_ctx {
+struct aesbs_cbc_ctr_ctx {
struct aesbs_ctx key;
u32 enc[AES_MAX_KEYLENGTH_U32];
};
-struct aesbs_ctr_ctx {
- struct aesbs_ctx key; /* must be first member */
- struct crypto_aes_ctx fallback;
-};
-
struct aesbs_xts_ctx {
struct aesbs_ctx key;
u32 twkey[AES_MAX_KEYLENGTH_U32];
+ struct crypto_aes_ctx cts;
};
static int aesbs_setkey(struct crypto_skcipher *tfm, const u8 *in_key,
@@ -78,7 +78,7 @@ static int aesbs_setkey(struct crypto_skcipher *tfm, const u8 *in_key,
struct crypto_aes_ctx rk;
int err;
- err = crypto_aes_expand_key(&rk, in_key, key_len);
+ err = aes_expandkey(&rk, in_key, key_len);
if (err)
return err;
@@ -100,9 +100,8 @@ static int __ecb_crypt(struct skcipher_request *req,
struct skcipher_walk walk;
int err;
- err = skcipher_walk_virt(&walk, req, true);
+ err = skcipher_walk_virt(&walk, req, false);
- kernel_neon_begin();
while (walk.nbytes >= AES_BLOCK_SIZE) {
unsigned int blocks = walk.nbytes / AES_BLOCK_SIZE;
@@ -110,12 +109,13 @@ static int __ecb_crypt(struct skcipher_request *req,
blocks = round_down(blocks,
walk.stride / AES_BLOCK_SIZE);
+ kernel_neon_begin();
fn(walk.dst.virt.addr, walk.src.virt.addr, ctx->rk,
ctx->rounds, blocks);
+ kernel_neon_end();
err = skcipher_walk_done(&walk,
walk.nbytes - blocks * AES_BLOCK_SIZE);
}
- kernel_neon_end();
return err;
}
@@ -130,14 +130,14 @@ static int ecb_decrypt(struct skcipher_request *req)
return __ecb_crypt(req, aesbs_ecb_decrypt);
}
-static int aesbs_cbc_setkey(struct crypto_skcipher *tfm, const u8 *in_key,
+static int aesbs_cbc_ctr_setkey(struct crypto_skcipher *tfm, const u8 *in_key,
unsigned int key_len)
{
- struct aesbs_cbc_ctx *ctx = crypto_skcipher_ctx(tfm);
+ struct aesbs_cbc_ctr_ctx *ctx = crypto_skcipher_ctx(tfm);
struct crypto_aes_ctx rk;
int err;
- err = crypto_aes_expand_key(&rk, in_key, key_len);
+ err = aes_expandkey(&rk, in_key, key_len);
if (err)
return err;
@@ -148,6 +148,7 @@ static int aesbs_cbc_setkey(struct crypto_skcipher *tfm, const u8 *in_key,
kernel_neon_begin();
aesbs_convert_key(ctx->key.rk, rk.key_enc, ctx->key.rounds);
kernel_neon_end();
+ memzero_explicit(&rk, sizeof(rk));
return 0;
}
@@ -155,37 +156,35 @@ static int aesbs_cbc_setkey(struct crypto_skcipher *tfm, const u8 *in_key,
static int cbc_encrypt(struct skcipher_request *req)
{
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
- struct aesbs_cbc_ctx *ctx = crypto_skcipher_ctx(tfm);
+ struct aesbs_cbc_ctr_ctx *ctx = crypto_skcipher_ctx(tfm);
struct skcipher_walk walk;
- int err, first = 1;
+ int err;
- err = skcipher_walk_virt(&walk, req, true);
+ err = skcipher_walk_virt(&walk, req, false);
- kernel_neon_begin();
while (walk.nbytes >= AES_BLOCK_SIZE) {
unsigned int blocks = walk.nbytes / AES_BLOCK_SIZE;
/* fall back to the non-bitsliced NEON implementation */
+ kernel_neon_begin();
neon_aes_cbc_encrypt(walk.dst.virt.addr, walk.src.virt.addr,
- ctx->enc, ctx->key.rounds, blocks, walk.iv,
- first);
+ ctx->enc, ctx->key.rounds, blocks,
+ walk.iv);
+ kernel_neon_end();
err = skcipher_walk_done(&walk, walk.nbytes % AES_BLOCK_SIZE);
- first = 0;
}
- kernel_neon_end();
return err;
}
static int cbc_decrypt(struct skcipher_request *req)
{
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
- struct aesbs_cbc_ctx *ctx = crypto_skcipher_ctx(tfm);
+ struct aesbs_cbc_ctr_ctx *ctx = crypto_skcipher_ctx(tfm);
struct skcipher_walk walk;
int err;
- err = skcipher_walk_virt(&walk, req, true);
+ err = skcipher_walk_virt(&walk, req, false);
- kernel_neon_begin();
while (walk.nbytes >= AES_BLOCK_SIZE) {
unsigned int blocks = walk.nbytes / AES_BLOCK_SIZE;
@@ -193,75 +192,48 @@ static int cbc_decrypt(struct skcipher_request *req)
blocks = round_down(blocks,
walk.stride / AES_BLOCK_SIZE);
+ kernel_neon_begin();
aesbs_cbc_decrypt(walk.dst.virt.addr, walk.src.virt.addr,
ctx->key.rk, ctx->key.rounds, blocks,
walk.iv);
+ kernel_neon_end();
err = skcipher_walk_done(&walk,
walk.nbytes - blocks * AES_BLOCK_SIZE);
}
- kernel_neon_end();
return err;
}
-static int aesbs_ctr_setkey_sync(struct crypto_skcipher *tfm, const u8 *in_key,
- unsigned int key_len)
-{
- struct aesbs_ctr_ctx *ctx = crypto_skcipher_ctx(tfm);
- int err;
-
- err = crypto_aes_expand_key(&ctx->fallback, in_key, key_len);
- if (err)
- return err;
-
- ctx->key.rounds = 6 + key_len / 4;
-
- kernel_neon_begin();
- aesbs_convert_key(ctx->key.rk, ctx->fallback.key_enc, ctx->key.rounds);
- kernel_neon_end();
-
- return 0;
-}
-
static int ctr_encrypt(struct skcipher_request *req)
{
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
- struct aesbs_ctx *ctx = crypto_skcipher_ctx(tfm);
+ struct aesbs_cbc_ctr_ctx *ctx = crypto_skcipher_ctx(tfm);
struct skcipher_walk walk;
- u8 buf[AES_BLOCK_SIZE];
int err;
- err = skcipher_walk_virt(&walk, req, true);
+ err = skcipher_walk_virt(&walk, req, false);
- kernel_neon_begin();
while (walk.nbytes > 0) {
- unsigned int blocks = walk.nbytes / AES_BLOCK_SIZE;
- u8 *final = (walk.total % AES_BLOCK_SIZE) ? buf : NULL;
-
- if (walk.nbytes < walk.total) {
- blocks = round_down(blocks,
- walk.stride / AES_BLOCK_SIZE);
- final = NULL;
+ int blocks = (walk.nbytes / AES_BLOCK_SIZE) & ~7;
+ int nbytes = walk.nbytes % (8 * AES_BLOCK_SIZE);
+ const u8 *src = walk.src.virt.addr;
+ u8 *dst = walk.dst.virt.addr;
+
+ kernel_neon_begin();
+ if (blocks >= 8) {
+ aesbs_ctr_encrypt(dst, src, ctx->key.rk, ctx->key.rounds,
+ blocks, walk.iv);
+ dst += blocks * AES_BLOCK_SIZE;
+ src += blocks * AES_BLOCK_SIZE;
}
-
- aesbs_ctr_encrypt(walk.dst.virt.addr, walk.src.virt.addr,
- ctx->rk, ctx->rounds, blocks, walk.iv, final);
-
- if (final) {
- u8 *dst = walk.dst.virt.addr + blocks * AES_BLOCK_SIZE;
- u8 *src = walk.src.virt.addr + blocks * AES_BLOCK_SIZE;
-
- crypto_xor_cpy(dst, src, final,
- walk.total % AES_BLOCK_SIZE);
-
- err = skcipher_walk_done(&walk, 0);
- break;
+ if (nbytes && walk.nbytes == walk.total) {
+ neon_aes_ctr_encrypt(dst, src, ctx->enc, ctx->key.rounds,
+ nbytes, walk.iv);
+ nbytes = 0;
}
- err = skcipher_walk_done(&walk,
- walk.nbytes - blocks * AES_BLOCK_SIZE);
+ kernel_neon_end();
+ err = skcipher_walk_done(&walk, nbytes);
}
- kernel_neon_end();
-
return err;
}
@@ -277,7 +249,11 @@ static int aesbs_xts_setkey(struct crypto_skcipher *tfm, const u8 *in_key,
return err;
key_len /= 2;
- err = crypto_aes_expand_key(&rk, in_key + key_len, key_len);
+ err = aes_expandkey(&ctx->cts, in_key, key_len);
+ if (err)
+ return err;
+
+ err = aes_expandkey(&rk, in_key + key_len, key_len);
if (err)
return err;
@@ -286,68 +262,129 @@ static int aesbs_xts_setkey(struct crypto_skcipher *tfm, const u8 *in_key,
return aesbs_setkey(tfm, in_key, key_len);
}
-static int ctr_encrypt_sync(struct skcipher_request *req)
-{
- struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
- struct aesbs_ctr_ctx *ctx = crypto_skcipher_ctx(tfm);
-
- if (!may_use_simd())
- return aes_ctr_encrypt_fallback(&ctx->fallback, req);
-
- return ctr_encrypt(req);
-}
-
-static int __xts_crypt(struct skcipher_request *req,
+static int __xts_crypt(struct skcipher_request *req, bool encrypt,
void (*fn)(u8 out[], u8 const in[], u8 const rk[],
int rounds, int blocks, u8 iv[]))
{
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
struct aesbs_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
+ int tail = req->cryptlen % (8 * AES_BLOCK_SIZE);
+ struct scatterlist sg_src[2], sg_dst[2];
+ struct skcipher_request subreq;
+ struct scatterlist *src, *dst;
struct skcipher_walk walk;
- int err;
+ int nbytes, err;
+ int first = 1;
+ u8 *out, *in;
+
+ if (req->cryptlen < AES_BLOCK_SIZE)
+ return -EINVAL;
+
+ /* ensure that the cts tail is covered by a single step */
+ if (unlikely(tail > 0 && tail < AES_BLOCK_SIZE)) {
+ int xts_blocks = DIV_ROUND_UP(req->cryptlen,
+ AES_BLOCK_SIZE) - 2;
+
+ skcipher_request_set_tfm(&subreq, tfm);
+ skcipher_request_set_callback(&subreq,
+ skcipher_request_flags(req),
+ NULL, NULL);
+ skcipher_request_set_crypt(&subreq, req->src, req->dst,
+ xts_blocks * AES_BLOCK_SIZE,
+ req->iv);
+ req = &subreq;
+ } else {
+ tail = 0;
+ }
- err = skcipher_walk_virt(&walk, req, true);
+ err = skcipher_walk_virt(&walk, req, false);
+ if (err)
+ return err;
- kernel_neon_begin();
+ while (walk.nbytes >= AES_BLOCK_SIZE) {
+ int blocks = (walk.nbytes / AES_BLOCK_SIZE) & ~7;
+ out = walk.dst.virt.addr;
+ in = walk.src.virt.addr;
+ nbytes = walk.nbytes;
+
+ kernel_neon_begin();
+ if (blocks >= 8) {
+ if (first == 1)
+ neon_aes_ecb_encrypt(walk.iv, walk.iv,
+ ctx->twkey,
+ ctx->key.rounds, 1);
+ first = 2;
+
+ fn(out, in, ctx->key.rk, ctx->key.rounds, blocks,
+ walk.iv);
+
+ out += blocks * AES_BLOCK_SIZE;
+ in += blocks * AES_BLOCK_SIZE;
+ nbytes -= blocks * AES_BLOCK_SIZE;
+ }
+ if (walk.nbytes == walk.total && nbytes > 0) {
+ if (encrypt)
+ neon_aes_xts_encrypt(out, in, ctx->cts.key_enc,
+ ctx->key.rounds, nbytes,
+ ctx->twkey, walk.iv, first);
+ else
+ neon_aes_xts_decrypt(out, in, ctx->cts.key_dec,
+ ctx->key.rounds, nbytes,
+ ctx->twkey, walk.iv, first);
+ nbytes = first = 0;
+ }
+ kernel_neon_end();
+ err = skcipher_walk_done(&walk, nbytes);
+ }
- neon_aes_ecb_encrypt(walk.iv, walk.iv, ctx->twkey,
- ctx->key.rounds, 1, 1);
+ if (err || likely(!tail))
+ return err;
- while (walk.nbytes >= AES_BLOCK_SIZE) {
- unsigned int blocks = walk.nbytes / AES_BLOCK_SIZE;
+ /* handle ciphertext stealing */
+ dst = src = scatterwalk_ffwd(sg_src, req->src, req->cryptlen);
+ if (req->dst != req->src)
+ dst = scatterwalk_ffwd(sg_dst, req->dst, req->cryptlen);
- if (walk.nbytes < walk.total)
- blocks = round_down(blocks,
- walk.stride / AES_BLOCK_SIZE);
+ skcipher_request_set_crypt(req, src, dst, AES_BLOCK_SIZE + tail,
+ req->iv);
- fn(walk.dst.virt.addr, walk.src.virt.addr, ctx->key.rk,
- ctx->key.rounds, blocks, walk.iv);
- err = skcipher_walk_done(&walk,
- walk.nbytes - blocks * AES_BLOCK_SIZE);
- }
+ err = skcipher_walk_virt(&walk, req, false);
+ if (err)
+ return err;
+
+ out = walk.dst.virt.addr;
+ in = walk.src.virt.addr;
+ nbytes = walk.nbytes;
+
+ kernel_neon_begin();
+ if (encrypt)
+ neon_aes_xts_encrypt(out, in, ctx->cts.key_enc, ctx->key.rounds,
+ nbytes, ctx->twkey, walk.iv, first);
+ else
+ neon_aes_xts_decrypt(out, in, ctx->cts.key_dec, ctx->key.rounds,
+ nbytes, ctx->twkey, walk.iv, first);
kernel_neon_end();
- return err;
+ return skcipher_walk_done(&walk, 0);
}
static int xts_encrypt(struct skcipher_request *req)
{
- return __xts_crypt(req, aesbs_xts_encrypt);
+ return __xts_crypt(req, true, aesbs_xts_encrypt);
}
static int xts_decrypt(struct skcipher_request *req)
{
- return __xts_crypt(req, aesbs_xts_decrypt);
+ return __xts_crypt(req, false, aesbs_xts_decrypt);
}
static struct skcipher_alg aes_algs[] = { {
- .base.cra_name = "__ecb(aes)",
- .base.cra_driver_name = "__ecb-aes-neonbs",
+ .base.cra_name = "ecb(aes)",
+ .base.cra_driver_name = "ecb-aes-neonbs",
.base.cra_priority = 250,
.base.cra_blocksize = AES_BLOCK_SIZE,
.base.cra_ctxsize = sizeof(struct aesbs_ctx),
.base.cra_module = THIS_MODULE,
- .base.cra_flags = CRYPTO_ALG_INTERNAL,
.min_keysize = AES_MIN_KEY_SIZE,
.max_keysize = AES_MAX_KEY_SIZE,
@@ -356,62 +393,43 @@ static struct skcipher_alg aes_algs[] = { {
.encrypt = ecb_encrypt,
.decrypt = ecb_decrypt,
}, {
- .base.cra_name = "__cbc(aes)",
- .base.cra_driver_name = "__cbc-aes-neonbs",
+ .base.cra_name = "cbc(aes)",
+ .base.cra_driver_name = "cbc-aes-neonbs",
.base.cra_priority = 250,
.base.cra_blocksize = AES_BLOCK_SIZE,
- .base.cra_ctxsize = sizeof(struct aesbs_cbc_ctx),
+ .base.cra_ctxsize = sizeof(struct aesbs_cbc_ctr_ctx),
.base.cra_module = THIS_MODULE,
- .base.cra_flags = CRYPTO_ALG_INTERNAL,
.min_keysize = AES_MIN_KEY_SIZE,
.max_keysize = AES_MAX_KEY_SIZE,
.walksize = 8 * AES_BLOCK_SIZE,
.ivsize = AES_BLOCK_SIZE,
- .setkey = aesbs_cbc_setkey,
+ .setkey = aesbs_cbc_ctr_setkey,
.encrypt = cbc_encrypt,
.decrypt = cbc_decrypt,
}, {
- .base.cra_name = "__ctr(aes)",
- .base.cra_driver_name = "__ctr-aes-neonbs",
+ .base.cra_name = "ctr(aes)",
+ .base.cra_driver_name = "ctr-aes-neonbs",
.base.cra_priority = 250,
.base.cra_blocksize = 1,
- .base.cra_ctxsize = sizeof(struct aesbs_ctx),
+ .base.cra_ctxsize = sizeof(struct aesbs_cbc_ctr_ctx),
.base.cra_module = THIS_MODULE,
- .base.cra_flags = CRYPTO_ALG_INTERNAL,
.min_keysize = AES_MIN_KEY_SIZE,
.max_keysize = AES_MAX_KEY_SIZE,
.chunksize = AES_BLOCK_SIZE,
.walksize = 8 * AES_BLOCK_SIZE,
.ivsize = AES_BLOCK_SIZE,
- .setkey = aesbs_setkey,
+ .setkey = aesbs_cbc_ctr_setkey,
.encrypt = ctr_encrypt,
.decrypt = ctr_encrypt,
}, {
- .base.cra_name = "ctr(aes)",
- .base.cra_driver_name = "ctr-aes-neonbs",
- .base.cra_priority = 250 - 1,
- .base.cra_blocksize = 1,
- .base.cra_ctxsize = sizeof(struct aesbs_ctr_ctx),
- .base.cra_module = THIS_MODULE,
-
- .min_keysize = AES_MIN_KEY_SIZE,
- .max_keysize = AES_MAX_KEY_SIZE,
- .chunksize = AES_BLOCK_SIZE,
- .walksize = 8 * AES_BLOCK_SIZE,
- .ivsize = AES_BLOCK_SIZE,
- .setkey = aesbs_ctr_setkey_sync,
- .encrypt = ctr_encrypt_sync,
- .decrypt = ctr_encrypt_sync,
-}, {
- .base.cra_name = "__xts(aes)",
- .base.cra_driver_name = "__xts-aes-neonbs",
+ .base.cra_name = "xts(aes)",
+ .base.cra_driver_name = "xts-aes-neonbs",
.base.cra_priority = 250,
.base.cra_blocksize = AES_BLOCK_SIZE,
.base.cra_ctxsize = sizeof(struct aesbs_xts_ctx),
.base.cra_module = THIS_MODULE,
- .base.cra_flags = CRYPTO_ALG_INTERNAL,
.min_keysize = 2 * AES_MIN_KEY_SIZE,
.max_keysize = 2 * AES_MAX_KEY_SIZE,
@@ -422,54 +440,17 @@ static struct skcipher_alg aes_algs[] = { {
.decrypt = xts_decrypt,
} };
-static struct simd_skcipher_alg *aes_simd_algs[ARRAY_SIZE(aes_algs)];
-
static void aes_exit(void)
{
- int i;
-
- for (i = 0; i < ARRAY_SIZE(aes_simd_algs); i++)
- if (aes_simd_algs[i])
- simd_skcipher_free(aes_simd_algs[i]);
-
crypto_unregister_skciphers(aes_algs, ARRAY_SIZE(aes_algs));
}
static int __init aes_init(void)
{
- struct simd_skcipher_alg *simd;
- const char *basename;
- const char *algname;
- const char *drvname;
- int err;
- int i;
-
- if (!(elf_hwcap & HWCAP_ASIMD))
+ if (!cpu_have_named_feature(ASIMD))
return -ENODEV;
- err = crypto_register_skciphers(aes_algs, ARRAY_SIZE(aes_algs));
- if (err)
- return err;
-
- for (i = 0; i < ARRAY_SIZE(aes_algs); i++) {
- if (!(aes_algs[i].base.cra_flags & CRYPTO_ALG_INTERNAL))
- continue;
-
- algname = aes_algs[i].base.cra_name + 2;
- drvname = aes_algs[i].base.cra_driver_name + 2;
- basename = aes_algs[i].base.cra_driver_name;
- simd = simd_skcipher_create_compat(algname, drvname, basename);
- err = PTR_ERR(simd);
- if (IS_ERR(simd))
- goto unregister_simds;
-
- aes_simd_algs[i] = simd;
- }
- return 0;
-
-unregister_simds:
- aes_exit();
- return err;
+ return crypto_register_skciphers(aes_algs, ARRAY_SIZE(aes_algs));
}
module_init(aes_init);
diff --git a/arch/arm64/crypto/chacha20-neon-core.S b/arch/arm64/crypto/chacha-neon-core.S
index 13c85e272c2a..b70ac76f2610 100644
--- a/arch/arm64/crypto/chacha20-neon-core.S
+++ b/arch/arm64/crypto/chacha-neon-core.S
@@ -1,13 +1,13 @@
/*
- * ChaCha20 256-bit cipher algorithm, RFC7539, arm64 NEON functions
+ * ChaCha/XChaCha NEON helper functions
*
- * Copyright (C) 2016 Linaro, Ltd. <ard.biesheuvel@linaro.org>
+ * Copyright (C) 2016-2018 Linaro, Ltd. <ard.biesheuvel@linaro.org>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*
- * Based on:
+ * Originally based on:
* ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSSE3 functions
*
* Copyright (C) 2015 Martin Willi
@@ -19,29 +19,27 @@
*/
#include <linux/linkage.h>
+#include <asm/assembler.h>
+#include <asm/cache.h>
.text
.align 6
-ENTRY(chacha20_block_xor_neon)
- // x0: Input state matrix, s
- // x1: 1 data block output, o
- // x2: 1 data block input, i
-
- //
- // This function encrypts one ChaCha20 block by loading the state matrix
- // in four NEON registers. It performs matrix operation on four words in
- // parallel, but requires shuffling to rearrange the words after each
- // round.
- //
-
- // x0..3 = s0..3
- adr x3, ROT8
- ld1 {v0.4s-v3.4s}, [x0]
- ld1 {v8.4s-v11.4s}, [x0]
- ld1 {v12.4s}, [x3]
+/*
+ * chacha_permute - permute one block
+ *
+ * Permute one 64-byte block where the state matrix is stored in the four NEON
+ * registers v0-v3. It performs matrix operations on four words in parallel,
+ * but requires shuffling to rearrange the words after each round.
+ *
+ * The round count is given in w3.
+ *
+ * Clobbers: w3, x10, v4, v12
+ */
+SYM_FUNC_START_LOCAL(chacha_permute)
- mov x3, #10
+ adr_l x10, ROT8
+ ld1 {v12.4s}, [x10]
.Ldoubleround:
// x0 += x1, x3 = rotl32(x3 ^ x0, 16)
@@ -102,9 +100,27 @@ ENTRY(chacha20_block_xor_neon)
// x3 = shuffle32(x3, MASK(0, 3, 2, 1))
ext v3.16b, v3.16b, v3.16b, #4
- subs x3, x3, #1
+ subs w3, w3, #2
b.ne .Ldoubleround
+ ret
+SYM_FUNC_END(chacha_permute)
+
+SYM_FUNC_START(chacha_block_xor_neon)
+ // x0: Input state matrix, s
+ // x1: 1 data block output, o
+ // x2: 1 data block input, i
+ // w3: nrounds
+
+ stp x29, x30, [sp, #-16]!
+ mov x29, sp
+
+ // x0..3 = s0..3
+ ld1 {v0.4s-v3.4s}, [x0]
+ ld1 {v8.4s-v11.4s}, [x0]
+
+ bl chacha_permute
+
ld1 {v4.16b-v7.16b}, [x2]
// o0 = i0 ^ (x0 + s0)
@@ -125,71 +141,155 @@ ENTRY(chacha20_block_xor_neon)
st1 {v0.16b-v3.16b}, [x1]
+ ldp x29, x30, [sp], #16
ret
-ENDPROC(chacha20_block_xor_neon)
+SYM_FUNC_END(chacha_block_xor_neon)
+
+SYM_FUNC_START(hchacha_block_neon)
+ // x0: Input state matrix, s
+ // x1: output (8 32-bit words)
+ // w2: nrounds
+
+ stp x29, x30, [sp, #-16]!
+ mov x29, sp
+
+ ld1 {v0.4s-v3.4s}, [x0]
+
+ mov w3, w2
+ bl chacha_permute
+
+ st1 {v0.4s}, [x1], #16
+ st1 {v3.4s}, [x1]
+
+ ldp x29, x30, [sp], #16
+ ret
+SYM_FUNC_END(hchacha_block_neon)
+
+ a0 .req w12
+ a1 .req w13
+ a2 .req w14
+ a3 .req w15
+ a4 .req w16
+ a5 .req w17
+ a6 .req w19
+ a7 .req w20
+ a8 .req w21
+ a9 .req w22
+ a10 .req w23
+ a11 .req w24
+ a12 .req w25
+ a13 .req w26
+ a14 .req w27
+ a15 .req w28
.align 6
-ENTRY(chacha20_4block_xor_neon)
+SYM_FUNC_START(chacha_4block_xor_neon)
+ frame_push 10
+
// x0: Input state matrix, s
// x1: 4 data blocks output, o
// x2: 4 data blocks input, i
+ // w3: nrounds
+ // x4: byte count
+
+ adr_l x10, .Lpermute
+ and x5, x4, #63
+ add x10, x10, x5
//
- // This function encrypts four consecutive ChaCha20 blocks by loading
+ // This function encrypts four consecutive ChaCha blocks by loading
// the state matrix in NEON registers four times. The algorithm performs
// each operation on the corresponding word of each state matrix, hence
// requires no word shuffling. For final XORing step we transpose the
// matrix by interleaving 32- and then 64-bit words, which allows us to
// do XOR in NEON registers.
//
- adr x3, CTRINC // ... and ROT8
- ld1 {v30.4s-v31.4s}, [x3]
+ // At the same time, a fifth block is encrypted in parallel using
+ // scalar registers
+ //
+ adr_l x9, CTRINC // ... and ROT8
+ ld1 {v30.4s-v31.4s}, [x9]
// x0..15[0-3] = s0..3[0..3]
- mov x4, x0
- ld4r { v0.4s- v3.4s}, [x4], #16
- ld4r { v4.4s- v7.4s}, [x4], #16
- ld4r { v8.4s-v11.4s}, [x4], #16
- ld4r {v12.4s-v15.4s}, [x4]
-
- // x12 += counter values 0-3
+ add x8, x0, #16
+ ld4r { v0.4s- v3.4s}, [x0]
+ ld4r { v4.4s- v7.4s}, [x8], #16
+ ld4r { v8.4s-v11.4s}, [x8], #16
+ ld4r {v12.4s-v15.4s}, [x8]
+
+ mov a0, v0.s[0]
+ mov a1, v1.s[0]
+ mov a2, v2.s[0]
+ mov a3, v3.s[0]
+ mov a4, v4.s[0]
+ mov a5, v5.s[0]
+ mov a6, v6.s[0]
+ mov a7, v7.s[0]
+ mov a8, v8.s[0]
+ mov a9, v9.s[0]
+ mov a10, v10.s[0]
+ mov a11, v11.s[0]
+ mov a12, v12.s[0]
+ mov a13, v13.s[0]
+ mov a14, v14.s[0]
+ mov a15, v15.s[0]
+
+ // x12 += counter values 1-4
add v12.4s, v12.4s, v30.4s
- mov x3, #10
-
.Ldoubleround4:
// x0 += x4, x12 = rotl32(x12 ^ x0, 16)
// x1 += x5, x13 = rotl32(x13 ^ x1, 16)
// x2 += x6, x14 = rotl32(x14 ^ x2, 16)
// x3 += x7, x15 = rotl32(x15 ^ x3, 16)
add v0.4s, v0.4s, v4.4s
+ add a0, a0, a4
add v1.4s, v1.4s, v5.4s
+ add a1, a1, a5
add v2.4s, v2.4s, v6.4s
+ add a2, a2, a6
add v3.4s, v3.4s, v7.4s
+ add a3, a3, a7
eor v12.16b, v12.16b, v0.16b
+ eor a12, a12, a0
eor v13.16b, v13.16b, v1.16b
+ eor a13, a13, a1
eor v14.16b, v14.16b, v2.16b
+ eor a14, a14, a2
eor v15.16b, v15.16b, v3.16b
+ eor a15, a15, a3
rev32 v12.8h, v12.8h
+ ror a12, a12, #16
rev32 v13.8h, v13.8h
+ ror a13, a13, #16
rev32 v14.8h, v14.8h
+ ror a14, a14, #16
rev32 v15.8h, v15.8h
+ ror a15, a15, #16
// x8 += x12, x4 = rotl32(x4 ^ x8, 12)
// x9 += x13, x5 = rotl32(x5 ^ x9, 12)
// x10 += x14, x6 = rotl32(x6 ^ x10, 12)
// x11 += x15, x7 = rotl32(x7 ^ x11, 12)
add v8.4s, v8.4s, v12.4s
+ add a8, a8, a12
add v9.4s, v9.4s, v13.4s
+ add a9, a9, a13
add v10.4s, v10.4s, v14.4s
+ add a10, a10, a14
add v11.4s, v11.4s, v15.4s
+ add a11, a11, a15
eor v16.16b, v4.16b, v8.16b
+ eor a4, a4, a8
eor v17.16b, v5.16b, v9.16b
+ eor a5, a5, a9
eor v18.16b, v6.16b, v10.16b
+ eor a6, a6, a10
eor v19.16b, v7.16b, v11.16b
+ eor a7, a7, a11
shl v4.4s, v16.4s, #12
shl v5.4s, v17.4s, #12
@@ -197,42 +297,66 @@ ENTRY(chacha20_4block_xor_neon)
shl v7.4s, v19.4s, #12
sri v4.4s, v16.4s, #20
+ ror a4, a4, #20
sri v5.4s, v17.4s, #20
+ ror a5, a5, #20
sri v6.4s, v18.4s, #20
+ ror a6, a6, #20
sri v7.4s, v19.4s, #20
+ ror a7, a7, #20
// x0 += x4, x12 = rotl32(x12 ^ x0, 8)
// x1 += x5, x13 = rotl32(x13 ^ x1, 8)
// x2 += x6, x14 = rotl32(x14 ^ x2, 8)
// x3 += x7, x15 = rotl32(x15 ^ x3, 8)
add v0.4s, v0.4s, v4.4s
+ add a0, a0, a4
add v1.4s, v1.4s, v5.4s
+ add a1, a1, a5
add v2.4s, v2.4s, v6.4s
+ add a2, a2, a6
add v3.4s, v3.4s, v7.4s
+ add a3, a3, a7
eor v12.16b, v12.16b, v0.16b
+ eor a12, a12, a0
eor v13.16b, v13.16b, v1.16b
+ eor a13, a13, a1
eor v14.16b, v14.16b, v2.16b
+ eor a14, a14, a2
eor v15.16b, v15.16b, v3.16b
+ eor a15, a15, a3
tbl v12.16b, {v12.16b}, v31.16b
+ ror a12, a12, #24
tbl v13.16b, {v13.16b}, v31.16b
+ ror a13, a13, #24
tbl v14.16b, {v14.16b}, v31.16b
+ ror a14, a14, #24
tbl v15.16b, {v15.16b}, v31.16b
+ ror a15, a15, #24
// x8 += x12, x4 = rotl32(x4 ^ x8, 7)
// x9 += x13, x5 = rotl32(x5 ^ x9, 7)
// x10 += x14, x6 = rotl32(x6 ^ x10, 7)
// x11 += x15, x7 = rotl32(x7 ^ x11, 7)
add v8.4s, v8.4s, v12.4s
+ add a8, a8, a12
add v9.4s, v9.4s, v13.4s
+ add a9, a9, a13
add v10.4s, v10.4s, v14.4s
+ add a10, a10, a14
add v11.4s, v11.4s, v15.4s
+ add a11, a11, a15
eor v16.16b, v4.16b, v8.16b
+ eor a4, a4, a8
eor v17.16b, v5.16b, v9.16b
+ eor a5, a5, a9
eor v18.16b, v6.16b, v10.16b
+ eor a6, a6, a10
eor v19.16b, v7.16b, v11.16b
+ eor a7, a7, a11
shl v4.4s, v16.4s, #7
shl v5.4s, v17.4s, #7
@@ -240,42 +364,66 @@ ENTRY(chacha20_4block_xor_neon)
shl v7.4s, v19.4s, #7
sri v4.4s, v16.4s, #25
+ ror a4, a4, #25
sri v5.4s, v17.4s, #25
+ ror a5, a5, #25
sri v6.4s, v18.4s, #25
+ ror a6, a6, #25
sri v7.4s, v19.4s, #25
+ ror a7, a7, #25
// x0 += x5, x15 = rotl32(x15 ^ x0, 16)
// x1 += x6, x12 = rotl32(x12 ^ x1, 16)
// x2 += x7, x13 = rotl32(x13 ^ x2, 16)
// x3 += x4, x14 = rotl32(x14 ^ x3, 16)
add v0.4s, v0.4s, v5.4s
+ add a0, a0, a5
add v1.4s, v1.4s, v6.4s
+ add a1, a1, a6
add v2.4s, v2.4s, v7.4s
+ add a2, a2, a7
add v3.4s, v3.4s, v4.4s
+ add a3, a3, a4
eor v15.16b, v15.16b, v0.16b
+ eor a15, a15, a0
eor v12.16b, v12.16b, v1.16b
+ eor a12, a12, a1
eor v13.16b, v13.16b, v2.16b
+ eor a13, a13, a2
eor v14.16b, v14.16b, v3.16b
+ eor a14, a14, a3
rev32 v15.8h, v15.8h
+ ror a15, a15, #16
rev32 v12.8h, v12.8h
+ ror a12, a12, #16
rev32 v13.8h, v13.8h
+ ror a13, a13, #16
rev32 v14.8h, v14.8h
+ ror a14, a14, #16
// x10 += x15, x5 = rotl32(x5 ^ x10, 12)
// x11 += x12, x6 = rotl32(x6 ^ x11, 12)
// x8 += x13, x7 = rotl32(x7 ^ x8, 12)
// x9 += x14, x4 = rotl32(x4 ^ x9, 12)
add v10.4s, v10.4s, v15.4s
+ add a10, a10, a15
add v11.4s, v11.4s, v12.4s
+ add a11, a11, a12
add v8.4s, v8.4s, v13.4s
+ add a8, a8, a13
add v9.4s, v9.4s, v14.4s
+ add a9, a9, a14
eor v16.16b, v5.16b, v10.16b
+ eor a5, a5, a10
eor v17.16b, v6.16b, v11.16b
+ eor a6, a6, a11
eor v18.16b, v7.16b, v8.16b
+ eor a7, a7, a8
eor v19.16b, v4.16b, v9.16b
+ eor a4, a4, a9
shl v5.4s, v16.4s, #12
shl v6.4s, v17.4s, #12
@@ -283,42 +431,66 @@ ENTRY(chacha20_4block_xor_neon)
shl v4.4s, v19.4s, #12
sri v5.4s, v16.4s, #20
+ ror a5, a5, #20
sri v6.4s, v17.4s, #20
+ ror a6, a6, #20
sri v7.4s, v18.4s, #20
+ ror a7, a7, #20
sri v4.4s, v19.4s, #20
+ ror a4, a4, #20
// x0 += x5, x15 = rotl32(x15 ^ x0, 8)
// x1 += x6, x12 = rotl32(x12 ^ x1, 8)
// x2 += x7, x13 = rotl32(x13 ^ x2, 8)
// x3 += x4, x14 = rotl32(x14 ^ x3, 8)
add v0.4s, v0.4s, v5.4s
+ add a0, a0, a5
add v1.4s, v1.4s, v6.4s
+ add a1, a1, a6
add v2.4s, v2.4s, v7.4s
+ add a2, a2, a7
add v3.4s, v3.4s, v4.4s
+ add a3, a3, a4
eor v15.16b, v15.16b, v0.16b
+ eor a15, a15, a0
eor v12.16b, v12.16b, v1.16b
+ eor a12, a12, a1
eor v13.16b, v13.16b, v2.16b
+ eor a13, a13, a2
eor v14.16b, v14.16b, v3.16b
+ eor a14, a14, a3
tbl v15.16b, {v15.16b}, v31.16b
+ ror a15, a15, #24
tbl v12.16b, {v12.16b}, v31.16b
+ ror a12, a12, #24
tbl v13.16b, {v13.16b}, v31.16b
+ ror a13, a13, #24
tbl v14.16b, {v14.16b}, v31.16b
+ ror a14, a14, #24
// x10 += x15, x5 = rotl32(x5 ^ x10, 7)
// x11 += x12, x6 = rotl32(x6 ^ x11, 7)
// x8 += x13, x7 = rotl32(x7 ^ x8, 7)
// x9 += x14, x4 = rotl32(x4 ^ x9, 7)
add v10.4s, v10.4s, v15.4s
+ add a10, a10, a15
add v11.4s, v11.4s, v12.4s
+ add a11, a11, a12
add v8.4s, v8.4s, v13.4s
+ add a8, a8, a13
add v9.4s, v9.4s, v14.4s
+ add a9, a9, a14
eor v16.16b, v5.16b, v10.16b
+ eor a5, a5, a10
eor v17.16b, v6.16b, v11.16b
+ eor a6, a6, a11
eor v18.16b, v7.16b, v8.16b
+ eor a7, a7, a8
eor v19.16b, v4.16b, v9.16b
+ eor a4, a4, a9
shl v5.4s, v16.4s, #7
shl v6.4s, v17.4s, #7
@@ -326,11 +498,15 @@ ENTRY(chacha20_4block_xor_neon)
shl v4.4s, v19.4s, #7
sri v5.4s, v16.4s, #25
+ ror a5, a5, #25
sri v6.4s, v17.4s, #25
+ ror a6, a6, #25
sri v7.4s, v18.4s, #25
+ ror a7, a7, #25
sri v4.4s, v19.4s, #25
+ ror a4, a4, #25
- subs x3, x3, #1
+ subs w3, w3, #2
b.ne .Ldoubleround4
ld4r {v16.4s-v19.4s}, [x0], #16
@@ -344,9 +520,21 @@ ENTRY(chacha20_4block_xor_neon)
// x2[0-3] += s0[2]
// x3[0-3] += s0[3]
add v0.4s, v0.4s, v16.4s
+ mov w6, v16.s[0]
+ mov w7, v17.s[0]
add v1.4s, v1.4s, v17.4s
+ mov w8, v18.s[0]
+ mov w9, v19.s[0]
add v2.4s, v2.4s, v18.4s
+ add a0, a0, w6
+ add a1, a1, w7
add v3.4s, v3.4s, v19.4s
+ add a2, a2, w8
+ add a3, a3, w9
+CPU_BE( rev a0, a0 )
+CPU_BE( rev a1, a1 )
+CPU_BE( rev a2, a2 )
+CPU_BE( rev a3, a3 )
ld4r {v24.4s-v27.4s}, [x0], #16
ld4r {v28.4s-v31.4s}, [x0]
@@ -356,69 +544,154 @@ ENTRY(chacha20_4block_xor_neon)
// x6[0-3] += s1[2]
// x7[0-3] += s1[3]
add v4.4s, v4.4s, v20.4s
+ mov w6, v20.s[0]
+ mov w7, v21.s[0]
add v5.4s, v5.4s, v21.4s
+ mov w8, v22.s[0]
+ mov w9, v23.s[0]
add v6.4s, v6.4s, v22.4s
+ add a4, a4, w6
+ add a5, a5, w7
add v7.4s, v7.4s, v23.4s
+ add a6, a6, w8
+ add a7, a7, w9
+CPU_BE( rev a4, a4 )
+CPU_BE( rev a5, a5 )
+CPU_BE( rev a6, a6 )
+CPU_BE( rev a7, a7 )
// x8[0-3] += s2[0]
// x9[0-3] += s2[1]
// x10[0-3] += s2[2]
// x11[0-3] += s2[3]
add v8.4s, v8.4s, v24.4s
+ mov w6, v24.s[0]
+ mov w7, v25.s[0]
add v9.4s, v9.4s, v25.4s
+ mov w8, v26.s[0]
+ mov w9, v27.s[0]
add v10.4s, v10.4s, v26.4s
+ add a8, a8, w6
+ add a9, a9, w7
add v11.4s, v11.4s, v27.4s
+ add a10, a10, w8
+ add a11, a11, w9
+CPU_BE( rev a8, a8 )
+CPU_BE( rev a9, a9 )
+CPU_BE( rev a10, a10 )
+CPU_BE( rev a11, a11 )
// x12[0-3] += s3[0]
// x13[0-3] += s3[1]
// x14[0-3] += s3[2]
// x15[0-3] += s3[3]
add v12.4s, v12.4s, v28.4s
+ mov w6, v28.s[0]
+ mov w7, v29.s[0]
add v13.4s, v13.4s, v29.4s
+ mov w8, v30.s[0]
+ mov w9, v31.s[0]
add v14.4s, v14.4s, v30.4s
+ add a12, a12, w6
+ add a13, a13, w7
add v15.4s, v15.4s, v31.4s
+ add a14, a14, w8
+ add a15, a15, w9
+CPU_BE( rev a12, a12 )
+CPU_BE( rev a13, a13 )
+CPU_BE( rev a14, a14 )
+CPU_BE( rev a15, a15 )
// interleave 32-bit words in state n, n+1
+ ldp w6, w7, [x2], #64
zip1 v16.4s, v0.4s, v1.4s
+ ldp w8, w9, [x2, #-56]
+ eor a0, a0, w6
zip2 v17.4s, v0.4s, v1.4s
+ eor a1, a1, w7
zip1 v18.4s, v2.4s, v3.4s
+ eor a2, a2, w8
zip2 v19.4s, v2.4s, v3.4s
+ eor a3, a3, w9
+ ldp w6, w7, [x2, #-48]
zip1 v20.4s, v4.4s, v5.4s
+ ldp w8, w9, [x2, #-40]
+ eor a4, a4, w6
zip2 v21.4s, v4.4s, v5.4s
+ eor a5, a5, w7
zip1 v22.4s, v6.4s, v7.4s
+ eor a6, a6, w8
zip2 v23.4s, v6.4s, v7.4s
+ eor a7, a7, w9
+ ldp w6, w7, [x2, #-32]
zip1 v24.4s, v8.4s, v9.4s
+ ldp w8, w9, [x2, #-24]
+ eor a8, a8, w6
zip2 v25.4s, v8.4s, v9.4s
+ eor a9, a9, w7
zip1 v26.4s, v10.4s, v11.4s
+ eor a10, a10, w8
zip2 v27.4s, v10.4s, v11.4s
+ eor a11, a11, w9
+ ldp w6, w7, [x2, #-16]
zip1 v28.4s, v12.4s, v13.4s
+ ldp w8, w9, [x2, #-8]
+ eor a12, a12, w6
zip2 v29.4s, v12.4s, v13.4s
+ eor a13, a13, w7
zip1 v30.4s, v14.4s, v15.4s
+ eor a14, a14, w8
zip2 v31.4s, v14.4s, v15.4s
+ eor a15, a15, w9
+
+ add x3, x2, x4
+ sub x3, x3, #128 // start of last block
+
+ subs x5, x4, #128
+ csel x2, x2, x3, ge
// interleave 64-bit words in state n, n+2
zip1 v0.2d, v16.2d, v18.2d
zip2 v4.2d, v16.2d, v18.2d
+ stp a0, a1, [x1], #64
zip1 v8.2d, v17.2d, v19.2d
zip2 v12.2d, v17.2d, v19.2d
+ stp a2, a3, [x1, #-56]
+
+ subs x6, x4, #192
ld1 {v16.16b-v19.16b}, [x2], #64
+ csel x2, x2, x3, ge
zip1 v1.2d, v20.2d, v22.2d
zip2 v5.2d, v20.2d, v22.2d
+ stp a4, a5, [x1, #-48]
zip1 v9.2d, v21.2d, v23.2d
zip2 v13.2d, v21.2d, v23.2d
+ stp a6, a7, [x1, #-40]
+
+ subs x7, x4, #256
ld1 {v20.16b-v23.16b}, [x2], #64
+ csel x2, x2, x3, ge
zip1 v2.2d, v24.2d, v26.2d
zip2 v6.2d, v24.2d, v26.2d
+ stp a8, a9, [x1, #-32]
zip1 v10.2d, v25.2d, v27.2d
zip2 v14.2d, v25.2d, v27.2d
+ stp a10, a11, [x1, #-24]
+
+ subs x8, x4, #320
ld1 {v24.16b-v27.16b}, [x2], #64
+ csel x2, x2, x3, ge
zip1 v3.2d, v28.2d, v30.2d
zip2 v7.2d, v28.2d, v30.2d
+ stp a12, a13, [x1, #-16]
zip1 v11.2d, v29.2d, v31.2d
zip2 v15.2d, v29.2d, v31.2d
+ stp a14, a15, [x1, #-8]
+
+ tbnz x5, #63, .Lt128
ld1 {v28.16b-v31.16b}, [x2]
// xor with corresponding input, write to output
@@ -426,25 +699,107 @@ ENTRY(chacha20_4block_xor_neon)
eor v17.16b, v17.16b, v1.16b
eor v18.16b, v18.16b, v2.16b
eor v19.16b, v19.16b, v3.16b
+
+ tbnz x6, #63, .Lt192
+
eor v20.16b, v20.16b, v4.16b
eor v21.16b, v21.16b, v5.16b
- st1 {v16.16b-v19.16b}, [x1], #64
eor v22.16b, v22.16b, v6.16b
eor v23.16b, v23.16b, v7.16b
+
+ st1 {v16.16b-v19.16b}, [x1], #64
+ tbnz x7, #63, .Lt256
+
eor v24.16b, v24.16b, v8.16b
eor v25.16b, v25.16b, v9.16b
- st1 {v20.16b-v23.16b}, [x1], #64
eor v26.16b, v26.16b, v10.16b
eor v27.16b, v27.16b, v11.16b
+
+ st1 {v20.16b-v23.16b}, [x1], #64
+ tbnz x8, #63, .Lt320
+
eor v28.16b, v28.16b, v12.16b
- st1 {v24.16b-v27.16b}, [x1], #64
eor v29.16b, v29.16b, v13.16b
eor v30.16b, v30.16b, v14.16b
eor v31.16b, v31.16b, v15.16b
+
+ st1 {v24.16b-v27.16b}, [x1], #64
st1 {v28.16b-v31.16b}, [x1]
+.Lout: frame_pop
ret
-ENDPROC(chacha20_4block_xor_neon)
-CTRINC: .word 0, 1, 2, 3
+ // fewer than 192 bytes of in/output
+.Lt192: cbz x5, 1f // exactly 128 bytes?
+ ld1 {v28.16b-v31.16b}, [x10]
+ add x5, x5, x1
+ tbl v28.16b, {v4.16b-v7.16b}, v28.16b
+ tbl v29.16b, {v4.16b-v7.16b}, v29.16b
+ tbl v30.16b, {v4.16b-v7.16b}, v30.16b
+ tbl v31.16b, {v4.16b-v7.16b}, v31.16b
+
+0: eor v20.16b, v20.16b, v28.16b
+ eor v21.16b, v21.16b, v29.16b
+ eor v22.16b, v22.16b, v30.16b
+ eor v23.16b, v23.16b, v31.16b
+ st1 {v20.16b-v23.16b}, [x5] // overlapping stores
+1: st1 {v16.16b-v19.16b}, [x1]
+ b .Lout
+
+ // fewer than 128 bytes of in/output
+.Lt128: ld1 {v28.16b-v31.16b}, [x10]
+ add x5, x5, x1
+ sub x1, x1, #64
+ tbl v28.16b, {v0.16b-v3.16b}, v28.16b
+ tbl v29.16b, {v0.16b-v3.16b}, v29.16b
+ tbl v30.16b, {v0.16b-v3.16b}, v30.16b
+ tbl v31.16b, {v0.16b-v3.16b}, v31.16b
+ ld1 {v16.16b-v19.16b}, [x1] // reload first output block
+ b 0b
+
+ // fewer than 256 bytes of in/output
+.Lt256: cbz x6, 2f // exactly 192 bytes?
+ ld1 {v4.16b-v7.16b}, [x10]
+ add x6, x6, x1
+ tbl v0.16b, {v8.16b-v11.16b}, v4.16b
+ tbl v1.16b, {v8.16b-v11.16b}, v5.16b
+ tbl v2.16b, {v8.16b-v11.16b}, v6.16b
+ tbl v3.16b, {v8.16b-v11.16b}, v7.16b
+
+ eor v28.16b, v28.16b, v0.16b
+ eor v29.16b, v29.16b, v1.16b
+ eor v30.16b, v30.16b, v2.16b
+ eor v31.16b, v31.16b, v3.16b
+ st1 {v28.16b-v31.16b}, [x6] // overlapping stores
+2: st1 {v20.16b-v23.16b}, [x1]
+ b .Lout
+
+ // fewer than 320 bytes of in/output
+.Lt320: cbz x7, 3f // exactly 256 bytes?
+ ld1 {v4.16b-v7.16b}, [x10]
+ add x7, x7, x1
+ tbl v0.16b, {v12.16b-v15.16b}, v4.16b
+ tbl v1.16b, {v12.16b-v15.16b}, v5.16b
+ tbl v2.16b, {v12.16b-v15.16b}, v6.16b
+ tbl v3.16b, {v12.16b-v15.16b}, v7.16b
+
+ eor v28.16b, v28.16b, v0.16b
+ eor v29.16b, v29.16b, v1.16b
+ eor v30.16b, v30.16b, v2.16b
+ eor v31.16b, v31.16b, v3.16b
+ st1 {v28.16b-v31.16b}, [x7] // overlapping stores
+3: st1 {v24.16b-v27.16b}, [x1]
+ b .Lout
+SYM_FUNC_END(chacha_4block_xor_neon)
+
+ .section ".rodata", "a", %progbits
+ .align L1_CACHE_SHIFT
+.Lpermute:
+ .set .Li, 0
+ .rept 128
+ .byte (.Li - 64)
+ .set .Li, .Li + 1
+ .endr
+
+CTRINC: .word 1, 2, 3, 4
ROT8: .word 0x02010003, 0x06050407, 0x0a09080b, 0x0e0d0c0f
diff --git a/arch/arm64/crypto/chacha-neon-glue.c b/arch/arm64/crypto/chacha-neon-glue.c
new file mode 100644
index 000000000000..af2bbca38e70
--- /dev/null
+++ b/arch/arm64/crypto/chacha-neon-glue.c
@@ -0,0 +1,243 @@
+/*
+ * ARM NEON and scalar accelerated ChaCha and XChaCha stream ciphers,
+ * including ChaCha20 (RFC7539)
+ *
+ * Copyright (C) 2016 - 2017 Linaro, Ltd. <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Based on:
+ * ChaCha20 256-bit cipher algorithm, RFC7539, SIMD glue code
+ *
+ * Copyright (C) 2015 Martin Willi
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <crypto/algapi.h>
+#include <crypto/internal/chacha.h>
+#include <crypto/internal/simd.h>
+#include <crypto/internal/skcipher.h>
+#include <linux/jump_label.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+
+#include <asm/hwcap.h>
+#include <asm/neon.h>
+#include <asm/simd.h>
+
+asmlinkage void chacha_block_xor_neon(u32 *state, u8 *dst, const u8 *src,
+ int nrounds);
+asmlinkage void chacha_4block_xor_neon(u32 *state, u8 *dst, const u8 *src,
+ int nrounds, int bytes);
+asmlinkage void hchacha_block_neon(const u32 *state, u32 *out, int nrounds);
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
+
+static void chacha_doneon(u32 *state, u8 *dst, const u8 *src,
+ int bytes, int nrounds)
+{
+ while (bytes > 0) {
+ int l = min(bytes, CHACHA_BLOCK_SIZE * 5);
+
+ if (l <= CHACHA_BLOCK_SIZE) {
+ u8 buf[CHACHA_BLOCK_SIZE];
+
+ memcpy(buf, src, l);
+ chacha_block_xor_neon(state, buf, buf, nrounds);
+ memcpy(dst, buf, l);
+ state[12] += 1;
+ break;
+ }
+ chacha_4block_xor_neon(state, dst, src, nrounds, l);
+ bytes -= l;
+ src += l;
+ dst += l;
+ state[12] += DIV_ROUND_UP(l, CHACHA_BLOCK_SIZE);
+ }
+}
+
+void hchacha_block_arch(const u32 *state, u32 *stream, int nrounds)
+{
+ if (!static_branch_likely(&have_neon) || !crypto_simd_usable()) {
+ hchacha_block_generic(state, stream, nrounds);
+ } else {
+ kernel_neon_begin();
+ hchacha_block_neon(state, stream, nrounds);
+ kernel_neon_end();
+ }
+}
+EXPORT_SYMBOL(hchacha_block_arch);
+
+void chacha_init_arch(u32 *state, const u32 *key, const u8 *iv)
+{
+ chacha_init_generic(state, key, iv);
+}
+EXPORT_SYMBOL(chacha_init_arch);
+
+void chacha_crypt_arch(u32 *state, u8 *dst, const u8 *src, unsigned int bytes,
+ int nrounds)
+{
+ if (!static_branch_likely(&have_neon) || bytes <= CHACHA_BLOCK_SIZE ||
+ !crypto_simd_usable())
+ return chacha_crypt_generic(state, dst, src, bytes, nrounds);
+
+ do {
+ unsigned int todo = min_t(unsigned int, bytes, SZ_4K);
+
+ kernel_neon_begin();
+ chacha_doneon(state, dst, src, todo, nrounds);
+ kernel_neon_end();
+
+ bytes -= todo;
+ src += todo;
+ dst += todo;
+ } while (bytes);
+}
+EXPORT_SYMBOL(chacha_crypt_arch);
+
+static int chacha_neon_stream_xor(struct skcipher_request *req,
+ const struct chacha_ctx *ctx, const u8 *iv)
+{
+ struct skcipher_walk walk;
+ u32 state[16];
+ int err;
+
+ err = skcipher_walk_virt(&walk, req, false);
+
+ chacha_init_generic(state, ctx->key, iv);
+
+ while (walk.nbytes > 0) {
+ unsigned int nbytes = walk.nbytes;
+
+ if (nbytes < walk.total)
+ nbytes = rounddown(nbytes, walk.stride);
+
+ if (!static_branch_likely(&have_neon) ||
+ !crypto_simd_usable()) {
+ chacha_crypt_generic(state, walk.dst.virt.addr,
+ walk.src.virt.addr, nbytes,
+ ctx->nrounds);
+ } else {
+ kernel_neon_begin();
+ chacha_doneon(state, walk.dst.virt.addr,
+ walk.src.virt.addr, nbytes, ctx->nrounds);
+ kernel_neon_end();
+ }
+ err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
+ }
+
+ return err;
+}
+
+static int chacha_neon(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
+
+ return chacha_neon_stream_xor(req, ctx, req->iv);
+}
+
+static int xchacha_neon(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
+ struct chacha_ctx subctx;
+ u32 state[16];
+ u8 real_iv[16];
+
+ chacha_init_generic(state, ctx->key, req->iv);
+ hchacha_block_arch(state, subctx.key, ctx->nrounds);
+ subctx.nrounds = ctx->nrounds;
+
+ memcpy(&real_iv[0], req->iv + 24, 8);
+ memcpy(&real_iv[8], req->iv + 16, 8);
+ return chacha_neon_stream_xor(req, &subctx, real_iv);
+}
+
+static struct skcipher_alg algs[] = {
+ {
+ .base.cra_name = "chacha20",
+ .base.cra_driver_name = "chacha20-neon",
+ .base.cra_priority = 300,
+ .base.cra_blocksize = 1,
+ .base.cra_ctxsize = sizeof(struct chacha_ctx),
+ .base.cra_module = THIS_MODULE,
+
+ .min_keysize = CHACHA_KEY_SIZE,
+ .max_keysize = CHACHA_KEY_SIZE,
+ .ivsize = CHACHA_IV_SIZE,
+ .chunksize = CHACHA_BLOCK_SIZE,
+ .walksize = 5 * CHACHA_BLOCK_SIZE,
+ .setkey = chacha20_setkey,
+ .encrypt = chacha_neon,
+ .decrypt = chacha_neon,
+ }, {
+ .base.cra_name = "xchacha20",
+ .base.cra_driver_name = "xchacha20-neon",
+ .base.cra_priority = 300,
+ .base.cra_blocksize = 1,
+ .base.cra_ctxsize = sizeof(struct chacha_ctx),
+ .base.cra_module = THIS_MODULE,
+
+ .min_keysize = CHACHA_KEY_SIZE,
+ .max_keysize = CHACHA_KEY_SIZE,
+ .ivsize = XCHACHA_IV_SIZE,
+ .chunksize = CHACHA_BLOCK_SIZE,
+ .walksize = 5 * CHACHA_BLOCK_SIZE,
+ .setkey = chacha20_setkey,
+ .encrypt = xchacha_neon,
+ .decrypt = xchacha_neon,
+ }, {
+ .base.cra_name = "xchacha12",
+ .base.cra_driver_name = "xchacha12-neon",
+ .base.cra_priority = 300,
+ .base.cra_blocksize = 1,
+ .base.cra_ctxsize = sizeof(struct chacha_ctx),
+ .base.cra_module = THIS_MODULE,
+
+ .min_keysize = CHACHA_KEY_SIZE,
+ .max_keysize = CHACHA_KEY_SIZE,
+ .ivsize = XCHACHA_IV_SIZE,
+ .chunksize = CHACHA_BLOCK_SIZE,
+ .walksize = 5 * CHACHA_BLOCK_SIZE,
+ .setkey = chacha12_setkey,
+ .encrypt = xchacha_neon,
+ .decrypt = xchacha_neon,
+ }
+};
+
+static int __init chacha_simd_mod_init(void)
+{
+ if (!cpu_have_named_feature(ASIMD))
+ return 0;
+
+ static_branch_enable(&have_neon);
+
+ return IS_REACHABLE(CONFIG_CRYPTO_SKCIPHER) ?
+ crypto_register_skciphers(algs, ARRAY_SIZE(algs)) : 0;
+}
+
+static void __exit chacha_simd_mod_fini(void)
+{
+ if (IS_REACHABLE(CONFIG_CRYPTO_SKCIPHER) && cpu_have_named_feature(ASIMD))
+ crypto_unregister_skciphers(algs, ARRAY_SIZE(algs));
+}
+
+module_init(chacha_simd_mod_init);
+module_exit(chacha_simd_mod_fini);
+
+MODULE_DESCRIPTION("ChaCha and XChaCha stream ciphers (NEON accelerated)");
+MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
+MODULE_LICENSE("GPL v2");
+MODULE_ALIAS_CRYPTO("chacha20");
+MODULE_ALIAS_CRYPTO("chacha20-neon");
+MODULE_ALIAS_CRYPTO("xchacha20");
+MODULE_ALIAS_CRYPTO("xchacha20-neon");
+MODULE_ALIAS_CRYPTO("xchacha12");
+MODULE_ALIAS_CRYPTO("xchacha12-neon");
diff --git a/arch/arm64/crypto/chacha20-neon-glue.c b/arch/arm64/crypto/chacha20-neon-glue.c
deleted file mode 100644
index cbdb75d15cd0..000000000000
--- a/arch/arm64/crypto/chacha20-neon-glue.c
+++ /dev/null
@@ -1,127 +0,0 @@
-/*
- * ChaCha20 256-bit cipher algorithm, RFC7539, arm64 NEON functions
- *
- * Copyright (C) 2016 - 2017 Linaro, Ltd. <ard.biesheuvel@linaro.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * Based on:
- * ChaCha20 256-bit cipher algorithm, RFC7539, SIMD glue code
- *
- * Copyright (C) 2015 Martin Willi
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- */
-
-#include <crypto/algapi.h>
-#include <crypto/chacha20.h>
-#include <crypto/internal/skcipher.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-
-#include <asm/hwcap.h>
-#include <asm/neon.h>
-#include <asm/simd.h>
-
-asmlinkage void chacha20_block_xor_neon(u32 *state, u8 *dst, const u8 *src);
-asmlinkage void chacha20_4block_xor_neon(u32 *state, u8 *dst, const u8 *src);
-
-static void chacha20_doneon(u32 *state, u8 *dst, const u8 *src,
- unsigned int bytes)
-{
- u8 buf[CHACHA20_BLOCK_SIZE];
-
- while (bytes >= CHACHA20_BLOCK_SIZE * 4) {
- chacha20_4block_xor_neon(state, dst, src);
- bytes -= CHACHA20_BLOCK_SIZE * 4;
- src += CHACHA20_BLOCK_SIZE * 4;
- dst += CHACHA20_BLOCK_SIZE * 4;
- state[12] += 4;
- }
- while (bytes >= CHACHA20_BLOCK_SIZE) {
- chacha20_block_xor_neon(state, dst, src);
- bytes -= CHACHA20_BLOCK_SIZE;
- src += CHACHA20_BLOCK_SIZE;
- dst += CHACHA20_BLOCK_SIZE;
- state[12]++;
- }
- if (bytes) {
- memcpy(buf, src, bytes);
- chacha20_block_xor_neon(state, buf, buf);
- memcpy(dst, buf, bytes);
- }
-}
-
-static int chacha20_neon(struct skcipher_request *req)
-{
- struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
- struct chacha20_ctx *ctx = crypto_skcipher_ctx(tfm);
- struct skcipher_walk walk;
- u32 state[16];
- int err;
-
- if (!may_use_simd() || req->cryptlen <= CHACHA20_BLOCK_SIZE)
- return crypto_chacha20_crypt(req);
-
- err = skcipher_walk_virt(&walk, req, true);
-
- crypto_chacha20_init(state, ctx, walk.iv);
-
- kernel_neon_begin();
- while (walk.nbytes > 0) {
- unsigned int nbytes = walk.nbytes;
-
- if (nbytes < walk.total)
- nbytes = round_down(nbytes, walk.stride);
-
- chacha20_doneon(state, walk.dst.virt.addr, walk.src.virt.addr,
- nbytes);
- err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
- }
- kernel_neon_end();
-
- return err;
-}
-
-static struct skcipher_alg alg = {
- .base.cra_name = "chacha20",
- .base.cra_driver_name = "chacha20-neon",
- .base.cra_priority = 300,
- .base.cra_blocksize = 1,
- .base.cra_ctxsize = sizeof(struct chacha20_ctx),
- .base.cra_module = THIS_MODULE,
-
- .min_keysize = CHACHA20_KEY_SIZE,
- .max_keysize = CHACHA20_KEY_SIZE,
- .ivsize = CHACHA20_IV_SIZE,
- .chunksize = CHACHA20_BLOCK_SIZE,
- .walksize = 4 * CHACHA20_BLOCK_SIZE,
- .setkey = crypto_chacha20_setkey,
- .encrypt = chacha20_neon,
- .decrypt = chacha20_neon,
-};
-
-static int __init chacha20_simd_mod_init(void)
-{
- if (!(elf_hwcap & HWCAP_ASIMD))
- return -ENODEV;
-
- return crypto_register_skcipher(&alg);
-}
-
-static void __exit chacha20_simd_mod_fini(void)
-{
- crypto_unregister_skcipher(&alg);
-}
-
-module_init(chacha20_simd_mod_init);
-module_exit(chacha20_simd_mod_fini);
-
-MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
-MODULE_LICENSE("GPL v2");
-MODULE_ALIAS_CRYPTO("chacha20");
diff --git a/arch/arm64/crypto/crc32-ce-core.S b/arch/arm64/crypto/crc32-ce-core.S
deleted file mode 100644
index 18f5a8442276..000000000000
--- a/arch/arm64/crypto/crc32-ce-core.S
+++ /dev/null
@@ -1,266 +0,0 @@
-/*
- * Accelerated CRC32(C) using arm64 CRC, NEON and Crypto Extensions instructions
- *
- * Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-/* GPL HEADER START
- *
- * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 only,
- * as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License version 2 for more details (a copy is included
- * in the LICENSE file that accompanied this code).
- *
- * You should have received a copy of the GNU General Public License
- * version 2 along with this program; If not, see http://www.gnu.org/licenses
- *
- * Please visit http://www.xyratex.com/contact if you need additional
- * information or have any questions.
- *
- * GPL HEADER END
- */
-
-/*
- * Copyright 2012 Xyratex Technology Limited
- *
- * Using hardware provided PCLMULQDQ instruction to accelerate the CRC32
- * calculation.
- * CRC32 polynomial:0x04c11db7(BE)/0xEDB88320(LE)
- * PCLMULQDQ is a new instruction in Intel SSE4.2, the reference can be found
- * at:
- * http://www.intel.com/products/processor/manuals/
- * Intel(R) 64 and IA-32 Architectures Software Developer's Manual
- * Volume 2B: Instruction Set Reference, N-Z
- *
- * Authors: Gregory Prestas <Gregory_Prestas@us.xyratex.com>
- * Alexander Boyko <Alexander_Boyko@xyratex.com>
- */
-
-#include <linux/linkage.h>
-#include <asm/assembler.h>
-
- .text
- .align 6
- .cpu generic+crypto+crc
-
-.Lcrc32_constants:
- /*
- * [x4*128+32 mod P(x) << 32)]' << 1 = 0x154442bd4
- * #define CONSTANT_R1 0x154442bd4LL
- *
- * [(x4*128-32 mod P(x) << 32)]' << 1 = 0x1c6e41596
- * #define CONSTANT_R2 0x1c6e41596LL
- */
- .octa 0x00000001c6e415960000000154442bd4
-
- /*
- * [(x128+32 mod P(x) << 32)]' << 1 = 0x1751997d0
- * #define CONSTANT_R3 0x1751997d0LL
- *
- * [(x128-32 mod P(x) << 32)]' << 1 = 0x0ccaa009e
- * #define CONSTANT_R4 0x0ccaa009eLL
- */
- .octa 0x00000000ccaa009e00000001751997d0
-
- /*
- * [(x64 mod P(x) << 32)]' << 1 = 0x163cd6124
- * #define CONSTANT_R5 0x163cd6124LL
- */
- .quad 0x0000000163cd6124
- .quad 0x00000000FFFFFFFF
-
- /*
- * #define CRCPOLY_TRUE_LE_FULL 0x1DB710641LL
- *
- * Barrett Reduction constant (u64`) = u` = (x**64 / P(x))`
- * = 0x1F7011641LL
- * #define CONSTANT_RU 0x1F7011641LL
- */
- .octa 0x00000001F701164100000001DB710641
-
-.Lcrc32c_constants:
- .octa 0x000000009e4addf800000000740eef02
- .octa 0x000000014cd00bd600000000f20c0dfe
- .quad 0x00000000dd45aab8
- .quad 0x00000000FFFFFFFF
- .octa 0x00000000dea713f10000000105ec76f0
-
- vCONSTANT .req v0
- dCONSTANT .req d0
- qCONSTANT .req q0
-
- BUF .req x0
- LEN .req x1
- CRC .req x2
-
- vzr .req v9
-
- /**
- * Calculate crc32
- * BUF - buffer
- * LEN - sizeof buffer (multiple of 16 bytes), LEN should be > 63
- * CRC - initial crc32
- * return %eax crc32
- * uint crc32_pmull_le(unsigned char const *buffer,
- * size_t len, uint crc32)
- */
-ENTRY(crc32_pmull_le)
- adr x3, .Lcrc32_constants
- b 0f
-
-ENTRY(crc32c_pmull_le)
- adr x3, .Lcrc32c_constants
-
-0: bic LEN, LEN, #15
- ld1 {v1.16b-v4.16b}, [BUF], #0x40
- movi vzr.16b, #0
- fmov dCONSTANT, CRC
- eor v1.16b, v1.16b, vCONSTANT.16b
- sub LEN, LEN, #0x40
- cmp LEN, #0x40
- b.lt less_64
-
- ldr qCONSTANT, [x3]
-
-loop_64: /* 64 bytes Full cache line folding */
- sub LEN, LEN, #0x40
-
- pmull2 v5.1q, v1.2d, vCONSTANT.2d
- pmull2 v6.1q, v2.2d, vCONSTANT.2d
- pmull2 v7.1q, v3.2d, vCONSTANT.2d
- pmull2 v8.1q, v4.2d, vCONSTANT.2d
-
- pmull v1.1q, v1.1d, vCONSTANT.1d
- pmull v2.1q, v2.1d, vCONSTANT.1d
- pmull v3.1q, v3.1d, vCONSTANT.1d
- pmull v4.1q, v4.1d, vCONSTANT.1d
-
- eor v1.16b, v1.16b, v5.16b
- ld1 {v5.16b}, [BUF], #0x10
- eor v2.16b, v2.16b, v6.16b
- ld1 {v6.16b}, [BUF], #0x10
- eor v3.16b, v3.16b, v7.16b
- ld1 {v7.16b}, [BUF], #0x10
- eor v4.16b, v4.16b, v8.16b
- ld1 {v8.16b}, [BUF], #0x10
-
- eor v1.16b, v1.16b, v5.16b
- eor v2.16b, v2.16b, v6.16b
- eor v3.16b, v3.16b, v7.16b
- eor v4.16b, v4.16b, v8.16b
-
- cmp LEN, #0x40
- b.ge loop_64
-
-less_64: /* Folding cache line into 128bit */
- ldr qCONSTANT, [x3, #16]
-
- pmull2 v5.1q, v1.2d, vCONSTANT.2d
- pmull v1.1q, v1.1d, vCONSTANT.1d
- eor v1.16b, v1.16b, v5.16b
- eor v1.16b, v1.16b, v2.16b
-
- pmull2 v5.1q, v1.2d, vCONSTANT.2d
- pmull v1.1q, v1.1d, vCONSTANT.1d
- eor v1.16b, v1.16b, v5.16b
- eor v1.16b, v1.16b, v3.16b
-
- pmull2 v5.1q, v1.2d, vCONSTANT.2d
- pmull v1.1q, v1.1d, vCONSTANT.1d
- eor v1.16b, v1.16b, v5.16b
- eor v1.16b, v1.16b, v4.16b
-
- cbz LEN, fold_64
-
-loop_16: /* Folding rest buffer into 128bit */
- subs LEN, LEN, #0x10
-
- ld1 {v2.16b}, [BUF], #0x10
- pmull2 v5.1q, v1.2d, vCONSTANT.2d
- pmull v1.1q, v1.1d, vCONSTANT.1d
- eor v1.16b, v1.16b, v5.16b
- eor v1.16b, v1.16b, v2.16b
-
- b.ne loop_16
-
-fold_64:
- /* perform the last 64 bit fold, also adds 32 zeroes
- * to the input stream */
- ext v2.16b, v1.16b, v1.16b, #8
- pmull2 v2.1q, v2.2d, vCONSTANT.2d
- ext v1.16b, v1.16b, vzr.16b, #8
- eor v1.16b, v1.16b, v2.16b
-
- /* final 32-bit fold */
- ldr dCONSTANT, [x3, #32]
- ldr d3, [x3, #40]
-
- ext v2.16b, v1.16b, vzr.16b, #4
- and v1.16b, v1.16b, v3.16b
- pmull v1.1q, v1.1d, vCONSTANT.1d
- eor v1.16b, v1.16b, v2.16b
-
- /* Finish up with the bit-reversed barrett reduction 64 ==> 32 bits */
- ldr qCONSTANT, [x3, #48]
-
- and v2.16b, v1.16b, v3.16b
- ext v2.16b, vzr.16b, v2.16b, #8
- pmull2 v2.1q, v2.2d, vCONSTANT.2d
- and v2.16b, v2.16b, v3.16b
- pmull v2.1q, v2.1d, vCONSTANT.1d
- eor v1.16b, v1.16b, v2.16b
- mov w0, v1.s[1]
-
- ret
-ENDPROC(crc32_pmull_le)
-ENDPROC(crc32c_pmull_le)
-
- .macro __crc32, c
-0: subs x2, x2, #16
- b.mi 8f
- ldp x3, x4, [x1], #16
-CPU_BE( rev x3, x3 )
-CPU_BE( rev x4, x4 )
- crc32\c\()x w0, w0, x3
- crc32\c\()x w0, w0, x4
- b.ne 0b
- ret
-
-8: tbz x2, #3, 4f
- ldr x3, [x1], #8
-CPU_BE( rev x3, x3 )
- crc32\c\()x w0, w0, x3
-4: tbz x2, #2, 2f
- ldr w3, [x1], #4
-CPU_BE( rev w3, w3 )
- crc32\c\()w w0, w0, w3
-2: tbz x2, #1, 1f
- ldrh w3, [x1], #2
-CPU_BE( rev16 w3, w3 )
- crc32\c\()h w0, w0, w3
-1: tbz x2, #0, 0f
- ldrb w3, [x1]
- crc32\c\()b w0, w0, w3
-0: ret
- .endm
-
- .align 5
-ENTRY(crc32_armv8_le)
- __crc32
-ENDPROC(crc32_armv8_le)
-
- .align 5
-ENTRY(crc32c_armv8_le)
- __crc32 c
-ENDPROC(crc32c_armv8_le)
diff --git a/arch/arm64/crypto/crc32-ce-glue.c b/arch/arm64/crypto/crc32-ce-glue.c
deleted file mode 100644
index 624f4137918c..000000000000
--- a/arch/arm64/crypto/crc32-ce-glue.c
+++ /dev/null
@@ -1,242 +0,0 @@
-/*
- * Accelerated CRC32(C) using arm64 NEON and Crypto Extensions instructions
- *
- * Copyright (C) 2016 - 2017 Linaro Ltd <ard.biesheuvel@linaro.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/cpufeature.h>
-#include <linux/crc32.h>
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/string.h>
-
-#include <crypto/internal/hash.h>
-
-#include <asm/hwcap.h>
-#include <asm/neon.h>
-#include <asm/simd.h>
-#include <asm/unaligned.h>
-
-#define PMULL_MIN_LEN 64L /* minimum size of buffer
- * for crc32_pmull_le_16 */
-#define SCALE_F 16L /* size of NEON register */
-
-asmlinkage u32 crc32_pmull_le(const u8 buf[], u64 len, u32 init_crc);
-asmlinkage u32 crc32_armv8_le(u32 init_crc, const u8 buf[], size_t len);
-
-asmlinkage u32 crc32c_pmull_le(const u8 buf[], u64 len, u32 init_crc);
-asmlinkage u32 crc32c_armv8_le(u32 init_crc, const u8 buf[], size_t len);
-
-static u32 (*fallback_crc32)(u32 init_crc, const u8 buf[], size_t len);
-static u32 (*fallback_crc32c)(u32 init_crc, const u8 buf[], size_t len);
-
-static int crc32_pmull_cra_init(struct crypto_tfm *tfm)
-{
- u32 *key = crypto_tfm_ctx(tfm);
-
- *key = 0;
- return 0;
-}
-
-static int crc32c_pmull_cra_init(struct crypto_tfm *tfm)
-{
- u32 *key = crypto_tfm_ctx(tfm);
-
- *key = ~0;
- return 0;
-}
-
-static int crc32_pmull_setkey(struct crypto_shash *hash, const u8 *key,
- unsigned int keylen)
-{
- u32 *mctx = crypto_shash_ctx(hash);
-
- if (keylen != sizeof(u32)) {
- crypto_shash_set_flags(hash, CRYPTO_TFM_RES_BAD_KEY_LEN);
- return -EINVAL;
- }
- *mctx = le32_to_cpup((__le32 *)key);
- return 0;
-}
-
-static int crc32_pmull_init(struct shash_desc *desc)
-{
- u32 *mctx = crypto_shash_ctx(desc->tfm);
- u32 *crc = shash_desc_ctx(desc);
-
- *crc = *mctx;
- return 0;
-}
-
-static int crc32_update(struct shash_desc *desc, const u8 *data,
- unsigned int length)
-{
- u32 *crc = shash_desc_ctx(desc);
-
- *crc = crc32_armv8_le(*crc, data, length);
- return 0;
-}
-
-static int crc32c_update(struct shash_desc *desc, const u8 *data,
- unsigned int length)
-{
- u32 *crc = shash_desc_ctx(desc);
-
- *crc = crc32c_armv8_le(*crc, data, length);
- return 0;
-}
-
-static int crc32_pmull_update(struct shash_desc *desc, const u8 *data,
- unsigned int length)
-{
- u32 *crc = shash_desc_ctx(desc);
- unsigned int l;
-
- if ((u64)data % SCALE_F) {
- l = min_t(u32, length, SCALE_F - ((u64)data % SCALE_F));
-
- *crc = fallback_crc32(*crc, data, l);
-
- data += l;
- length -= l;
- }
-
- if (length >= PMULL_MIN_LEN && may_use_simd()) {
- l = round_down(length, SCALE_F);
-
- kernel_neon_begin();
- *crc = crc32_pmull_le(data, l, *crc);
- kernel_neon_end();
-
- data += l;
- length -= l;
- }
-
- if (length > 0)
- *crc = fallback_crc32(*crc, data, length);
-
- return 0;
-}
-
-static int crc32c_pmull_update(struct shash_desc *desc, const u8 *data,
- unsigned int length)
-{
- u32 *crc = shash_desc_ctx(desc);
- unsigned int l;
-
- if ((u64)data % SCALE_F) {
- l = min_t(u32, length, SCALE_F - ((u64)data % SCALE_F));
-
- *crc = fallback_crc32c(*crc, data, l);
-
- data += l;
- length -= l;
- }
-
- if (length >= PMULL_MIN_LEN && may_use_simd()) {
- l = round_down(length, SCALE_F);
-
- kernel_neon_begin();
- *crc = crc32c_pmull_le(data, l, *crc);
- kernel_neon_end();
-
- data += l;
- length -= l;
- }
-
- if (length > 0) {
- *crc = fallback_crc32c(*crc, data, length);
- }
-
- return 0;
-}
-
-static int crc32_pmull_final(struct shash_desc *desc, u8 *out)
-{
- u32 *crc = shash_desc_ctx(desc);
-
- put_unaligned_le32(*crc, out);
- return 0;
-}
-
-static int crc32c_pmull_final(struct shash_desc *desc, u8 *out)
-{
- u32 *crc = shash_desc_ctx(desc);
-
- put_unaligned_le32(~*crc, out);
- return 0;
-}
-
-static struct shash_alg crc32_pmull_algs[] = { {
- .setkey = crc32_pmull_setkey,
- .init = crc32_pmull_init,
- .update = crc32_update,
- .final = crc32_pmull_final,
- .descsize = sizeof(u32),
- .digestsize = sizeof(u32),
-
- .base.cra_ctxsize = sizeof(u32),
- .base.cra_init = crc32_pmull_cra_init,
- .base.cra_name = "crc32",
- .base.cra_driver_name = "crc32-arm64-ce",
- .base.cra_priority = 200,
- .base.cra_blocksize = 1,
- .base.cra_module = THIS_MODULE,
-}, {
- .setkey = crc32_pmull_setkey,
- .init = crc32_pmull_init,
- .update = crc32c_update,
- .final = crc32c_pmull_final,
- .descsize = sizeof(u32),
- .digestsize = sizeof(u32),
-
- .base.cra_ctxsize = sizeof(u32),
- .base.cra_init = crc32c_pmull_cra_init,
- .base.cra_name = "crc32c",
- .base.cra_driver_name = "crc32c-arm64-ce",
- .base.cra_priority = 200,
- .base.cra_blocksize = 1,
- .base.cra_module = THIS_MODULE,
-} };
-
-static int __init crc32_pmull_mod_init(void)
-{
- if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && (elf_hwcap & HWCAP_PMULL)) {
- crc32_pmull_algs[0].update = crc32_pmull_update;
- crc32_pmull_algs[1].update = crc32c_pmull_update;
-
- if (elf_hwcap & HWCAP_CRC32) {
- fallback_crc32 = crc32_armv8_le;
- fallback_crc32c = crc32c_armv8_le;
- } else {
- fallback_crc32 = crc32_le;
- fallback_crc32c = __crc32c_le;
- }
- } else if (!(elf_hwcap & HWCAP_CRC32)) {
- return -ENODEV;
- }
- return crypto_register_shashes(crc32_pmull_algs,
- ARRAY_SIZE(crc32_pmull_algs));
-}
-
-static void __exit crc32_pmull_mod_exit(void)
-{
- crypto_unregister_shashes(crc32_pmull_algs,
- ARRAY_SIZE(crc32_pmull_algs));
-}
-
-static const struct cpu_feature crc32_cpu_feature[] = {
- { cpu_feature(CRC32) }, { cpu_feature(PMULL) }, { }
-};
-MODULE_DEVICE_TABLE(cpu, crc32_cpu_feature);
-
-module_init(crc32_pmull_mod_init);
-module_exit(crc32_pmull_mod_exit);
-
-MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
-MODULE_LICENSE("GPL v2");
diff --git a/arch/arm64/crypto/crct10dif-ce-core.S b/arch/arm64/crypto/crct10dif-ce-core.S
index d5b5a8c038c8..dce6dcebfca1 100644
--- a/arch/arm64/crypto/crct10dif-ce-core.S
+++ b/arch/arm64/crypto/crct10dif-ce-core.S
@@ -2,12 +2,14 @@
// Accelerated CRC-T10DIF using arm64 NEON and Crypto Extensions instructions
//
// Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
+// Copyright (C) 2019 Google LLC <ebiggers@google.com>
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License version 2 as
// published by the Free Software Foundation.
//
+// Derived from the x86 version:
//
// Implement fast CRC-T10DIF computation with SSE and PCLMULQDQ instructions
//
@@ -54,109 +56,176 @@
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
-// Function API:
-// UINT16 crc_t10dif_pcl(
-// UINT16 init_crc, //initial CRC value, 16 bits
-// const unsigned char *buf, //buffer pointer to calculate CRC on
-// UINT64 len //buffer length in bytes (64-bit data)
-// );
-//
// Reference paper titled "Fast CRC Computation for Generic
// Polynomials Using PCLMULQDQ Instruction"
// URL: http://www.intel.com/content/dam/www/public/us/en/documents
// /white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
//
-//
#include <linux/linkage.h>
#include <asm/assembler.h>
.text
- .cpu generic+crypto
-
- arg1_low32 .req w0
- arg2 .req x1
- arg3 .req x2
+ .arch armv8-a+crypto
- vzr .req v13
+ init_crc .req w0
+ buf .req x1
+ len .req x2
+ fold_consts_ptr .req x3
-ENTRY(crc_t10dif_pmull)
- movi vzr.16b, #0 // init zero register
+ fold_consts .req v10
- // adjust the 16-bit initial_crc value, scale it to 32 bits
- lsl arg1_low32, arg1_low32, #16
+ ad .req v14
- // check if smaller than 256
- cmp arg3, #256
+ k00_16 .req v15
+ k32_48 .req v16
- // for sizes less than 128, we can't fold 64B at a time...
- b.lt _less_than_128
+ t3 .req v17
+ t4 .req v18
+ t5 .req v19
+ t6 .req v20
+ t7 .req v21
+ t8 .req v22
+ t9 .req v23
- // load the initial crc value
- // crc value does not need to be byte-reflected, but it needs
- // to be moved to the high part of the register.
- // because data will be byte-reflected and will align with
- // initial crc at correct place.
- movi v10.16b, #0
- mov v10.s[3], arg1_low32 // initial crc
+ perm1 .req v24
+ perm2 .req v25
+ perm3 .req v26
+ perm4 .req v27
- // receive the initial 64B data, xor the initial crc value
- ldp q0, q1, [arg2]
- ldp q2, q3, [arg2, #0x20]
- ldp q4, q5, [arg2, #0x40]
- ldp q6, q7, [arg2, #0x60]
- add arg2, arg2, #0x80
+ bd1 .req v28
+ bd2 .req v29
+ bd3 .req v30
+ bd4 .req v31
-CPU_LE( rev64 v0.16b, v0.16b )
-CPU_LE( rev64 v1.16b, v1.16b )
-CPU_LE( rev64 v2.16b, v2.16b )
-CPU_LE( rev64 v3.16b, v3.16b )
-CPU_LE( rev64 v4.16b, v4.16b )
-CPU_LE( rev64 v5.16b, v5.16b )
-CPU_LE( rev64 v6.16b, v6.16b )
-CPU_LE( rev64 v7.16b, v7.16b )
+ .macro __pmull_init_p64
+ .endm
-CPU_LE( ext v0.16b, v0.16b, v0.16b, #8 )
-CPU_LE( ext v1.16b, v1.16b, v1.16b, #8 )
-CPU_LE( ext v2.16b, v2.16b, v2.16b, #8 )
-CPU_LE( ext v3.16b, v3.16b, v3.16b, #8 )
-CPU_LE( ext v4.16b, v4.16b, v4.16b, #8 )
-CPU_LE( ext v5.16b, v5.16b, v5.16b, #8 )
-CPU_LE( ext v6.16b, v6.16b, v6.16b, #8 )
-CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 )
+ .macro __pmull_pre_p64, bd
+ .endm
- // XOR the initial_crc value
- eor v0.16b, v0.16b, v10.16b
+ .macro __pmull_init_p8
+ // k00_16 := 0x0000000000000000_000000000000ffff
+ // k32_48 := 0x00000000ffffffff_0000ffffffffffff
+ movi k32_48.2d, #0xffffffff
+ mov k32_48.h[2], k32_48.h[0]
+ ushr k00_16.2d, k32_48.2d, #32
+
+ // prepare the permutation vectors
+ mov_q x5, 0x080f0e0d0c0b0a09
+ movi perm4.8b, #8
+ dup perm1.2d, x5
+ eor perm1.16b, perm1.16b, perm4.16b
+ ushr perm2.2d, perm1.2d, #8
+ ushr perm3.2d, perm1.2d, #16
+ ushr perm4.2d, perm1.2d, #24
+ sli perm2.2d, perm1.2d, #56
+ sli perm3.2d, perm1.2d, #48
+ sli perm4.2d, perm1.2d, #40
+ .endm
- ldr q10, rk3 // xmm10 has rk3 and rk4
- // type of pmull instruction
- // will determine which constant to use
+ .macro __pmull_pre_p8, bd
+ tbl bd1.16b, {\bd\().16b}, perm1.16b
+ tbl bd2.16b, {\bd\().16b}, perm2.16b
+ tbl bd3.16b, {\bd\().16b}, perm3.16b
+ tbl bd4.16b, {\bd\().16b}, perm4.16b
+ .endm
- //
- // we subtract 256 instead of 128 to save one instruction from the loop
- //
- sub arg3, arg3, #256
+SYM_FUNC_START_LOCAL(__pmull_p8_core)
+.L__pmull_p8_core:
+ ext t4.8b, ad.8b, ad.8b, #1 // A1
+ ext t5.8b, ad.8b, ad.8b, #2 // A2
+ ext t6.8b, ad.8b, ad.8b, #3 // A3
+
+ pmull t4.8h, t4.8b, fold_consts.8b // F = A1*B
+ pmull t8.8h, ad.8b, bd1.8b // E = A*B1
+ pmull t5.8h, t5.8b, fold_consts.8b // H = A2*B
+ pmull t7.8h, ad.8b, bd2.8b // G = A*B2
+ pmull t6.8h, t6.8b, fold_consts.8b // J = A3*B
+ pmull t9.8h, ad.8b, bd3.8b // I = A*B3
+ pmull t3.8h, ad.8b, bd4.8b // K = A*B4
+ b 0f
+
+.L__pmull_p8_core2:
+ tbl t4.16b, {ad.16b}, perm1.16b // A1
+ tbl t5.16b, {ad.16b}, perm2.16b // A2
+ tbl t6.16b, {ad.16b}, perm3.16b // A3
+
+ pmull2 t4.8h, t4.16b, fold_consts.16b // F = A1*B
+ pmull2 t8.8h, ad.16b, bd1.16b // E = A*B1
+ pmull2 t5.8h, t5.16b, fold_consts.16b // H = A2*B
+ pmull2 t7.8h, ad.16b, bd2.16b // G = A*B2
+ pmull2 t6.8h, t6.16b, fold_consts.16b // J = A3*B
+ pmull2 t9.8h, ad.16b, bd3.16b // I = A*B3
+ pmull2 t3.8h, ad.16b, bd4.16b // K = A*B4
+
+0: eor t4.16b, t4.16b, t8.16b // L = E + F
+ eor t5.16b, t5.16b, t7.16b // M = G + H
+ eor t6.16b, t6.16b, t9.16b // N = I + J
+
+ uzp1 t8.2d, t4.2d, t5.2d
+ uzp2 t4.2d, t4.2d, t5.2d
+ uzp1 t7.2d, t6.2d, t3.2d
+ uzp2 t6.2d, t6.2d, t3.2d
+
+ // t4 = (L) (P0 + P1) << 8
+ // t5 = (M) (P2 + P3) << 16
+ eor t8.16b, t8.16b, t4.16b
+ and t4.16b, t4.16b, k32_48.16b
+
+ // t6 = (N) (P4 + P5) << 24
+ // t7 = (K) (P6 + P7) << 32
+ eor t7.16b, t7.16b, t6.16b
+ and t6.16b, t6.16b, k00_16.16b
+
+ eor t8.16b, t8.16b, t4.16b
+ eor t7.16b, t7.16b, t6.16b
+
+ zip2 t5.2d, t8.2d, t4.2d
+ zip1 t4.2d, t8.2d, t4.2d
+ zip2 t3.2d, t7.2d, t6.2d
+ zip1 t6.2d, t7.2d, t6.2d
+
+ ext t4.16b, t4.16b, t4.16b, #15
+ ext t5.16b, t5.16b, t5.16b, #14
+ ext t6.16b, t6.16b, t6.16b, #13
+ ext t3.16b, t3.16b, t3.16b, #12
+
+ eor t4.16b, t4.16b, t5.16b
+ eor t6.16b, t6.16b, t3.16b
+ ret
+SYM_FUNC_END(__pmull_p8_core)
- // at this section of the code, there is 64*x+y (0<=y<64) bytes of
- // buffer. The _fold_64_B_loop will fold 64B at a time
- // until we have 64+y Bytes of buffer
+ .macro __pmull_p8, rq, ad, bd, i
+ .ifnc \bd, fold_consts
+ .err
+ .endif
+ mov ad.16b, \ad\().16b
+ .ifb \i
+ pmull \rq\().8h, \ad\().8b, \bd\().8b // D = A*B
+ .else
+ pmull2 \rq\().8h, \ad\().16b, \bd\().16b // D = A*B
+ .endif
+ bl .L__pmull_p8_core\i
- // fold 64B at a time. This section of the code folds 4 vector
- // registers in parallel
-_fold_64_B_loop:
+ eor \rq\().16b, \rq\().16b, t4.16b
+ eor \rq\().16b, \rq\().16b, t6.16b
+ .endm
- .macro fold64, reg1, reg2
- ldp q11, q12, [arg2], #0x20
+ // Fold reg1, reg2 into the next 32 data bytes, storing the result back
+ // into reg1, reg2.
+ .macro fold_32_bytes, p, reg1, reg2
+ ldp q11, q12, [buf], #0x20
- pmull2 v8.1q, \reg1\().2d, v10.2d
- pmull \reg1\().1q, \reg1\().1d, v10.1d
+ __pmull_\p v8, \reg1, fold_consts, 2
+ __pmull_\p \reg1, \reg1, fold_consts
CPU_LE( rev64 v11.16b, v11.16b )
CPU_LE( rev64 v12.16b, v12.16b )
- pmull2 v9.1q, \reg2\().2d, v10.2d
- pmull \reg2\().1q, \reg2\().1d, v10.1d
+ __pmull_\p v9, \reg2, fold_consts, 2
+ __pmull_\p \reg2, \reg2, fold_consts
CPU_LE( ext v11.16b, v11.16b, v11.16b, #8 )
CPU_LE( ext v12.16b, v12.16b, v12.16b, #8 )
@@ -167,225 +236,279 @@ CPU_LE( ext v12.16b, v12.16b, v12.16b, #8 )
eor \reg2\().16b, \reg2\().16b, v12.16b
.endm
- fold64 v0, v1
- fold64 v2, v3
- fold64 v4, v5
- fold64 v6, v7
-
- subs arg3, arg3, #128
+ // Fold src_reg into dst_reg, optionally loading the next fold constants
+ .macro fold_16_bytes, p, src_reg, dst_reg, load_next_consts
+ __pmull_\p v8, \src_reg, fold_consts
+ __pmull_\p \src_reg, \src_reg, fold_consts, 2
+ .ifnb \load_next_consts
+ ld1 {fold_consts.2d}, [fold_consts_ptr], #16
+ __pmull_pre_\p fold_consts
+ .endif
+ eor \dst_reg\().16b, \dst_reg\().16b, v8.16b
+ eor \dst_reg\().16b, \dst_reg\().16b, \src_reg\().16b
+ .endm
- // check if there is another 64B in the buffer to be able to fold
- b.ge _fold_64_B_loop
+ .macro __pmull_p64, rd, rn, rm, n
+ .ifb \n
+ pmull \rd\().1q, \rn\().1d, \rm\().1d
+ .else
+ pmull2 \rd\().1q, \rn\().2d, \rm\().2d
+ .endif
+ .endm
- // at this point, the buffer pointer is pointing at the last y Bytes
- // of the buffer the 64B of folded data is in 4 of the vector
- // registers: v0, v1, v2, v3
+ .macro crc_t10dif_pmull, p
+ __pmull_init_\p
- // fold the 8 vector registers to 1 vector register with different
- // constants
+ // For sizes less than 256 bytes, we can't fold 128 bytes at a time.
+ cmp len, #256
+ b.lt .Lless_than_256_bytes_\@
- ldr q10, rk9
+ adr_l fold_consts_ptr, .Lfold_across_128_bytes_consts
- .macro fold16, reg, rk
- pmull v8.1q, \reg\().1d, v10.1d
- pmull2 \reg\().1q, \reg\().2d, v10.2d
- .ifnb \rk
- ldr q10, \rk
- .endif
- eor v7.16b, v7.16b, v8.16b
- eor v7.16b, v7.16b, \reg\().16b
- .endm
+ // Load the first 128 data bytes. Byte swapping is necessary to make
+ // the bit order match the polynomial coefficient order.
+ ldp q0, q1, [buf]
+ ldp q2, q3, [buf, #0x20]
+ ldp q4, q5, [buf, #0x40]
+ ldp q6, q7, [buf, #0x60]
+ add buf, buf, #0x80
+CPU_LE( rev64 v0.16b, v0.16b )
+CPU_LE( rev64 v1.16b, v1.16b )
+CPU_LE( rev64 v2.16b, v2.16b )
+CPU_LE( rev64 v3.16b, v3.16b )
+CPU_LE( rev64 v4.16b, v4.16b )
+CPU_LE( rev64 v5.16b, v5.16b )
+CPU_LE( rev64 v6.16b, v6.16b )
+CPU_LE( rev64 v7.16b, v7.16b )
+CPU_LE( ext v0.16b, v0.16b, v0.16b, #8 )
+CPU_LE( ext v1.16b, v1.16b, v1.16b, #8 )
+CPU_LE( ext v2.16b, v2.16b, v2.16b, #8 )
+CPU_LE( ext v3.16b, v3.16b, v3.16b, #8 )
+CPU_LE( ext v4.16b, v4.16b, v4.16b, #8 )
+CPU_LE( ext v5.16b, v5.16b, v5.16b, #8 )
+CPU_LE( ext v6.16b, v6.16b, v6.16b, #8 )
+CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 )
- fold16 v0, rk11
- fold16 v1, rk13
- fold16 v2, rk15
- fold16 v3, rk17
- fold16 v4, rk19
- fold16 v5, rk1
- fold16 v6
-
- // instead of 64, we add 48 to the loop counter to save 1 instruction
- // from the loop instead of a cmp instruction, we use the negative
- // flag with the jl instruction
- adds arg3, arg3, #(128-16)
- b.lt _final_reduction_for_128
-
- // now we have 16+y bytes left to reduce. 16 Bytes is in register v7
- // and the rest is in memory. We can fold 16 bytes at a time if y>=16
- // continue folding 16B at a time
-
-_16B_reduction_loop:
- pmull v8.1q, v7.1d, v10.1d
- pmull2 v7.1q, v7.2d, v10.2d
+ // XOR the first 16 data *bits* with the initial CRC value.
+ movi v8.16b, #0
+ mov v8.h[7], init_crc
+ eor v0.16b, v0.16b, v8.16b
+
+ // Load the constants for folding across 128 bytes.
+ ld1 {fold_consts.2d}, [fold_consts_ptr]
+ __pmull_pre_\p fold_consts
+
+ // Subtract 128 for the 128 data bytes just consumed. Subtract another
+ // 128 to simplify the termination condition of the following loop.
+ sub len, len, #256
+
+ // While >= 128 data bytes remain (not counting v0-v7), fold the 128
+ // bytes v0-v7 into them, storing the result back into v0-v7.
+.Lfold_128_bytes_loop_\@:
+ fold_32_bytes \p, v0, v1
+ fold_32_bytes \p, v2, v3
+ fold_32_bytes \p, v4, v5
+ fold_32_bytes \p, v6, v7
+
+ subs len, len, #128
+ b.ge .Lfold_128_bytes_loop_\@
+
+ // Now fold the 112 bytes in v0-v6 into the 16 bytes in v7.
+
+ // Fold across 64 bytes.
+ add fold_consts_ptr, fold_consts_ptr, #16
+ ld1 {fold_consts.2d}, [fold_consts_ptr], #16
+ __pmull_pre_\p fold_consts
+ fold_16_bytes \p, v0, v4
+ fold_16_bytes \p, v1, v5
+ fold_16_bytes \p, v2, v6
+ fold_16_bytes \p, v3, v7, 1
+ // Fold across 32 bytes.
+ fold_16_bytes \p, v4, v6
+ fold_16_bytes \p, v5, v7, 1
+ // Fold across 16 bytes.
+ fold_16_bytes \p, v6, v7
+
+ // Add 128 to get the correct number of data bytes remaining in 0...127
+ // (not counting v7), following the previous extra subtraction by 128.
+ // Then subtract 16 to simplify the termination condition of the
+ // following loop.
+ adds len, len, #(128-16)
+
+ // While >= 16 data bytes remain (not counting v7), fold the 16 bytes v7
+ // into them, storing the result back into v7.
+ b.lt .Lfold_16_bytes_loop_done_\@
+.Lfold_16_bytes_loop_\@:
+ __pmull_\p v8, v7, fold_consts
+ __pmull_\p v7, v7, fold_consts, 2
eor v7.16b, v7.16b, v8.16b
-
- ldr q0, [arg2], #16
+ ldr q0, [buf], #16
CPU_LE( rev64 v0.16b, v0.16b )
CPU_LE( ext v0.16b, v0.16b, v0.16b, #8 )
eor v7.16b, v7.16b, v0.16b
- subs arg3, arg3, #16
-
- // instead of a cmp instruction, we utilize the flags with the
- // jge instruction equivalent of: cmp arg3, 16-16
- // check if there is any more 16B in the buffer to be able to fold
- b.ge _16B_reduction_loop
-
- // now we have 16+z bytes left to reduce, where 0<= z < 16.
- // first, we reduce the data in the xmm7 register
-
-_final_reduction_for_128:
- // check if any more data to fold. If not, compute the CRC of
- // the final 128 bits
- adds arg3, arg3, #16
- b.eq _128_done
-
- // here we are getting data that is less than 16 bytes.
- // since we know that there was data before the pointer, we can
- // offset the input pointer before the actual point, to receive
- // exactly 16 bytes. after that the registers need to be adjusted.
-_get_last_two_regs:
- add arg2, arg2, arg3
- ldr q1, [arg2, #-16]
-CPU_LE( rev64 v1.16b, v1.16b )
-CPU_LE( ext v1.16b, v1.16b, v1.16b, #8 )
+ subs len, len, #16
+ b.ge .Lfold_16_bytes_loop_\@
+
+.Lfold_16_bytes_loop_done_\@:
+ // Add 16 to get the correct number of data bytes remaining in 0...15
+ // (not counting v7), following the previous extra subtraction by 16.
+ adds len, len, #16
+ b.eq .Lreduce_final_16_bytes_\@
+
+.Lhandle_partial_segment_\@:
+ // Reduce the last '16 + len' bytes where 1 <= len <= 15 and the first
+ // 16 bytes are in v7 and the rest are the remaining data in 'buf'. To
+ // do this without needing a fold constant for each possible 'len',
+ // redivide the bytes into a first chunk of 'len' bytes and a second
+ // chunk of 16 bytes, then fold the first chunk into the second.
+
+ // v0 = last 16 original data bytes
+ add buf, buf, len
+ ldr q0, [buf, #-16]
+CPU_LE( rev64 v0.16b, v0.16b )
+CPU_LE( ext v0.16b, v0.16b, v0.16b, #8 )
- // get rid of the extra data that was loaded before
- // load the shift constant
- adr x4, tbl_shf_table + 16
- sub x4, x4, arg3
- ld1 {v0.16b}, [x4]
+ // v1 = high order part of second chunk: v7 left-shifted by 'len' bytes.
+ adr_l x4, .Lbyteshift_table + 16
+ sub x4, x4, len
+ ld1 {v2.16b}, [x4]
+ tbl v1.16b, {v7.16b}, v2.16b
- // shift v2 to the left by arg3 bytes
- tbl v2.16b, {v7.16b}, v0.16b
+ // v3 = first chunk: v7 right-shifted by '16-len' bytes.
+ movi v3.16b, #0x80
+ eor v2.16b, v2.16b, v3.16b
+ tbl v3.16b, {v7.16b}, v2.16b
- // shift v7 to the right by 16-arg3 bytes
- movi v9.16b, #0x80
- eor v0.16b, v0.16b, v9.16b
- tbl v7.16b, {v7.16b}, v0.16b
+ // Convert to 8-bit masks: 'len' 0x00 bytes, then '16-len' 0xff bytes.
+ sshr v2.16b, v2.16b, #7
- // blend
- sshr v0.16b, v0.16b, #7 // convert to 8-bit mask
- bsl v0.16b, v2.16b, v1.16b
+ // v2 = second chunk: 'len' bytes from v0 (low-order bytes),
+ // then '16-len' bytes from v1 (high-order bytes).
+ bsl v2.16b, v1.16b, v0.16b
- // fold 16 Bytes
- pmull v8.1q, v7.1d, v10.1d
- pmull2 v7.1q, v7.2d, v10.2d
- eor v7.16b, v7.16b, v8.16b
+ // Fold the first chunk into the second chunk, storing the result in v7.
+ __pmull_\p v0, v3, fold_consts
+ __pmull_\p v7, v3, fold_consts, 2
eor v7.16b, v7.16b, v0.16b
+ eor v7.16b, v7.16b, v2.16b
+
+.Lreduce_final_16_bytes_\@:
+ // Reduce the 128-bit value M(x), stored in v7, to the final 16-bit CRC.
+
+ movi v2.16b, #0 // init zero register
+
+ // Load 'x^48 * (x^48 mod G(x))' and 'x^48 * (x^80 mod G(x))'.
+ ld1 {fold_consts.2d}, [fold_consts_ptr], #16
+ __pmull_pre_\p fold_consts
+
+ // Fold the high 64 bits into the low 64 bits, while also multiplying by
+ // x^64. This produces a 128-bit value congruent to x^64 * M(x) and
+ // whose low 48 bits are 0.
+ ext v0.16b, v2.16b, v7.16b, #8
+ __pmull_\p v7, v7, fold_consts, 2 // high bits * x^48 * (x^80 mod G(x))
+ eor v0.16b, v0.16b, v7.16b // + low bits * x^64
+
+ // Fold the high 32 bits into the low 96 bits. This produces a 96-bit
+ // value congruent to x^64 * M(x) and whose low 48 bits are 0.
+ ext v1.16b, v0.16b, v2.16b, #12 // extract high 32 bits
+ mov v0.s[3], v2.s[0] // zero high 32 bits
+ __pmull_\p v1, v1, fold_consts // high 32 bits * x^48 * (x^48 mod G(x))
+ eor v0.16b, v0.16b, v1.16b // + low bits
+
+ // Load G(x) and floor(x^48 / G(x)).
+ ld1 {fold_consts.2d}, [fold_consts_ptr]
+ __pmull_pre_\p fold_consts
+
+ // Use Barrett reduction to compute the final CRC value.
+ __pmull_\p v1, v0, fold_consts, 2 // high 32 bits * floor(x^48 / G(x))
+ ushr v1.2d, v1.2d, #32 // /= x^32
+ __pmull_\p v1, v1, fold_consts // *= G(x)
+ ushr v0.2d, v0.2d, #48
+ eor v0.16b, v0.16b, v1.16b // + low 16 nonzero bits
+ // Final CRC value (x^16 * M(x)) mod G(x) is in low 16 bits of v0.
+
+ umov w0, v0.h[0]
+ .ifc \p, p8
+ ldp x29, x30, [sp], #16
+ .endif
+ ret
-_128_done:
- // compute crc of a 128-bit value
- ldr q10, rk5 // rk5 and rk6 in xmm10
-
- // 64b fold
- ext v0.16b, vzr.16b, v7.16b, #8
- mov v7.d[0], v7.d[1]
- pmull v7.1q, v7.1d, v10.1d
- eor v7.16b, v7.16b, v0.16b
+.Lless_than_256_bytes_\@:
+ // Checksumming a buffer of length 16...255 bytes
- // 32b fold
- ext v0.16b, v7.16b, vzr.16b, #4
- mov v7.s[3], vzr.s[0]
- pmull2 v0.1q, v0.2d, v10.2d
- eor v7.16b, v7.16b, v0.16b
+ adr_l fold_consts_ptr, .Lfold_across_16_bytes_consts
- // barrett reduction
-_barrett:
- ldr q10, rk7
- mov v0.d[0], v7.d[1]
+ // Load the first 16 data bytes.
+ ldr q7, [buf], #0x10
+CPU_LE( rev64 v7.16b, v7.16b )
+CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 )
- pmull v0.1q, v0.1d, v10.1d
- ext v0.16b, vzr.16b, v0.16b, #12
- pmull2 v0.1q, v0.2d, v10.2d
- ext v0.16b, vzr.16b, v0.16b, #12
+ // XOR the first 16 data *bits* with the initial CRC value.
+ movi v0.16b, #0
+ mov v0.h[7], init_crc
eor v7.16b, v7.16b, v0.16b
- mov w0, v7.s[1]
-_cleanup:
- // scale the result back to 16 bits
- lsr x0, x0, #16
- ret
+ // Load the fold-across-16-bytes constants.
+ ld1 {fold_consts.2d}, [fold_consts_ptr], #16
+ __pmull_pre_\p fold_consts
-_less_than_128:
- cbz arg3, _cleanup
+ cmp len, #16
+ b.eq .Lreduce_final_16_bytes_\@ // len == 16
+ subs len, len, #32
+ b.ge .Lfold_16_bytes_loop_\@ // 32 <= len <= 255
+ add len, len, #16
+ b .Lhandle_partial_segment_\@ // 17 <= len <= 31
+ .endm
- movi v0.16b, #0
- mov v0.s[3], arg1_low32 // get the initial crc value
+//
+// u16 crc_t10dif_pmull_p8(u16 init_crc, const u8 *buf, size_t len);
+//
+// Assumes len >= 16.
+//
+SYM_FUNC_START(crc_t10dif_pmull_p8)
+ stp x29, x30, [sp, #-16]!
+ mov x29, sp
+ crc_t10dif_pmull p8
+SYM_FUNC_END(crc_t10dif_pmull_p8)
- ldr q7, [arg2], #0x10
-CPU_LE( rev64 v7.16b, v7.16b )
-CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 )
- eor v7.16b, v7.16b, v0.16b // xor the initial crc value
-
- cmp arg3, #16
- b.eq _128_done // exactly 16 left
- b.lt _less_than_16_left
-
- ldr q10, rk1 // rk1 and rk2 in xmm10
-
- // update the counter. subtract 32 instead of 16 to save one
- // instruction from the loop
- subs arg3, arg3, #32
- b.ge _16B_reduction_loop
-
- add arg3, arg3, #16
- b _get_last_two_regs
-
-_less_than_16_left:
- // shl r9, 4
- adr x0, tbl_shf_table + 16
- sub x0, x0, arg3
- ld1 {v0.16b}, [x0]
- movi v9.16b, #0x80
- eor v0.16b, v0.16b, v9.16b
- tbl v7.16b, {v7.16b}, v0.16b
- b _128_done
-ENDPROC(crc_t10dif_pmull)
-
-// precomputed constants
-// these constants are precomputed from the poly:
-// 0x8bb70000 (0x8bb7 scaled to 32 bits)
+ .align 5
+//
+// u16 crc_t10dif_pmull_p64(u16 init_crc, const u8 *buf, size_t len);
+//
+// Assumes len >= 16.
+//
+SYM_FUNC_START(crc_t10dif_pmull_p64)
+ crc_t10dif_pmull p64
+SYM_FUNC_END(crc_t10dif_pmull_p64)
+
+ .section ".rodata", "a"
.align 4
-// Q = 0x18BB70000
-// rk1 = 2^(32*3) mod Q << 32
-// rk2 = 2^(32*5) mod Q << 32
-// rk3 = 2^(32*15) mod Q << 32
-// rk4 = 2^(32*17) mod Q << 32
-// rk5 = 2^(32*3) mod Q << 32
-// rk6 = 2^(32*2) mod Q << 32
-// rk7 = floor(2^64/Q)
-// rk8 = Q
-
-rk1: .octa 0x06df0000000000002d56000000000000
-rk3: .octa 0x7cf50000000000009d9d000000000000
-rk5: .octa 0x13680000000000002d56000000000000
-rk7: .octa 0x000000018bb7000000000001f65a57f8
-rk9: .octa 0xbfd6000000000000ceae000000000000
-rk11: .octa 0x713c0000000000001e16000000000000
-rk13: .octa 0x80a6000000000000f7f9000000000000
-rk15: .octa 0xe658000000000000044c000000000000
-rk17: .octa 0xa497000000000000ad18000000000000
-rk19: .octa 0xe7b50000000000006ee3000000000000
-
-tbl_shf_table:
-// use these values for shift constants for the tbl/tbx instruction
-// different alignments result in values as shown:
-// DDQ 0x008f8e8d8c8b8a898887868584838281 # shl 15 (16-1) / shr1
-// DDQ 0x01008f8e8d8c8b8a8988878685848382 # shl 14 (16-3) / shr2
-// DDQ 0x0201008f8e8d8c8b8a89888786858483 # shl 13 (16-4) / shr3
-// DDQ 0x030201008f8e8d8c8b8a898887868584 # shl 12 (16-4) / shr4
-// DDQ 0x04030201008f8e8d8c8b8a8988878685 # shl 11 (16-5) / shr5
-// DDQ 0x0504030201008f8e8d8c8b8a89888786 # shl 10 (16-6) / shr6
-// DDQ 0x060504030201008f8e8d8c8b8a898887 # shl 9 (16-7) / shr7
-// DDQ 0x07060504030201008f8e8d8c8b8a8988 # shl 8 (16-8) / shr8
-// DDQ 0x0807060504030201008f8e8d8c8b8a89 # shl 7 (16-9) / shr9
-// DDQ 0x090807060504030201008f8e8d8c8b8a # shl 6 (16-10) / shr10
-// DDQ 0x0a090807060504030201008f8e8d8c8b # shl 5 (16-11) / shr11
-// DDQ 0x0b0a090807060504030201008f8e8d8c # shl 4 (16-12) / shr12
-// DDQ 0x0c0b0a090807060504030201008f8e8d # shl 3 (16-13) / shr13
-// DDQ 0x0d0c0b0a090807060504030201008f8e # shl 2 (16-14) / shr14
-// DDQ 0x0e0d0c0b0a090807060504030201008f # shl 1 (16-15) / shr15
+// Fold constants precomputed from the polynomial 0x18bb7
+// G(x) = x^16 + x^15 + x^11 + x^9 + x^8 + x^7 + x^5 + x^4 + x^2 + x^1 + x^0
+.Lfold_across_128_bytes_consts:
+ .quad 0x0000000000006123 // x^(8*128) mod G(x)
+ .quad 0x0000000000002295 // x^(8*128+64) mod G(x)
+// .Lfold_across_64_bytes_consts:
+ .quad 0x0000000000001069 // x^(4*128) mod G(x)
+ .quad 0x000000000000dd31 // x^(4*128+64) mod G(x)
+// .Lfold_across_32_bytes_consts:
+ .quad 0x000000000000857d // x^(2*128) mod G(x)
+ .quad 0x0000000000007acc // x^(2*128+64) mod G(x)
+.Lfold_across_16_bytes_consts:
+ .quad 0x000000000000a010 // x^(1*128) mod G(x)
+ .quad 0x0000000000001faa // x^(1*128+64) mod G(x)
+// .Lfinal_fold_consts:
+ .quad 0x1368000000000000 // x^48 * (x^48 mod G(x))
+ .quad 0x2d56000000000000 // x^48 * (x^80 mod G(x))
+// .Lbarrett_reduction_consts:
+ .quad 0x0000000000018bb7 // G(x)
+ .quad 0x00000001f65a57f8 // floor(x^48 / G(x))
+
+// For 1 <= len <= 15, the 16-byte vector beginning at &byteshift_table[16 -
+// len] is the index vector to shift left by 'len' bytes, and is also {0x80,
+// ..., 0x80} XOR the index vector to shift right by '16 - len' bytes.
+.Lbyteshift_table:
.byte 0x0, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87
.byte 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f
.byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7
diff --git a/arch/arm64/crypto/crct10dif-ce-glue.c b/arch/arm64/crypto/crct10dif-ce-glue.c
index 96f0cae4a022..09eb1456aed4 100644
--- a/arch/arm64/crypto/crct10dif-ce-glue.c
+++ b/arch/arm64/crypto/crct10dif-ce-glue.c
@@ -1,11 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Accelerated CRC-T10DIF using arm64 NEON and Crypto Extensions instructions
*
* Copyright (C) 2016 - 2017 Linaro Ltd <ard.biesheuvel@linaro.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#include <linux/cpufeature.h>
@@ -16,13 +13,15 @@
#include <linux/string.h>
#include <crypto/internal/hash.h>
+#include <crypto/internal/simd.h>
#include <asm/neon.h>
#include <asm/simd.h>
#define CRC_T10DIF_PMULL_CHUNK_SIZE 16U
-asmlinkage u16 crc_t10dif_pmull(u16 init_crc, const u8 buf[], u64 len);
+asmlinkage u16 crc_t10dif_pmull_p8(u16 init_crc, const u8 *buf, size_t len);
+asmlinkage u16 crc_t10dif_pmull_p64(u16 init_crc, const u8 *buf, size_t len);
static int crct10dif_init(struct shash_desc *desc)
{
@@ -32,30 +31,51 @@ static int crct10dif_init(struct shash_desc *desc)
return 0;
}
-static int crct10dif_update(struct shash_desc *desc, const u8 *data,
+static int crct10dif_update_pmull_p8(struct shash_desc *desc, const u8 *data,
unsigned int length)
{
u16 *crc = shash_desc_ctx(desc);
- unsigned int l;
- if (unlikely((u64)data % CRC_T10DIF_PMULL_CHUNK_SIZE)) {
- l = min_t(u32, length, CRC_T10DIF_PMULL_CHUNK_SIZE -
- ((u64)data % CRC_T10DIF_PMULL_CHUNK_SIZE));
+ if (length >= CRC_T10DIF_PMULL_CHUNK_SIZE && crypto_simd_usable()) {
+ do {
+ unsigned int chunk = length;
- *crc = crc_t10dif_generic(*crc, data, l);
+ if (chunk > SZ_4K + CRC_T10DIF_PMULL_CHUNK_SIZE)
+ chunk = SZ_4K;
- length -= l;
- data += l;
+ kernel_neon_begin();
+ *crc = crc_t10dif_pmull_p8(*crc, data, chunk);
+ kernel_neon_end();
+ data += chunk;
+ length -= chunk;
+ } while (length);
+ } else {
+ *crc = crc_t10dif_generic(*crc, data, length);
}
- if (length > 0) {
- if (may_use_simd()) {
+ return 0;
+}
+
+static int crct10dif_update_pmull_p64(struct shash_desc *desc, const u8 *data,
+ unsigned int length)
+{
+ u16 *crc = shash_desc_ctx(desc);
+
+ if (length >= CRC_T10DIF_PMULL_CHUNK_SIZE && crypto_simd_usable()) {
+ do {
+ unsigned int chunk = length;
+
+ if (chunk > SZ_4K + CRC_T10DIF_PMULL_CHUNK_SIZE)
+ chunk = SZ_4K;
+
kernel_neon_begin();
- *crc = crc_t10dif_pmull(*crc, data, length);
+ *crc = crc_t10dif_pmull_p64(*crc, data, chunk);
kernel_neon_end();
- } else {
- *crc = crc_t10dif_generic(*crc, data, length);
- }
+ data += chunk;
+ length -= chunk;
+ } while (length);
+ } else {
+ *crc = crc_t10dif_generic(*crc, data, length);
}
return 0;
@@ -69,10 +89,22 @@ static int crct10dif_final(struct shash_desc *desc, u8 *out)
return 0;
}
-static struct shash_alg crc_t10dif_alg = {
+static struct shash_alg crc_t10dif_alg[] = {{
+ .digestsize = CRC_T10DIF_DIGEST_SIZE,
+ .init = crct10dif_init,
+ .update = crct10dif_update_pmull_p8,
+ .final = crct10dif_final,
+ .descsize = CRC_T10DIF_DIGEST_SIZE,
+
+ .base.cra_name = "crct10dif",
+ .base.cra_driver_name = "crct10dif-arm64-neon",
+ .base.cra_priority = 100,
+ .base.cra_blocksize = CRC_T10DIF_BLOCK_SIZE,
+ .base.cra_module = THIS_MODULE,
+}, {
.digestsize = CRC_T10DIF_DIGEST_SIZE,
.init = crct10dif_init,
- .update = crct10dif_update,
+ .update = crct10dif_update_pmull_p64,
.final = crct10dif_final,
.descsize = CRC_T10DIF_DIGEST_SIZE,
@@ -81,20 +113,31 @@ static struct shash_alg crc_t10dif_alg = {
.base.cra_priority = 200,
.base.cra_blocksize = CRC_T10DIF_BLOCK_SIZE,
.base.cra_module = THIS_MODULE,
-};
+}};
static int __init crc_t10dif_mod_init(void)
{
- return crypto_register_shash(&crc_t10dif_alg);
+ if (cpu_have_named_feature(PMULL))
+ return crypto_register_shashes(crc_t10dif_alg,
+ ARRAY_SIZE(crc_t10dif_alg));
+ else
+ /* only register the first array element */
+ return crypto_register_shash(crc_t10dif_alg);
}
static void __exit crc_t10dif_mod_exit(void)
{
- crypto_unregister_shash(&crc_t10dif_alg);
+ if (cpu_have_named_feature(PMULL))
+ crypto_unregister_shashes(crc_t10dif_alg,
+ ARRAY_SIZE(crc_t10dif_alg));
+ else
+ crypto_unregister_shash(crc_t10dif_alg);
}
-module_cpu_feature_match(PMULL, crc_t10dif_mod_init);
+module_cpu_feature_match(ASIMD, crc_t10dif_mod_init);
module_exit(crc_t10dif_mod_exit);
MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
MODULE_LICENSE("GPL v2");
+MODULE_ALIAS_CRYPTO("crct10dif");
+MODULE_ALIAS_CRYPTO("crct10dif-arm64-ce");
diff --git a/arch/arm64/crypto/ghash-ce-core.S b/arch/arm64/crypto/ghash-ce-core.S
index 11ebf1ae248a..ebe5558929b7 100644
--- a/arch/arm64/crypto/ghash-ce-core.S
+++ b/arch/arm64/crypto/ghash-ce-core.S
@@ -1,14 +1,12 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
/*
* Accelerated GHASH implementation with ARMv8 PMULL instructions.
*
- * Copyright (C) 2014 - 2017 Linaro Ltd. <ard.biesheuvel@linaro.org>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as published
- * by the Free Software Foundation.
+ * Copyright (C) 2014 - 2018 Linaro Ltd. <ard.biesheuvel@linaro.org>
*/
#include <linux/linkage.h>
+#include <linux/cfi_types.h>
#include <asm/assembler.h>
SHASH .req v0
@@ -16,8 +14,8 @@
T1 .req v2
T2 .req v3
MASK .req v4
- XL .req v5
- XM .req v6
+ XM .req v5
+ XL .req v6
XH .req v7
IN1 .req v7
@@ -46,6 +44,19 @@
ss3 .req v26
ss4 .req v27
+ XL2 .req v8
+ XM2 .req v9
+ XH2 .req v10
+ XL3 .req v11
+ XM3 .req v12
+ XH3 .req v13
+ TT3 .req v14
+ TT4 .req v15
+ HH .req v16
+ HH3 .req v17
+ HH4 .req v18
+ HH34 .req v19
+
.text
.arch armv8-a+crypto
@@ -134,11 +145,25 @@
.endm
.macro __pmull_pre_p64
+ add x8, x3, #16
+ ld1 {HH.2d-HH4.2d}, [x8]
+
+ trn1 SHASH2.2d, SHASH.2d, HH.2d
+ trn2 T1.2d, SHASH.2d, HH.2d
+ eor SHASH2.16b, SHASH2.16b, T1.16b
+
+ trn1 HH34.2d, HH3.2d, HH4.2d
+ trn2 T1.2d, HH3.2d, HH4.2d
+ eor HH34.16b, HH34.16b, T1.16b
+
movi MASK.16b, #0xe1
shl MASK.2d, MASK.2d, #57
.endm
.macro __pmull_pre_p8
+ ext SHASH2.16b, SHASH.16b, SHASH.16b, #8
+ eor SHASH2.16b, SHASH2.16b, SHASH.16b
+
// k00_16 := 0x0000000000000000_000000000000ffff
// k32_48 := 0x00000000ffffffff_0000ffffffffffff
movi k32_48.2d, #0xffffffff
@@ -215,20 +240,86 @@
.macro __pmull_ghash, pn
ld1 {SHASH.2d}, [x3]
ld1 {XL.2d}, [x1]
- ext SHASH2.16b, SHASH.16b, SHASH.16b, #8
- eor SHASH2.16b, SHASH2.16b, SHASH.16b
__pmull_pre_\pn
/* do the head block first, if supplied */
cbz x4, 0f
ld1 {T1.2d}, [x4]
- b 1f
+ mov x4, xzr
+ b 3f
+
+0: .ifc \pn, p64
+ tbnz w0, #0, 2f // skip until #blocks is a
+ tbnz w0, #1, 2f // round multiple of 4
+
+1: ld1 {XM3.16b-TT4.16b}, [x2], #64
+
+ sub w0, w0, #4
+
+ rev64 T1.16b, XM3.16b
+ rev64 T2.16b, XH3.16b
+ rev64 TT4.16b, TT4.16b
+ rev64 TT3.16b, TT3.16b
+
+ ext IN1.16b, TT4.16b, TT4.16b, #8
+ ext XL3.16b, TT3.16b, TT3.16b, #8
+
+ eor TT4.16b, TT4.16b, IN1.16b
+ pmull2 XH2.1q, SHASH.2d, IN1.2d // a1 * b1
+ pmull XL2.1q, SHASH.1d, IN1.1d // a0 * b0
+ pmull XM2.1q, SHASH2.1d, TT4.1d // (a1 + a0)(b1 + b0)
+
+ eor TT3.16b, TT3.16b, XL3.16b
+ pmull2 XH3.1q, HH.2d, XL3.2d // a1 * b1
+ pmull XL3.1q, HH.1d, XL3.1d // a0 * b0
+ pmull2 XM3.1q, SHASH2.2d, TT3.2d // (a1 + a0)(b1 + b0)
+
+ ext IN1.16b, T2.16b, T2.16b, #8
+ eor XL2.16b, XL2.16b, XL3.16b
+ eor XH2.16b, XH2.16b, XH3.16b
+ eor XM2.16b, XM2.16b, XM3.16b
+
+ eor T2.16b, T2.16b, IN1.16b
+ pmull2 XH3.1q, HH3.2d, IN1.2d // a1 * b1
+ pmull XL3.1q, HH3.1d, IN1.1d // a0 * b0
+ pmull XM3.1q, HH34.1d, T2.1d // (a1 + a0)(b1 + b0)
+
+ eor XL2.16b, XL2.16b, XL3.16b
+ eor XH2.16b, XH2.16b, XH3.16b
+ eor XM2.16b, XM2.16b, XM3.16b
+
+ ext IN1.16b, T1.16b, T1.16b, #8
+ ext TT3.16b, XL.16b, XL.16b, #8
+ eor XL.16b, XL.16b, IN1.16b
+ eor T1.16b, T1.16b, TT3.16b
+
+ pmull2 XH.1q, HH4.2d, XL.2d // a1 * b1
+ eor T1.16b, T1.16b, XL.16b
+ pmull XL.1q, HH4.1d, XL.1d // a0 * b0
+ pmull2 XM.1q, HH34.2d, T1.2d // (a1 + a0)(b1 + b0)
+
+ eor XL.16b, XL.16b, XL2.16b
+ eor XH.16b, XH.16b, XH2.16b
+ eor XM.16b, XM.16b, XM2.16b
+
+ eor T2.16b, XL.16b, XH.16b
+ ext T1.16b, XL.16b, XH.16b, #8
+ eor XM.16b, XM.16b, T2.16b
+
+ __pmull_reduce_p64
+
+ eor T2.16b, T2.16b, XH.16b
+ eor XL.16b, XL.16b, T2.16b
+
+ cbz w0, 5f
+ b 1b
+ .endif
-0: ld1 {T1.2d}, [x2], #16
+2: ld1 {T1.2d}, [x2], #16
sub w0, w0, #1
-1: /* multiply XL by SHASH in GF(2^128) */
+3: /* multiply XL by SHASH in GF(2^128) */
CPU_LE( rev64 T1.16b, T1.16b )
ext T2.16b, XL.16b, XL.16b, #8
@@ -241,7 +332,7 @@ CPU_LE( rev64 T1.16b, T1.16b )
__pmull_\pn XL, XL, SHASH // a0 * b0
__pmull_\pn XM, T1, SHASH2 // (a1 + a0)(b1 + b0)
- eor T2.16b, XL.16b, XH.16b
+4: eor T2.16b, XL.16b, XH.16b
ext T1.16b, XL.16b, XH.16b, #8
eor XM.16b, XM.16b, T2.16b
@@ -252,7 +343,7 @@ CPU_LE( rev64 T1.16b, T1.16b )
cbnz w0, 0b
- st1 {XL.2d}, [x1]
+5: st1 {XL.2d}, [x1]
ret
.endm
@@ -260,27 +351,45 @@ CPU_LE( rev64 T1.16b, T1.16b )
* void pmull_ghash_update(int blocks, u64 dg[], const char *src,
* struct ghash_key const *k, const char *head)
*/
-ENTRY(pmull_ghash_update_p64)
+SYM_TYPED_FUNC_START(pmull_ghash_update_p64)
__pmull_ghash p64
-ENDPROC(pmull_ghash_update_p64)
+SYM_FUNC_END(pmull_ghash_update_p64)
-ENTRY(pmull_ghash_update_p8)
+SYM_TYPED_FUNC_START(pmull_ghash_update_p8)
__pmull_ghash p8
-ENDPROC(pmull_ghash_update_p8)
-
- KS .req v8
- CTR .req v9
- INP .req v10
-
- .macro load_round_keys, rounds, rk
- cmp \rounds, #12
- blo 2222f /* 128 bits */
- beq 1111f /* 192 bits */
- ld1 {v17.4s-v18.4s}, [\rk], #32
-1111: ld1 {v19.4s-v20.4s}, [\rk], #32
-2222: ld1 {v21.4s-v24.4s}, [\rk], #64
- ld1 {v25.4s-v28.4s}, [\rk], #64
- ld1 {v29.4s-v31.4s}, [\rk]
+SYM_FUNC_END(pmull_ghash_update_p8)
+
+ KS0 .req v8
+ KS1 .req v9
+ KS2 .req v10
+ KS3 .req v11
+
+ INP0 .req v21
+ INP1 .req v22
+ INP2 .req v23
+ INP3 .req v24
+
+ K0 .req v25
+ K1 .req v26
+ K2 .req v27
+ K3 .req v28
+ K4 .req v12
+ K5 .req v13
+ K6 .req v4
+ K7 .req v5
+ K8 .req v14
+ K9 .req v15
+ KK .req v29
+ KL .req v30
+ KM .req v31
+
+ .macro load_round_keys, rounds, rk, tmp
+ add \tmp, \rk, #64
+ ld1 {K0.4s-K3.4s}, [\rk]
+ ld1 {K4.4s-K5.4s}, [\tmp]
+ add \tmp, \rk, \rounds, lsl #4
+ sub \tmp, \tmp, #32
+ ld1 {KK.4s-KM.4s}, [\tmp]
.endm
.macro enc_round, state, key
@@ -288,157 +397,382 @@ ENDPROC(pmull_ghash_update_p8)
aesmc \state\().16b, \state\().16b
.endm
- .macro enc_block, state, rounds
- cmp \rounds, #12
- b.lo 2222f /* 128 bits */
- b.eq 1111f /* 192 bits */
- enc_round \state, v17
- enc_round \state, v18
-1111: enc_round \state, v19
- enc_round \state, v20
-2222: .irp key, v21, v22, v23, v24, v25, v26, v27, v28, v29
+ .macro enc_qround, s0, s1, s2, s3, key
+ enc_round \s0, \key
+ enc_round \s1, \key
+ enc_round \s2, \key
+ enc_round \s3, \key
+ .endm
+
+ .macro enc_block, state, rounds, rk, tmp
+ add \tmp, \rk, #96
+ ld1 {K6.4s-K7.4s}, [\tmp], #32
+ .irp key, K0, K1, K2, K3, K4 K5
enc_round \state, \key
.endr
- aese \state\().16b, v30.16b
- eor \state\().16b, \state\().16b, v31.16b
+
+ tbnz \rounds, #2, .Lnot128_\@
+.Lout256_\@:
+ enc_round \state, K6
+ enc_round \state, K7
+
+.Lout192_\@:
+ enc_round \state, KK
+ aese \state\().16b, KL.16b
+ eor \state\().16b, \state\().16b, KM.16b
+
+ .subsection 1
+.Lnot128_\@:
+ ld1 {K8.4s-K9.4s}, [\tmp], #32
+ enc_round \state, K6
+ enc_round \state, K7
+ ld1 {K6.4s-K7.4s}, [\tmp]
+ enc_round \state, K8
+ enc_round \state, K9
+ tbz \rounds, #1, .Lout192_\@
+ b .Lout256_\@
+ .previous
.endm
+ .align 6
.macro pmull_gcm_do_crypt, enc
- ld1 {SHASH.2d}, [x4]
- ld1 {XL.2d}, [x1]
- ldr x8, [x5, #8] // load lower counter
+ stp x29, x30, [sp, #-32]!
+ mov x29, sp
+ str x19, [sp, #24]
- movi MASK.16b, #0xe1
- ext SHASH2.16b, SHASH.16b, SHASH.16b, #8
-CPU_LE( rev x8, x8 )
- shl MASK.2d, MASK.2d, #57
- eor SHASH2.16b, SHASH2.16b, SHASH.16b
+ load_round_keys x7, x6, x8
- .if \enc == 1
- ld1 {KS.16b}, [x7]
+ ld1 {SHASH.2d}, [x3], #16
+ ld1 {HH.2d-HH4.2d}, [x3]
+
+ trn1 SHASH2.2d, SHASH.2d, HH.2d
+ trn2 T1.2d, SHASH.2d, HH.2d
+ eor SHASH2.16b, SHASH2.16b, T1.16b
+
+ trn1 HH34.2d, HH3.2d, HH4.2d
+ trn2 T1.2d, HH3.2d, HH4.2d
+ eor HH34.16b, HH34.16b, T1.16b
+
+ ld1 {XL.2d}, [x4]
+
+ cbz x0, 3f // tag only?
+
+ ldr w8, [x5, #12] // load lower counter
+CPU_LE( rev w8, w8 )
+
+0: mov w9, #4 // max blocks per round
+ add x10, x0, #0xf
+ lsr x10, x10, #4 // remaining blocks
+
+ subs x0, x0, #64
+ csel w9, w10, w9, mi
+ add w8, w8, w9
+
+ bmi 1f
+ ld1 {INP0.16b-INP3.16b}, [x2], #64
+ .subsection 1
+ /*
+ * Populate the four input registers right to left with up to 63 bytes
+ * of data, using overlapping loads to avoid branches.
+ *
+ * INP0 INP1 INP2 INP3
+ * 1 byte | | | |x |
+ * 16 bytes | | | |xxxxxxxx|
+ * 17 bytes | | |xxxxxxxx|x |
+ * 47 bytes | |xxxxxxxx|xxxxxxxx|xxxxxxx |
+ * etc etc
+ *
+ * Note that this code may read up to 15 bytes before the start of
+ * the input. It is up to the calling code to ensure this is safe if
+ * this happens in the first iteration of the loop (i.e., when the
+ * input size is < 16 bytes)
+ */
+1: mov x15, #16
+ ands x19, x0, #0xf
+ csel x19, x19, x15, ne
+ adr_l x17, .Lpermute_table + 16
+
+ sub x11, x15, x19
+ add x12, x17, x11
+ sub x17, x17, x11
+ ld1 {T1.16b}, [x12]
+ sub x10, x1, x11
+ sub x11, x2, x11
+
+ cmp x0, #-16
+ csel x14, x15, xzr, gt
+ cmp x0, #-32
+ csel x15, x15, xzr, gt
+ cmp x0, #-48
+ csel x16, x19, xzr, gt
+ csel x1, x1, x10, gt
+ csel x2, x2, x11, gt
+
+ ld1 {INP0.16b}, [x2], x14
+ ld1 {INP1.16b}, [x2], x15
+ ld1 {INP2.16b}, [x2], x16
+ ld1 {INP3.16b}, [x2]
+ tbl INP3.16b, {INP3.16b}, T1.16b
+ b 2f
+ .previous
+
+2: .if \enc == 0
+ bl pmull_gcm_ghash_4x
.endif
-0: ld1 {CTR.8b}, [x5] // load upper counter
- ld1 {INP.16b}, [x3], #16
- rev x9, x8
- add x8, x8, #1
- sub w0, w0, #1
- ins CTR.d[1], x9 // set lower counter
+ bl pmull_gcm_enc_4x
+ tbnz x0, #63, 6f
+ st1 {INP0.16b-INP3.16b}, [x1], #64
.if \enc == 1
- eor INP.16b, INP.16b, KS.16b // encrypt input
- st1 {INP.16b}, [x2], #16
+ bl pmull_gcm_ghash_4x
.endif
+ bne 0b
- rev64 T1.16b, INP.16b
+3: ldp x19, x10, [sp, #24]
+ cbz x10, 5f // output tag?
- cmp w6, #12
- b.ge 2f // AES-192/256?
+ ld1 {INP3.16b}, [x10] // load lengths[]
+ mov w9, #1
+ bl pmull_gcm_ghash_4x
-1: enc_round CTR, v21
+ mov w11, #(0x1 << 24) // BE '1U'
+ ld1 {KS0.16b}, [x5]
+ mov KS0.s[3], w11
- ext T2.16b, XL.16b, XL.16b, #8
- ext IN1.16b, T1.16b, T1.16b, #8
+ enc_block KS0, x7, x6, x12
- enc_round CTR, v22
+ ext XL.16b, XL.16b, XL.16b, #8
+ rev64 XL.16b, XL.16b
+ eor XL.16b, XL.16b, KS0.16b
- eor T1.16b, T1.16b, T2.16b
- eor XL.16b, XL.16b, IN1.16b
+ .if \enc == 1
+ st1 {XL.16b}, [x10] // store tag
+ .else
+ ldp x11, x12, [sp, #40] // load tag pointer and authsize
+ adr_l x17, .Lpermute_table
+ ld1 {KS0.16b}, [x11] // load supplied tag
+ add x17, x17, x12
+ ld1 {KS1.16b}, [x17] // load permute vector
+
+ cmeq XL.16b, XL.16b, KS0.16b // compare tags
+ mvn XL.16b, XL.16b // -1 for fail, 0 for pass
+ tbl XL.16b, {XL.16b}, KS1.16b // keep authsize bytes only
+ sminv b0, XL.16b // signed minimum across XL
+ smov w0, v0.b[0] // return b0
+ .endif
- enc_round CTR, v23
+4: ldp x29, x30, [sp], #32
+ ret
- pmull2 XH.1q, SHASH.2d, XL.2d // a1 * b1
- eor T1.16b, T1.16b, XL.16b
+5:
+CPU_LE( rev w8, w8 )
+ str w8, [x5, #12] // store lower counter
+ st1 {XL.2d}, [x4]
+ b 4b
+
+6: ld1 {T1.16b-T2.16b}, [x17], #32 // permute vectors
+ sub x17, x17, x19, lsl #1
+
+ cmp w9, #1
+ beq 7f
+ .subsection 1
+7: ld1 {INP2.16b}, [x1]
+ tbx INP2.16b, {INP3.16b}, T1.16b
+ mov INP3.16b, INP2.16b
+ b 8f
+ .previous
+
+ st1 {INP0.16b}, [x1], x14
+ st1 {INP1.16b}, [x1], x15
+ st1 {INP2.16b}, [x1], x16
+ tbl INP3.16b, {INP3.16b}, T1.16b
+ tbx INP3.16b, {INP2.16b}, T2.16b
+8: st1 {INP3.16b}, [x1]
- enc_round CTR, v24
+ .if \enc == 1
+ ld1 {T1.16b}, [x17]
+ tbl INP3.16b, {INP3.16b}, T1.16b // clear non-data bits
+ bl pmull_gcm_ghash_4x
+ .endif
+ b 3b
+ .endm
- pmull XL.1q, SHASH.1d, XL.1d // a0 * b0
- pmull XM.1q, SHASH2.1d, T1.1d // (a1 + a0)(b1 + b0)
+ /*
+ * void pmull_gcm_encrypt(int blocks, u8 dst[], const u8 src[],
+ * struct ghash_key const *k, u64 dg[], u8 ctr[],
+ * int rounds, u8 tag)
+ */
+SYM_FUNC_START(pmull_gcm_encrypt)
+ pmull_gcm_do_crypt 1
+SYM_FUNC_END(pmull_gcm_encrypt)
- enc_round CTR, v25
+ /*
+ * void pmull_gcm_decrypt(int blocks, u8 dst[], const u8 src[],
+ * struct ghash_key const *k, u64 dg[], u8 ctr[],
+ * int rounds, u8 tag)
+ */
+SYM_FUNC_START(pmull_gcm_decrypt)
+ pmull_gcm_do_crypt 0
+SYM_FUNC_END(pmull_gcm_decrypt)
- ext T1.16b, XL.16b, XH.16b, #8
- eor T2.16b, XL.16b, XH.16b
- eor XM.16b, XM.16b, T1.16b
+SYM_FUNC_START_LOCAL(pmull_gcm_ghash_4x)
+ movi MASK.16b, #0xe1
+ shl MASK.2d, MASK.2d, #57
- enc_round CTR, v26
+ rev64 T1.16b, INP0.16b
+ rev64 T2.16b, INP1.16b
+ rev64 TT3.16b, INP2.16b
+ rev64 TT4.16b, INP3.16b
- eor XM.16b, XM.16b, T2.16b
- pmull T2.1q, XL.1d, MASK.1d
+ ext XL.16b, XL.16b, XL.16b, #8
- enc_round CTR, v27
+ tbz w9, #2, 0f // <4 blocks?
+ .subsection 1
+0: movi XH2.16b, #0
+ movi XM2.16b, #0
+ movi XL2.16b, #0
- mov XH.d[0], XM.d[1]
- mov XM.d[1], XL.d[0]
+ tbz w9, #0, 1f // 2 blocks?
+ tbz w9, #1, 2f // 1 block?
- enc_round CTR, v28
+ eor T2.16b, T2.16b, XL.16b
+ ext T1.16b, T2.16b, T2.16b, #8
+ b .Lgh3
- eor XL.16b, XM.16b, T2.16b
+1: eor TT3.16b, TT3.16b, XL.16b
+ ext T2.16b, TT3.16b, TT3.16b, #8
+ b .Lgh2
- enc_round CTR, v29
+2: eor TT4.16b, TT4.16b, XL.16b
+ ext IN1.16b, TT4.16b, TT4.16b, #8
+ b .Lgh1
+ .previous
- ext T2.16b, XL.16b, XL.16b, #8
+ eor T1.16b, T1.16b, XL.16b
+ ext IN1.16b, T1.16b, T1.16b, #8
- aese CTR.16b, v30.16b
+ pmull2 XH2.1q, HH4.2d, IN1.2d // a1 * b1
+ eor T1.16b, T1.16b, IN1.16b
+ pmull XL2.1q, HH4.1d, IN1.1d // a0 * b0
+ pmull2 XM2.1q, HH34.2d, T1.2d // (a1 + a0)(b1 + b0)
+
+ ext T1.16b, T2.16b, T2.16b, #8
+.Lgh3: eor T2.16b, T2.16b, T1.16b
+ pmull2 XH.1q, HH3.2d, T1.2d // a1 * b1
+ pmull XL.1q, HH3.1d, T1.1d // a0 * b0
+ pmull XM.1q, HH34.1d, T2.1d // (a1 + a0)(b1 + b0)
+
+ eor XH2.16b, XH2.16b, XH.16b
+ eor XL2.16b, XL2.16b, XL.16b
+ eor XM2.16b, XM2.16b, XM.16b
+
+ ext T2.16b, TT3.16b, TT3.16b, #8
+.Lgh2: eor TT3.16b, TT3.16b, T2.16b
+ pmull2 XH.1q, HH.2d, T2.2d // a1 * b1
+ pmull XL.1q, HH.1d, T2.1d // a0 * b0
+ pmull2 XM.1q, SHASH2.2d, TT3.2d // (a1 + a0)(b1 + b0)
+
+ eor XH2.16b, XH2.16b, XH.16b
+ eor XL2.16b, XL2.16b, XL.16b
+ eor XM2.16b, XM2.16b, XM.16b
+
+ ext IN1.16b, TT4.16b, TT4.16b, #8
+.Lgh1: eor TT4.16b, TT4.16b, IN1.16b
+ pmull XL.1q, SHASH.1d, IN1.1d // a0 * b0
+ pmull2 XH.1q, SHASH.2d, IN1.2d // a1 * b1
+ pmull XM.1q, SHASH2.1d, TT4.1d // (a1 + a0)(b1 + b0)
+
+ eor XH.16b, XH.16b, XH2.16b
+ eor XL.16b, XL.16b, XL2.16b
+ eor XM.16b, XM.16b, XM2.16b
- pmull XL.1q, XL.1d, MASK.1d
- eor T2.16b, T2.16b, XH.16b
+ eor T2.16b, XL.16b, XH.16b
+ ext T1.16b, XL.16b, XH.16b, #8
+ eor XM.16b, XM.16b, T2.16b
- eor KS.16b, CTR.16b, v31.16b
+ __pmull_reduce_p64
+ eor T2.16b, T2.16b, XH.16b
eor XL.16b, XL.16b, T2.16b
- .if \enc == 0
- eor INP.16b, INP.16b, KS.16b
- st1 {INP.16b}, [x2], #16
- .endif
-
- cbnz w0, 0b
+ ret
+SYM_FUNC_END(pmull_gcm_ghash_4x)
+
+SYM_FUNC_START_LOCAL(pmull_gcm_enc_4x)
+ ld1 {KS0.16b}, [x5] // load upper counter
+ sub w10, w8, #4
+ sub w11, w8, #3
+ sub w12, w8, #2
+ sub w13, w8, #1
+ rev w10, w10
+ rev w11, w11
+ rev w12, w12
+ rev w13, w13
+ mov KS1.16b, KS0.16b
+ mov KS2.16b, KS0.16b
+ mov KS3.16b, KS0.16b
+ ins KS0.s[3], w10 // set lower counter
+ ins KS1.s[3], w11
+ ins KS2.s[3], w12
+ ins KS3.s[3], w13
+
+ add x10, x6, #96 // round key pointer
+ ld1 {K6.4s-K7.4s}, [x10], #32
+ .irp key, K0, K1, K2, K3, K4, K5
+ enc_qround KS0, KS1, KS2, KS3, \key
+ .endr
-CPU_LE( rev x8, x8 )
- st1 {XL.2d}, [x1]
- str x8, [x5, #8] // store lower counter
+ tbnz x7, #2, .Lnot128
+ .subsection 1
+.Lnot128:
+ ld1 {K8.4s-K9.4s}, [x10], #32
+ .irp key, K6, K7
+ enc_qround KS0, KS1, KS2, KS3, \key
+ .endr
+ ld1 {K6.4s-K7.4s}, [x10]
+ .irp key, K8, K9
+ enc_qround KS0, KS1, KS2, KS3, \key
+ .endr
+ tbz x7, #1, .Lout192
+ b .Lout256
+ .previous
- .if \enc == 1
- st1 {KS.16b}, [x7]
- .endif
+.Lout256:
+ .irp key, K6, K7
+ enc_qround KS0, KS1, KS2, KS3, \key
+ .endr
- ret
+.Lout192:
+ enc_qround KS0, KS1, KS2, KS3, KK
-2: b.eq 3f // AES-192?
- enc_round CTR, v17
- enc_round CTR, v18
-3: enc_round CTR, v19
- enc_round CTR, v20
- b 1b
- .endm
+ aese KS0.16b, KL.16b
+ aese KS1.16b, KL.16b
+ aese KS2.16b, KL.16b
+ aese KS3.16b, KL.16b
- /*
- * void pmull_gcm_encrypt(int blocks, u64 dg[], u8 dst[], const u8 src[],
- * struct ghash_key const *k, u8 ctr[],
- * int rounds, u8 ks[])
- */
-ENTRY(pmull_gcm_encrypt)
- pmull_gcm_do_crypt 1
-ENDPROC(pmull_gcm_encrypt)
+ eor KS0.16b, KS0.16b, KM.16b
+ eor KS1.16b, KS1.16b, KM.16b
+ eor KS2.16b, KS2.16b, KM.16b
+ eor KS3.16b, KS3.16b, KM.16b
- /*
- * void pmull_gcm_decrypt(int blocks, u64 dg[], u8 dst[], const u8 src[],
- * struct ghash_key const *k, u8 ctr[],
- * int rounds)
- */
-ENTRY(pmull_gcm_decrypt)
- pmull_gcm_do_crypt 0
-ENDPROC(pmull_gcm_decrypt)
+ eor INP0.16b, INP0.16b, KS0.16b
+ eor INP1.16b, INP1.16b, KS1.16b
+ eor INP2.16b, INP2.16b, KS2.16b
+ eor INP3.16b, INP3.16b, KS3.16b
- /*
- * void pmull_gcm_encrypt_block(u8 dst[], u8 src[], u8 rk[], int rounds)
- */
-ENTRY(pmull_gcm_encrypt_block)
- cbz x2, 0f
- load_round_keys w3, x2
-0: ld1 {v0.16b}, [x1]
- enc_block v0, w3
- st1 {v0.16b}, [x0]
ret
-ENDPROC(pmull_gcm_encrypt_block)
+SYM_FUNC_END(pmull_gcm_enc_4x)
+
+ .section ".rodata", "a"
+ .align 6
+.Lpermute_table:
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+ .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7
+ .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+ .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7
+ .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf
+ .previous
diff --git a/arch/arm64/crypto/ghash-ce-glue.c b/arch/arm64/crypto/ghash-ce-glue.c
index cfc9c92814fd..15794fe21a0b 100644
--- a/arch/arm64/crypto/ghash-ce-glue.c
+++ b/arch/arm64/crypto/ghash-ce-glue.c
@@ -1,11 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Accelerated GHASH implementation with ARMv8 PMULL instructions.
*
- * Copyright (C) 2014 - 2017 Linaro Ltd. <ard.biesheuvel@linaro.org>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as published
- * by the Free Software Foundation.
+ * Copyright (C) 2014 - 2018 Linaro Ltd. <ard.biesheuvel@linaro.org>
*/
#include <asm/neon.h>
@@ -17,6 +14,7 @@
#include <crypto/gf128mul.h>
#include <crypto/internal/aead.h>
#include <crypto/internal/hash.h>
+#include <crypto/internal/simd.h>
#include <crypto/internal/skcipher.h>
#include <crypto/scatterwalk.h>
#include <linux/cpufeature.h>
@@ -33,9 +31,8 @@ MODULE_ALIAS_CRYPTO("ghash");
#define GCM_IV_SIZE 12
struct ghash_key {
- u64 a;
- u64 b;
- be128 k;
+ be128 k;
+ u64 h[][2];
};
struct ghash_desc_ctx {
@@ -50,29 +47,18 @@ struct gcm_aes_ctx {
};
asmlinkage void pmull_ghash_update_p64(int blocks, u64 dg[], const char *src,
- struct ghash_key const *k,
- const char *head);
+ u64 const h[][2], const char *head);
asmlinkage void pmull_ghash_update_p8(int blocks, u64 dg[], const char *src,
- struct ghash_key const *k,
- const char *head);
-
-static void (*pmull_ghash_update)(int blocks, u64 dg[], const char *src,
- struct ghash_key const *k,
- const char *head);
-
-asmlinkage void pmull_gcm_encrypt(int blocks, u64 dg[], u8 dst[],
- const u8 src[], struct ghash_key const *k,
- u8 ctr[], int rounds, u8 ks[]);
-
-asmlinkage void pmull_gcm_decrypt(int blocks, u64 dg[], u8 dst[],
- const u8 src[], struct ghash_key const *k,
- u8 ctr[], int rounds);
+ u64 const h[][2], const char *head);
-asmlinkage void pmull_gcm_encrypt_block(u8 dst[], u8 const src[],
- u32 const rk[], int rounds);
-
-asmlinkage void __aes_arm64_encrypt(u32 *rk, u8 *out, const u8 *in, int rounds);
+asmlinkage void pmull_gcm_encrypt(int bytes, u8 dst[], const u8 src[],
+ u64 const h[][2], u64 dg[], u8 ctr[],
+ u32 const rk[], int rounds, u8 tag[]);
+asmlinkage int pmull_gcm_decrypt(int bytes, u8 dst[], const u8 src[],
+ u64 const h[][2], u64 dg[], u8 ctr[],
+ u32 const rk[], int rounds, const u8 l[],
+ const u8 tag[], u64 authsize);
static int ghash_init(struct shash_desc *desc)
{
@@ -85,33 +71,47 @@ static int ghash_init(struct shash_desc *desc)
static void ghash_do_update(int blocks, u64 dg[], const char *src,
struct ghash_key *key, const char *head)
{
- if (likely(may_use_simd())) {
+ be128 dst = { cpu_to_be64(dg[1]), cpu_to_be64(dg[0]) };
+
+ do {
+ const u8 *in = src;
+
+ if (head) {
+ in = head;
+ blocks++;
+ head = NULL;
+ } else {
+ src += GHASH_BLOCK_SIZE;
+ }
+
+ crypto_xor((u8 *)&dst, in, GHASH_BLOCK_SIZE);
+ gf128mul_lle(&dst, &key->k);
+ } while (--blocks);
+
+ dg[0] = be64_to_cpu(dst.b);
+ dg[1] = be64_to_cpu(dst.a);
+}
+
+static __always_inline
+void ghash_do_simd_update(int blocks, u64 dg[], const char *src,
+ struct ghash_key *key, const char *head,
+ void (*simd_update)(int blocks, u64 dg[],
+ const char *src,
+ u64 const h[][2],
+ const char *head))
+{
+ if (likely(crypto_simd_usable())) {
kernel_neon_begin();
- pmull_ghash_update(blocks, dg, src, key, head);
+ simd_update(blocks, dg, src, key->h, head);
kernel_neon_end();
} else {
- be128 dst = { cpu_to_be64(dg[1]), cpu_to_be64(dg[0]) };
-
- do {
- const u8 *in = src;
-
- if (head) {
- in = head;
- blocks++;
- head = NULL;
- } else {
- src += GHASH_BLOCK_SIZE;
- }
-
- crypto_xor((u8 *)&dst, in, GHASH_BLOCK_SIZE);
- gf128mul_lle(&dst, &key->k);
- } while (--blocks);
-
- dg[0] = be64_to_cpu(dst.b);
- dg[1] = be64_to_cpu(dst.a);
+ ghash_do_update(blocks, dg, src, key, head);
}
}
+/* avoid hogging the CPU for too long */
+#define MAX_BLOCKS (SZ_64K / GHASH_BLOCK_SIZE)
+
static int ghash_update(struct shash_desc *desc, const u8 *src,
unsigned int len)
{
@@ -135,11 +135,17 @@ static int ghash_update(struct shash_desc *desc, const u8 *src,
blocks = len / GHASH_BLOCK_SIZE;
len %= GHASH_BLOCK_SIZE;
- ghash_do_update(blocks, ctx->digest, src, key,
- partial ? ctx->buf : NULL);
+ do {
+ int chunk = min(blocks, MAX_BLOCKS);
- src += blocks * GHASH_BLOCK_SIZE;
- partial = 0;
+ ghash_do_simd_update(chunk, ctx->digest, src, key,
+ partial ? ctx->buf : NULL,
+ pmull_ghash_update_p8);
+
+ blocks -= chunk;
+ src += chunk * GHASH_BLOCK_SIZE;
+ partial = 0;
+ } while (unlikely(blocks > 0));
}
if (len)
memcpy(ctx->buf + partial, src, len);
@@ -156,34 +162,25 @@ static int ghash_final(struct shash_desc *desc, u8 *dst)
memset(ctx->buf + partial, 0, GHASH_BLOCK_SIZE - partial);
- ghash_do_update(1, ctx->digest, ctx->buf, key, NULL);
+ ghash_do_simd_update(1, ctx->digest, ctx->buf, key, NULL,
+ pmull_ghash_update_p8);
}
put_unaligned_be64(ctx->digest[1], dst);
put_unaligned_be64(ctx->digest[0], dst + 8);
- *ctx = (struct ghash_desc_ctx){};
+ memzero_explicit(ctx, sizeof(*ctx));
return 0;
}
-static int __ghash_setkey(struct ghash_key *key,
- const u8 *inkey, unsigned int keylen)
+static void ghash_reflect(u64 h[], const be128 *k)
{
- u64 a, b;
-
- /* needed for the fallback */
- memcpy(&key->k, inkey, GHASH_BLOCK_SIZE);
-
- /* perform multiplication by 'x' in GF(2^128) */
- b = get_unaligned_be64(inkey);
- a = get_unaligned_be64(inkey + 8);
-
- key->a = (a << 1) | (b >> 63);
- key->b = (b << 1) | (a >> 63);
+ u64 carry = be64_to_cpu(k->a) & BIT(63) ? 1 : 0;
- if (b >> 63)
- key->b ^= 0xc200000000000000UL;
+ h[0] = (be64_to_cpu(k->b) << 1) | carry;
+ h[1] = (be64_to_cpu(k->a) << 1) | (be64_to_cpu(k->b) >> 63);
- return 0;
+ if (carry)
+ h[1] ^= 0xc200000000000000UL;
}
static int ghash_setkey(struct crypto_shash *tfm,
@@ -191,21 +188,22 @@ static int ghash_setkey(struct crypto_shash *tfm,
{
struct ghash_key *key = crypto_shash_ctx(tfm);
- if (keylen != GHASH_BLOCK_SIZE) {
- crypto_shash_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
+ if (keylen != GHASH_BLOCK_SIZE)
return -EINVAL;
- }
- return __ghash_setkey(key, inkey, keylen);
+ /* needed for the fallback */
+ memcpy(&key->k, inkey, GHASH_BLOCK_SIZE);
+
+ ghash_reflect(key->h[0], &key->k);
+ return 0;
}
static struct shash_alg ghash_alg = {
.base.cra_name = "ghash",
- .base.cra_driver_name = "ghash-ce",
- .base.cra_priority = 200,
- .base.cra_flags = CRYPTO_ALG_TYPE_SHASH,
+ .base.cra_driver_name = "ghash-neon",
+ .base.cra_priority = 150,
.base.cra_blocksize = GHASH_BLOCK_SIZE,
- .base.cra_ctxsize = sizeof(struct ghash_key),
+ .base.cra_ctxsize = sizeof(struct ghash_key) + sizeof(u64[2]),
.base.cra_module = THIS_MODULE,
.digestsize = GHASH_DIGEST_SIZE,
@@ -233,18 +231,31 @@ static int gcm_setkey(struct crypto_aead *tfm, const u8 *inkey,
{
struct gcm_aes_ctx *ctx = crypto_aead_ctx(tfm);
u8 key[GHASH_BLOCK_SIZE];
+ be128 h;
int ret;
- ret = crypto_aes_expand_key(&ctx->aes_key, inkey, keylen);
- if (ret) {
- tfm->base.crt_flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
+ ret = aes_expandkey(&ctx->aes_key, inkey, keylen);
+ if (ret)
return -EINVAL;
- }
- __aes_arm64_encrypt(ctx->aes_key.key_enc, key, (u8[AES_BLOCK_SIZE]){},
- num_rounds(&ctx->aes_key));
+ aes_encrypt(&ctx->aes_key, key, (u8[AES_BLOCK_SIZE]){});
+
+ /* needed for the fallback */
+ memcpy(&ctx->ghash_key.k, key, GHASH_BLOCK_SIZE);
+
+ ghash_reflect(ctx->ghash_key.h[0], &ctx->ghash_key.k);
+
+ h = ctx->ghash_key.k;
+ gf128mul_lle(&h, &ctx->ghash_key.k);
+ ghash_reflect(ctx->ghash_key.h[1], &h);
+
+ gf128mul_lle(&h, &ctx->ghash_key.k);
+ ghash_reflect(ctx->ghash_key.h[2], &h);
- return __ghash_setkey(&ctx->ghash_key, key, sizeof(key));
+ gf128mul_lle(&h, &ctx->ghash_key.k);
+ ghash_reflect(ctx->ghash_key.h[3], &h);
+
+ return 0;
}
static int gcm_setauthsize(struct crypto_aead *tfm, unsigned int authsize)
@@ -276,8 +287,9 @@ static void gcm_update_mac(u64 dg[], const u8 *src, int count, u8 buf[],
if (count >= GHASH_BLOCK_SIZE || *buf_count == GHASH_BLOCK_SIZE) {
int blocks = count / GHASH_BLOCK_SIZE;
- ghash_do_update(blocks, dg, src, &ctx->ghash_key,
- *buf_count ? buf : NULL);
+ ghash_do_simd_update(blocks, dg, src, &ctx->ghash_key,
+ *buf_count ? buf : NULL,
+ pmull_ghash_update_p64);
src += blocks * GHASH_BLOCK_SIZE;
count %= GHASH_BLOCK_SIZE;
@@ -321,121 +333,69 @@ static void gcm_calculate_auth_mac(struct aead_request *req, u64 dg[])
if (buf_count) {
memset(&buf[buf_count], 0, GHASH_BLOCK_SIZE - buf_count);
- ghash_do_update(1, dg, buf, &ctx->ghash_key, NULL);
+ ghash_do_simd_update(1, dg, buf, &ctx->ghash_key, NULL,
+ pmull_ghash_update_p64);
}
}
-static void gcm_final(struct aead_request *req, struct gcm_aes_ctx *ctx,
- u64 dg[], u8 tag[], int cryptlen)
-{
- u8 mac[AES_BLOCK_SIZE];
- u128 lengths;
-
- lengths.a = cpu_to_be64(req->assoclen * 8);
- lengths.b = cpu_to_be64(cryptlen * 8);
-
- ghash_do_update(1, dg, (void *)&lengths, &ctx->ghash_key, NULL);
-
- put_unaligned_be64(dg[1], mac);
- put_unaligned_be64(dg[0], mac + 8);
-
- crypto_xor(tag, mac, AES_BLOCK_SIZE);
-}
-
static int gcm_encrypt(struct aead_request *req)
{
struct crypto_aead *aead = crypto_aead_reqtfm(req);
struct gcm_aes_ctx *ctx = crypto_aead_ctx(aead);
+ int nrounds = num_rounds(&ctx->aes_key);
struct skcipher_walk walk;
+ u8 buf[AES_BLOCK_SIZE];
u8 iv[AES_BLOCK_SIZE];
- u8 ks[AES_BLOCK_SIZE];
- u8 tag[AES_BLOCK_SIZE];
u64 dg[2] = {};
+ be128 lengths;
+ u8 *tag;
int err;
+ lengths.a = cpu_to_be64(req->assoclen * 8);
+ lengths.b = cpu_to_be64(req->cryptlen * 8);
+
if (req->assoclen)
gcm_calculate_auth_mac(req, dg);
memcpy(iv, req->iv, GCM_IV_SIZE);
- put_unaligned_be32(1, iv + GCM_IV_SIZE);
+ put_unaligned_be32(2, iv + GCM_IV_SIZE);
- if (likely(may_use_simd())) {
- kernel_neon_begin();
-
- pmull_gcm_encrypt_block(tag, iv, ctx->aes_key.key_enc,
- num_rounds(&ctx->aes_key));
- put_unaligned_be32(2, iv + GCM_IV_SIZE);
- pmull_gcm_encrypt_block(ks, iv, NULL,
- num_rounds(&ctx->aes_key));
- put_unaligned_be32(3, iv + GCM_IV_SIZE);
-
- err = skcipher_walk_aead_encrypt(&walk, req, true);
-
- while (walk.nbytes >= AES_BLOCK_SIZE) {
- int blocks = walk.nbytes / AES_BLOCK_SIZE;
+ err = skcipher_walk_aead_encrypt(&walk, req, false);
- pmull_gcm_encrypt(blocks, dg, walk.dst.virt.addr,
- walk.src.virt.addr, &ctx->ghash_key,
- iv, num_rounds(&ctx->aes_key), ks);
-
- err = skcipher_walk_done(&walk,
- walk.nbytes % AES_BLOCK_SIZE);
- }
- kernel_neon_end();
- } else {
- __aes_arm64_encrypt(ctx->aes_key.key_enc, tag, iv,
- num_rounds(&ctx->aes_key));
- put_unaligned_be32(2, iv + GCM_IV_SIZE);
-
- err = skcipher_walk_aead_encrypt(&walk, req, true);
-
- while (walk.nbytes >= AES_BLOCK_SIZE) {
- int blocks = walk.nbytes / AES_BLOCK_SIZE;
- u8 *dst = walk.dst.virt.addr;
- u8 *src = walk.src.virt.addr;
-
- do {
- __aes_arm64_encrypt(ctx->aes_key.key_enc,
- ks, iv,
- num_rounds(&ctx->aes_key));
- crypto_xor_cpy(dst, src, ks, AES_BLOCK_SIZE);
- crypto_inc(iv, AES_BLOCK_SIZE);
-
- dst += AES_BLOCK_SIZE;
- src += AES_BLOCK_SIZE;
- } while (--blocks > 0);
-
- ghash_do_update(walk.nbytes / AES_BLOCK_SIZE, dg,
- walk.dst.virt.addr, &ctx->ghash_key,
- NULL);
-
- err = skcipher_walk_done(&walk,
- walk.nbytes % AES_BLOCK_SIZE);
+ do {
+ const u8 *src = walk.src.virt.addr;
+ u8 *dst = walk.dst.virt.addr;
+ int nbytes = walk.nbytes;
+
+ tag = (u8 *)&lengths;
+
+ if (unlikely(nbytes > 0 && nbytes < AES_BLOCK_SIZE)) {
+ src = dst = memcpy(buf + sizeof(buf) - nbytes,
+ src, nbytes);
+ } else if (nbytes < walk.total) {
+ nbytes &= ~(AES_BLOCK_SIZE - 1);
+ tag = NULL;
}
- if (walk.nbytes)
- __aes_arm64_encrypt(ctx->aes_key.key_enc, ks, iv,
- num_rounds(&ctx->aes_key));
- }
- /* handle the tail */
- if (walk.nbytes) {
- u8 buf[GHASH_BLOCK_SIZE];
+ kernel_neon_begin();
+ pmull_gcm_encrypt(nbytes, dst, src, ctx->ghash_key.h,
+ dg, iv, ctx->aes_key.key_enc, nrounds,
+ tag);
+ kernel_neon_end();
- crypto_xor_cpy(walk.dst.virt.addr, walk.src.virt.addr, ks,
- walk.nbytes);
+ if (unlikely(!nbytes))
+ break;
- memcpy(buf, walk.dst.virt.addr, walk.nbytes);
- memset(buf + walk.nbytes, 0, GHASH_BLOCK_SIZE - walk.nbytes);
- ghash_do_update(1, dg, buf, &ctx->ghash_key, NULL);
+ if (unlikely(nbytes > 0 && nbytes < AES_BLOCK_SIZE))
+ memcpy(walk.dst.virt.addr,
+ buf + sizeof(buf) - nbytes, nbytes);
- err = skcipher_walk_done(&walk, 0);
- }
+ err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
+ } while (walk.nbytes);
if (err)
return err;
- gcm_final(req, ctx, dg, tag, req->cryptlen);
-
/* copy authtag to end of dst */
scatterwalk_map_and_copy(tag, req->dst, req->assoclen + req->cryptlen,
crypto_aead_authsize(aead), 1);
@@ -448,102 +408,67 @@ static int gcm_decrypt(struct aead_request *req)
struct crypto_aead *aead = crypto_aead_reqtfm(req);
struct gcm_aes_ctx *ctx = crypto_aead_ctx(aead);
unsigned int authsize = crypto_aead_authsize(aead);
+ int nrounds = num_rounds(&ctx->aes_key);
struct skcipher_walk walk;
+ u8 otag[AES_BLOCK_SIZE];
+ u8 buf[AES_BLOCK_SIZE];
u8 iv[AES_BLOCK_SIZE];
- u8 tag[AES_BLOCK_SIZE];
- u8 buf[GHASH_BLOCK_SIZE];
u64 dg[2] = {};
+ be128 lengths;
+ u8 *tag;
+ int ret;
int err;
+ lengths.a = cpu_to_be64(req->assoclen * 8);
+ lengths.b = cpu_to_be64((req->cryptlen - authsize) * 8);
+
if (req->assoclen)
gcm_calculate_auth_mac(req, dg);
memcpy(iv, req->iv, GCM_IV_SIZE);
- put_unaligned_be32(1, iv + GCM_IV_SIZE);
-
- if (likely(may_use_simd())) {
- kernel_neon_begin();
-
- pmull_gcm_encrypt_block(tag, iv, ctx->aes_key.key_enc,
- num_rounds(&ctx->aes_key));
- put_unaligned_be32(2, iv + GCM_IV_SIZE);
-
- err = skcipher_walk_aead_decrypt(&walk, req, true);
+ put_unaligned_be32(2, iv + GCM_IV_SIZE);
- while (walk.nbytes >= AES_BLOCK_SIZE) {
- int blocks = walk.nbytes / AES_BLOCK_SIZE;
+ scatterwalk_map_and_copy(otag, req->src,
+ req->assoclen + req->cryptlen - authsize,
+ authsize, 0);
- pmull_gcm_decrypt(blocks, dg, walk.dst.virt.addr,
- walk.src.virt.addr, &ctx->ghash_key,
- iv, num_rounds(&ctx->aes_key));
+ err = skcipher_walk_aead_decrypt(&walk, req, false);
- err = skcipher_walk_done(&walk,
- walk.nbytes % AES_BLOCK_SIZE);
+ do {
+ const u8 *src = walk.src.virt.addr;
+ u8 *dst = walk.dst.virt.addr;
+ int nbytes = walk.nbytes;
+
+ tag = (u8 *)&lengths;
+
+ if (unlikely(nbytes > 0 && nbytes < AES_BLOCK_SIZE)) {
+ src = dst = memcpy(buf + sizeof(buf) - nbytes,
+ src, nbytes);
+ } else if (nbytes < walk.total) {
+ nbytes &= ~(AES_BLOCK_SIZE - 1);
+ tag = NULL;
}
- if (walk.nbytes)
- pmull_gcm_encrypt_block(iv, iv, NULL,
- num_rounds(&ctx->aes_key));
+ kernel_neon_begin();
+ ret = pmull_gcm_decrypt(nbytes, dst, src, ctx->ghash_key.h,
+ dg, iv, ctx->aes_key.key_enc,
+ nrounds, tag, otag, authsize);
kernel_neon_end();
- } else {
- __aes_arm64_encrypt(ctx->aes_key.key_enc, tag, iv,
- num_rounds(&ctx->aes_key));
- put_unaligned_be32(2, iv + GCM_IV_SIZE);
-
- err = skcipher_walk_aead_decrypt(&walk, req, true);
-
- while (walk.nbytes >= AES_BLOCK_SIZE) {
- int blocks = walk.nbytes / AES_BLOCK_SIZE;
- u8 *dst = walk.dst.virt.addr;
- u8 *src = walk.src.virt.addr;
-
- ghash_do_update(blocks, dg, walk.src.virt.addr,
- &ctx->ghash_key, NULL);
-
- do {
- __aes_arm64_encrypt(ctx->aes_key.key_enc,
- buf, iv,
- num_rounds(&ctx->aes_key));
- crypto_xor_cpy(dst, src, buf, AES_BLOCK_SIZE);
- crypto_inc(iv, AES_BLOCK_SIZE);
- dst += AES_BLOCK_SIZE;
- src += AES_BLOCK_SIZE;
- } while (--blocks > 0);
+ if (unlikely(!nbytes))
+ break;
- err = skcipher_walk_done(&walk,
- walk.nbytes % AES_BLOCK_SIZE);
- }
- if (walk.nbytes)
- __aes_arm64_encrypt(ctx->aes_key.key_enc, iv, iv,
- num_rounds(&ctx->aes_key));
- }
-
- /* handle the tail */
- if (walk.nbytes) {
- memcpy(buf, walk.src.virt.addr, walk.nbytes);
- memset(buf + walk.nbytes, 0, GHASH_BLOCK_SIZE - walk.nbytes);
- ghash_do_update(1, dg, buf, &ctx->ghash_key, NULL);
+ if (unlikely(nbytes > 0 && nbytes < AES_BLOCK_SIZE))
+ memcpy(walk.dst.virt.addr,
+ buf + sizeof(buf) - nbytes, nbytes);
- crypto_xor_cpy(walk.dst.virt.addr, walk.src.virt.addr, iv,
- walk.nbytes);
-
- err = skcipher_walk_done(&walk, 0);
- }
+ err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
+ } while (walk.nbytes);
if (err)
return err;
- gcm_final(req, ctx, dg, tag, req->cryptlen - authsize);
-
- /* compare calculated auth tag with the stored one */
- scatterwalk_map_and_copy(buf, req->src,
- req->assoclen + req->cryptlen - authsize,
- authsize, 0);
-
- if (crypto_memneq(tag, buf, authsize))
- return -EBADMSG;
- return 0;
+ return ret ? -EBADMSG : 0;
}
static struct aead_alg gcm_aes_alg = {
@@ -559,39 +484,28 @@ static struct aead_alg gcm_aes_alg = {
.base.cra_driver_name = "gcm-aes-ce",
.base.cra_priority = 300,
.base.cra_blocksize = 1,
- .base.cra_ctxsize = sizeof(struct gcm_aes_ctx),
+ .base.cra_ctxsize = sizeof(struct gcm_aes_ctx) +
+ 4 * sizeof(u64[2]),
.base.cra_module = THIS_MODULE,
};
static int __init ghash_ce_mod_init(void)
{
- int ret;
-
- if (!(elf_hwcap & HWCAP_ASIMD))
+ if (!cpu_have_named_feature(ASIMD))
return -ENODEV;
- if (elf_hwcap & HWCAP_PMULL)
- pmull_ghash_update = pmull_ghash_update_p64;
+ if (cpu_have_named_feature(PMULL))
+ return crypto_register_aead(&gcm_aes_alg);
- else
- pmull_ghash_update = pmull_ghash_update_p8;
-
- ret = crypto_register_shash(&ghash_alg);
- if (ret)
- return ret;
-
- if (elf_hwcap & HWCAP_PMULL) {
- ret = crypto_register_aead(&gcm_aes_alg);
- if (ret)
- crypto_unregister_shash(&ghash_alg);
- }
- return ret;
+ return crypto_register_shash(&ghash_alg);
}
static void __exit ghash_ce_mod_exit(void)
{
- crypto_unregister_shash(&ghash_alg);
- crypto_unregister_aead(&gcm_aes_alg);
+ if (cpu_have_named_feature(PMULL))
+ crypto_unregister_aead(&gcm_aes_alg);
+ else
+ crypto_unregister_shash(&ghash_alg);
}
static const struct cpu_feature ghash_cpu_feature[] = {
diff --git a/arch/arm64/crypto/nh-neon-core.S b/arch/arm64/crypto/nh-neon-core.S
new file mode 100644
index 000000000000..51c0a534ef87
--- /dev/null
+++ b/arch/arm64/crypto/nh-neon-core.S
@@ -0,0 +1,103 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * NH - ε-almost-universal hash function, ARM64 NEON accelerated version
+ *
+ * Copyright 2018 Google LLC
+ *
+ * Author: Eric Biggers <ebiggers@google.com>
+ */
+
+#include <linux/linkage.h>
+
+ KEY .req x0
+ MESSAGE .req x1
+ MESSAGE_LEN .req x2
+ HASH .req x3
+
+ PASS0_SUMS .req v0
+ PASS1_SUMS .req v1
+ PASS2_SUMS .req v2
+ PASS3_SUMS .req v3
+ K0 .req v4
+ K1 .req v5
+ K2 .req v6
+ K3 .req v7
+ T0 .req v8
+ T1 .req v9
+ T2 .req v10
+ T3 .req v11
+ T4 .req v12
+ T5 .req v13
+ T6 .req v14
+ T7 .req v15
+
+.macro _nh_stride k0, k1, k2, k3
+
+ // Load next message stride
+ ld1 {T3.16b}, [MESSAGE], #16
+
+ // Load next key stride
+ ld1 {\k3\().4s}, [KEY], #16
+
+ // Add message words to key words
+ add T0.4s, T3.4s, \k0\().4s
+ add T1.4s, T3.4s, \k1\().4s
+ add T2.4s, T3.4s, \k2\().4s
+ add T3.4s, T3.4s, \k3\().4s
+
+ // Multiply 32x32 => 64 and accumulate
+ mov T4.d[0], T0.d[1]
+ mov T5.d[0], T1.d[1]
+ mov T6.d[0], T2.d[1]
+ mov T7.d[0], T3.d[1]
+ umlal PASS0_SUMS.2d, T0.2s, T4.2s
+ umlal PASS1_SUMS.2d, T1.2s, T5.2s
+ umlal PASS2_SUMS.2d, T2.2s, T6.2s
+ umlal PASS3_SUMS.2d, T3.2s, T7.2s
+.endm
+
+/*
+ * void nh_neon(const u32 *key, const u8 *message, size_t message_len,
+ * u8 hash[NH_HASH_BYTES])
+ *
+ * It's guaranteed that message_len % 16 == 0.
+ */
+SYM_FUNC_START(nh_neon)
+
+ ld1 {K0.4s,K1.4s}, [KEY], #32
+ movi PASS0_SUMS.2d, #0
+ movi PASS1_SUMS.2d, #0
+ ld1 {K2.4s}, [KEY], #16
+ movi PASS2_SUMS.2d, #0
+ movi PASS3_SUMS.2d, #0
+
+ subs MESSAGE_LEN, MESSAGE_LEN, #64
+ blt .Lloop4_done
+.Lloop4:
+ _nh_stride K0, K1, K2, K3
+ _nh_stride K1, K2, K3, K0
+ _nh_stride K2, K3, K0, K1
+ _nh_stride K3, K0, K1, K2
+ subs MESSAGE_LEN, MESSAGE_LEN, #64
+ bge .Lloop4
+
+.Lloop4_done:
+ ands MESSAGE_LEN, MESSAGE_LEN, #63
+ beq .Ldone
+ _nh_stride K0, K1, K2, K3
+
+ subs MESSAGE_LEN, MESSAGE_LEN, #16
+ beq .Ldone
+ _nh_stride K1, K2, K3, K0
+
+ subs MESSAGE_LEN, MESSAGE_LEN, #16
+ beq .Ldone
+ _nh_stride K2, K3, K0, K1
+
+.Ldone:
+ // Sum the accumulators for each pass, then store the sums to 'hash'
+ addp T0.2d, PASS0_SUMS.2d, PASS1_SUMS.2d
+ addp T1.2d, PASS2_SUMS.2d, PASS3_SUMS.2d
+ st1 {T0.16b,T1.16b}, [HASH]
+ ret
+SYM_FUNC_END(nh_neon)
diff --git a/arch/arm64/crypto/nhpoly1305-neon-glue.c b/arch/arm64/crypto/nhpoly1305-neon-glue.c
new file mode 100644
index 000000000000..c5405e6a6db7
--- /dev/null
+++ b/arch/arm64/crypto/nhpoly1305-neon-glue.c
@@ -0,0 +1,78 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * NHPoly1305 - ε-almost-∆-universal hash function for Adiantum
+ * (ARM64 NEON accelerated version)
+ *
+ * Copyright 2018 Google LLC
+ */
+
+#include <asm/neon.h>
+#include <asm/simd.h>
+#include <crypto/internal/hash.h>
+#include <crypto/internal/simd.h>
+#include <crypto/nhpoly1305.h>
+#include <linux/module.h>
+
+asmlinkage void nh_neon(const u32 *key, const u8 *message, size_t message_len,
+ u8 hash[NH_HASH_BYTES]);
+
+/* wrapper to avoid indirect call to assembly, which doesn't work with CFI */
+static void _nh_neon(const u32 *key, const u8 *message, size_t message_len,
+ __le64 hash[NH_NUM_PASSES])
+{
+ nh_neon(key, message, message_len, (u8 *)hash);
+}
+
+static int nhpoly1305_neon_update(struct shash_desc *desc,
+ const u8 *src, unsigned int srclen)
+{
+ if (srclen < 64 || !crypto_simd_usable())
+ return crypto_nhpoly1305_update(desc, src, srclen);
+
+ do {
+ unsigned int n = min_t(unsigned int, srclen, SZ_4K);
+
+ kernel_neon_begin();
+ crypto_nhpoly1305_update_helper(desc, src, n, _nh_neon);
+ kernel_neon_end();
+ src += n;
+ srclen -= n;
+ } while (srclen);
+ return 0;
+}
+
+static struct shash_alg nhpoly1305_alg = {
+ .base.cra_name = "nhpoly1305",
+ .base.cra_driver_name = "nhpoly1305-neon",
+ .base.cra_priority = 200,
+ .base.cra_ctxsize = sizeof(struct nhpoly1305_key),
+ .base.cra_module = THIS_MODULE,
+ .digestsize = POLY1305_DIGEST_SIZE,
+ .init = crypto_nhpoly1305_init,
+ .update = nhpoly1305_neon_update,
+ .final = crypto_nhpoly1305_final,
+ .setkey = crypto_nhpoly1305_setkey,
+ .descsize = sizeof(struct nhpoly1305_state),
+};
+
+static int __init nhpoly1305_mod_init(void)
+{
+ if (!cpu_have_named_feature(ASIMD))
+ return -ENODEV;
+
+ return crypto_register_shash(&nhpoly1305_alg);
+}
+
+static void __exit nhpoly1305_mod_exit(void)
+{
+ crypto_unregister_shash(&nhpoly1305_alg);
+}
+
+module_init(nhpoly1305_mod_init);
+module_exit(nhpoly1305_mod_exit);
+
+MODULE_DESCRIPTION("NHPoly1305 ε-almost-∆-universal hash function (NEON-accelerated)");
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Eric Biggers <ebiggers@google.com>");
+MODULE_ALIAS_CRYPTO("nhpoly1305");
+MODULE_ALIAS_CRYPTO("nhpoly1305-neon");
diff --git a/arch/arm64/crypto/poly1305-armv8.pl b/arch/arm64/crypto/poly1305-armv8.pl
new file mode 100644
index 000000000000..cbc980fb02e3
--- /dev/null
+++ b/arch/arm64/crypto/poly1305-armv8.pl
@@ -0,0 +1,913 @@
+#!/usr/bin/env perl
+# SPDX-License-Identifier: GPL-1.0+ OR BSD-3-Clause
+#
+# ====================================================================
+# Written by Andy Polyakov, @dot-asm, initially for the OpenSSL
+# project.
+# ====================================================================
+#
+# This module implements Poly1305 hash for ARMv8.
+#
+# June 2015
+#
+# Numbers are cycles per processed byte with poly1305_blocks alone.
+#
+# IALU/gcc-4.9 NEON
+#
+# Apple A7 1.86/+5% 0.72
+# Cortex-A53 2.69/+58% 1.47
+# Cortex-A57 2.70/+7% 1.14
+# Denver 1.64/+50% 1.18(*)
+# X-Gene 2.13/+68% 2.27
+# Mongoose 1.77/+75% 1.12
+# Kryo 2.70/+55% 1.13
+# ThunderX2 1.17/+95% 1.36
+#
+# (*) estimate based on resources availability is less than 1.0,
+# i.e. measured result is worse than expected, presumably binary
+# translator is not almighty;
+
+$flavour=shift;
+$output=shift;
+
+if ($flavour && $flavour ne "void") {
+ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+ ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+ ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+ die "can't locate arm-xlate.pl";
+
+ open STDOUT,"| \"$^X\" $xlate $flavour $output";
+} else {
+ open STDOUT,">$output";
+}
+
+my ($ctx,$inp,$len,$padbit) = map("x$_",(0..3));
+my ($mac,$nonce)=($inp,$len);
+
+my ($h0,$h1,$h2,$r0,$r1,$s1,$t0,$t1,$d0,$d1,$d2) = map("x$_",(4..14));
+
+$code.=<<___;
+#ifndef __KERNEL__
+# include "arm_arch.h"
+.extern OPENSSL_armcap_P
+#endif
+
+.text
+
+// forward "declarations" are required for Apple
+.globl poly1305_blocks
+.globl poly1305_emit
+
+.globl poly1305_init
+.type poly1305_init,%function
+.align 5
+poly1305_init:
+ cmp $inp,xzr
+ stp xzr,xzr,[$ctx] // zero hash value
+ stp xzr,xzr,[$ctx,#16] // [along with is_base2_26]
+
+ csel x0,xzr,x0,eq
+ b.eq .Lno_key
+
+#ifndef __KERNEL__
+ adrp x17,OPENSSL_armcap_P
+ ldr w17,[x17,#:lo12:OPENSSL_armcap_P]
+#endif
+
+ ldp $r0,$r1,[$inp] // load key
+ mov $s1,#0xfffffffc0fffffff
+ movk $s1,#0x0fff,lsl#48
+#ifdef __AARCH64EB__
+ rev $r0,$r0 // flip bytes
+ rev $r1,$r1
+#endif
+ and $r0,$r0,$s1 // &=0ffffffc0fffffff
+ and $s1,$s1,#-4
+ and $r1,$r1,$s1 // &=0ffffffc0ffffffc
+ mov w#$s1,#-1
+ stp $r0,$r1,[$ctx,#32] // save key value
+ str w#$s1,[$ctx,#48] // impossible key power value
+
+#ifndef __KERNEL__
+ tst w17,#ARMV7_NEON
+
+ adr $d0,.Lpoly1305_blocks
+ adr $r0,.Lpoly1305_blocks_neon
+ adr $d1,.Lpoly1305_emit
+
+ csel $d0,$d0,$r0,eq
+
+# ifdef __ILP32__
+ stp w#$d0,w#$d1,[$len]
+# else
+ stp $d0,$d1,[$len]
+# endif
+#endif
+ mov x0,#1
+.Lno_key:
+ ret
+.size poly1305_init,.-poly1305_init
+
+.type poly1305_blocks,%function
+.align 5
+poly1305_blocks:
+.Lpoly1305_blocks:
+ ands $len,$len,#-16
+ b.eq .Lno_data
+
+ ldp $h0,$h1,[$ctx] // load hash value
+ ldp $h2,x17,[$ctx,#16] // [along with is_base2_26]
+ ldp $r0,$r1,[$ctx,#32] // load key value
+
+#ifdef __AARCH64EB__
+ lsr $d0,$h0,#32
+ mov w#$d1,w#$h0
+ lsr $d2,$h1,#32
+ mov w15,w#$h1
+ lsr x16,$h2,#32
+#else
+ mov w#$d0,w#$h0
+ lsr $d1,$h0,#32
+ mov w#$d2,w#$h1
+ lsr x15,$h1,#32
+ mov w16,w#$h2
+#endif
+
+ add $d0,$d0,$d1,lsl#26 // base 2^26 -> base 2^64
+ lsr $d1,$d2,#12
+ adds $d0,$d0,$d2,lsl#52
+ add $d1,$d1,x15,lsl#14
+ adc $d1,$d1,xzr
+ lsr $d2,x16,#24
+ adds $d1,$d1,x16,lsl#40
+ adc $d2,$d2,xzr
+
+ cmp x17,#0 // is_base2_26?
+ add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2)
+ csel $h0,$h0,$d0,eq // choose between radixes
+ csel $h1,$h1,$d1,eq
+ csel $h2,$h2,$d2,eq
+
+.Loop:
+ ldp $t0,$t1,[$inp],#16 // load input
+ sub $len,$len,#16
+#ifdef __AARCH64EB__
+ rev $t0,$t0
+ rev $t1,$t1
+#endif
+ adds $h0,$h0,$t0 // accumulate input
+ adcs $h1,$h1,$t1
+
+ mul $d0,$h0,$r0 // h0*r0
+ adc $h2,$h2,$padbit
+ umulh $d1,$h0,$r0
+
+ mul $t0,$h1,$s1 // h1*5*r1
+ umulh $t1,$h1,$s1
+
+ adds $d0,$d0,$t0
+ mul $t0,$h0,$r1 // h0*r1
+ adc $d1,$d1,$t1
+ umulh $d2,$h0,$r1
+
+ adds $d1,$d1,$t0
+ mul $t0,$h1,$r0 // h1*r0
+ adc $d2,$d2,xzr
+ umulh $t1,$h1,$r0
+
+ adds $d1,$d1,$t0
+ mul $t0,$h2,$s1 // h2*5*r1
+ adc $d2,$d2,$t1
+ mul $t1,$h2,$r0 // h2*r0
+
+ adds $d1,$d1,$t0
+ adc $d2,$d2,$t1
+
+ and $t0,$d2,#-4 // final reduction
+ and $h2,$d2,#3
+ add $t0,$t0,$d2,lsr#2
+ adds $h0,$d0,$t0
+ adcs $h1,$d1,xzr
+ adc $h2,$h2,xzr
+
+ cbnz $len,.Loop
+
+ stp $h0,$h1,[$ctx] // store hash value
+ stp $h2,xzr,[$ctx,#16] // [and clear is_base2_26]
+
+.Lno_data:
+ ret
+.size poly1305_blocks,.-poly1305_blocks
+
+.type poly1305_emit,%function
+.align 5
+poly1305_emit:
+.Lpoly1305_emit:
+ ldp $h0,$h1,[$ctx] // load hash base 2^64
+ ldp $h2,$r0,[$ctx,#16] // [along with is_base2_26]
+ ldp $t0,$t1,[$nonce] // load nonce
+
+#ifdef __AARCH64EB__
+ lsr $d0,$h0,#32
+ mov w#$d1,w#$h0
+ lsr $d2,$h1,#32
+ mov w15,w#$h1
+ lsr x16,$h2,#32
+#else
+ mov w#$d0,w#$h0
+ lsr $d1,$h0,#32
+ mov w#$d2,w#$h1
+ lsr x15,$h1,#32
+ mov w16,w#$h2
+#endif
+
+ add $d0,$d0,$d1,lsl#26 // base 2^26 -> base 2^64
+ lsr $d1,$d2,#12
+ adds $d0,$d0,$d2,lsl#52
+ add $d1,$d1,x15,lsl#14
+ adc $d1,$d1,xzr
+ lsr $d2,x16,#24
+ adds $d1,$d1,x16,lsl#40
+ adc $d2,$d2,xzr
+
+ cmp $r0,#0 // is_base2_26?
+ csel $h0,$h0,$d0,eq // choose between radixes
+ csel $h1,$h1,$d1,eq
+ csel $h2,$h2,$d2,eq
+
+ adds $d0,$h0,#5 // compare to modulus
+ adcs $d1,$h1,xzr
+ adc $d2,$h2,xzr
+
+ tst $d2,#-4 // see if it's carried/borrowed
+
+ csel $h0,$h0,$d0,eq
+ csel $h1,$h1,$d1,eq
+
+#ifdef __AARCH64EB__
+ ror $t0,$t0,#32 // flip nonce words
+ ror $t1,$t1,#32
+#endif
+ adds $h0,$h0,$t0 // accumulate nonce
+ adc $h1,$h1,$t1
+#ifdef __AARCH64EB__
+ rev $h0,$h0 // flip output bytes
+ rev $h1,$h1
+#endif
+ stp $h0,$h1,[$mac] // write result
+
+ ret
+.size poly1305_emit,.-poly1305_emit
+___
+my ($R0,$R1,$S1,$R2,$S2,$R3,$S3,$R4,$S4) = map("v$_.4s",(0..8));
+my ($IN01_0,$IN01_1,$IN01_2,$IN01_3,$IN01_4) = map("v$_.2s",(9..13));
+my ($IN23_0,$IN23_1,$IN23_2,$IN23_3,$IN23_4) = map("v$_.2s",(14..18));
+my ($ACC0,$ACC1,$ACC2,$ACC3,$ACC4) = map("v$_.2d",(19..23));
+my ($H0,$H1,$H2,$H3,$H4) = map("v$_.2s",(24..28));
+my ($T0,$T1,$MASK) = map("v$_",(29..31));
+
+my ($in2,$zeros)=("x16","x17");
+my $is_base2_26 = $zeros; # borrow
+
+$code.=<<___;
+.type poly1305_mult,%function
+.align 5
+poly1305_mult:
+ mul $d0,$h0,$r0 // h0*r0
+ umulh $d1,$h0,$r0
+
+ mul $t0,$h1,$s1 // h1*5*r1
+ umulh $t1,$h1,$s1
+
+ adds $d0,$d0,$t0
+ mul $t0,$h0,$r1 // h0*r1
+ adc $d1,$d1,$t1
+ umulh $d2,$h0,$r1
+
+ adds $d1,$d1,$t0
+ mul $t0,$h1,$r0 // h1*r0
+ adc $d2,$d2,xzr
+ umulh $t1,$h1,$r0
+
+ adds $d1,$d1,$t0
+ mul $t0,$h2,$s1 // h2*5*r1
+ adc $d2,$d2,$t1
+ mul $t1,$h2,$r0 // h2*r0
+
+ adds $d1,$d1,$t0
+ adc $d2,$d2,$t1
+
+ and $t0,$d2,#-4 // final reduction
+ and $h2,$d2,#3
+ add $t0,$t0,$d2,lsr#2
+ adds $h0,$d0,$t0
+ adcs $h1,$d1,xzr
+ adc $h2,$h2,xzr
+
+ ret
+.size poly1305_mult,.-poly1305_mult
+
+.type poly1305_splat,%function
+.align 4
+poly1305_splat:
+ and x12,$h0,#0x03ffffff // base 2^64 -> base 2^26
+ ubfx x13,$h0,#26,#26
+ extr x14,$h1,$h0,#52
+ and x14,x14,#0x03ffffff
+ ubfx x15,$h1,#14,#26
+ extr x16,$h2,$h1,#40
+
+ str w12,[$ctx,#16*0] // r0
+ add w12,w13,w13,lsl#2 // r1*5
+ str w13,[$ctx,#16*1] // r1
+ add w13,w14,w14,lsl#2 // r2*5
+ str w12,[$ctx,#16*2] // s1
+ str w14,[$ctx,#16*3] // r2
+ add w14,w15,w15,lsl#2 // r3*5
+ str w13,[$ctx,#16*4] // s2
+ str w15,[$ctx,#16*5] // r3
+ add w15,w16,w16,lsl#2 // r4*5
+ str w14,[$ctx,#16*6] // s3
+ str w16,[$ctx,#16*7] // r4
+ str w15,[$ctx,#16*8] // s4
+
+ ret
+.size poly1305_splat,.-poly1305_splat
+
+#ifdef __KERNEL__
+.globl poly1305_blocks_neon
+#endif
+.type poly1305_blocks_neon,%function
+.align 5
+poly1305_blocks_neon:
+.Lpoly1305_blocks_neon:
+ ldr $is_base2_26,[$ctx,#24]
+ cmp $len,#128
+ b.lo .Lpoly1305_blocks
+
+ .inst 0xd503233f // paciasp
+ stp x29,x30,[sp,#-80]!
+ add x29,sp,#0
+
+ stp d8,d9,[sp,#16] // meet ABI requirements
+ stp d10,d11,[sp,#32]
+ stp d12,d13,[sp,#48]
+ stp d14,d15,[sp,#64]
+
+ cbz $is_base2_26,.Lbase2_64_neon
+
+ ldp w10,w11,[$ctx] // load hash value base 2^26
+ ldp w12,w13,[$ctx,#8]
+ ldr w14,[$ctx,#16]
+
+ tst $len,#31
+ b.eq .Leven_neon
+
+ ldp $r0,$r1,[$ctx,#32] // load key value
+
+ add $h0,x10,x11,lsl#26 // base 2^26 -> base 2^64
+ lsr $h1,x12,#12
+ adds $h0,$h0,x12,lsl#52
+ add $h1,$h1,x13,lsl#14
+ adc $h1,$h1,xzr
+ lsr $h2,x14,#24
+ adds $h1,$h1,x14,lsl#40
+ adc $d2,$h2,xzr // can be partially reduced...
+
+ ldp $d0,$d1,[$inp],#16 // load input
+ sub $len,$len,#16
+ add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2)
+
+#ifdef __AARCH64EB__
+ rev $d0,$d0
+ rev $d1,$d1
+#endif
+ adds $h0,$h0,$d0 // accumulate input
+ adcs $h1,$h1,$d1
+ adc $h2,$h2,$padbit
+
+ bl poly1305_mult
+
+ and x10,$h0,#0x03ffffff // base 2^64 -> base 2^26
+ ubfx x11,$h0,#26,#26
+ extr x12,$h1,$h0,#52
+ and x12,x12,#0x03ffffff
+ ubfx x13,$h1,#14,#26
+ extr x14,$h2,$h1,#40
+
+ b .Leven_neon
+
+.align 4
+.Lbase2_64_neon:
+ ldp $r0,$r1,[$ctx,#32] // load key value
+
+ ldp $h0,$h1,[$ctx] // load hash value base 2^64
+ ldr $h2,[$ctx,#16]
+
+ tst $len,#31
+ b.eq .Linit_neon
+
+ ldp $d0,$d1,[$inp],#16 // load input
+ sub $len,$len,#16
+ add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2)
+#ifdef __AARCH64EB__
+ rev $d0,$d0
+ rev $d1,$d1
+#endif
+ adds $h0,$h0,$d0 // accumulate input
+ adcs $h1,$h1,$d1
+ adc $h2,$h2,$padbit
+
+ bl poly1305_mult
+
+.Linit_neon:
+ ldr w17,[$ctx,#48] // first table element
+ and x10,$h0,#0x03ffffff // base 2^64 -> base 2^26
+ ubfx x11,$h0,#26,#26
+ extr x12,$h1,$h0,#52
+ and x12,x12,#0x03ffffff
+ ubfx x13,$h1,#14,#26
+ extr x14,$h2,$h1,#40
+
+ cmp w17,#-1 // is value impossible?
+ b.ne .Leven_neon
+
+ fmov ${H0},x10
+ fmov ${H1},x11
+ fmov ${H2},x12
+ fmov ${H3},x13
+ fmov ${H4},x14
+
+ ////////////////////////////////// initialize r^n table
+ mov $h0,$r0 // r^1
+ add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2)
+ mov $h1,$r1
+ mov $h2,xzr
+ add $ctx,$ctx,#48+12
+ bl poly1305_splat
+
+ bl poly1305_mult // r^2
+ sub $ctx,$ctx,#4
+ bl poly1305_splat
+
+ bl poly1305_mult // r^3
+ sub $ctx,$ctx,#4
+ bl poly1305_splat
+
+ bl poly1305_mult // r^4
+ sub $ctx,$ctx,#4
+ bl poly1305_splat
+ sub $ctx,$ctx,#48 // restore original $ctx
+ b .Ldo_neon
+
+.align 4
+.Leven_neon:
+ fmov ${H0},x10
+ fmov ${H1},x11
+ fmov ${H2},x12
+ fmov ${H3},x13
+ fmov ${H4},x14
+
+.Ldo_neon:
+ ldp x8,x12,[$inp,#32] // inp[2:3]
+ subs $len,$len,#64
+ ldp x9,x13,[$inp,#48]
+ add $in2,$inp,#96
+ adr $zeros,.Lzeros
+
+ lsl $padbit,$padbit,#24
+ add x15,$ctx,#48
+
+#ifdef __AARCH64EB__
+ rev x8,x8
+ rev x12,x12
+ rev x9,x9
+ rev x13,x13
+#endif
+ and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
+ and x5,x9,#0x03ffffff
+ ubfx x6,x8,#26,#26
+ ubfx x7,x9,#26,#26
+ add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
+ extr x8,x12,x8,#52
+ extr x9,x13,x9,#52
+ add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
+ fmov $IN23_0,x4
+ and x8,x8,#0x03ffffff
+ and x9,x9,#0x03ffffff
+ ubfx x10,x12,#14,#26
+ ubfx x11,x13,#14,#26
+ add x12,$padbit,x12,lsr#40
+ add x13,$padbit,x13,lsr#40
+ add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
+ fmov $IN23_1,x6
+ add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
+ add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
+ fmov $IN23_2,x8
+ fmov $IN23_3,x10
+ fmov $IN23_4,x12
+
+ ldp x8,x12,[$inp],#16 // inp[0:1]
+ ldp x9,x13,[$inp],#48
+
+ ld1 {$R0,$R1,$S1,$R2},[x15],#64
+ ld1 {$S2,$R3,$S3,$R4},[x15],#64
+ ld1 {$S4},[x15]
+
+#ifdef __AARCH64EB__
+ rev x8,x8
+ rev x12,x12
+ rev x9,x9
+ rev x13,x13
+#endif
+ and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
+ and x5,x9,#0x03ffffff
+ ubfx x6,x8,#26,#26
+ ubfx x7,x9,#26,#26
+ add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
+ extr x8,x12,x8,#52
+ extr x9,x13,x9,#52
+ add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
+ fmov $IN01_0,x4
+ and x8,x8,#0x03ffffff
+ and x9,x9,#0x03ffffff
+ ubfx x10,x12,#14,#26
+ ubfx x11,x13,#14,#26
+ add x12,$padbit,x12,lsr#40
+ add x13,$padbit,x13,lsr#40
+ add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
+ fmov $IN01_1,x6
+ add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
+ add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
+ movi $MASK.2d,#-1
+ fmov $IN01_2,x8
+ fmov $IN01_3,x10
+ fmov $IN01_4,x12
+ ushr $MASK.2d,$MASK.2d,#38
+
+ b.ls .Lskip_loop
+
+.align 4
+.Loop_neon:
+ ////////////////////////////////////////////////////////////////
+ // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
+ // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
+ // \___________________/
+ // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
+ // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
+ // \___________________/ \____________________/
+ //
+ // Note that we start with inp[2:3]*r^2. This is because it
+ // doesn't depend on reduction in previous iteration.
+ ////////////////////////////////////////////////////////////////
+ // d4 = h0*r4 + h1*r3 + h2*r2 + h3*r1 + h4*r0
+ // d3 = h0*r3 + h1*r2 + h2*r1 + h3*r0 + h4*5*r4
+ // d2 = h0*r2 + h1*r1 + h2*r0 + h3*5*r4 + h4*5*r3
+ // d1 = h0*r1 + h1*r0 + h2*5*r4 + h3*5*r3 + h4*5*r2
+ // d0 = h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1
+
+ subs $len,$len,#64
+ umull $ACC4,$IN23_0,${R4}[2]
+ csel $in2,$zeros,$in2,lo
+ umull $ACC3,$IN23_0,${R3}[2]
+ umull $ACC2,$IN23_0,${R2}[2]
+ ldp x8,x12,[$in2],#16 // inp[2:3] (or zero)
+ umull $ACC1,$IN23_0,${R1}[2]
+ ldp x9,x13,[$in2],#48
+ umull $ACC0,$IN23_0,${R0}[2]
+#ifdef __AARCH64EB__
+ rev x8,x8
+ rev x12,x12
+ rev x9,x9
+ rev x13,x13
+#endif
+
+ umlal $ACC4,$IN23_1,${R3}[2]
+ and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
+ umlal $ACC3,$IN23_1,${R2}[2]
+ and x5,x9,#0x03ffffff
+ umlal $ACC2,$IN23_1,${R1}[2]
+ ubfx x6,x8,#26,#26
+ umlal $ACC1,$IN23_1,${R0}[2]
+ ubfx x7,x9,#26,#26
+ umlal $ACC0,$IN23_1,${S4}[2]
+ add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
+
+ umlal $ACC4,$IN23_2,${R2}[2]
+ extr x8,x12,x8,#52
+ umlal $ACC3,$IN23_2,${R1}[2]
+ extr x9,x13,x9,#52
+ umlal $ACC2,$IN23_2,${R0}[2]
+ add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
+ umlal $ACC1,$IN23_2,${S4}[2]
+ fmov $IN23_0,x4
+ umlal $ACC0,$IN23_2,${S3}[2]
+ and x8,x8,#0x03ffffff
+
+ umlal $ACC4,$IN23_3,${R1}[2]
+ and x9,x9,#0x03ffffff
+ umlal $ACC3,$IN23_3,${R0}[2]
+ ubfx x10,x12,#14,#26
+ umlal $ACC2,$IN23_3,${S4}[2]
+ ubfx x11,x13,#14,#26
+ umlal $ACC1,$IN23_3,${S3}[2]
+ add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
+ umlal $ACC0,$IN23_3,${S2}[2]
+ fmov $IN23_1,x6
+
+ add $IN01_2,$IN01_2,$H2
+ add x12,$padbit,x12,lsr#40
+ umlal $ACC4,$IN23_4,${R0}[2]
+ add x13,$padbit,x13,lsr#40
+ umlal $ACC3,$IN23_4,${S4}[2]
+ add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
+ umlal $ACC2,$IN23_4,${S3}[2]
+ add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
+ umlal $ACC1,$IN23_4,${S2}[2]
+ fmov $IN23_2,x8
+ umlal $ACC0,$IN23_4,${S1}[2]
+ fmov $IN23_3,x10
+
+ ////////////////////////////////////////////////////////////////
+ // (hash+inp[0:1])*r^4 and accumulate
+
+ add $IN01_0,$IN01_0,$H0
+ fmov $IN23_4,x12
+ umlal $ACC3,$IN01_2,${R1}[0]
+ ldp x8,x12,[$inp],#16 // inp[0:1]
+ umlal $ACC0,$IN01_2,${S3}[0]
+ ldp x9,x13,[$inp],#48
+ umlal $ACC4,$IN01_2,${R2}[0]
+ umlal $ACC1,$IN01_2,${S4}[0]
+ umlal $ACC2,$IN01_2,${R0}[0]
+#ifdef __AARCH64EB__
+ rev x8,x8
+ rev x12,x12
+ rev x9,x9
+ rev x13,x13
+#endif
+
+ add $IN01_1,$IN01_1,$H1
+ umlal $ACC3,$IN01_0,${R3}[0]
+ umlal $ACC4,$IN01_0,${R4}[0]
+ and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
+ umlal $ACC2,$IN01_0,${R2}[0]
+ and x5,x9,#0x03ffffff
+ umlal $ACC0,$IN01_0,${R0}[0]
+ ubfx x6,x8,#26,#26
+ umlal $ACC1,$IN01_0,${R1}[0]
+ ubfx x7,x9,#26,#26
+
+ add $IN01_3,$IN01_3,$H3
+ add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
+ umlal $ACC3,$IN01_1,${R2}[0]
+ extr x8,x12,x8,#52
+ umlal $ACC4,$IN01_1,${R3}[0]
+ extr x9,x13,x9,#52
+ umlal $ACC0,$IN01_1,${S4}[0]
+ add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
+ umlal $ACC2,$IN01_1,${R1}[0]
+ fmov $IN01_0,x4
+ umlal $ACC1,$IN01_1,${R0}[0]
+ and x8,x8,#0x03ffffff
+
+ add $IN01_4,$IN01_4,$H4
+ and x9,x9,#0x03ffffff
+ umlal $ACC3,$IN01_3,${R0}[0]
+ ubfx x10,x12,#14,#26
+ umlal $ACC0,$IN01_3,${S2}[0]
+ ubfx x11,x13,#14,#26
+ umlal $ACC4,$IN01_3,${R1}[0]
+ add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
+ umlal $ACC1,$IN01_3,${S3}[0]
+ fmov $IN01_1,x6
+ umlal $ACC2,$IN01_3,${S4}[0]
+ add x12,$padbit,x12,lsr#40
+
+ umlal $ACC3,$IN01_4,${S4}[0]
+ add x13,$padbit,x13,lsr#40
+ umlal $ACC0,$IN01_4,${S1}[0]
+ add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
+ umlal $ACC4,$IN01_4,${R0}[0]
+ add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
+ umlal $ACC1,$IN01_4,${S2}[0]
+ fmov $IN01_2,x8
+ umlal $ACC2,$IN01_4,${S3}[0]
+ fmov $IN01_3,x10
+ fmov $IN01_4,x12
+
+ /////////////////////////////////////////////////////////////////
+ // lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
+ // and P. Schwabe
+ //
+ // [see discussion in poly1305-armv4 module]
+
+ ushr $T0.2d,$ACC3,#26
+ xtn $H3,$ACC3
+ ushr $T1.2d,$ACC0,#26
+ and $ACC0,$ACC0,$MASK.2d
+ add $ACC4,$ACC4,$T0.2d // h3 -> h4
+ bic $H3,#0xfc,lsl#24 // &=0x03ffffff
+ add $ACC1,$ACC1,$T1.2d // h0 -> h1
+
+ ushr $T0.2d,$ACC4,#26
+ xtn $H4,$ACC4
+ ushr $T1.2d,$ACC1,#26
+ xtn $H1,$ACC1
+ bic $H4,#0xfc,lsl#24
+ add $ACC2,$ACC2,$T1.2d // h1 -> h2
+
+ add $ACC0,$ACC0,$T0.2d
+ shl $T0.2d,$T0.2d,#2
+ shrn $T1.2s,$ACC2,#26
+ xtn $H2,$ACC2
+ add $ACC0,$ACC0,$T0.2d // h4 -> h0
+ bic $H1,#0xfc,lsl#24
+ add $H3,$H3,$T1.2s // h2 -> h3
+ bic $H2,#0xfc,lsl#24
+
+ shrn $T0.2s,$ACC0,#26
+ xtn $H0,$ACC0
+ ushr $T1.2s,$H3,#26
+ bic $H3,#0xfc,lsl#24
+ bic $H0,#0xfc,lsl#24
+ add $H1,$H1,$T0.2s // h0 -> h1
+ add $H4,$H4,$T1.2s // h3 -> h4
+
+ b.hi .Loop_neon
+
+.Lskip_loop:
+ dup $IN23_2,${IN23_2}[0]
+ add $IN01_2,$IN01_2,$H2
+
+ ////////////////////////////////////////////////////////////////
+ // multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
+
+ adds $len,$len,#32
+ b.ne .Long_tail
+
+ dup $IN23_2,${IN01_2}[0]
+ add $IN23_0,$IN01_0,$H0
+ add $IN23_3,$IN01_3,$H3
+ add $IN23_1,$IN01_1,$H1
+ add $IN23_4,$IN01_4,$H4
+
+.Long_tail:
+ dup $IN23_0,${IN23_0}[0]
+ umull2 $ACC0,$IN23_2,${S3}
+ umull2 $ACC3,$IN23_2,${R1}
+ umull2 $ACC4,$IN23_2,${R2}
+ umull2 $ACC2,$IN23_2,${R0}
+ umull2 $ACC1,$IN23_2,${S4}
+
+ dup $IN23_1,${IN23_1}[0]
+ umlal2 $ACC0,$IN23_0,${R0}
+ umlal2 $ACC2,$IN23_0,${R2}
+ umlal2 $ACC3,$IN23_0,${R3}
+ umlal2 $ACC4,$IN23_0,${R4}
+ umlal2 $ACC1,$IN23_0,${R1}
+
+ dup $IN23_3,${IN23_3}[0]
+ umlal2 $ACC0,$IN23_1,${S4}
+ umlal2 $ACC3,$IN23_1,${R2}
+ umlal2 $ACC2,$IN23_1,${R1}
+ umlal2 $ACC4,$IN23_1,${R3}
+ umlal2 $ACC1,$IN23_1,${R0}
+
+ dup $IN23_4,${IN23_4}[0]
+ umlal2 $ACC3,$IN23_3,${R0}
+ umlal2 $ACC4,$IN23_3,${R1}
+ umlal2 $ACC0,$IN23_3,${S2}
+ umlal2 $ACC1,$IN23_3,${S3}
+ umlal2 $ACC2,$IN23_3,${S4}
+
+ umlal2 $ACC3,$IN23_4,${S4}
+ umlal2 $ACC0,$IN23_4,${S1}
+ umlal2 $ACC4,$IN23_4,${R0}
+ umlal2 $ACC1,$IN23_4,${S2}
+ umlal2 $ACC2,$IN23_4,${S3}
+
+ b.eq .Lshort_tail
+
+ ////////////////////////////////////////////////////////////////
+ // (hash+inp[0:1])*r^4:r^3 and accumulate
+
+ add $IN01_0,$IN01_0,$H0
+ umlal $ACC3,$IN01_2,${R1}
+ umlal $ACC0,$IN01_2,${S3}
+ umlal $ACC4,$IN01_2,${R2}
+ umlal $ACC1,$IN01_2,${S4}
+ umlal $ACC2,$IN01_2,${R0}
+
+ add $IN01_1,$IN01_1,$H1
+ umlal $ACC3,$IN01_0,${R3}
+ umlal $ACC0,$IN01_0,${R0}
+ umlal $ACC4,$IN01_0,${R4}
+ umlal $ACC1,$IN01_0,${R1}
+ umlal $ACC2,$IN01_0,${R2}
+
+ add $IN01_3,$IN01_3,$H3
+ umlal $ACC3,$IN01_1,${R2}
+ umlal $ACC0,$IN01_1,${S4}
+ umlal $ACC4,$IN01_1,${R3}
+ umlal $ACC1,$IN01_1,${R0}
+ umlal $ACC2,$IN01_1,${R1}
+
+ add $IN01_4,$IN01_4,$H4
+ umlal $ACC3,$IN01_3,${R0}
+ umlal $ACC0,$IN01_3,${S2}
+ umlal $ACC4,$IN01_3,${R1}
+ umlal $ACC1,$IN01_3,${S3}
+ umlal $ACC2,$IN01_3,${S4}
+
+ umlal $ACC3,$IN01_4,${S4}
+ umlal $ACC0,$IN01_4,${S1}
+ umlal $ACC4,$IN01_4,${R0}
+ umlal $ACC1,$IN01_4,${S2}
+ umlal $ACC2,$IN01_4,${S3}
+
+.Lshort_tail:
+ ////////////////////////////////////////////////////////////////
+ // horizontal add
+
+ addp $ACC3,$ACC3,$ACC3
+ ldp d8,d9,[sp,#16] // meet ABI requirements
+ addp $ACC0,$ACC0,$ACC0
+ ldp d10,d11,[sp,#32]
+ addp $ACC4,$ACC4,$ACC4
+ ldp d12,d13,[sp,#48]
+ addp $ACC1,$ACC1,$ACC1
+ ldp d14,d15,[sp,#64]
+ addp $ACC2,$ACC2,$ACC2
+ ldr x30,[sp,#8]
+
+ ////////////////////////////////////////////////////////////////
+ // lazy reduction, but without narrowing
+
+ ushr $T0.2d,$ACC3,#26
+ and $ACC3,$ACC3,$MASK.2d
+ ushr $T1.2d,$ACC0,#26
+ and $ACC0,$ACC0,$MASK.2d
+
+ add $ACC4,$ACC4,$T0.2d // h3 -> h4
+ add $ACC1,$ACC1,$T1.2d // h0 -> h1
+
+ ushr $T0.2d,$ACC4,#26
+ and $ACC4,$ACC4,$MASK.2d
+ ushr $T1.2d,$ACC1,#26
+ and $ACC1,$ACC1,$MASK.2d
+ add $ACC2,$ACC2,$T1.2d // h1 -> h2
+
+ add $ACC0,$ACC0,$T0.2d
+ shl $T0.2d,$T0.2d,#2
+ ushr $T1.2d,$ACC2,#26
+ and $ACC2,$ACC2,$MASK.2d
+ add $ACC0,$ACC0,$T0.2d // h4 -> h0
+ add $ACC3,$ACC3,$T1.2d // h2 -> h3
+
+ ushr $T0.2d,$ACC0,#26
+ and $ACC0,$ACC0,$MASK.2d
+ ushr $T1.2d,$ACC3,#26
+ and $ACC3,$ACC3,$MASK.2d
+ add $ACC1,$ACC1,$T0.2d // h0 -> h1
+ add $ACC4,$ACC4,$T1.2d // h3 -> h4
+
+ ////////////////////////////////////////////////////////////////
+ // write the result, can be partially reduced
+
+ st4 {$ACC0,$ACC1,$ACC2,$ACC3}[0],[$ctx],#16
+ mov x4,#1
+ st1 {$ACC4}[0],[$ctx]
+ str x4,[$ctx,#8] // set is_base2_26
+
+ ldr x29,[sp],#80
+ .inst 0xd50323bf // autiasp
+ ret
+.size poly1305_blocks_neon,.-poly1305_blocks_neon
+
+.align 5
+.Lzeros:
+.long 0,0,0,0,0,0,0,0
+.asciz "Poly1305 for ARMv8, CRYPTOGAMS by \@dot-asm"
+.align 2
+#if !defined(__KERNEL__) && !defined(_WIN64)
+.comm OPENSSL_armcap_P,4,4
+.hidden OPENSSL_armcap_P
+#endif
+___
+
+foreach (split("\n",$code)) {
+ s/\b(shrn\s+v[0-9]+)\.[24]d/$1.2s/ or
+ s/\b(fmov\s+)v([0-9]+)[^,]*,\s*x([0-9]+)/$1d$2,x$3/ or
+ (m/\bdup\b/ and (s/\.[24]s/.2d/g or 1)) or
+ (m/\b(eor|and)/ and (s/\.[248][sdh]/.16b/g or 1)) or
+ (m/\bum(ul|la)l\b/ and (s/\.4s/.2s/g or 1)) or
+ (m/\bum(ul|la)l2\b/ and (s/\.2s/.4s/g or 1)) or
+ (m/\bst[1-4]\s+{[^}]+}\[/ and (s/\.[24]d/.s/g or 1));
+
+ s/\.[124]([sd])\[/.$1\[/;
+ s/w#x([0-9]+)/w$1/g;
+
+ print $_,"\n";
+}
+close STDOUT;
diff --git a/arch/arm64/crypto/poly1305-glue.c b/arch/arm64/crypto/poly1305-glue.c
new file mode 100644
index 000000000000..1fae18ba11ed
--- /dev/null
+++ b/arch/arm64/crypto/poly1305-glue.c
@@ -0,0 +1,231 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * OpenSSL/Cryptogams accelerated Poly1305 transform for arm64
+ *
+ * Copyright (C) 2019 Linaro Ltd. <ard.biesheuvel@linaro.org>
+ */
+
+#include <asm/hwcap.h>
+#include <asm/neon.h>
+#include <asm/simd.h>
+#include <asm/unaligned.h>
+#include <crypto/algapi.h>
+#include <crypto/internal/hash.h>
+#include <crypto/internal/poly1305.h>
+#include <crypto/internal/simd.h>
+#include <linux/cpufeature.h>
+#include <linux/crypto.h>
+#include <linux/jump_label.h>
+#include <linux/module.h>
+
+asmlinkage void poly1305_init_arm64(void *state, const u8 *key);
+asmlinkage void poly1305_blocks(void *state, const u8 *src, u32 len, u32 hibit);
+asmlinkage void poly1305_blocks_neon(void *state, const u8 *src, u32 len, u32 hibit);
+asmlinkage void poly1305_emit(void *state, u8 *digest, const u32 *nonce);
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
+
+void poly1305_init_arch(struct poly1305_desc_ctx *dctx, const u8 key[POLY1305_KEY_SIZE])
+{
+ poly1305_init_arm64(&dctx->h, key);
+ dctx->s[0] = get_unaligned_le32(key + 16);
+ dctx->s[1] = get_unaligned_le32(key + 20);
+ dctx->s[2] = get_unaligned_le32(key + 24);
+ dctx->s[3] = get_unaligned_le32(key + 28);
+ dctx->buflen = 0;
+}
+EXPORT_SYMBOL(poly1305_init_arch);
+
+static int neon_poly1305_init(struct shash_desc *desc)
+{
+ struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
+
+ dctx->buflen = 0;
+ dctx->rset = 0;
+ dctx->sset = false;
+
+ return 0;
+}
+
+static void neon_poly1305_blocks(struct poly1305_desc_ctx *dctx, const u8 *src,
+ u32 len, u32 hibit, bool do_neon)
+{
+ if (unlikely(!dctx->sset)) {
+ if (!dctx->rset) {
+ poly1305_init_arm64(&dctx->h, src);
+ src += POLY1305_BLOCK_SIZE;
+ len -= POLY1305_BLOCK_SIZE;
+ dctx->rset = 1;
+ }
+ if (len >= POLY1305_BLOCK_SIZE) {
+ dctx->s[0] = get_unaligned_le32(src + 0);
+ dctx->s[1] = get_unaligned_le32(src + 4);
+ dctx->s[2] = get_unaligned_le32(src + 8);
+ dctx->s[3] = get_unaligned_le32(src + 12);
+ src += POLY1305_BLOCK_SIZE;
+ len -= POLY1305_BLOCK_SIZE;
+ dctx->sset = true;
+ }
+ if (len < POLY1305_BLOCK_SIZE)
+ return;
+ }
+
+ len &= ~(POLY1305_BLOCK_SIZE - 1);
+
+ if (static_branch_likely(&have_neon) && likely(do_neon))
+ poly1305_blocks_neon(&dctx->h, src, len, hibit);
+ else
+ poly1305_blocks(&dctx->h, src, len, hibit);
+}
+
+static void neon_poly1305_do_update(struct poly1305_desc_ctx *dctx,
+ const u8 *src, u32 len, bool do_neon)
+{
+ if (unlikely(dctx->buflen)) {
+ u32 bytes = min(len, POLY1305_BLOCK_SIZE - dctx->buflen);
+
+ memcpy(dctx->buf + dctx->buflen, src, bytes);
+ src += bytes;
+ len -= bytes;
+ dctx->buflen += bytes;
+
+ if (dctx->buflen == POLY1305_BLOCK_SIZE) {
+ neon_poly1305_blocks(dctx, dctx->buf,
+ POLY1305_BLOCK_SIZE, 1, false);
+ dctx->buflen = 0;
+ }
+ }
+
+ if (likely(len >= POLY1305_BLOCK_SIZE)) {
+ neon_poly1305_blocks(dctx, src, len, 1, do_neon);
+ src += round_down(len, POLY1305_BLOCK_SIZE);
+ len %= POLY1305_BLOCK_SIZE;
+ }
+
+ if (unlikely(len)) {
+ dctx->buflen = len;
+ memcpy(dctx->buf, src, len);
+ }
+}
+
+static int neon_poly1305_update(struct shash_desc *desc,
+ const u8 *src, unsigned int srclen)
+{
+ bool do_neon = crypto_simd_usable() && srclen > 128;
+ struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
+
+ if (static_branch_likely(&have_neon) && do_neon)
+ kernel_neon_begin();
+ neon_poly1305_do_update(dctx, src, srclen, do_neon);
+ if (static_branch_likely(&have_neon) && do_neon)
+ kernel_neon_end();
+ return 0;
+}
+
+void poly1305_update_arch(struct poly1305_desc_ctx *dctx, const u8 *src,
+ unsigned int nbytes)
+{
+ if (unlikely(dctx->buflen)) {
+ u32 bytes = min(nbytes, POLY1305_BLOCK_SIZE - dctx->buflen);
+
+ memcpy(dctx->buf + dctx->buflen, src, bytes);
+ src += bytes;
+ nbytes -= bytes;
+ dctx->buflen += bytes;
+
+ if (dctx->buflen == POLY1305_BLOCK_SIZE) {
+ poly1305_blocks(&dctx->h, dctx->buf, POLY1305_BLOCK_SIZE, 1);
+ dctx->buflen = 0;
+ }
+ }
+
+ if (likely(nbytes >= POLY1305_BLOCK_SIZE)) {
+ unsigned int len = round_down(nbytes, POLY1305_BLOCK_SIZE);
+
+ if (static_branch_likely(&have_neon) && crypto_simd_usable()) {
+ do {
+ unsigned int todo = min_t(unsigned int, len, SZ_4K);
+
+ kernel_neon_begin();
+ poly1305_blocks_neon(&dctx->h, src, todo, 1);
+ kernel_neon_end();
+
+ len -= todo;
+ src += todo;
+ } while (len);
+ } else {
+ poly1305_blocks(&dctx->h, src, len, 1);
+ src += len;
+ }
+ nbytes %= POLY1305_BLOCK_SIZE;
+ }
+
+ if (unlikely(nbytes)) {
+ dctx->buflen = nbytes;
+ memcpy(dctx->buf, src, nbytes);
+ }
+}
+EXPORT_SYMBOL(poly1305_update_arch);
+
+void poly1305_final_arch(struct poly1305_desc_ctx *dctx, u8 *dst)
+{
+ if (unlikely(dctx->buflen)) {
+ dctx->buf[dctx->buflen++] = 1;
+ memset(dctx->buf + dctx->buflen, 0,
+ POLY1305_BLOCK_SIZE - dctx->buflen);
+ poly1305_blocks(&dctx->h, dctx->buf, POLY1305_BLOCK_SIZE, 0);
+ }
+
+ poly1305_emit(&dctx->h, dst, dctx->s);
+ memzero_explicit(dctx, sizeof(*dctx));
+}
+EXPORT_SYMBOL(poly1305_final_arch);
+
+static int neon_poly1305_final(struct shash_desc *desc, u8 *dst)
+{
+ struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
+
+ if (unlikely(!dctx->sset))
+ return -ENOKEY;
+
+ poly1305_final_arch(dctx, dst);
+ return 0;
+}
+
+static struct shash_alg neon_poly1305_alg = {
+ .init = neon_poly1305_init,
+ .update = neon_poly1305_update,
+ .final = neon_poly1305_final,
+ .digestsize = POLY1305_DIGEST_SIZE,
+ .descsize = sizeof(struct poly1305_desc_ctx),
+
+ .base.cra_name = "poly1305",
+ .base.cra_driver_name = "poly1305-neon",
+ .base.cra_priority = 200,
+ .base.cra_blocksize = POLY1305_BLOCK_SIZE,
+ .base.cra_module = THIS_MODULE,
+};
+
+static int __init neon_poly1305_mod_init(void)
+{
+ if (!cpu_have_named_feature(ASIMD))
+ return 0;
+
+ static_branch_enable(&have_neon);
+
+ return IS_REACHABLE(CONFIG_CRYPTO_HASH) ?
+ crypto_register_shash(&neon_poly1305_alg) : 0;
+}
+
+static void __exit neon_poly1305_mod_exit(void)
+{
+ if (IS_REACHABLE(CONFIG_CRYPTO_HASH) && cpu_have_named_feature(ASIMD))
+ crypto_unregister_shash(&neon_poly1305_alg);
+}
+
+module_init(neon_poly1305_mod_init);
+module_exit(neon_poly1305_mod_exit);
+
+MODULE_LICENSE("GPL v2");
+MODULE_ALIAS_CRYPTO("poly1305");
+MODULE_ALIAS_CRYPTO("poly1305-neon");
diff --git a/arch/arm64/crypto/polyval-ce-core.S b/arch/arm64/crypto/polyval-ce-core.S
new file mode 100644
index 000000000000..b5326540d2e3
--- /dev/null
+++ b/arch/arm64/crypto/polyval-ce-core.S
@@ -0,0 +1,361 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Implementation of POLYVAL using ARMv8 Crypto Extensions.
+ *
+ * Copyright 2021 Google LLC
+ */
+/*
+ * This is an efficient implementation of POLYVAL using ARMv8 Crypto Extensions
+ * It works on 8 blocks at a time, by precomputing the first 8 keys powers h^8,
+ * ..., h^1 in the POLYVAL finite field. This precomputation allows us to split
+ * finite field multiplication into two steps.
+ *
+ * In the first step, we consider h^i, m_i as normal polynomials of degree less
+ * than 128. We then compute p(x) = h^8m_0 + ... + h^1m_7 where multiplication
+ * is simply polynomial multiplication.
+ *
+ * In the second step, we compute the reduction of p(x) modulo the finite field
+ * modulus g(x) = x^128 + x^127 + x^126 + x^121 + 1.
+ *
+ * This two step process is equivalent to computing h^8m_0 + ... + h^1m_7 where
+ * multiplication is finite field multiplication. The advantage is that the
+ * two-step process only requires 1 finite field reduction for every 8
+ * polynomial multiplications. Further parallelism is gained by interleaving the
+ * multiplications and polynomial reductions.
+ */
+
+#include <linux/linkage.h>
+#define STRIDE_BLOCKS 8
+
+KEY_POWERS .req x0
+MSG .req x1
+BLOCKS_LEFT .req x2
+ACCUMULATOR .req x3
+KEY_START .req x10
+EXTRA_BYTES .req x11
+TMP .req x13
+
+M0 .req v0
+M1 .req v1
+M2 .req v2
+M3 .req v3
+M4 .req v4
+M5 .req v5
+M6 .req v6
+M7 .req v7
+KEY8 .req v8
+KEY7 .req v9
+KEY6 .req v10
+KEY5 .req v11
+KEY4 .req v12
+KEY3 .req v13
+KEY2 .req v14
+KEY1 .req v15
+PL .req v16
+PH .req v17
+TMP_V .req v18
+LO .req v20
+MI .req v21
+HI .req v22
+SUM .req v23
+GSTAR .req v24
+
+ .text
+
+ .arch armv8-a+crypto
+ .align 4
+
+.Lgstar:
+ .quad 0xc200000000000000, 0xc200000000000000
+
+/*
+ * Computes the product of two 128-bit polynomials in X and Y and XORs the
+ * components of the 256-bit product into LO, MI, HI.
+ *
+ * Given:
+ * X = [X_1 : X_0]
+ * Y = [Y_1 : Y_0]
+ *
+ * We compute:
+ * LO += X_0 * Y_0
+ * MI += (X_0 + X_1) * (Y_0 + Y_1)
+ * HI += X_1 * Y_1
+ *
+ * Later, the 256-bit result can be extracted as:
+ * [HI_1 : HI_0 + HI_1 + MI_1 + LO_1 : LO_1 + HI_0 + MI_0 + LO_0 : LO_0]
+ * This step is done when computing the polynomial reduction for efficiency
+ * reasons.
+ *
+ * Karatsuba multiplication is used instead of Schoolbook multiplication because
+ * it was found to be slightly faster on ARM64 CPUs.
+ *
+ */
+.macro karatsuba1 X Y
+ X .req \X
+ Y .req \Y
+ ext v25.16b, X.16b, X.16b, #8
+ ext v26.16b, Y.16b, Y.16b, #8
+ eor v25.16b, v25.16b, X.16b
+ eor v26.16b, v26.16b, Y.16b
+ pmull2 v28.1q, X.2d, Y.2d
+ pmull v29.1q, X.1d, Y.1d
+ pmull v27.1q, v25.1d, v26.1d
+ eor HI.16b, HI.16b, v28.16b
+ eor LO.16b, LO.16b, v29.16b
+ eor MI.16b, MI.16b, v27.16b
+ .unreq X
+ .unreq Y
+.endm
+
+/*
+ * Same as karatsuba1, except overwrites HI, LO, MI rather than XORing into
+ * them.
+ */
+.macro karatsuba1_store X Y
+ X .req \X
+ Y .req \Y
+ ext v25.16b, X.16b, X.16b, #8
+ ext v26.16b, Y.16b, Y.16b, #8
+ eor v25.16b, v25.16b, X.16b
+ eor v26.16b, v26.16b, Y.16b
+ pmull2 HI.1q, X.2d, Y.2d
+ pmull LO.1q, X.1d, Y.1d
+ pmull MI.1q, v25.1d, v26.1d
+ .unreq X
+ .unreq Y
+.endm
+
+/*
+ * Computes the 256-bit polynomial represented by LO, HI, MI. Stores
+ * the result in PL, PH.
+ * [PH : PL] =
+ * [HI_1 : HI_1 + HI_0 + MI_1 + LO_1 : HI_0 + MI_0 + LO_1 + LO_0 : LO_0]
+ */
+.macro karatsuba2
+ // v4 = [HI_1 + MI_1 : HI_0 + MI_0]
+ eor v4.16b, HI.16b, MI.16b
+ // v4 = [HI_1 + MI_1 + LO_1 : HI_0 + MI_0 + LO_0]
+ eor v4.16b, v4.16b, LO.16b
+ // v5 = [HI_0 : LO_1]
+ ext v5.16b, LO.16b, HI.16b, #8
+ // v4 = [HI_1 + HI_0 + MI_1 + LO_1 : HI_0 + MI_0 + LO_1 + LO_0]
+ eor v4.16b, v4.16b, v5.16b
+ // HI = [HI_0 : HI_1]
+ ext HI.16b, HI.16b, HI.16b, #8
+ // LO = [LO_0 : LO_1]
+ ext LO.16b, LO.16b, LO.16b, #8
+ // PH = [HI_1 : HI_1 + HI_0 + MI_1 + LO_1]
+ ext PH.16b, v4.16b, HI.16b, #8
+ // PL = [HI_0 + MI_0 + LO_1 + LO_0 : LO_0]
+ ext PL.16b, LO.16b, v4.16b, #8
+.endm
+
+/*
+ * Computes the 128-bit reduction of PH : PL. Stores the result in dest.
+ *
+ * This macro computes p(x) mod g(x) where p(x) is in montgomery form and g(x) =
+ * x^128 + x^127 + x^126 + x^121 + 1.
+ *
+ * We have a 256-bit polynomial PH : PL = P_3 : P_2 : P_1 : P_0 that is the
+ * product of two 128-bit polynomials in Montgomery form. We need to reduce it
+ * mod g(x). Also, since polynomials in Montgomery form have an "extra" factor
+ * of x^128, this product has two extra factors of x^128. To get it back into
+ * Montgomery form, we need to remove one of these factors by dividing by x^128.
+ *
+ * To accomplish both of these goals, we add multiples of g(x) that cancel out
+ * the low 128 bits P_1 : P_0, leaving just the high 128 bits. Since the low
+ * bits are zero, the polynomial division by x^128 can be done by right
+ * shifting.
+ *
+ * Since the only nonzero term in the low 64 bits of g(x) is the constant term,
+ * the multiple of g(x) needed to cancel out P_0 is P_0 * g(x). The CPU can
+ * only do 64x64 bit multiplications, so split P_0 * g(x) into x^128 * P_0 +
+ * x^64 * g*(x) * P_0 + P_0, where g*(x) is bits 64-127 of g(x). Adding this to
+ * the original polynomial gives P_3 : P_2 + P_0 + T_1 : P_1 + T_0 : 0, where T
+ * = T_1 : T_0 = g*(x) * P_0. Thus, bits 0-63 got "folded" into bits 64-191.
+ *
+ * Repeating this same process on the next 64 bits "folds" bits 64-127 into bits
+ * 128-255, giving the answer in bits 128-255. This time, we need to cancel P_1
+ * + T_0 in bits 64-127. The multiple of g(x) required is (P_1 + T_0) * g(x) *
+ * x^64. Adding this to our previous computation gives P_3 + P_1 + T_0 + V_1 :
+ * P_2 + P_0 + T_1 + V_0 : 0 : 0, where V = V_1 : V_0 = g*(x) * (P_1 + T_0).
+ *
+ * So our final computation is:
+ * T = T_1 : T_0 = g*(x) * P_0
+ * V = V_1 : V_0 = g*(x) * (P_1 + T_0)
+ * p(x) / x^{128} mod g(x) = P_3 + P_1 + T_0 + V_1 : P_2 + P_0 + T_1 + V_0
+ *
+ * The implementation below saves a XOR instruction by computing P_1 + T_0 : P_0
+ * + T_1 and XORing into dest, rather than separately XORing P_1 : P_0 and T_0 :
+ * T_1 into dest. This allows us to reuse P_1 + T_0 when computing V.
+ */
+.macro montgomery_reduction dest
+ DEST .req \dest
+ // TMP_V = T_1 : T_0 = P_0 * g*(x)
+ pmull TMP_V.1q, PL.1d, GSTAR.1d
+ // TMP_V = T_0 : T_1
+ ext TMP_V.16b, TMP_V.16b, TMP_V.16b, #8
+ // TMP_V = P_1 + T_0 : P_0 + T_1
+ eor TMP_V.16b, PL.16b, TMP_V.16b
+ // PH = P_3 + P_1 + T_0 : P_2 + P_0 + T_1
+ eor PH.16b, PH.16b, TMP_V.16b
+ // TMP_V = V_1 : V_0 = (P_1 + T_0) * g*(x)
+ pmull2 TMP_V.1q, TMP_V.2d, GSTAR.2d
+ eor DEST.16b, PH.16b, TMP_V.16b
+ .unreq DEST
+.endm
+
+/*
+ * Compute Polyval on 8 blocks.
+ *
+ * If reduce is set, also computes the montgomery reduction of the
+ * previous full_stride call and XORs with the first message block.
+ * (m_0 + REDUCE(PL, PH))h^8 + ... + m_7h^1.
+ * I.e., the first multiplication uses m_0 + REDUCE(PL, PH) instead of m_0.
+ *
+ * Sets PL, PH.
+ */
+.macro full_stride reduce
+ eor LO.16b, LO.16b, LO.16b
+ eor MI.16b, MI.16b, MI.16b
+ eor HI.16b, HI.16b, HI.16b
+
+ ld1 {M0.16b, M1.16b, M2.16b, M3.16b}, [MSG], #64
+ ld1 {M4.16b, M5.16b, M6.16b, M7.16b}, [MSG], #64
+
+ karatsuba1 M7 KEY1
+ .if \reduce
+ pmull TMP_V.1q, PL.1d, GSTAR.1d
+ .endif
+
+ karatsuba1 M6 KEY2
+ .if \reduce
+ ext TMP_V.16b, TMP_V.16b, TMP_V.16b, #8
+ .endif
+
+ karatsuba1 M5 KEY3
+ .if \reduce
+ eor TMP_V.16b, PL.16b, TMP_V.16b
+ .endif
+
+ karatsuba1 M4 KEY4
+ .if \reduce
+ eor PH.16b, PH.16b, TMP_V.16b
+ .endif
+
+ karatsuba1 M3 KEY5
+ .if \reduce
+ pmull2 TMP_V.1q, TMP_V.2d, GSTAR.2d
+ .endif
+
+ karatsuba1 M2 KEY6
+ .if \reduce
+ eor SUM.16b, PH.16b, TMP_V.16b
+ .endif
+
+ karatsuba1 M1 KEY7
+ eor M0.16b, M0.16b, SUM.16b
+
+ karatsuba1 M0 KEY8
+ karatsuba2
+.endm
+
+/*
+ * Handle any extra blocks after full_stride loop.
+ */
+.macro partial_stride
+ add KEY_POWERS, KEY_START, #(STRIDE_BLOCKS << 4)
+ sub KEY_POWERS, KEY_POWERS, BLOCKS_LEFT, lsl #4
+ ld1 {KEY1.16b}, [KEY_POWERS], #16
+
+ ld1 {TMP_V.16b}, [MSG], #16
+ eor SUM.16b, SUM.16b, TMP_V.16b
+ karatsuba1_store KEY1 SUM
+ sub BLOCKS_LEFT, BLOCKS_LEFT, #1
+
+ tst BLOCKS_LEFT, #4
+ beq .Lpartial4BlocksDone
+ ld1 {M0.16b, M1.16b, M2.16b, M3.16b}, [MSG], #64
+ ld1 {KEY8.16b, KEY7.16b, KEY6.16b, KEY5.16b}, [KEY_POWERS], #64
+ karatsuba1 M0 KEY8
+ karatsuba1 M1 KEY7
+ karatsuba1 M2 KEY6
+ karatsuba1 M3 KEY5
+.Lpartial4BlocksDone:
+ tst BLOCKS_LEFT, #2
+ beq .Lpartial2BlocksDone
+ ld1 {M0.16b, M1.16b}, [MSG], #32
+ ld1 {KEY8.16b, KEY7.16b}, [KEY_POWERS], #32
+ karatsuba1 M0 KEY8
+ karatsuba1 M1 KEY7
+.Lpartial2BlocksDone:
+ tst BLOCKS_LEFT, #1
+ beq .LpartialDone
+ ld1 {M0.16b}, [MSG], #16
+ ld1 {KEY8.16b}, [KEY_POWERS], #16
+ karatsuba1 M0 KEY8
+.LpartialDone:
+ karatsuba2
+ montgomery_reduction SUM
+.endm
+
+/*
+ * Perform montgomery multiplication in GF(2^128) and store result in op1.
+ *
+ * Computes op1*op2*x^{-128} mod x^128 + x^127 + x^126 + x^121 + 1
+ * If op1, op2 are in montgomery form, this computes the montgomery
+ * form of op1*op2.
+ *
+ * void pmull_polyval_mul(u8 *op1, const u8 *op2);
+ */
+SYM_FUNC_START(pmull_polyval_mul)
+ adr TMP, .Lgstar
+ ld1 {GSTAR.2d}, [TMP]
+ ld1 {v0.16b}, [x0]
+ ld1 {v1.16b}, [x1]
+ karatsuba1_store v0 v1
+ karatsuba2
+ montgomery_reduction SUM
+ st1 {SUM.16b}, [x0]
+ ret
+SYM_FUNC_END(pmull_polyval_mul)
+
+/*
+ * Perform polynomial evaluation as specified by POLYVAL. This computes:
+ * h^n * accumulator + h^n * m_0 + ... + h^1 * m_{n-1}
+ * where n=nblocks, h is the hash key, and m_i are the message blocks.
+ *
+ * x0 - pointer to precomputed key powers h^8 ... h^1
+ * x1 - pointer to message blocks
+ * x2 - number of blocks to hash
+ * x3 - pointer to accumulator
+ *
+ * void pmull_polyval_update(const struct polyval_ctx *ctx, const u8 *in,
+ * size_t nblocks, u8 *accumulator);
+ */
+SYM_FUNC_START(pmull_polyval_update)
+ adr TMP, .Lgstar
+ mov KEY_START, KEY_POWERS
+ ld1 {GSTAR.2d}, [TMP]
+ ld1 {SUM.16b}, [ACCUMULATOR]
+ subs BLOCKS_LEFT, BLOCKS_LEFT, #STRIDE_BLOCKS
+ blt .LstrideLoopExit
+ ld1 {KEY8.16b, KEY7.16b, KEY6.16b, KEY5.16b}, [KEY_POWERS], #64
+ ld1 {KEY4.16b, KEY3.16b, KEY2.16b, KEY1.16b}, [KEY_POWERS], #64
+ full_stride 0
+ subs BLOCKS_LEFT, BLOCKS_LEFT, #STRIDE_BLOCKS
+ blt .LstrideLoopExitReduce
+.LstrideLoop:
+ full_stride 1
+ subs BLOCKS_LEFT, BLOCKS_LEFT, #STRIDE_BLOCKS
+ bge .LstrideLoop
+.LstrideLoopExitReduce:
+ montgomery_reduction SUM
+.LstrideLoopExit:
+ adds BLOCKS_LEFT, BLOCKS_LEFT, #STRIDE_BLOCKS
+ beq .LskipPartial
+ partial_stride
+.LskipPartial:
+ st1 {SUM.16b}, [ACCUMULATOR]
+ ret
+SYM_FUNC_END(pmull_polyval_update)
diff --git a/arch/arm64/crypto/polyval-ce-glue.c b/arch/arm64/crypto/polyval-ce-glue.c
new file mode 100644
index 000000000000..0a3b5718df85
--- /dev/null
+++ b/arch/arm64/crypto/polyval-ce-glue.c
@@ -0,0 +1,191 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Glue code for POLYVAL using ARMv8 Crypto Extensions
+ *
+ * Copyright (c) 2007 Nokia Siemens Networks - Mikko Herranen <mh1@iki.fi>
+ * Copyright (c) 2009 Intel Corp.
+ * Author: Huang Ying <ying.huang@intel.com>
+ * Copyright 2021 Google LLC
+ */
+
+/*
+ * Glue code based on ghash-clmulni-intel_glue.c.
+ *
+ * This implementation of POLYVAL uses montgomery multiplication accelerated by
+ * ARMv8 Crypto Extensions instructions to implement the finite field operations.
+ */
+
+#include <crypto/algapi.h>
+#include <crypto/internal/hash.h>
+#include <crypto/internal/simd.h>
+#include <crypto/polyval.h>
+#include <linux/crypto.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/cpufeature.h>
+#include <asm/neon.h>
+#include <asm/simd.h>
+
+#define NUM_KEY_POWERS 8
+
+struct polyval_tfm_ctx {
+ /*
+ * These powers must be in the order h^8, ..., h^1.
+ */
+ u8 key_powers[NUM_KEY_POWERS][POLYVAL_BLOCK_SIZE];
+};
+
+struct polyval_desc_ctx {
+ u8 buffer[POLYVAL_BLOCK_SIZE];
+ u32 bytes;
+};
+
+asmlinkage void pmull_polyval_update(const struct polyval_tfm_ctx *keys,
+ const u8 *in, size_t nblocks, u8 *accumulator);
+asmlinkage void pmull_polyval_mul(u8 *op1, const u8 *op2);
+
+static void internal_polyval_update(const struct polyval_tfm_ctx *keys,
+ const u8 *in, size_t nblocks, u8 *accumulator)
+{
+ if (likely(crypto_simd_usable())) {
+ kernel_neon_begin();
+ pmull_polyval_update(keys, in, nblocks, accumulator);
+ kernel_neon_end();
+ } else {
+ polyval_update_non4k(keys->key_powers[NUM_KEY_POWERS-1], in,
+ nblocks, accumulator);
+ }
+}
+
+static void internal_polyval_mul(u8 *op1, const u8 *op2)
+{
+ if (likely(crypto_simd_usable())) {
+ kernel_neon_begin();
+ pmull_polyval_mul(op1, op2);
+ kernel_neon_end();
+ } else {
+ polyval_mul_non4k(op1, op2);
+ }
+}
+
+static int polyval_arm64_setkey(struct crypto_shash *tfm,
+ const u8 *key, unsigned int keylen)
+{
+ struct polyval_tfm_ctx *tctx = crypto_shash_ctx(tfm);
+ int i;
+
+ if (keylen != POLYVAL_BLOCK_SIZE)
+ return -EINVAL;
+
+ memcpy(tctx->key_powers[NUM_KEY_POWERS-1], key, POLYVAL_BLOCK_SIZE);
+
+ for (i = NUM_KEY_POWERS-2; i >= 0; i--) {
+ memcpy(tctx->key_powers[i], key, POLYVAL_BLOCK_SIZE);
+ internal_polyval_mul(tctx->key_powers[i],
+ tctx->key_powers[i+1]);
+ }
+
+ return 0;
+}
+
+static int polyval_arm64_init(struct shash_desc *desc)
+{
+ struct polyval_desc_ctx *dctx = shash_desc_ctx(desc);
+
+ memset(dctx, 0, sizeof(*dctx));
+
+ return 0;
+}
+
+static int polyval_arm64_update(struct shash_desc *desc,
+ const u8 *src, unsigned int srclen)
+{
+ struct polyval_desc_ctx *dctx = shash_desc_ctx(desc);
+ const struct polyval_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm);
+ u8 *pos;
+ unsigned int nblocks;
+ unsigned int n;
+
+ if (dctx->bytes) {
+ n = min(srclen, dctx->bytes);
+ pos = dctx->buffer + POLYVAL_BLOCK_SIZE - dctx->bytes;
+
+ dctx->bytes -= n;
+ srclen -= n;
+
+ while (n--)
+ *pos++ ^= *src++;
+
+ if (!dctx->bytes)
+ internal_polyval_mul(dctx->buffer,
+ tctx->key_powers[NUM_KEY_POWERS-1]);
+ }
+
+ while (srclen >= POLYVAL_BLOCK_SIZE) {
+ /* allow rescheduling every 4K bytes */
+ nblocks = min(srclen, 4096U) / POLYVAL_BLOCK_SIZE;
+ internal_polyval_update(tctx, src, nblocks, dctx->buffer);
+ srclen -= nblocks * POLYVAL_BLOCK_SIZE;
+ src += nblocks * POLYVAL_BLOCK_SIZE;
+ }
+
+ if (srclen) {
+ dctx->bytes = POLYVAL_BLOCK_SIZE - srclen;
+ pos = dctx->buffer;
+ while (srclen--)
+ *pos++ ^= *src++;
+ }
+
+ return 0;
+}
+
+static int polyval_arm64_final(struct shash_desc *desc, u8 *dst)
+{
+ struct polyval_desc_ctx *dctx = shash_desc_ctx(desc);
+ const struct polyval_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm);
+
+ if (dctx->bytes) {
+ internal_polyval_mul(dctx->buffer,
+ tctx->key_powers[NUM_KEY_POWERS-1]);
+ }
+
+ memcpy(dst, dctx->buffer, POLYVAL_BLOCK_SIZE);
+
+ return 0;
+}
+
+static struct shash_alg polyval_alg = {
+ .digestsize = POLYVAL_DIGEST_SIZE,
+ .init = polyval_arm64_init,
+ .update = polyval_arm64_update,
+ .final = polyval_arm64_final,
+ .setkey = polyval_arm64_setkey,
+ .descsize = sizeof(struct polyval_desc_ctx),
+ .base = {
+ .cra_name = "polyval",
+ .cra_driver_name = "polyval-ce",
+ .cra_priority = 200,
+ .cra_blocksize = POLYVAL_BLOCK_SIZE,
+ .cra_ctxsize = sizeof(struct polyval_tfm_ctx),
+ .cra_module = THIS_MODULE,
+ },
+};
+
+static int __init polyval_ce_mod_init(void)
+{
+ return crypto_register_shash(&polyval_alg);
+}
+
+static void __exit polyval_ce_mod_exit(void)
+{
+ crypto_unregister_shash(&polyval_alg);
+}
+
+module_cpu_feature_match(PMULL, polyval_ce_mod_init)
+module_exit(polyval_ce_mod_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("POLYVAL hash function accelerated by ARMv8 Crypto Extensions");
+MODULE_ALIAS_CRYPTO("polyval");
+MODULE_ALIAS_CRYPTO("polyval-ce");
diff --git a/arch/arm64/crypto/sha1-ce-core.S b/arch/arm64/crypto/sha1-ce-core.S
index 8550408735a0..889ca0f8972b 100644
--- a/arch/arm64/crypto/sha1-ce-core.S
+++ b/arch/arm64/crypto/sha1-ce-core.S
@@ -1,11 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
/*
* sha1-ce-core.S - SHA-1 secure hash using ARMv8 Crypto Extensions
*
* Copyright (C) 2014 Linaro Ltd <ard.biesheuvel@linaro.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#include <linux/linkage.h>
@@ -58,24 +55,22 @@
sha1su1 v\s0\().4s, v\s3\().4s
.endm
- /*
- * The SHA1 round constants
- */
- .align 4
-.Lsha1_rcon:
- .word 0x5a827999, 0x6ed9eba1, 0x8f1bbcdc, 0xca62c1d6
+ .macro loadrc, k, val, tmp
+ movz \tmp, :abs_g0_nc:\val
+ movk \tmp, :abs_g1:\val
+ dup \k, \tmp
+ .endm
/*
- * void sha1_ce_transform(struct sha1_ce_state *sst, u8 const *src,
- * int blocks)
+ * int sha1_ce_transform(struct sha1_ce_state *sst, u8 const *src,
+ * int blocks)
*/
-ENTRY(sha1_ce_transform)
+SYM_FUNC_START(sha1_ce_transform)
/* load round constants */
- adr x6, .Lsha1_rcon
- ld1r {k0.4s}, [x6], #4
- ld1r {k1.4s}, [x6], #4
- ld1r {k2.4s}, [x6], #4
- ld1r {k3.4s}, [x6]
+ loadrc k0.4s, 0x5a827999, w6
+ loadrc k1.4s, 0x6ed9eba1, w6
+ loadrc k2.4s, 0x8f1bbcdc, w6
+ loadrc k3.4s, 0xca62c1d6, w6
/* load state */
ld1 {dgav.4s}, [x0]
@@ -125,14 +120,16 @@ CPU_LE( rev32 v11.16b, v11.16b )
add dgbv.2s, dgbv.2s, dg1v.2s
add dgav.4s, dgav.4s, dg0v.4s
- cbnz w2, 0b
+ cbz w2, 2f
+ cond_yield 3f, x5, x6
+ b 0b
/*
* Final block: add padding and total bit count.
* Skip if the input size was not a round multiple of the block size,
* the padding is handled by the C code in that case.
*/
- cbz x4, 3f
+2: cbz x4, 3f
ldr_l w4, sha1_ce_offsetof_count, x4
ldr x4, [x0, x4]
movi v9.2d, #0
@@ -148,5 +145,6 @@ CPU_LE( rev32 v11.16b, v11.16b )
/* store new state */
3: st1 {dgav.4s}, [x0]
str dgb, [x0, #16]
+ mov w0, w2
ret
-ENDPROC(sha1_ce_transform)
+SYM_FUNC_END(sha1_ce_transform)
diff --git a/arch/arm64/crypto/sha1-ce-glue.c b/arch/arm64/crypto/sha1-ce-glue.c
index efbeb3e0dcfb..71fa4f1122d7 100644
--- a/arch/arm64/crypto/sha1-ce-glue.c
+++ b/arch/arm64/crypto/sha1-ce-glue.c
@@ -1,18 +1,16 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* sha1-ce-glue.c - SHA-1 secure hash using ARMv8 Crypto Extensions
*
* Copyright (C) 2014 - 2017 Linaro Ltd <ard.biesheuvel@linaro.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#include <asm/neon.h>
#include <asm/simd.h>
#include <asm/unaligned.h>
#include <crypto/internal/hash.h>
-#include <crypto/sha.h>
+#include <crypto/internal/simd.h>
+#include <crypto/sha1.h>
#include <crypto/sha1_base.h>
#include <linux/cpufeature.h>
#include <linux/crypto.h>
@@ -21,14 +19,33 @@
MODULE_DESCRIPTION("SHA1 secure hash using ARMv8 Crypto Extensions");
MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
MODULE_LICENSE("GPL v2");
+MODULE_ALIAS_CRYPTO("sha1");
struct sha1_ce_state {
struct sha1_state sst;
u32 finalize;
};
-asmlinkage void sha1_ce_transform(struct sha1_ce_state *sst, u8 const *src,
- int blocks);
+extern const u32 sha1_ce_offsetof_count;
+extern const u32 sha1_ce_offsetof_finalize;
+
+asmlinkage int sha1_ce_transform(struct sha1_ce_state *sst, u8 const *src,
+ int blocks);
+
+static void __sha1_ce_transform(struct sha1_state *sst, u8 const *src,
+ int blocks)
+{
+ while (blocks) {
+ int rem;
+
+ kernel_neon_begin();
+ rem = sha1_ce_transform(container_of(sst, struct sha1_ce_state,
+ sst), src, blocks);
+ kernel_neon_end();
+ src += (blocks - rem) * SHA1_BLOCK_SIZE;
+ blocks = rem;
+ }
+}
const u32 sha1_ce_offsetof_count = offsetof(struct sha1_ce_state, sst.count);
const u32 sha1_ce_offsetof_finalize = offsetof(struct sha1_ce_state, finalize);
@@ -38,14 +55,11 @@ static int sha1_ce_update(struct shash_desc *desc, const u8 *data,
{
struct sha1_ce_state *sctx = shash_desc_ctx(desc);
- if (!may_use_simd())
+ if (!crypto_simd_usable())
return crypto_sha1_update(desc, data, len);
sctx->finalize = 0;
- kernel_neon_begin();
- sha1_base_do_update(desc, data, len,
- (sha1_block_fn *)sha1_ce_transform);
- kernel_neon_end();
+ sha1_base_do_update(desc, data, len, __sha1_ce_transform);
return 0;
}
@@ -54,9 +68,9 @@ static int sha1_ce_finup(struct shash_desc *desc, const u8 *data,
unsigned int len, u8 *out)
{
struct sha1_ce_state *sctx = shash_desc_ctx(desc);
- bool finalize = !sctx->sst.count && !(len % SHA1_BLOCK_SIZE);
+ bool finalize = !sctx->sst.count && !(len % SHA1_BLOCK_SIZE) && len;
- if (!may_use_simd())
+ if (!crypto_simd_usable())
return crypto_sha1_finup(desc, data, len, out);
/*
@@ -65,12 +79,9 @@ static int sha1_ce_finup(struct shash_desc *desc, const u8 *data,
*/
sctx->finalize = finalize;
- kernel_neon_begin();
- sha1_base_do_update(desc, data, len,
- (sha1_block_fn *)sha1_ce_transform);
+ sha1_base_do_update(desc, data, len, __sha1_ce_transform);
if (!finalize)
- sha1_base_do_finalize(desc, (sha1_block_fn *)sha1_ce_transform);
- kernel_neon_end();
+ sha1_base_do_finalize(desc, __sha1_ce_transform);
return sha1_base_finish(desc, out);
}
@@ -78,28 +89,45 @@ static int sha1_ce_final(struct shash_desc *desc, u8 *out)
{
struct sha1_ce_state *sctx = shash_desc_ctx(desc);
- if (!may_use_simd())
+ if (!crypto_simd_usable())
return crypto_sha1_finup(desc, NULL, 0, out);
sctx->finalize = 0;
- kernel_neon_begin();
- sha1_base_do_finalize(desc, (sha1_block_fn *)sha1_ce_transform);
- kernel_neon_end();
+ sha1_base_do_finalize(desc, __sha1_ce_transform);
return sha1_base_finish(desc, out);
}
+static int sha1_ce_export(struct shash_desc *desc, void *out)
+{
+ struct sha1_ce_state *sctx = shash_desc_ctx(desc);
+
+ memcpy(out, &sctx->sst, sizeof(struct sha1_state));
+ return 0;
+}
+
+static int sha1_ce_import(struct shash_desc *desc, const void *in)
+{
+ struct sha1_ce_state *sctx = shash_desc_ctx(desc);
+
+ memcpy(&sctx->sst, in, sizeof(struct sha1_state));
+ sctx->finalize = 0;
+ return 0;
+}
+
static struct shash_alg alg = {
.init = sha1_base_init,
.update = sha1_ce_update,
.final = sha1_ce_final,
.finup = sha1_ce_finup,
+ .import = sha1_ce_import,
+ .export = sha1_ce_export,
.descsize = sizeof(struct sha1_ce_state),
+ .statesize = sizeof(struct sha1_state),
.digestsize = SHA1_DIGEST_SIZE,
.base = {
.cra_name = "sha1",
.cra_driver_name = "sha1-ce",
.cra_priority = 200,
- .cra_flags = CRYPTO_ALG_TYPE_SHASH,
.cra_blocksize = SHA1_BLOCK_SIZE,
.cra_module = THIS_MODULE,
}
diff --git a/arch/arm64/crypto/sha2-ce-core.S b/arch/arm64/crypto/sha2-ce-core.S
index 679c6c002f4f..491179922f49 100644
--- a/arch/arm64/crypto/sha2-ce-core.S
+++ b/arch/arm64/crypto/sha2-ce-core.S
@@ -1,11 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
/*
* sha2-ce-core.S - core SHA-224/SHA-256 transform using v8 Crypto Extensions
*
* Copyright (C) 2014 Linaro Ltd <ard.biesheuvel@linaro.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#include <linux/linkage.h>
@@ -53,6 +50,7 @@
/*
* The SHA-256 round constants
*/
+ .section ".rodata", "a"
.align 4
.Lsha2_rcon:
.word 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
@@ -76,9 +74,10 @@
* void sha2_ce_transform(struct sha256_ce_state *sst, u8 const *src,
* int blocks)
*/
-ENTRY(sha2_ce_transform)
+ .text
+SYM_FUNC_START(sha2_ce_transform)
/* load round constants */
- adr x8, .Lsha2_rcon
+ adr_l x8, .Lsha2_rcon
ld1 { v0.4s- v3.4s}, [x8], #64
ld1 { v4.4s- v7.4s}, [x8], #64
ld1 { v8.4s-v11.4s}, [x8], #64
@@ -129,14 +128,16 @@ CPU_LE( rev32 v19.16b, v19.16b )
add dgbv.4s, dgbv.4s, dg1v.4s
/* handled all input blocks? */
- cbnz w2, 0b
+ cbz w2, 2f
+ cond_yield 3f, x5, x6
+ b 0b
/*
* Final block: add padding and total bit count.
* Skip if the input size was not a round multiple of the block size,
* the padding is handled by the C code in that case.
*/
- cbz x4, 3f
+2: cbz x4, 3f
ldr_l w4, sha256_ce_offsetof_count, x4
ldr x4, [x0, x4]
movi v17.2d, #0
@@ -151,5 +152,6 @@ CPU_LE( rev32 v19.16b, v19.16b )
/* store new state */
3: st1 {dgav.4s, dgbv.4s}, [x0]
+ mov w0, w2
ret
-ENDPROC(sha2_ce_transform)
+SYM_FUNC_END(sha2_ce_transform)
diff --git a/arch/arm64/crypto/sha2-ce-glue.c b/arch/arm64/crypto/sha2-ce-glue.c
index fd1ff2b13dfa..c57a6119fefc 100644
--- a/arch/arm64/crypto/sha2-ce-glue.c
+++ b/arch/arm64/crypto/sha2-ce-glue.c
@@ -1,18 +1,16 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* sha2-ce-glue.c - SHA-224/SHA-256 using ARMv8 Crypto Extensions
*
* Copyright (C) 2014 - 2017 Linaro Ltd <ard.biesheuvel@linaro.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#include <asm/neon.h>
#include <asm/simd.h>
#include <asm/unaligned.h>
#include <crypto/internal/hash.h>
-#include <crypto/sha.h>
+#include <crypto/internal/simd.h>
+#include <crypto/sha2.h>
#include <crypto/sha256_base.h>
#include <linux/cpufeature.h>
#include <linux/crypto.h>
@@ -21,14 +19,34 @@
MODULE_DESCRIPTION("SHA-224/SHA-256 secure hash using ARMv8 Crypto Extensions");
MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
MODULE_LICENSE("GPL v2");
+MODULE_ALIAS_CRYPTO("sha224");
+MODULE_ALIAS_CRYPTO("sha256");
struct sha256_ce_state {
struct sha256_state sst;
u32 finalize;
};
-asmlinkage void sha2_ce_transform(struct sha256_ce_state *sst, u8 const *src,
- int blocks);
+extern const u32 sha256_ce_offsetof_count;
+extern const u32 sha256_ce_offsetof_finalize;
+
+asmlinkage int sha2_ce_transform(struct sha256_ce_state *sst, u8 const *src,
+ int blocks);
+
+static void __sha2_ce_transform(struct sha256_state *sst, u8 const *src,
+ int blocks)
+{
+ while (blocks) {
+ int rem;
+
+ kernel_neon_begin();
+ rem = sha2_ce_transform(container_of(sst, struct sha256_ce_state,
+ sst), src, blocks);
+ kernel_neon_end();
+ src += (blocks - rem) * SHA256_BLOCK_SIZE;
+ blocks = rem;
+ }
+}
const u32 sha256_ce_offsetof_count = offsetof(struct sha256_ce_state,
sst.count);
@@ -37,20 +55,23 @@ const u32 sha256_ce_offsetof_finalize = offsetof(struct sha256_ce_state,
asmlinkage void sha256_block_data_order(u32 *digest, u8 const *src, int blocks);
+static void __sha256_block_data_order(struct sha256_state *sst, u8 const *src,
+ int blocks)
+{
+ sha256_block_data_order(sst->state, src, blocks);
+}
+
static int sha256_ce_update(struct shash_desc *desc, const u8 *data,
unsigned int len)
{
struct sha256_ce_state *sctx = shash_desc_ctx(desc);
- if (!may_use_simd())
+ if (!crypto_simd_usable())
return sha256_base_do_update(desc, data, len,
- (sha256_block_fn *)sha256_block_data_order);
+ __sha256_block_data_order);
sctx->finalize = 0;
- kernel_neon_begin();
- sha256_base_do_update(desc, data, len,
- (sha256_block_fn *)sha2_ce_transform);
- kernel_neon_end();
+ sha256_base_do_update(desc, data, len, __sha2_ce_transform);
return 0;
}
@@ -59,14 +80,13 @@ static int sha256_ce_finup(struct shash_desc *desc, const u8 *data,
unsigned int len, u8 *out)
{
struct sha256_ce_state *sctx = shash_desc_ctx(desc);
- bool finalize = !sctx->sst.count && !(len % SHA256_BLOCK_SIZE);
+ bool finalize = !sctx->sst.count && !(len % SHA256_BLOCK_SIZE) && len;
- if (!may_use_simd()) {
+ if (!crypto_simd_usable()) {
if (len)
sha256_base_do_update(desc, data, len,
- (sha256_block_fn *)sha256_block_data_order);
- sha256_base_do_finalize(desc,
- (sha256_block_fn *)sha256_block_data_order);
+ __sha256_block_data_order);
+ sha256_base_do_finalize(desc, __sha256_block_data_order);
return sha256_base_finish(desc, out);
}
@@ -76,13 +96,9 @@ static int sha256_ce_finup(struct shash_desc *desc, const u8 *data,
*/
sctx->finalize = finalize;
- kernel_neon_begin();
- sha256_base_do_update(desc, data, len,
- (sha256_block_fn *)sha2_ce_transform);
+ sha256_base_do_update(desc, data, len, __sha2_ce_transform);
if (!finalize)
- sha256_base_do_finalize(desc,
- (sha256_block_fn *)sha2_ce_transform);
- kernel_neon_end();
+ sha256_base_do_finalize(desc, __sha2_ce_transform);
return sha256_base_finish(desc, out);
}
@@ -90,31 +106,47 @@ static int sha256_ce_final(struct shash_desc *desc, u8 *out)
{
struct sha256_ce_state *sctx = shash_desc_ctx(desc);
- if (!may_use_simd()) {
- sha256_base_do_finalize(desc,
- (sha256_block_fn *)sha256_block_data_order);
+ if (!crypto_simd_usable()) {
+ sha256_base_do_finalize(desc, __sha256_block_data_order);
return sha256_base_finish(desc, out);
}
sctx->finalize = 0;
- kernel_neon_begin();
- sha256_base_do_finalize(desc, (sha256_block_fn *)sha2_ce_transform);
- kernel_neon_end();
+ sha256_base_do_finalize(desc, __sha2_ce_transform);
return sha256_base_finish(desc, out);
}
+static int sha256_ce_export(struct shash_desc *desc, void *out)
+{
+ struct sha256_ce_state *sctx = shash_desc_ctx(desc);
+
+ memcpy(out, &sctx->sst, sizeof(struct sha256_state));
+ return 0;
+}
+
+static int sha256_ce_import(struct shash_desc *desc, const void *in)
+{
+ struct sha256_ce_state *sctx = shash_desc_ctx(desc);
+
+ memcpy(&sctx->sst, in, sizeof(struct sha256_state));
+ sctx->finalize = 0;
+ return 0;
+}
+
static struct shash_alg algs[] = { {
.init = sha224_base_init,
.update = sha256_ce_update,
.final = sha256_ce_final,
.finup = sha256_ce_finup,
+ .export = sha256_ce_export,
+ .import = sha256_ce_import,
.descsize = sizeof(struct sha256_ce_state),
+ .statesize = sizeof(struct sha256_state),
.digestsize = SHA224_DIGEST_SIZE,
.base = {
.cra_name = "sha224",
.cra_driver_name = "sha224-ce",
.cra_priority = 200,
- .cra_flags = CRYPTO_ALG_TYPE_SHASH,
.cra_blocksize = SHA256_BLOCK_SIZE,
.cra_module = THIS_MODULE,
}
@@ -123,13 +155,15 @@ static struct shash_alg algs[] = { {
.update = sha256_ce_update,
.final = sha256_ce_final,
.finup = sha256_ce_finup,
+ .export = sha256_ce_export,
+ .import = sha256_ce_import,
.descsize = sizeof(struct sha256_ce_state),
+ .statesize = sizeof(struct sha256_state),
.digestsize = SHA256_DIGEST_SIZE,
.base = {
.cra_name = "sha256",
.cra_driver_name = "sha256-ce",
.cra_priority = 200,
- .cra_flags = CRYPTO_ALG_TYPE_SHASH,
.cra_blocksize = SHA256_BLOCK_SIZE,
.cra_module = THIS_MODULE,
}
diff --git a/arch/arm64/crypto/sha256-core.S_shipped b/arch/arm64/crypto/sha256-core.S_shipped
deleted file mode 100644
index 3ce82cc860bc..000000000000
--- a/arch/arm64/crypto/sha256-core.S_shipped
+++ /dev/null
@@ -1,2061 +0,0 @@
-// Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
-//
-// Licensed under the OpenSSL license (the "License"). You may not use
-// this file except in compliance with the License. You can obtain a copy
-// in the file LICENSE in the source distribution or at
-// https://www.openssl.org/source/license.html
-
-// ====================================================================
-// Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
-// project. The module is, however, dual licensed under OpenSSL and
-// CRYPTOGAMS licenses depending on where you obtain it. For further
-// details see http://www.openssl.org/~appro/cryptogams/.
-//
-// Permission to use under GPLv2 terms is granted.
-// ====================================================================
-//
-// SHA256/512 for ARMv8.
-//
-// Performance in cycles per processed byte and improvement coefficient
-// over code generated with "default" compiler:
-//
-// SHA256-hw SHA256(*) SHA512
-// Apple A7 1.97 10.5 (+33%) 6.73 (-1%(**))
-// Cortex-A53 2.38 15.5 (+115%) 10.0 (+150%(***))
-// Cortex-A57 2.31 11.6 (+86%) 7.51 (+260%(***))
-// Denver 2.01 10.5 (+26%) 6.70 (+8%)
-// X-Gene 20.0 (+100%) 12.8 (+300%(***))
-// Mongoose 2.36 13.0 (+50%) 8.36 (+33%)
-//
-// (*) Software SHA256 results are of lesser relevance, presented
-// mostly for informational purposes.
-// (**) The result is a trade-off: it's possible to improve it by
-// 10% (or by 1 cycle per round), but at the cost of 20% loss
-// on Cortex-A53 (or by 4 cycles per round).
-// (***) Super-impressive coefficients over gcc-generated code are
-// indication of some compiler "pathology", most notably code
-// generated with -mgeneral-regs-only is significanty faster
-// and the gap is only 40-90%.
-//
-// October 2016.
-//
-// Originally it was reckoned that it makes no sense to implement NEON
-// version of SHA256 for 64-bit processors. This is because performance
-// improvement on most wide-spread Cortex-A5x processors was observed
-// to be marginal, same on Cortex-A53 and ~10% on A57. But then it was
-// observed that 32-bit NEON SHA256 performs significantly better than
-// 64-bit scalar version on *some* of the more recent processors. As
-// result 64-bit NEON version of SHA256 was added to provide best
-// all-round performance. For example it executes ~30% faster on X-Gene
-// and Mongoose. [For reference, NEON version of SHA512 is bound to
-// deliver much less improvement, likely *negative* on Cortex-A5x.
-// Which is why NEON support is limited to SHA256.]
-
-#ifndef __KERNEL__
-# include "arm_arch.h"
-#endif
-
-.text
-
-.extern OPENSSL_armcap_P
-.globl sha256_block_data_order
-.type sha256_block_data_order,%function
-.align 6
-sha256_block_data_order:
-#ifndef __KERNEL__
-# ifdef __ILP32__
- ldrsw x16,.LOPENSSL_armcap_P
-# else
- ldr x16,.LOPENSSL_armcap_P
-# endif
- adr x17,.LOPENSSL_armcap_P
- add x16,x16,x17
- ldr w16,[x16]
- tst w16,#ARMV8_SHA256
- b.ne .Lv8_entry
- tst w16,#ARMV7_NEON
- b.ne .Lneon_entry
-#endif
- stp x29,x30,[sp,#-128]!
- add x29,sp,#0
-
- stp x19,x20,[sp,#16]
- stp x21,x22,[sp,#32]
- stp x23,x24,[sp,#48]
- stp x25,x26,[sp,#64]
- stp x27,x28,[sp,#80]
- sub sp,sp,#4*4
-
- ldp w20,w21,[x0] // load context
- ldp w22,w23,[x0,#2*4]
- ldp w24,w25,[x0,#4*4]
- add x2,x1,x2,lsl#6 // end of input
- ldp w26,w27,[x0,#6*4]
- adr x30,.LK256
- stp x0,x2,[x29,#96]
-
-.Loop:
- ldp w3,w4,[x1],#2*4
- ldr w19,[x30],#4 // *K++
- eor w28,w21,w22 // magic seed
- str x1,[x29,#112]
-#ifndef __AARCH64EB__
- rev w3,w3 // 0
-#endif
- ror w16,w24,#6
- add w27,w27,w19 // h+=K[i]
- eor w6,w24,w24,ror#14
- and w17,w25,w24
- bic w19,w26,w24
- add w27,w27,w3 // h+=X[i]
- orr w17,w17,w19 // Ch(e,f,g)
- eor w19,w20,w21 // a^b, b^c in next round
- eor w16,w16,w6,ror#11 // Sigma1(e)
- ror w6,w20,#2
- add w27,w27,w17 // h+=Ch(e,f,g)
- eor w17,w20,w20,ror#9
- add w27,w27,w16 // h+=Sigma1(e)
- and w28,w28,w19 // (b^c)&=(a^b)
- add w23,w23,w27 // d+=h
- eor w28,w28,w21 // Maj(a,b,c)
- eor w17,w6,w17,ror#13 // Sigma0(a)
- add w27,w27,w28 // h+=Maj(a,b,c)
- ldr w28,[x30],#4 // *K++, w19 in next round
- //add w27,w27,w17 // h+=Sigma0(a)
-#ifndef __AARCH64EB__
- rev w4,w4 // 1
-#endif
- ldp w5,w6,[x1],#2*4
- add w27,w27,w17 // h+=Sigma0(a)
- ror w16,w23,#6
- add w26,w26,w28 // h+=K[i]
- eor w7,w23,w23,ror#14
- and w17,w24,w23
- bic w28,w25,w23
- add w26,w26,w4 // h+=X[i]
- orr w17,w17,w28 // Ch(e,f,g)
- eor w28,w27,w20 // a^b, b^c in next round
- eor w16,w16,w7,ror#11 // Sigma1(e)
- ror w7,w27,#2
- add w26,w26,w17 // h+=Ch(e,f,g)
- eor w17,w27,w27,ror#9
- add w26,w26,w16 // h+=Sigma1(e)
- and w19,w19,w28 // (b^c)&=(a^b)
- add w22,w22,w26 // d+=h
- eor w19,w19,w20 // Maj(a,b,c)
- eor w17,w7,w17,ror#13 // Sigma0(a)
- add w26,w26,w19 // h+=Maj(a,b,c)
- ldr w19,[x30],#4 // *K++, w28 in next round
- //add w26,w26,w17 // h+=Sigma0(a)
-#ifndef __AARCH64EB__
- rev w5,w5 // 2
-#endif
- add w26,w26,w17 // h+=Sigma0(a)
- ror w16,w22,#6
- add w25,w25,w19 // h+=K[i]
- eor w8,w22,w22,ror#14
- and w17,w23,w22
- bic w19,w24,w22
- add w25,w25,w5 // h+=X[i]
- orr w17,w17,w19 // Ch(e,f,g)
- eor w19,w26,w27 // a^b, b^c in next round
- eor w16,w16,w8,ror#11 // Sigma1(e)
- ror w8,w26,#2
- add w25,w25,w17 // h+=Ch(e,f,g)
- eor w17,w26,w26,ror#9
- add w25,w25,w16 // h+=Sigma1(e)
- and w28,w28,w19 // (b^c)&=(a^b)
- add w21,w21,w25 // d+=h
- eor w28,w28,w27 // Maj(a,b,c)
- eor w17,w8,w17,ror#13 // Sigma0(a)
- add w25,w25,w28 // h+=Maj(a,b,c)
- ldr w28,[x30],#4 // *K++, w19 in next round
- //add w25,w25,w17 // h+=Sigma0(a)
-#ifndef __AARCH64EB__
- rev w6,w6 // 3
-#endif
- ldp w7,w8,[x1],#2*4
- add w25,w25,w17 // h+=Sigma0(a)
- ror w16,w21,#6
- add w24,w24,w28 // h+=K[i]
- eor w9,w21,w21,ror#14
- and w17,w22,w21
- bic w28,w23,w21
- add w24,w24,w6 // h+=X[i]
- orr w17,w17,w28 // Ch(e,f,g)
- eor w28,w25,w26 // a^b, b^c in next round
- eor w16,w16,w9,ror#11 // Sigma1(e)
- ror w9,w25,#2
- add w24,w24,w17 // h+=Ch(e,f,g)
- eor w17,w25,w25,ror#9
- add w24,w24,w16 // h+=Sigma1(e)
- and w19,w19,w28 // (b^c)&=(a^b)
- add w20,w20,w24 // d+=h
- eor w19,w19,w26 // Maj(a,b,c)
- eor w17,w9,w17,ror#13 // Sigma0(a)
- add w24,w24,w19 // h+=Maj(a,b,c)
- ldr w19,[x30],#4 // *K++, w28 in next round
- //add w24,w24,w17 // h+=Sigma0(a)
-#ifndef __AARCH64EB__
- rev w7,w7 // 4
-#endif
- add w24,w24,w17 // h+=Sigma0(a)
- ror w16,w20,#6
- add w23,w23,w19 // h+=K[i]
- eor w10,w20,w20,ror#14
- and w17,w21,w20
- bic w19,w22,w20
- add w23,w23,w7 // h+=X[i]
- orr w17,w17,w19 // Ch(e,f,g)
- eor w19,w24,w25 // a^b, b^c in next round
- eor w16,w16,w10,ror#11 // Sigma1(e)
- ror w10,w24,#2
- add w23,w23,w17 // h+=Ch(e,f,g)
- eor w17,w24,w24,ror#9
- add w23,w23,w16 // h+=Sigma1(e)
- and w28,w28,w19 // (b^c)&=(a^b)
- add w27,w27,w23 // d+=h
- eor w28,w28,w25 // Maj(a,b,c)
- eor w17,w10,w17,ror#13 // Sigma0(a)
- add w23,w23,w28 // h+=Maj(a,b,c)
- ldr w28,[x30],#4 // *K++, w19 in next round
- //add w23,w23,w17 // h+=Sigma0(a)
-#ifndef __AARCH64EB__
- rev w8,w8 // 5
-#endif
- ldp w9,w10,[x1],#2*4
- add w23,w23,w17 // h+=Sigma0(a)
- ror w16,w27,#6
- add w22,w22,w28 // h+=K[i]
- eor w11,w27,w27,ror#14
- and w17,w20,w27
- bic w28,w21,w27
- add w22,w22,w8 // h+=X[i]
- orr w17,w17,w28 // Ch(e,f,g)
- eor w28,w23,w24 // a^b, b^c in next round
- eor w16,w16,w11,ror#11 // Sigma1(e)
- ror w11,w23,#2
- add w22,w22,w17 // h+=Ch(e,f,g)
- eor w17,w23,w23,ror#9
- add w22,w22,w16 // h+=Sigma1(e)
- and w19,w19,w28 // (b^c)&=(a^b)
- add w26,w26,w22 // d+=h
- eor w19,w19,w24 // Maj(a,b,c)
- eor w17,w11,w17,ror#13 // Sigma0(a)
- add w22,w22,w19 // h+=Maj(a,b,c)
- ldr w19,[x30],#4 // *K++, w28 in next round
- //add w22,w22,w17 // h+=Sigma0(a)
-#ifndef __AARCH64EB__
- rev w9,w9 // 6
-#endif
- add w22,w22,w17 // h+=Sigma0(a)
- ror w16,w26,#6
- add w21,w21,w19 // h+=K[i]
- eor w12,w26,w26,ror#14
- and w17,w27,w26
- bic w19,w20,w26
- add w21,w21,w9 // h+=X[i]
- orr w17,w17,w19 // Ch(e,f,g)
- eor w19,w22,w23 // a^b, b^c in next round
- eor w16,w16,w12,ror#11 // Sigma1(e)
- ror w12,w22,#2
- add w21,w21,w17 // h+=Ch(e,f,g)
- eor w17,w22,w22,ror#9
- add w21,w21,w16 // h+=Sigma1(e)
- and w28,w28,w19 // (b^c)&=(a^b)
- add w25,w25,w21 // d+=h
- eor w28,w28,w23 // Maj(a,b,c)
- eor w17,w12,w17,ror#13 // Sigma0(a)
- add w21,w21,w28 // h+=Maj(a,b,c)
- ldr w28,[x30],#4 // *K++, w19 in next round
- //add w21,w21,w17 // h+=Sigma0(a)
-#ifndef __AARCH64EB__
- rev w10,w10 // 7
-#endif
- ldp w11,w12,[x1],#2*4
- add w21,w21,w17 // h+=Sigma0(a)
- ror w16,w25,#6
- add w20,w20,w28 // h+=K[i]
- eor w13,w25,w25,ror#14
- and w17,w26,w25
- bic w28,w27,w25
- add w20,w20,w10 // h+=X[i]
- orr w17,w17,w28 // Ch(e,f,g)
- eor w28,w21,w22 // a^b, b^c in next round
- eor w16,w16,w13,ror#11 // Sigma1(e)
- ror w13,w21,#2
- add w20,w20,w17 // h+=Ch(e,f,g)
- eor w17,w21,w21,ror#9
- add w20,w20,w16 // h+=Sigma1(e)
- and w19,w19,w28 // (b^c)&=(a^b)
- add w24,w24,w20 // d+=h
- eor w19,w19,w22 // Maj(a,b,c)
- eor w17,w13,w17,ror#13 // Sigma0(a)
- add w20,w20,w19 // h+=Maj(a,b,c)
- ldr w19,[x30],#4 // *K++, w28 in next round
- //add w20,w20,w17 // h+=Sigma0(a)
-#ifndef __AARCH64EB__
- rev w11,w11 // 8
-#endif
- add w20,w20,w17 // h+=Sigma0(a)
- ror w16,w24,#6
- add w27,w27,w19 // h+=K[i]
- eor w14,w24,w24,ror#14
- and w17,w25,w24
- bic w19,w26,w24
- add w27,w27,w11 // h+=X[i]
- orr w17,w17,w19 // Ch(e,f,g)
- eor w19,w20,w21 // a^b, b^c in next round
- eor w16,w16,w14,ror#11 // Sigma1(e)
- ror w14,w20,#2
- add w27,w27,w17 // h+=Ch(e,f,g)
- eor w17,w20,w20,ror#9
- add w27,w27,w16 // h+=Sigma1(e)
- and w28,w28,w19 // (b^c)&=(a^b)
- add w23,w23,w27 // d+=h
- eor w28,w28,w21 // Maj(a,b,c)
- eor w17,w14,w17,ror#13 // Sigma0(a)
- add w27,w27,w28 // h+=Maj(a,b,c)
- ldr w28,[x30],#4 // *K++, w19 in next round
- //add w27,w27,w17 // h+=Sigma0(a)
-#ifndef __AARCH64EB__
- rev w12,w12 // 9
-#endif
- ldp w13,w14,[x1],#2*4
- add w27,w27,w17 // h+=Sigma0(a)
- ror w16,w23,#6
- add w26,w26,w28 // h+=K[i]
- eor w15,w23,w23,ror#14
- and w17,w24,w23
- bic w28,w25,w23
- add w26,w26,w12 // h+=X[i]
- orr w17,w17,w28 // Ch(e,f,g)
- eor w28,w27,w20 // a^b, b^c in next round
- eor w16,w16,w15,ror#11 // Sigma1(e)
- ror w15,w27,#2
- add w26,w26,w17 // h+=Ch(e,f,g)
- eor w17,w27,w27,ror#9
- add w26,w26,w16 // h+=Sigma1(e)
- and w19,w19,w28 // (b^c)&=(a^b)
- add w22,w22,w26 // d+=h
- eor w19,w19,w20 // Maj(a,b,c)
- eor w17,w15,w17,ror#13 // Sigma0(a)
- add w26,w26,w19 // h+=Maj(a,b,c)
- ldr w19,[x30],#4 // *K++, w28 in next round
- //add w26,w26,w17 // h+=Sigma0(a)
-#ifndef __AARCH64EB__
- rev w13,w13 // 10
-#endif
- add w26,w26,w17 // h+=Sigma0(a)
- ror w16,w22,#6
- add w25,w25,w19 // h+=K[i]
- eor w0,w22,w22,ror#14
- and w17,w23,w22
- bic w19,w24,w22
- add w25,w25,w13 // h+=X[i]
- orr w17,w17,w19 // Ch(e,f,g)
- eor w19,w26,w27 // a^b, b^c in next round
- eor w16,w16,w0,ror#11 // Sigma1(e)
- ror w0,w26,#2
- add w25,w25,w17 // h+=Ch(e,f,g)
- eor w17,w26,w26,ror#9
- add w25,w25,w16 // h+=Sigma1(e)
- and w28,w28,w19 // (b^c)&=(a^b)
- add w21,w21,w25 // d+=h
- eor w28,w28,w27 // Maj(a,b,c)
- eor w17,w0,w17,ror#13 // Sigma0(a)
- add w25,w25,w28 // h+=Maj(a,b,c)
- ldr w28,[x30],#4 // *K++, w19 in next round
- //add w25,w25,w17 // h+=Sigma0(a)
-#ifndef __AARCH64EB__
- rev w14,w14 // 11
-#endif
- ldp w15,w0,[x1],#2*4
- add w25,w25,w17 // h+=Sigma0(a)
- str w6,[sp,#12]
- ror w16,w21,#6
- add w24,w24,w28 // h+=K[i]
- eor w6,w21,w21,ror#14
- and w17,w22,w21
- bic w28,w23,w21
- add w24,w24,w14 // h+=X[i]
- orr w17,w17,w28 // Ch(e,f,g)
- eor w28,w25,w26 // a^b, b^c in next round
- eor w16,w16,w6,ror#11 // Sigma1(e)
- ror w6,w25,#2
- add w24,w24,w17 // h+=Ch(e,f,g)
- eor w17,w25,w25,ror#9
- add w24,w24,w16 // h+=Sigma1(e)
- and w19,w19,w28 // (b^c)&=(a^b)
- add w20,w20,w24 // d+=h
- eor w19,w19,w26 // Maj(a,b,c)
- eor w17,w6,w17,ror#13 // Sigma0(a)
- add w24,w24,w19 // h+=Maj(a,b,c)
- ldr w19,[x30],#4 // *K++, w28 in next round
- //add w24,w24,w17 // h+=Sigma0(a)
-#ifndef __AARCH64EB__
- rev w15,w15 // 12
-#endif
- add w24,w24,w17 // h+=Sigma0(a)
- str w7,[sp,#0]
- ror w16,w20,#6
- add w23,w23,w19 // h+=K[i]
- eor w7,w20,w20,ror#14
- and w17,w21,w20
- bic w19,w22,w20
- add w23,w23,w15 // h+=X[i]
- orr w17,w17,w19 // Ch(e,f,g)
- eor w19,w24,w25 // a^b, b^c in next round
- eor w16,w16,w7,ror#11 // Sigma1(e)
- ror w7,w24,#2
- add w23,w23,w17 // h+=Ch(e,f,g)
- eor w17,w24,w24,ror#9
- add w23,w23,w16 // h+=Sigma1(e)
- and w28,w28,w19 // (b^c)&=(a^b)
- add w27,w27,w23 // d+=h
- eor w28,w28,w25 // Maj(a,b,c)
- eor w17,w7,w17,ror#13 // Sigma0(a)
- add w23,w23,w28 // h+=Maj(a,b,c)
- ldr w28,[x30],#4 // *K++, w19 in next round
- //add w23,w23,w17 // h+=Sigma0(a)
-#ifndef __AARCH64EB__
- rev w0,w0 // 13
-#endif
- ldp w1,w2,[x1]
- add w23,w23,w17 // h+=Sigma0(a)
- str w8,[sp,#4]
- ror w16,w27,#6
- add w22,w22,w28 // h+=K[i]
- eor w8,w27,w27,ror#14
- and w17,w20,w27
- bic w28,w21,w27
- add w22,w22,w0 // h+=X[i]
- orr w17,w17,w28 // Ch(e,f,g)
- eor w28,w23,w24 // a^b, b^c in next round
- eor w16,w16,w8,ror#11 // Sigma1(e)
- ror w8,w23,#2
- add w22,w22,w17 // h+=Ch(e,f,g)
- eor w17,w23,w23,ror#9
- add w22,w22,w16 // h+=Sigma1(e)
- and w19,w19,w28 // (b^c)&=(a^b)
- add w26,w26,w22 // d+=h
- eor w19,w19,w24 // Maj(a,b,c)
- eor w17,w8,w17,ror#13 // Sigma0(a)
- add w22,w22,w19 // h+=Maj(a,b,c)
- ldr w19,[x30],#4 // *K++, w28 in next round
- //add w22,w22,w17 // h+=Sigma0(a)
-#ifndef __AARCH64EB__
- rev w1,w1 // 14
-#endif
- ldr w6,[sp,#12]
- add w22,w22,w17 // h+=Sigma0(a)
- str w9,[sp,#8]
- ror w16,w26,#6
- add w21,w21,w19 // h+=K[i]
- eor w9,w26,w26,ror#14
- and w17,w27,w26
- bic w19,w20,w26
- add w21,w21,w1 // h+=X[i]
- orr w17,w17,w19 // Ch(e,f,g)
- eor w19,w22,w23 // a^b, b^c in next round
- eor w16,w16,w9,ror#11 // Sigma1(e)
- ror w9,w22,#2
- add w21,w21,w17 // h+=Ch(e,f,g)
- eor w17,w22,w22,ror#9
- add w21,w21,w16 // h+=Sigma1(e)
- and w28,w28,w19 // (b^c)&=(a^b)
- add w25,w25,w21 // d+=h
- eor w28,w28,w23 // Maj(a,b,c)
- eor w17,w9,w17,ror#13 // Sigma0(a)
- add w21,w21,w28 // h+=Maj(a,b,c)
- ldr w28,[x30],#4 // *K++, w19 in next round
- //add w21,w21,w17 // h+=Sigma0(a)
-#ifndef __AARCH64EB__
- rev w2,w2 // 15
-#endif
- ldr w7,[sp,#0]
- add w21,w21,w17 // h+=Sigma0(a)
- str w10,[sp,#12]
- ror w16,w25,#6
- add w20,w20,w28 // h+=K[i]
- ror w9,w4,#7
- and w17,w26,w25
- ror w8,w1,#17
- bic w28,w27,w25
- ror w10,w21,#2
- add w20,w20,w2 // h+=X[i]
- eor w16,w16,w25,ror#11
- eor w9,w9,w4,ror#18
- orr w17,w17,w28 // Ch(e,f,g)
- eor w28,w21,w22 // a^b, b^c in next round
- eor w16,w16,w25,ror#25 // Sigma1(e)
- eor w10,w10,w21,ror#13
- add w20,w20,w17 // h+=Ch(e,f,g)
- and w19,w19,w28 // (b^c)&=(a^b)
- eor w8,w8,w1,ror#19
- eor w9,w9,w4,lsr#3 // sigma0(X[i+1])
- add w20,w20,w16 // h+=Sigma1(e)
- eor w19,w19,w22 // Maj(a,b,c)
- eor w17,w10,w21,ror#22 // Sigma0(a)
- eor w8,w8,w1,lsr#10 // sigma1(X[i+14])
- add w3,w3,w12
- add w24,w24,w20 // d+=h
- add w20,w20,w19 // h+=Maj(a,b,c)
- ldr w19,[x30],#4 // *K++, w28 in next round
- add w3,w3,w9
- add w20,w20,w17 // h+=Sigma0(a)
- add w3,w3,w8
-.Loop_16_xx:
- ldr w8,[sp,#4]
- str w11,[sp,#0]
- ror w16,w24,#6
- add w27,w27,w19 // h+=K[i]
- ror w10,w5,#7
- and w17,w25,w24
- ror w9,w2,#17
- bic w19,w26,w24
- ror w11,w20,#2
- add w27,w27,w3 // h+=X[i]
- eor w16,w16,w24,ror#11
- eor w10,w10,w5,ror#18
- orr w17,w17,w19 // Ch(e,f,g)
- eor w19,w20,w21 // a^b, b^c in next round
- eor w16,w16,w24,ror#25 // Sigma1(e)
- eor w11,w11,w20,ror#13
- add w27,w27,w17 // h+=Ch(e,f,g)
- and w28,w28,w19 // (b^c)&=(a^b)
- eor w9,w9,w2,ror#19
- eor w10,w10,w5,lsr#3 // sigma0(X[i+1])
- add w27,w27,w16 // h+=Sigma1(e)
- eor w28,w28,w21 // Maj(a,b,c)
- eor w17,w11,w20,ror#22 // Sigma0(a)
- eor w9,w9,w2,lsr#10 // sigma1(X[i+14])
- add w4,w4,w13
- add w23,w23,w27 // d+=h
- add w27,w27,w28 // h+=Maj(a,b,c)
- ldr w28,[x30],#4 // *K++, w19 in next round
- add w4,w4,w10
- add w27,w27,w17 // h+=Sigma0(a)
- add w4,w4,w9
- ldr w9,[sp,#8]
- str w12,[sp,#4]
- ror w16,w23,#6
- add w26,w26,w28 // h+=K[i]
- ror w11,w6,#7
- and w17,w24,w23
- ror w10,w3,#17
- bic w28,w25,w23
- ror w12,w27,#2
- add w26,w26,w4 // h+=X[i]
- eor w16,w16,w23,ror#11
- eor w11,w11,w6,ror#18
- orr w17,w17,w28 // Ch(e,f,g)
- eor w28,w27,w20 // a^b, b^c in next round
- eor w16,w16,w23,ror#25 // Sigma1(e)
- eor w12,w12,w27,ror#13
- add w26,w26,w17 // h+=Ch(e,f,g)
- and w19,w19,w28 // (b^c)&=(a^b)
- eor w10,w10,w3,ror#19
- eor w11,w11,w6,lsr#3 // sigma0(X[i+1])
- add w26,w26,w16 // h+=Sigma1(e)
- eor w19,w19,w20 // Maj(a,b,c)
- eor w17,w12,w27,ror#22 // Sigma0(a)
- eor w10,w10,w3,lsr#10 // sigma1(X[i+14])
- add w5,w5,w14
- add w22,w22,w26 // d+=h
- add w26,w26,w19 // h+=Maj(a,b,c)
- ldr w19,[x30],#4 // *K++, w28 in next round
- add w5,w5,w11
- add w26,w26,w17 // h+=Sigma0(a)
- add w5,w5,w10
- ldr w10,[sp,#12]
- str w13,[sp,#8]
- ror w16,w22,#6
- add w25,w25,w19 // h+=K[i]
- ror w12,w7,#7
- and w17,w23,w22
- ror w11,w4,#17
- bic w19,w24,w22
- ror w13,w26,#2
- add w25,w25,w5 // h+=X[i]
- eor w16,w16,w22,ror#11
- eor w12,w12,w7,ror#18
- orr w17,w17,w19 // Ch(e,f,g)
- eor w19,w26,w27 // a^b, b^c in next round
- eor w16,w16,w22,ror#25 // Sigma1(e)
- eor w13,w13,w26,ror#13
- add w25,w25,w17 // h+=Ch(e,f,g)
- and w28,w28,w19 // (b^c)&=(a^b)
- eor w11,w11,w4,ror#19
- eor w12,w12,w7,lsr#3 // sigma0(X[i+1])
- add w25,w25,w16 // h+=Sigma1(e)
- eor w28,w28,w27 // Maj(a,b,c)
- eor w17,w13,w26,ror#22 // Sigma0(a)
- eor w11,w11,w4,lsr#10 // sigma1(X[i+14])
- add w6,w6,w15
- add w21,w21,w25 // d+=h
- add w25,w25,w28 // h+=Maj(a,b,c)
- ldr w28,[x30],#4 // *K++, w19 in next round
- add w6,w6,w12
- add w25,w25,w17 // h+=Sigma0(a)
- add w6,w6,w11
- ldr w11,[sp,#0]
- str w14,[sp,#12]
- ror w16,w21,#6
- add w24,w24,w28 // h+=K[i]
- ror w13,w8,#7
- and w17,w22,w21
- ror w12,w5,#17
- bic w28,w23,w21
- ror w14,w25,#2
- add w24,w24,w6 // h+=X[i]
- eor w16,w16,w21,ror#11
- eor w13,w13,w8,ror#18
- orr w17,w17,w28 // Ch(e,f,g)
- eor w28,w25,w26 // a^b, b^c in next round
- eor w16,w16,w21,ror#25 // Sigma1(e)
- eor w14,w14,w25,ror#13
- add w24,w24,w17 // h+=Ch(e,f,g)
- and w19,w19,w28 // (b^c)&=(a^b)
- eor w12,w12,w5,ror#19
- eor w13,w13,w8,lsr#3 // sigma0(X[i+1])
- add w24,w24,w16 // h+=Sigma1(e)
- eor w19,w19,w26 // Maj(a,b,c)
- eor w17,w14,w25,ror#22 // Sigma0(a)
- eor w12,w12,w5,lsr#10 // sigma1(X[i+14])
- add w7,w7,w0
- add w20,w20,w24 // d+=h
- add w24,w24,w19 // h+=Maj(a,b,c)
- ldr w19,[x30],#4 // *K++, w28 in next round
- add w7,w7,w13
- add w24,w24,w17 // h+=Sigma0(a)
- add w7,w7,w12
- ldr w12,[sp,#4]
- str w15,[sp,#0]
- ror w16,w20,#6
- add w23,w23,w19 // h+=K[i]
- ror w14,w9,#7
- and w17,w21,w20
- ror w13,w6,#17
- bic w19,w22,w20
- ror w15,w24,#2
- add w23,w23,w7 // h+=X[i]
- eor w16,w16,w20,ror#11
- eor w14,w14,w9,ror#18
- orr w17,w17,w19 // Ch(e,f,g)
- eor w19,w24,w25 // a^b, b^c in next round
- eor w16,w16,w20,ror#25 // Sigma1(e)
- eor w15,w15,w24,ror#13
- add w23,w23,w17 // h+=Ch(e,f,g)
- and w28,w28,w19 // (b^c)&=(a^b)
- eor w13,w13,w6,ror#19
- eor w14,w14,w9,lsr#3 // sigma0(X[i+1])
- add w23,w23,w16 // h+=Sigma1(e)
- eor w28,w28,w25 // Maj(a,b,c)
- eor w17,w15,w24,ror#22 // Sigma0(a)
- eor w13,w13,w6,lsr#10 // sigma1(X[i+14])
- add w8,w8,w1
- add w27,w27,w23 // d+=h
- add w23,w23,w28 // h+=Maj(a,b,c)
- ldr w28,[x30],#4 // *K++, w19 in next round
- add w8,w8,w14
- add w23,w23,w17 // h+=Sigma0(a)
- add w8,w8,w13
- ldr w13,[sp,#8]
- str w0,[sp,#4]
- ror w16,w27,#6
- add w22,w22,w28 // h+=K[i]
- ror w15,w10,#7
- and w17,w20,w27
- ror w14,w7,#17
- bic w28,w21,w27
- ror w0,w23,#2
- add w22,w22,w8 // h+=X[i]
- eor w16,w16,w27,ror#11
- eor w15,w15,w10,ror#18
- orr w17,w17,w28 // Ch(e,f,g)
- eor w28,w23,w24 // a^b, b^c in next round
- eor w16,w16,w27,ror#25 // Sigma1(e)
- eor w0,w0,w23,ror#13
- add w22,w22,w17 // h+=Ch(e,f,g)
- and w19,w19,w28 // (b^c)&=(a^b)
- eor w14,w14,w7,ror#19
- eor w15,w15,w10,lsr#3 // sigma0(X[i+1])
- add w22,w22,w16 // h+=Sigma1(e)
- eor w19,w19,w24 // Maj(a,b,c)
- eor w17,w0,w23,ror#22 // Sigma0(a)
- eor w14,w14,w7,lsr#10 // sigma1(X[i+14])
- add w9,w9,w2
- add w26,w26,w22 // d+=h
- add w22,w22,w19 // h+=Maj(a,b,c)
- ldr w19,[x30],#4 // *K++, w28 in next round
- add w9,w9,w15
- add w22,w22,w17 // h+=Sigma0(a)
- add w9,w9,w14
- ldr w14,[sp,#12]
- str w1,[sp,#8]
- ror w16,w26,#6
- add w21,w21,w19 // h+=K[i]
- ror w0,w11,#7
- and w17,w27,w26
- ror w15,w8,#17
- bic w19,w20,w26
- ror w1,w22,#2
- add w21,w21,w9 // h+=X[i]
- eor w16,w16,w26,ror#11
- eor w0,w0,w11,ror#18
- orr w17,w17,w19 // Ch(e,f,g)
- eor w19,w22,w23 // a^b, b^c in next round
- eor w16,w16,w26,ror#25 // Sigma1(e)
- eor w1,w1,w22,ror#13
- add w21,w21,w17 // h+=Ch(e,f,g)
- and w28,w28,w19 // (b^c)&=(a^b)
- eor w15,w15,w8,ror#19
- eor w0,w0,w11,lsr#3 // sigma0(X[i+1])
- add w21,w21,w16 // h+=Sigma1(e)
- eor w28,w28,w23 // Maj(a,b,c)
- eor w17,w1,w22,ror#22 // Sigma0(a)
- eor w15,w15,w8,lsr#10 // sigma1(X[i+14])
- add w10,w10,w3
- add w25,w25,w21 // d+=h
- add w21,w21,w28 // h+=Maj(a,b,c)
- ldr w28,[x30],#4 // *K++, w19 in next round
- add w10,w10,w0
- add w21,w21,w17 // h+=Sigma0(a)
- add w10,w10,w15
- ldr w15,[sp,#0]
- str w2,[sp,#12]
- ror w16,w25,#6
- add w20,w20,w28 // h+=K[i]
- ror w1,w12,#7
- and w17,w26,w25
- ror w0,w9,#17
- bic w28,w27,w25
- ror w2,w21,#2
- add w20,w20,w10 // h+=X[i]
- eor w16,w16,w25,ror#11
- eor w1,w1,w12,ror#18
- orr w17,w17,w28 // Ch(e,f,g)
- eor w28,w21,w22 // a^b, b^c in next round
- eor w16,w16,w25,ror#25 // Sigma1(e)
- eor w2,w2,w21,ror#13
- add w20,w20,w17 // h+=Ch(e,f,g)
- and w19,w19,w28 // (b^c)&=(a^b)
- eor w0,w0,w9,ror#19
- eor w1,w1,w12,lsr#3 // sigma0(X[i+1])
- add w20,w20,w16 // h+=Sigma1(e)
- eor w19,w19,w22 // Maj(a,b,c)
- eor w17,w2,w21,ror#22 // Sigma0(a)
- eor w0,w0,w9,lsr#10 // sigma1(X[i+14])
- add w11,w11,w4
- add w24,w24,w20 // d+=h
- add w20,w20,w19 // h+=Maj(a,b,c)
- ldr w19,[x30],#4 // *K++, w28 in next round
- add w11,w11,w1
- add w20,w20,w17 // h+=Sigma0(a)
- add w11,w11,w0
- ldr w0,[sp,#4]
- str w3,[sp,#0]
- ror w16,w24,#6
- add w27,w27,w19 // h+=K[i]
- ror w2,w13,#7
- and w17,w25,w24
- ror w1,w10,#17
- bic w19,w26,w24
- ror w3,w20,#2
- add w27,w27,w11 // h+=X[i]
- eor w16,w16,w24,ror#11
- eor w2,w2,w13,ror#18
- orr w17,w17,w19 // Ch(e,f,g)
- eor w19,w20,w21 // a^b, b^c in next round
- eor w16,w16,w24,ror#25 // Sigma1(e)
- eor w3,w3,w20,ror#13
- add w27,w27,w17 // h+=Ch(e,f,g)
- and w28,w28,w19 // (b^c)&=(a^b)
- eor w1,w1,w10,ror#19
- eor w2,w2,w13,lsr#3 // sigma0(X[i+1])
- add w27,w27,w16 // h+=Sigma1(e)
- eor w28,w28,w21 // Maj(a,b,c)
- eor w17,w3,w20,ror#22 // Sigma0(a)
- eor w1,w1,w10,lsr#10 // sigma1(X[i+14])
- add w12,w12,w5
- add w23,w23,w27 // d+=h
- add w27,w27,w28 // h+=Maj(a,b,c)
- ldr w28,[x30],#4 // *K++, w19 in next round
- add w12,w12,w2
- add w27,w27,w17 // h+=Sigma0(a)
- add w12,w12,w1
- ldr w1,[sp,#8]
- str w4,[sp,#4]
- ror w16,w23,#6
- add w26,w26,w28 // h+=K[i]
- ror w3,w14,#7
- and w17,w24,w23
- ror w2,w11,#17
- bic w28,w25,w23
- ror w4,w27,#2
- add w26,w26,w12 // h+=X[i]
- eor w16,w16,w23,ror#11
- eor w3,w3,w14,ror#18
- orr w17,w17,w28 // Ch(e,f,g)
- eor w28,w27,w20 // a^b, b^c in next round
- eor w16,w16,w23,ror#25 // Sigma1(e)
- eor w4,w4,w27,ror#13
- add w26,w26,w17 // h+=Ch(e,f,g)
- and w19,w19,w28 // (b^c)&=(a^b)
- eor w2,w2,w11,ror#19
- eor w3,w3,w14,lsr#3 // sigma0(X[i+1])
- add w26,w26,w16 // h+=Sigma1(e)
- eor w19,w19,w20 // Maj(a,b,c)
- eor w17,w4,w27,ror#22 // Sigma0(a)
- eor w2,w2,w11,lsr#10 // sigma1(X[i+14])
- add w13,w13,w6
- add w22,w22,w26 // d+=h
- add w26,w26,w19 // h+=Maj(a,b,c)
- ldr w19,[x30],#4 // *K++, w28 in next round
- add w13,w13,w3
- add w26,w26,w17 // h+=Sigma0(a)
- add w13,w13,w2
- ldr w2,[sp,#12]
- str w5,[sp,#8]
- ror w16,w22,#6
- add w25,w25,w19 // h+=K[i]
- ror w4,w15,#7
- and w17,w23,w22
- ror w3,w12,#17
- bic w19,w24,w22
- ror w5,w26,#2
- add w25,w25,w13 // h+=X[i]
- eor w16,w16,w22,ror#11
- eor w4,w4,w15,ror#18
- orr w17,w17,w19 // Ch(e,f,g)
- eor w19,w26,w27 // a^b, b^c in next round
- eor w16,w16,w22,ror#25 // Sigma1(e)
- eor w5,w5,w26,ror#13
- add w25,w25,w17 // h+=Ch(e,f,g)
- and w28,w28,w19 // (b^c)&=(a^b)
- eor w3,w3,w12,ror#19
- eor w4,w4,w15,lsr#3 // sigma0(X[i+1])
- add w25,w25,w16 // h+=Sigma1(e)
- eor w28,w28,w27 // Maj(a,b,c)
- eor w17,w5,w26,ror#22 // Sigma0(a)
- eor w3,w3,w12,lsr#10 // sigma1(X[i+14])
- add w14,w14,w7
- add w21,w21,w25 // d+=h
- add w25,w25,w28 // h+=Maj(a,b,c)
- ldr w28,[x30],#4 // *K++, w19 in next round
- add w14,w14,w4
- add w25,w25,w17 // h+=Sigma0(a)
- add w14,w14,w3
- ldr w3,[sp,#0]
- str w6,[sp,#12]
- ror w16,w21,#6
- add w24,w24,w28 // h+=K[i]
- ror w5,w0,#7
- and w17,w22,w21
- ror w4,w13,#17
- bic w28,w23,w21
- ror w6,w25,#2
- add w24,w24,w14 // h+=X[i]
- eor w16,w16,w21,ror#11
- eor w5,w5,w0,ror#18
- orr w17,w17,w28 // Ch(e,f,g)
- eor w28,w25,w26 // a^b, b^c in next round
- eor w16,w16,w21,ror#25 // Sigma1(e)
- eor w6,w6,w25,ror#13
- add w24,w24,w17 // h+=Ch(e,f,g)
- and w19,w19,w28 // (b^c)&=(a^b)
- eor w4,w4,w13,ror#19
- eor w5,w5,w0,lsr#3 // sigma0(X[i+1])
- add w24,w24,w16 // h+=Sigma1(e)
- eor w19,w19,w26 // Maj(a,b,c)
- eor w17,w6,w25,ror#22 // Sigma0(a)
- eor w4,w4,w13,lsr#10 // sigma1(X[i+14])
- add w15,w15,w8
- add w20,w20,w24 // d+=h
- add w24,w24,w19 // h+=Maj(a,b,c)
- ldr w19,[x30],#4 // *K++, w28 in next round
- add w15,w15,w5
- add w24,w24,w17 // h+=Sigma0(a)
- add w15,w15,w4
- ldr w4,[sp,#4]
- str w7,[sp,#0]
- ror w16,w20,#6
- add w23,w23,w19 // h+=K[i]
- ror w6,w1,#7
- and w17,w21,w20
- ror w5,w14,#17
- bic w19,w22,w20
- ror w7,w24,#2
- add w23,w23,w15 // h+=X[i]
- eor w16,w16,w20,ror#11
- eor w6,w6,w1,ror#18
- orr w17,w17,w19 // Ch(e,f,g)
- eor w19,w24,w25 // a^b, b^c in next round
- eor w16,w16,w20,ror#25 // Sigma1(e)
- eor w7,w7,w24,ror#13
- add w23,w23,w17 // h+=Ch(e,f,g)
- and w28,w28,w19 // (b^c)&=(a^b)
- eor w5,w5,w14,ror#19
- eor w6,w6,w1,lsr#3 // sigma0(X[i+1])
- add w23,w23,w16 // h+=Sigma1(e)
- eor w28,w28,w25 // Maj(a,b,c)
- eor w17,w7,w24,ror#22 // Sigma0(a)
- eor w5,w5,w14,lsr#10 // sigma1(X[i+14])
- add w0,w0,w9
- add w27,w27,w23 // d+=h
- add w23,w23,w28 // h+=Maj(a,b,c)
- ldr w28,[x30],#4 // *K++, w19 in next round
- add w0,w0,w6
- add w23,w23,w17 // h+=Sigma0(a)
- add w0,w0,w5
- ldr w5,[sp,#8]
- str w8,[sp,#4]
- ror w16,w27,#6
- add w22,w22,w28 // h+=K[i]
- ror w7,w2,#7
- and w17,w20,w27
- ror w6,w15,#17
- bic w28,w21,w27
- ror w8,w23,#2
- add w22,w22,w0 // h+=X[i]
- eor w16,w16,w27,ror#11
- eor w7,w7,w2,ror#18
- orr w17,w17,w28 // Ch(e,f,g)
- eor w28,w23,w24 // a^b, b^c in next round
- eor w16,w16,w27,ror#25 // Sigma1(e)
- eor w8,w8,w23,ror#13
- add w22,w22,w17 // h+=Ch(e,f,g)
- and w19,w19,w28 // (b^c)&=(a^b)
- eor w6,w6,w15,ror#19
- eor w7,w7,w2,lsr#3 // sigma0(X[i+1])
- add w22,w22,w16 // h+=Sigma1(e)
- eor w19,w19,w24 // Maj(a,b,c)
- eor w17,w8,w23,ror#22 // Sigma0(a)
- eor w6,w6,w15,lsr#10 // sigma1(X[i+14])
- add w1,w1,w10
- add w26,w26,w22 // d+=h
- add w22,w22,w19 // h+=Maj(a,b,c)
- ldr w19,[x30],#4 // *K++, w28 in next round
- add w1,w1,w7
- add w22,w22,w17 // h+=Sigma0(a)
- add w1,w1,w6
- ldr w6,[sp,#12]
- str w9,[sp,#8]
- ror w16,w26,#6
- add w21,w21,w19 // h+=K[i]
- ror w8,w3,#7
- and w17,w27,w26
- ror w7,w0,#17
- bic w19,w20,w26
- ror w9,w22,#2
- add w21,w21,w1 // h+=X[i]
- eor w16,w16,w26,ror#11
- eor w8,w8,w3,ror#18
- orr w17,w17,w19 // Ch(e,f,g)
- eor w19,w22,w23 // a^b, b^c in next round
- eor w16,w16,w26,ror#25 // Sigma1(e)
- eor w9,w9,w22,ror#13
- add w21,w21,w17 // h+=Ch(e,f,g)
- and w28,w28,w19 // (b^c)&=(a^b)
- eor w7,w7,w0,ror#19
- eor w8,w8,w3,lsr#3 // sigma0(X[i+1])
- add w21,w21,w16 // h+=Sigma1(e)
- eor w28,w28,w23 // Maj(a,b,c)
- eor w17,w9,w22,ror#22 // Sigma0(a)
- eor w7,w7,w0,lsr#10 // sigma1(X[i+14])
- add w2,w2,w11
- add w25,w25,w21 // d+=h
- add w21,w21,w28 // h+=Maj(a,b,c)
- ldr w28,[x30],#4 // *K++, w19 in next round
- add w2,w2,w8
- add w21,w21,w17 // h+=Sigma0(a)
- add w2,w2,w7
- ldr w7,[sp,#0]
- str w10,[sp,#12]
- ror w16,w25,#6
- add w20,w20,w28 // h+=K[i]
- ror w9,w4,#7
- and w17,w26,w25
- ror w8,w1,#17
- bic w28,w27,w25
- ror w10,w21,#2
- add w20,w20,w2 // h+=X[i]
- eor w16,w16,w25,ror#11
- eor w9,w9,w4,ror#18
- orr w17,w17,w28 // Ch(e,f,g)
- eor w28,w21,w22 // a^b, b^c in next round
- eor w16,w16,w25,ror#25 // Sigma1(e)
- eor w10,w10,w21,ror#13
- add w20,w20,w17 // h+=Ch(e,f,g)
- and w19,w19,w28 // (b^c)&=(a^b)
- eor w8,w8,w1,ror#19
- eor w9,w9,w4,lsr#3 // sigma0(X[i+1])
- add w20,w20,w16 // h+=Sigma1(e)
- eor w19,w19,w22 // Maj(a,b,c)
- eor w17,w10,w21,ror#22 // Sigma0(a)
- eor w8,w8,w1,lsr#10 // sigma1(X[i+14])
- add w3,w3,w12
- add w24,w24,w20 // d+=h
- add w20,w20,w19 // h+=Maj(a,b,c)
- ldr w19,[x30],#4 // *K++, w28 in next round
- add w3,w3,w9
- add w20,w20,w17 // h+=Sigma0(a)
- add w3,w3,w8
- cbnz w19,.Loop_16_xx
-
- ldp x0,x2,[x29,#96]
- ldr x1,[x29,#112]
- sub x30,x30,#260 // rewind
-
- ldp w3,w4,[x0]
- ldp w5,w6,[x0,#2*4]
- add x1,x1,#14*4 // advance input pointer
- ldp w7,w8,[x0,#4*4]
- add w20,w20,w3
- ldp w9,w10,[x0,#6*4]
- add w21,w21,w4
- add w22,w22,w5
- add w23,w23,w6
- stp w20,w21,[x0]
- add w24,w24,w7
- add w25,w25,w8
- stp w22,w23,[x0,#2*4]
- add w26,w26,w9
- add w27,w27,w10
- cmp x1,x2
- stp w24,w25,[x0,#4*4]
- stp w26,w27,[x0,#6*4]
- b.ne .Loop
-
- ldp x19,x20,[x29,#16]
- add sp,sp,#4*4
- ldp x21,x22,[x29,#32]
- ldp x23,x24,[x29,#48]
- ldp x25,x26,[x29,#64]
- ldp x27,x28,[x29,#80]
- ldp x29,x30,[sp],#128
- ret
-.size sha256_block_data_order,.-sha256_block_data_order
-
-.align 6
-.type .LK256,%object
-.LK256:
- .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
- .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
- .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
- .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
- .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
- .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
- .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
- .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
- .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
- .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
- .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
- .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
- .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
- .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
- .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
- .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
- .long 0 //terminator
-.size .LK256,.-.LK256
-#ifndef __KERNEL__
-.align 3
-.LOPENSSL_armcap_P:
-# ifdef __ILP32__
- .long OPENSSL_armcap_P-.
-# else
- .quad OPENSSL_armcap_P-.
-# endif
-#endif
-.asciz "SHA256 block transform for ARMv8, CRYPTOGAMS by <appro@openssl.org>"
-.align 2
-#ifndef __KERNEL__
-.type sha256_block_armv8,%function
-.align 6
-sha256_block_armv8:
-.Lv8_entry:
- stp x29,x30,[sp,#-16]!
- add x29,sp,#0
-
- ld1 {v0.4s,v1.4s},[x0]
- adr x3,.LK256
-
-.Loop_hw:
- ld1 {v4.16b-v7.16b},[x1],#64
- sub x2,x2,#1
- ld1 {v16.4s},[x3],#16
- rev32 v4.16b,v4.16b
- rev32 v5.16b,v5.16b
- rev32 v6.16b,v6.16b
- rev32 v7.16b,v7.16b
- orr v18.16b,v0.16b,v0.16b // offload
- orr v19.16b,v1.16b,v1.16b
- ld1 {v17.4s},[x3],#16
- add v16.4s,v16.4s,v4.4s
- .inst 0x5e2828a4 //sha256su0 v4.16b,v5.16b
- orr v2.16b,v0.16b,v0.16b
- .inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s
- .inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s
- .inst 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b
- ld1 {v16.4s},[x3],#16
- add v17.4s,v17.4s,v5.4s
- .inst 0x5e2828c5 //sha256su0 v5.16b,v6.16b
- orr v2.16b,v0.16b,v0.16b
- .inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s
- .inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s
- .inst 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b
- ld1 {v17.4s},[x3],#16
- add v16.4s,v16.4s,v6.4s
- .inst 0x5e2828e6 //sha256su0 v6.16b,v7.16b
- orr v2.16b,v0.16b,v0.16b
- .inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s
- .inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s
- .inst 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b
- ld1 {v16.4s},[x3],#16
- add v17.4s,v17.4s,v7.4s
- .inst 0x5e282887 //sha256su0 v7.16b,v4.16b
- orr v2.16b,v0.16b,v0.16b
- .inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s
- .inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s
- .inst 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b
- ld1 {v17.4s},[x3],#16
- add v16.4s,v16.4s,v4.4s
- .inst 0x5e2828a4 //sha256su0 v4.16b,v5.16b
- orr v2.16b,v0.16b,v0.16b
- .inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s
- .inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s
- .inst 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b
- ld1 {v16.4s},[x3],#16
- add v17.4s,v17.4s,v5.4s
- .inst 0x5e2828c5 //sha256su0 v5.16b,v6.16b
- orr v2.16b,v0.16b,v0.16b
- .inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s
- .inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s
- .inst 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b
- ld1 {v17.4s},[x3],#16
- add v16.4s,v16.4s,v6.4s
- .inst 0x5e2828e6 //sha256su0 v6.16b,v7.16b
- orr v2.16b,v0.16b,v0.16b
- .inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s
- .inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s
- .inst 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b
- ld1 {v16.4s},[x3],#16
- add v17.4s,v17.4s,v7.4s
- .inst 0x5e282887 //sha256su0 v7.16b,v4.16b
- orr v2.16b,v0.16b,v0.16b
- .inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s
- .inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s
- .inst 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b
- ld1 {v17.4s},[x3],#16
- add v16.4s,v16.4s,v4.4s
- .inst 0x5e2828a4 //sha256su0 v4.16b,v5.16b
- orr v2.16b,v0.16b,v0.16b
- .inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s
- .inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s
- .inst 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b
- ld1 {v16.4s},[x3],#16
- add v17.4s,v17.4s,v5.4s
- .inst 0x5e2828c5 //sha256su0 v5.16b,v6.16b
- orr v2.16b,v0.16b,v0.16b
- .inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s
- .inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s
- .inst 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b
- ld1 {v17.4s},[x3],#16
- add v16.4s,v16.4s,v6.4s
- .inst 0x5e2828e6 //sha256su0 v6.16b,v7.16b
- orr v2.16b,v0.16b,v0.16b
- .inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s
- .inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s
- .inst 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b
- ld1 {v16.4s},[x3],#16
- add v17.4s,v17.4s,v7.4s
- .inst 0x5e282887 //sha256su0 v7.16b,v4.16b
- orr v2.16b,v0.16b,v0.16b
- .inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s
- .inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s
- .inst 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b
- ld1 {v17.4s},[x3],#16
- add v16.4s,v16.4s,v4.4s
- orr v2.16b,v0.16b,v0.16b
- .inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s
- .inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s
-
- ld1 {v16.4s},[x3],#16
- add v17.4s,v17.4s,v5.4s
- orr v2.16b,v0.16b,v0.16b
- .inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s
- .inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s
-
- ld1 {v17.4s},[x3]
- add v16.4s,v16.4s,v6.4s
- sub x3,x3,#64*4-16 // rewind
- orr v2.16b,v0.16b,v0.16b
- .inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s
- .inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s
-
- add v17.4s,v17.4s,v7.4s
- orr v2.16b,v0.16b,v0.16b
- .inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s
- .inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s
-
- add v0.4s,v0.4s,v18.4s
- add v1.4s,v1.4s,v19.4s
-
- cbnz x2,.Loop_hw
-
- st1 {v0.4s,v1.4s},[x0]
-
- ldr x29,[sp],#16
- ret
-.size sha256_block_armv8,.-sha256_block_armv8
-#endif
-#ifdef __KERNEL__
-.globl sha256_block_neon
-#endif
-.type sha256_block_neon,%function
-.align 4
-sha256_block_neon:
-.Lneon_entry:
- stp x29, x30, [sp, #-16]!
- mov x29, sp
- sub sp,sp,#16*4
-
- adr x16,.LK256
- add x2,x1,x2,lsl#6 // len to point at the end of inp
-
- ld1 {v0.16b},[x1], #16
- ld1 {v1.16b},[x1], #16
- ld1 {v2.16b},[x1], #16
- ld1 {v3.16b},[x1], #16
- ld1 {v4.4s},[x16], #16
- ld1 {v5.4s},[x16], #16
- ld1 {v6.4s},[x16], #16
- ld1 {v7.4s},[x16], #16
- rev32 v0.16b,v0.16b // yes, even on
- rev32 v1.16b,v1.16b // big-endian
- rev32 v2.16b,v2.16b
- rev32 v3.16b,v3.16b
- mov x17,sp
- add v4.4s,v4.4s,v0.4s
- add v5.4s,v5.4s,v1.4s
- add v6.4s,v6.4s,v2.4s
- st1 {v4.4s-v5.4s},[x17], #32
- add v7.4s,v7.4s,v3.4s
- st1 {v6.4s-v7.4s},[x17]
- sub x17,x17,#32
-
- ldp w3,w4,[x0]
- ldp w5,w6,[x0,#8]
- ldp w7,w8,[x0,#16]
- ldp w9,w10,[x0,#24]
- ldr w12,[sp,#0]
- mov w13,wzr
- eor w14,w4,w5
- mov w15,wzr
- b .L_00_48
-
-.align 4
-.L_00_48:
- ext v4.16b,v0.16b,v1.16b,#4
- add w10,w10,w12
- add w3,w3,w15
- and w12,w8,w7
- bic w15,w9,w7
- ext v7.16b,v2.16b,v3.16b,#4
- eor w11,w7,w7,ror#5
- add w3,w3,w13
- mov d19,v3.d[1]
- orr w12,w12,w15
- eor w11,w11,w7,ror#19
- ushr v6.4s,v4.4s,#7
- eor w15,w3,w3,ror#11
- ushr v5.4s,v4.4s,#3
- add w10,w10,w12
- add v0.4s,v0.4s,v7.4s
- ror w11,w11,#6
- sli v6.4s,v4.4s,#25
- eor w13,w3,w4
- eor w15,w15,w3,ror#20
- ushr v7.4s,v4.4s,#18
- add w10,w10,w11
- ldr w12,[sp,#4]
- and w14,w14,w13
- eor v5.16b,v5.16b,v6.16b
- ror w15,w15,#2
- add w6,w6,w10
- sli v7.4s,v4.4s,#14
- eor w14,w14,w4
- ushr v16.4s,v19.4s,#17
- add w9,w9,w12
- add w10,w10,w15
- and w12,w7,w6
- eor v5.16b,v5.16b,v7.16b
- bic w15,w8,w6
- eor w11,w6,w6,ror#5
- sli v16.4s,v19.4s,#15
- add w10,w10,w14
- orr w12,w12,w15
- ushr v17.4s,v19.4s,#10
- eor w11,w11,w6,ror#19
- eor w15,w10,w10,ror#11
- ushr v7.4s,v19.4s,#19
- add w9,w9,w12
- ror w11,w11,#6
- add v0.4s,v0.4s,v5.4s
- eor w14,w10,w3
- eor w15,w15,w10,ror#20
- sli v7.4s,v19.4s,#13
- add w9,w9,w11
- ldr w12,[sp,#8]
- and w13,w13,w14
- eor v17.16b,v17.16b,v16.16b
- ror w15,w15,#2
- add w5,w5,w9
- eor w13,w13,w3
- eor v17.16b,v17.16b,v7.16b
- add w8,w8,w12
- add w9,w9,w15
- and w12,w6,w5
- add v0.4s,v0.4s,v17.4s
- bic w15,w7,w5
- eor w11,w5,w5,ror#5
- add w9,w9,w13
- ushr v18.4s,v0.4s,#17
- orr w12,w12,w15
- ushr v19.4s,v0.4s,#10
- eor w11,w11,w5,ror#19
- eor w15,w9,w9,ror#11
- sli v18.4s,v0.4s,#15
- add w8,w8,w12
- ushr v17.4s,v0.4s,#19
- ror w11,w11,#6
- eor w13,w9,w10
- eor v19.16b,v19.16b,v18.16b
- eor w15,w15,w9,ror#20
- add w8,w8,w11
- sli v17.4s,v0.4s,#13
- ldr w12,[sp,#12]
- and w14,w14,w13
- ror w15,w15,#2
- ld1 {v4.4s},[x16], #16
- add w4,w4,w8
- eor v19.16b,v19.16b,v17.16b
- eor w14,w14,w10
- eor v17.16b,v17.16b,v17.16b
- add w7,w7,w12
- add w8,w8,w15
- and w12,w5,w4
- mov v17.d[1],v19.d[0]
- bic w15,w6,w4
- eor w11,w4,w4,ror#5
- add w8,w8,w14
- add v0.4s,v0.4s,v17.4s
- orr w12,w12,w15
- eor w11,w11,w4,ror#19
- eor w15,w8,w8,ror#11
- add v4.4s,v4.4s,v0.4s
- add w7,w7,w12
- ror w11,w11,#6
- eor w14,w8,w9
- eor w15,w15,w8,ror#20
- add w7,w7,w11
- ldr w12,[sp,#16]
- and w13,w13,w14
- ror w15,w15,#2
- add w3,w3,w7
- eor w13,w13,w9
- st1 {v4.4s},[x17], #16
- ext v4.16b,v1.16b,v2.16b,#4
- add w6,w6,w12
- add w7,w7,w15
- and w12,w4,w3
- bic w15,w5,w3
- ext v7.16b,v3.16b,v0.16b,#4
- eor w11,w3,w3,ror#5
- add w7,w7,w13
- mov d19,v0.d[1]
- orr w12,w12,w15
- eor w11,w11,w3,ror#19
- ushr v6.4s,v4.4s,#7
- eor w15,w7,w7,ror#11
- ushr v5.4s,v4.4s,#3
- add w6,w6,w12
- add v1.4s,v1.4s,v7.4s
- ror w11,w11,#6
- sli v6.4s,v4.4s,#25
- eor w13,w7,w8
- eor w15,w15,w7,ror#20
- ushr v7.4s,v4.4s,#18
- add w6,w6,w11
- ldr w12,[sp,#20]
- and w14,w14,w13
- eor v5.16b,v5.16b,v6.16b
- ror w15,w15,#2
- add w10,w10,w6
- sli v7.4s,v4.4s,#14
- eor w14,w14,w8
- ushr v16.4s,v19.4s,#17
- add w5,w5,w12
- add w6,w6,w15
- and w12,w3,w10
- eor v5.16b,v5.16b,v7.16b
- bic w15,w4,w10
- eor w11,w10,w10,ror#5
- sli v16.4s,v19.4s,#15
- add w6,w6,w14
- orr w12,w12,w15
- ushr v17.4s,v19.4s,#10
- eor w11,w11,w10,ror#19
- eor w15,w6,w6,ror#11
- ushr v7.4s,v19.4s,#19
- add w5,w5,w12
- ror w11,w11,#6
- add v1.4s,v1.4s,v5.4s
- eor w14,w6,w7
- eor w15,w15,w6,ror#20
- sli v7.4s,v19.4s,#13
- add w5,w5,w11
- ldr w12,[sp,#24]
- and w13,w13,w14
- eor v17.16b,v17.16b,v16.16b
- ror w15,w15,#2
- add w9,w9,w5
- eor w13,w13,w7
- eor v17.16b,v17.16b,v7.16b
- add w4,w4,w12
- add w5,w5,w15
- and w12,w10,w9
- add v1.4s,v1.4s,v17.4s
- bic w15,w3,w9
- eor w11,w9,w9,ror#5
- add w5,w5,w13
- ushr v18.4s,v1.4s,#17
- orr w12,w12,w15
- ushr v19.4s,v1.4s,#10
- eor w11,w11,w9,ror#19
- eor w15,w5,w5,ror#11
- sli v18.4s,v1.4s,#15
- add w4,w4,w12
- ushr v17.4s,v1.4s,#19
- ror w11,w11,#6
- eor w13,w5,w6
- eor v19.16b,v19.16b,v18.16b
- eor w15,w15,w5,ror#20
- add w4,w4,w11
- sli v17.4s,v1.4s,#13
- ldr w12,[sp,#28]
- and w14,w14,w13
- ror w15,w15,#2
- ld1 {v4.4s},[x16], #16
- add w8,w8,w4
- eor v19.16b,v19.16b,v17.16b
- eor w14,w14,w6
- eor v17.16b,v17.16b,v17.16b
- add w3,w3,w12
- add w4,w4,w15
- and w12,w9,w8
- mov v17.d[1],v19.d[0]
- bic w15,w10,w8
- eor w11,w8,w8,ror#5
- add w4,w4,w14
- add v1.4s,v1.4s,v17.4s
- orr w12,w12,w15
- eor w11,w11,w8,ror#19
- eor w15,w4,w4,ror#11
- add v4.4s,v4.4s,v1.4s
- add w3,w3,w12
- ror w11,w11,#6
- eor w14,w4,w5
- eor w15,w15,w4,ror#20
- add w3,w3,w11
- ldr w12,[sp,#32]
- and w13,w13,w14
- ror w15,w15,#2
- add w7,w7,w3
- eor w13,w13,w5
- st1 {v4.4s},[x17], #16
- ext v4.16b,v2.16b,v3.16b,#4
- add w10,w10,w12
- add w3,w3,w15
- and w12,w8,w7
- bic w15,w9,w7
- ext v7.16b,v0.16b,v1.16b,#4
- eor w11,w7,w7,ror#5
- add w3,w3,w13
- mov d19,v1.d[1]
- orr w12,w12,w15
- eor w11,w11,w7,ror#19
- ushr v6.4s,v4.4s,#7
- eor w15,w3,w3,ror#11
- ushr v5.4s,v4.4s,#3
- add w10,w10,w12
- add v2.4s,v2.4s,v7.4s
- ror w11,w11,#6
- sli v6.4s,v4.4s,#25
- eor w13,w3,w4
- eor w15,w15,w3,ror#20
- ushr v7.4s,v4.4s,#18
- add w10,w10,w11
- ldr w12,[sp,#36]
- and w14,w14,w13
- eor v5.16b,v5.16b,v6.16b
- ror w15,w15,#2
- add w6,w6,w10
- sli v7.4s,v4.4s,#14
- eor w14,w14,w4
- ushr v16.4s,v19.4s,#17
- add w9,w9,w12
- add w10,w10,w15
- and w12,w7,w6
- eor v5.16b,v5.16b,v7.16b
- bic w15,w8,w6
- eor w11,w6,w6,ror#5
- sli v16.4s,v19.4s,#15
- add w10,w10,w14
- orr w12,w12,w15
- ushr v17.4s,v19.4s,#10
- eor w11,w11,w6,ror#19
- eor w15,w10,w10,ror#11
- ushr v7.4s,v19.4s,#19
- add w9,w9,w12
- ror w11,w11,#6
- add v2.4s,v2.4s,v5.4s
- eor w14,w10,w3
- eor w15,w15,w10,ror#20
- sli v7.4s,v19.4s,#13
- add w9,w9,w11
- ldr w12,[sp,#40]
- and w13,w13,w14
- eor v17.16b,v17.16b,v16.16b
- ror w15,w15,#2
- add w5,w5,w9
- eor w13,w13,w3
- eor v17.16b,v17.16b,v7.16b
- add w8,w8,w12
- add w9,w9,w15
- and w12,w6,w5
- add v2.4s,v2.4s,v17.4s
- bic w15,w7,w5
- eor w11,w5,w5,ror#5
- add w9,w9,w13
- ushr v18.4s,v2.4s,#17
- orr w12,w12,w15
- ushr v19.4s,v2.4s,#10
- eor w11,w11,w5,ror#19
- eor w15,w9,w9,ror#11
- sli v18.4s,v2.4s,#15
- add w8,w8,w12
- ushr v17.4s,v2.4s,#19
- ror w11,w11,#6
- eor w13,w9,w10
- eor v19.16b,v19.16b,v18.16b
- eor w15,w15,w9,ror#20
- add w8,w8,w11
- sli v17.4s,v2.4s,#13
- ldr w12,[sp,#44]
- and w14,w14,w13
- ror w15,w15,#2
- ld1 {v4.4s},[x16], #16
- add w4,w4,w8
- eor v19.16b,v19.16b,v17.16b
- eor w14,w14,w10
- eor v17.16b,v17.16b,v17.16b
- add w7,w7,w12
- add w8,w8,w15
- and w12,w5,w4
- mov v17.d[1],v19.d[0]
- bic w15,w6,w4
- eor w11,w4,w4,ror#5
- add w8,w8,w14
- add v2.4s,v2.4s,v17.4s
- orr w12,w12,w15
- eor w11,w11,w4,ror#19
- eor w15,w8,w8,ror#11
- add v4.4s,v4.4s,v2.4s
- add w7,w7,w12
- ror w11,w11,#6
- eor w14,w8,w9
- eor w15,w15,w8,ror#20
- add w7,w7,w11
- ldr w12,[sp,#48]
- and w13,w13,w14
- ror w15,w15,#2
- add w3,w3,w7
- eor w13,w13,w9
- st1 {v4.4s},[x17], #16
- ext v4.16b,v3.16b,v0.16b,#4
- add w6,w6,w12
- add w7,w7,w15
- and w12,w4,w3
- bic w15,w5,w3
- ext v7.16b,v1.16b,v2.16b,#4
- eor w11,w3,w3,ror#5
- add w7,w7,w13
- mov d19,v2.d[1]
- orr w12,w12,w15
- eor w11,w11,w3,ror#19
- ushr v6.4s,v4.4s,#7
- eor w15,w7,w7,ror#11
- ushr v5.4s,v4.4s,#3
- add w6,w6,w12
- add v3.4s,v3.4s,v7.4s
- ror w11,w11,#6
- sli v6.4s,v4.4s,#25
- eor w13,w7,w8
- eor w15,w15,w7,ror#20
- ushr v7.4s,v4.4s,#18
- add w6,w6,w11
- ldr w12,[sp,#52]
- and w14,w14,w13
- eor v5.16b,v5.16b,v6.16b
- ror w15,w15,#2
- add w10,w10,w6
- sli v7.4s,v4.4s,#14
- eor w14,w14,w8
- ushr v16.4s,v19.4s,#17
- add w5,w5,w12
- add w6,w6,w15
- and w12,w3,w10
- eor v5.16b,v5.16b,v7.16b
- bic w15,w4,w10
- eor w11,w10,w10,ror#5
- sli v16.4s,v19.4s,#15
- add w6,w6,w14
- orr w12,w12,w15
- ushr v17.4s,v19.4s,#10
- eor w11,w11,w10,ror#19
- eor w15,w6,w6,ror#11
- ushr v7.4s,v19.4s,#19
- add w5,w5,w12
- ror w11,w11,#6
- add v3.4s,v3.4s,v5.4s
- eor w14,w6,w7
- eor w15,w15,w6,ror#20
- sli v7.4s,v19.4s,#13
- add w5,w5,w11
- ldr w12,[sp,#56]
- and w13,w13,w14
- eor v17.16b,v17.16b,v16.16b
- ror w15,w15,#2
- add w9,w9,w5
- eor w13,w13,w7
- eor v17.16b,v17.16b,v7.16b
- add w4,w4,w12
- add w5,w5,w15
- and w12,w10,w9
- add v3.4s,v3.4s,v17.4s
- bic w15,w3,w9
- eor w11,w9,w9,ror#5
- add w5,w5,w13
- ushr v18.4s,v3.4s,#17
- orr w12,w12,w15
- ushr v19.4s,v3.4s,#10
- eor w11,w11,w9,ror#19
- eor w15,w5,w5,ror#11
- sli v18.4s,v3.4s,#15
- add w4,w4,w12
- ushr v17.4s,v3.4s,#19
- ror w11,w11,#6
- eor w13,w5,w6
- eor v19.16b,v19.16b,v18.16b
- eor w15,w15,w5,ror#20
- add w4,w4,w11
- sli v17.4s,v3.4s,#13
- ldr w12,[sp,#60]
- and w14,w14,w13
- ror w15,w15,#2
- ld1 {v4.4s},[x16], #16
- add w8,w8,w4
- eor v19.16b,v19.16b,v17.16b
- eor w14,w14,w6
- eor v17.16b,v17.16b,v17.16b
- add w3,w3,w12
- add w4,w4,w15
- and w12,w9,w8
- mov v17.d[1],v19.d[0]
- bic w15,w10,w8
- eor w11,w8,w8,ror#5
- add w4,w4,w14
- add v3.4s,v3.4s,v17.4s
- orr w12,w12,w15
- eor w11,w11,w8,ror#19
- eor w15,w4,w4,ror#11
- add v4.4s,v4.4s,v3.4s
- add w3,w3,w12
- ror w11,w11,#6
- eor w14,w4,w5
- eor w15,w15,w4,ror#20
- add w3,w3,w11
- ldr w12,[x16]
- and w13,w13,w14
- ror w15,w15,#2
- add w7,w7,w3
- eor w13,w13,w5
- st1 {v4.4s},[x17], #16
- cmp w12,#0 // check for K256 terminator
- ldr w12,[sp,#0]
- sub x17,x17,#64
- bne .L_00_48
-
- sub x16,x16,#256 // rewind x16
- cmp x1,x2
- mov x17, #64
- csel x17, x17, xzr, eq
- sub x1,x1,x17 // avoid SEGV
- mov x17,sp
- add w10,w10,w12
- add w3,w3,w15
- and w12,w8,w7
- ld1 {v0.16b},[x1],#16
- bic w15,w9,w7
- eor w11,w7,w7,ror#5
- ld1 {v4.4s},[x16],#16
- add w3,w3,w13
- orr w12,w12,w15
- eor w11,w11,w7,ror#19
- eor w15,w3,w3,ror#11
- rev32 v0.16b,v0.16b
- add w10,w10,w12
- ror w11,w11,#6
- eor w13,w3,w4
- eor w15,w15,w3,ror#20
- add v4.4s,v4.4s,v0.4s
- add w10,w10,w11
- ldr w12,[sp,#4]
- and w14,w14,w13
- ror w15,w15,#2
- add w6,w6,w10
- eor w14,w14,w4
- add w9,w9,w12
- add w10,w10,w15
- and w12,w7,w6
- bic w15,w8,w6
- eor w11,w6,w6,ror#5
- add w10,w10,w14
- orr w12,w12,w15
- eor w11,w11,w6,ror#19
- eor w15,w10,w10,ror#11
- add w9,w9,w12
- ror w11,w11,#6
- eor w14,w10,w3
- eor w15,w15,w10,ror#20
- add w9,w9,w11
- ldr w12,[sp,#8]
- and w13,w13,w14
- ror w15,w15,#2
- add w5,w5,w9
- eor w13,w13,w3
- add w8,w8,w12
- add w9,w9,w15
- and w12,w6,w5
- bic w15,w7,w5
- eor w11,w5,w5,ror#5
- add w9,w9,w13
- orr w12,w12,w15
- eor w11,w11,w5,ror#19
- eor w15,w9,w9,ror#11
- add w8,w8,w12
- ror w11,w11,#6
- eor w13,w9,w10
- eor w15,w15,w9,ror#20
- add w8,w8,w11
- ldr w12,[sp,#12]
- and w14,w14,w13
- ror w15,w15,#2
- add w4,w4,w8
- eor w14,w14,w10
- add w7,w7,w12
- add w8,w8,w15
- and w12,w5,w4
- bic w15,w6,w4
- eor w11,w4,w4,ror#5
- add w8,w8,w14
- orr w12,w12,w15
- eor w11,w11,w4,ror#19
- eor w15,w8,w8,ror#11
- add w7,w7,w12
- ror w11,w11,#6
- eor w14,w8,w9
- eor w15,w15,w8,ror#20
- add w7,w7,w11
- ldr w12,[sp,#16]
- and w13,w13,w14
- ror w15,w15,#2
- add w3,w3,w7
- eor w13,w13,w9
- st1 {v4.4s},[x17], #16
- add w6,w6,w12
- add w7,w7,w15
- and w12,w4,w3
- ld1 {v1.16b},[x1],#16
- bic w15,w5,w3
- eor w11,w3,w3,ror#5
- ld1 {v4.4s},[x16],#16
- add w7,w7,w13
- orr w12,w12,w15
- eor w11,w11,w3,ror#19
- eor w15,w7,w7,ror#11
- rev32 v1.16b,v1.16b
- add w6,w6,w12
- ror w11,w11,#6
- eor w13,w7,w8
- eor w15,w15,w7,ror#20
- add v4.4s,v4.4s,v1.4s
- add w6,w6,w11
- ldr w12,[sp,#20]
- and w14,w14,w13
- ror w15,w15,#2
- add w10,w10,w6
- eor w14,w14,w8
- add w5,w5,w12
- add w6,w6,w15
- and w12,w3,w10
- bic w15,w4,w10
- eor w11,w10,w10,ror#5
- add w6,w6,w14
- orr w12,w12,w15
- eor w11,w11,w10,ror#19
- eor w15,w6,w6,ror#11
- add w5,w5,w12
- ror w11,w11,#6
- eor w14,w6,w7
- eor w15,w15,w6,ror#20
- add w5,w5,w11
- ldr w12,[sp,#24]
- and w13,w13,w14
- ror w15,w15,#2
- add w9,w9,w5
- eor w13,w13,w7
- add w4,w4,w12
- add w5,w5,w15
- and w12,w10,w9
- bic w15,w3,w9
- eor w11,w9,w9,ror#5
- add w5,w5,w13
- orr w12,w12,w15
- eor w11,w11,w9,ror#19
- eor w15,w5,w5,ror#11
- add w4,w4,w12
- ror w11,w11,#6
- eor w13,w5,w6
- eor w15,w15,w5,ror#20
- add w4,w4,w11
- ldr w12,[sp,#28]
- and w14,w14,w13
- ror w15,w15,#2
- add w8,w8,w4
- eor w14,w14,w6
- add w3,w3,w12
- add w4,w4,w15
- and w12,w9,w8
- bic w15,w10,w8
- eor w11,w8,w8,ror#5
- add w4,w4,w14
- orr w12,w12,w15
- eor w11,w11,w8,ror#19
- eor w15,w4,w4,ror#11
- add w3,w3,w12
- ror w11,w11,#6
- eor w14,w4,w5
- eor w15,w15,w4,ror#20
- add w3,w3,w11
- ldr w12,[sp,#32]
- and w13,w13,w14
- ror w15,w15,#2
- add w7,w7,w3
- eor w13,w13,w5
- st1 {v4.4s},[x17], #16
- add w10,w10,w12
- add w3,w3,w15
- and w12,w8,w7
- ld1 {v2.16b},[x1],#16
- bic w15,w9,w7
- eor w11,w7,w7,ror#5
- ld1 {v4.4s},[x16],#16
- add w3,w3,w13
- orr w12,w12,w15
- eor w11,w11,w7,ror#19
- eor w15,w3,w3,ror#11
- rev32 v2.16b,v2.16b
- add w10,w10,w12
- ror w11,w11,#6
- eor w13,w3,w4
- eor w15,w15,w3,ror#20
- add v4.4s,v4.4s,v2.4s
- add w10,w10,w11
- ldr w12,[sp,#36]
- and w14,w14,w13
- ror w15,w15,#2
- add w6,w6,w10
- eor w14,w14,w4
- add w9,w9,w12
- add w10,w10,w15
- and w12,w7,w6
- bic w15,w8,w6
- eor w11,w6,w6,ror#5
- add w10,w10,w14
- orr w12,w12,w15
- eor w11,w11,w6,ror#19
- eor w15,w10,w10,ror#11
- add w9,w9,w12
- ror w11,w11,#6
- eor w14,w10,w3
- eor w15,w15,w10,ror#20
- add w9,w9,w11
- ldr w12,[sp,#40]
- and w13,w13,w14
- ror w15,w15,#2
- add w5,w5,w9
- eor w13,w13,w3
- add w8,w8,w12
- add w9,w9,w15
- and w12,w6,w5
- bic w15,w7,w5
- eor w11,w5,w5,ror#5
- add w9,w9,w13
- orr w12,w12,w15
- eor w11,w11,w5,ror#19
- eor w15,w9,w9,ror#11
- add w8,w8,w12
- ror w11,w11,#6
- eor w13,w9,w10
- eor w15,w15,w9,ror#20
- add w8,w8,w11
- ldr w12,[sp,#44]
- and w14,w14,w13
- ror w15,w15,#2
- add w4,w4,w8
- eor w14,w14,w10
- add w7,w7,w12
- add w8,w8,w15
- and w12,w5,w4
- bic w15,w6,w4
- eor w11,w4,w4,ror#5
- add w8,w8,w14
- orr w12,w12,w15
- eor w11,w11,w4,ror#19
- eor w15,w8,w8,ror#11
- add w7,w7,w12
- ror w11,w11,#6
- eor w14,w8,w9
- eor w15,w15,w8,ror#20
- add w7,w7,w11
- ldr w12,[sp,#48]
- and w13,w13,w14
- ror w15,w15,#2
- add w3,w3,w7
- eor w13,w13,w9
- st1 {v4.4s},[x17], #16
- add w6,w6,w12
- add w7,w7,w15
- and w12,w4,w3
- ld1 {v3.16b},[x1],#16
- bic w15,w5,w3
- eor w11,w3,w3,ror#5
- ld1 {v4.4s},[x16],#16
- add w7,w7,w13
- orr w12,w12,w15
- eor w11,w11,w3,ror#19
- eor w15,w7,w7,ror#11
- rev32 v3.16b,v3.16b
- add w6,w6,w12
- ror w11,w11,#6
- eor w13,w7,w8
- eor w15,w15,w7,ror#20
- add v4.4s,v4.4s,v3.4s
- add w6,w6,w11
- ldr w12,[sp,#52]
- and w14,w14,w13
- ror w15,w15,#2
- add w10,w10,w6
- eor w14,w14,w8
- add w5,w5,w12
- add w6,w6,w15
- and w12,w3,w10
- bic w15,w4,w10
- eor w11,w10,w10,ror#5
- add w6,w6,w14
- orr w12,w12,w15
- eor w11,w11,w10,ror#19
- eor w15,w6,w6,ror#11
- add w5,w5,w12
- ror w11,w11,#6
- eor w14,w6,w7
- eor w15,w15,w6,ror#20
- add w5,w5,w11
- ldr w12,[sp,#56]
- and w13,w13,w14
- ror w15,w15,#2
- add w9,w9,w5
- eor w13,w13,w7
- add w4,w4,w12
- add w5,w5,w15
- and w12,w10,w9
- bic w15,w3,w9
- eor w11,w9,w9,ror#5
- add w5,w5,w13
- orr w12,w12,w15
- eor w11,w11,w9,ror#19
- eor w15,w5,w5,ror#11
- add w4,w4,w12
- ror w11,w11,#6
- eor w13,w5,w6
- eor w15,w15,w5,ror#20
- add w4,w4,w11
- ldr w12,[sp,#60]
- and w14,w14,w13
- ror w15,w15,#2
- add w8,w8,w4
- eor w14,w14,w6
- add w3,w3,w12
- add w4,w4,w15
- and w12,w9,w8
- bic w15,w10,w8
- eor w11,w8,w8,ror#5
- add w4,w4,w14
- orr w12,w12,w15
- eor w11,w11,w8,ror#19
- eor w15,w4,w4,ror#11
- add w3,w3,w12
- ror w11,w11,#6
- eor w14,w4,w5
- eor w15,w15,w4,ror#20
- add w3,w3,w11
- and w13,w13,w14
- ror w15,w15,#2
- add w7,w7,w3
- eor w13,w13,w5
- st1 {v4.4s},[x17], #16
- add w3,w3,w15 // h+=Sigma0(a) from the past
- ldp w11,w12,[x0,#0]
- add w3,w3,w13 // h+=Maj(a,b,c) from the past
- ldp w13,w14,[x0,#8]
- add w3,w3,w11 // accumulate
- add w4,w4,w12
- ldp w11,w12,[x0,#16]
- add w5,w5,w13
- add w6,w6,w14
- ldp w13,w14,[x0,#24]
- add w7,w7,w11
- add w8,w8,w12
- ldr w12,[sp,#0]
- stp w3,w4,[x0,#0]
- add w9,w9,w13
- mov w13,wzr
- stp w5,w6,[x0,#8]
- add w10,w10,w14
- stp w7,w8,[x0,#16]
- eor w14,w4,w5
- stp w9,w10,[x0,#24]
- mov w15,wzr
- mov x17,sp
- b.ne .L_00_48
-
- ldr x29,[x29]
- add sp,sp,#16*4+16
- ret
-.size sha256_block_neon,.-sha256_block_neon
-#ifndef __KERNEL__
-.comm OPENSSL_armcap_P,4,4
-#endif
diff --git a/arch/arm64/crypto/sha256-glue.c b/arch/arm64/crypto/sha256-glue.c
index b064d925fe2a..9462f6088b3f 100644
--- a/arch/arm64/crypto/sha256-glue.c
+++ b/arch/arm64/crypto/sha256-glue.c
@@ -1,22 +1,17 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Linux/arm64 port of the OpenSSL SHA256 implementation for AArch64
*
* Copyright (c) 2016 Linaro Ltd. <ard.biesheuvel@linaro.org>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the Free
- * Software Foundation; either version 2 of the License, or (at your option)
- * any later version.
- *
*/
#include <asm/hwcap.h>
#include <asm/neon.h>
#include <asm/simd.h>
#include <crypto/internal/hash.h>
-#include <crypto/sha.h>
+#include <crypto/internal/simd.h>
+#include <crypto/sha2.h>
#include <crypto/sha256_base.h>
-#include <linux/cryptohash.h>
#include <linux/types.h>
#include <linux/string.h>
@@ -31,57 +26,66 @@ asmlinkage void sha256_block_data_order(u32 *digest, const void *data,
unsigned int num_blks);
EXPORT_SYMBOL(sha256_block_data_order);
+static void __sha256_block_data_order(struct sha256_state *sst, u8 const *src,
+ int blocks)
+{
+ sha256_block_data_order(sst->state, src, blocks);
+}
+
asmlinkage void sha256_block_neon(u32 *digest, const void *data,
unsigned int num_blks);
-static int sha256_update(struct shash_desc *desc, const u8 *data,
- unsigned int len)
+static void __sha256_block_neon(struct sha256_state *sst, u8 const *src,
+ int blocks)
+{
+ sha256_block_neon(sst->state, src, blocks);
+}
+
+static int crypto_sha256_arm64_update(struct shash_desc *desc, const u8 *data,
+ unsigned int len)
{
return sha256_base_do_update(desc, data, len,
- (sha256_block_fn *)sha256_block_data_order);
+ __sha256_block_data_order);
}
-static int sha256_finup(struct shash_desc *desc, const u8 *data,
- unsigned int len, u8 *out)
+static int crypto_sha256_arm64_finup(struct shash_desc *desc, const u8 *data,
+ unsigned int len, u8 *out)
{
if (len)
sha256_base_do_update(desc, data, len,
- (sha256_block_fn *)sha256_block_data_order);
- sha256_base_do_finalize(desc,
- (sha256_block_fn *)sha256_block_data_order);
+ __sha256_block_data_order);
+ sha256_base_do_finalize(desc, __sha256_block_data_order);
return sha256_base_finish(desc, out);
}
-static int sha256_final(struct shash_desc *desc, u8 *out)
+static int crypto_sha256_arm64_final(struct shash_desc *desc, u8 *out)
{
- return sha256_finup(desc, NULL, 0, out);
+ return crypto_sha256_arm64_finup(desc, NULL, 0, out);
}
static struct shash_alg algs[] = { {
.digestsize = SHA256_DIGEST_SIZE,
.init = sha256_base_init,
- .update = sha256_update,
- .final = sha256_final,
- .finup = sha256_finup,
+ .update = crypto_sha256_arm64_update,
+ .final = crypto_sha256_arm64_final,
+ .finup = crypto_sha256_arm64_finup,
.descsize = sizeof(struct sha256_state),
.base.cra_name = "sha256",
.base.cra_driver_name = "sha256-arm64",
- .base.cra_priority = 100,
- .base.cra_flags = CRYPTO_ALG_TYPE_SHASH,
+ .base.cra_priority = 125,
.base.cra_blocksize = SHA256_BLOCK_SIZE,
.base.cra_module = THIS_MODULE,
}, {
.digestsize = SHA224_DIGEST_SIZE,
.init = sha224_base_init,
- .update = sha256_update,
- .final = sha256_final,
- .finup = sha256_finup,
+ .update = crypto_sha256_arm64_update,
+ .final = crypto_sha256_arm64_final,
+ .finup = crypto_sha256_arm64_finup,
.descsize = sizeof(struct sha256_state),
.base.cra_name = "sha224",
.base.cra_driver_name = "sha224-arm64",
- .base.cra_priority = 100,
- .base.cra_flags = CRYPTO_ALG_TYPE_SHASH,
+ .base.cra_priority = 125,
.base.cra_blocksize = SHA224_BLOCK_SIZE,
.base.cra_module = THIS_MODULE,
} };
@@ -89,40 +93,47 @@ static struct shash_alg algs[] = { {
static int sha256_update_neon(struct shash_desc *desc, const u8 *data,
unsigned int len)
{
- /*
- * Stacking and unstacking a substantial slice of the NEON register
- * file may significantly affect performance for small updates when
- * executing in interrupt context, so fall back to the scalar code
- * in that case.
- */
- if (!may_use_simd())
+ struct sha256_state *sctx = shash_desc_ctx(desc);
+
+ if (!crypto_simd_usable())
return sha256_base_do_update(desc, data, len,
- (sha256_block_fn *)sha256_block_data_order);
+ __sha256_block_data_order);
+
+ while (len > 0) {
+ unsigned int chunk = len;
- kernel_neon_begin();
- sha256_base_do_update(desc, data, len,
- (sha256_block_fn *)sha256_block_neon);
- kernel_neon_end();
+ /*
+ * Don't hog the CPU for the entire time it takes to process all
+ * input when running on a preemptible kernel, but process the
+ * data block by block instead.
+ */
+ if (IS_ENABLED(CONFIG_PREEMPTION) &&
+ chunk + sctx->count % SHA256_BLOCK_SIZE > SHA256_BLOCK_SIZE)
+ chunk = SHA256_BLOCK_SIZE -
+ sctx->count % SHA256_BLOCK_SIZE;
+ kernel_neon_begin();
+ sha256_base_do_update(desc, data, chunk, __sha256_block_neon);
+ kernel_neon_end();
+ data += chunk;
+ len -= chunk;
+ }
return 0;
}
static int sha256_finup_neon(struct shash_desc *desc, const u8 *data,
unsigned int len, u8 *out)
{
- if (!may_use_simd()) {
+ if (!crypto_simd_usable()) {
if (len)
sha256_base_do_update(desc, data, len,
- (sha256_block_fn *)sha256_block_data_order);
- sha256_base_do_finalize(desc,
- (sha256_block_fn *)sha256_block_data_order);
+ __sha256_block_data_order);
+ sha256_base_do_finalize(desc, __sha256_block_data_order);
} else {
- kernel_neon_begin();
if (len)
- sha256_base_do_update(desc, data, len,
- (sha256_block_fn *)sha256_block_neon);
- sha256_base_do_finalize(desc,
- (sha256_block_fn *)sha256_block_neon);
+ sha256_update_neon(desc, data, len);
+ kernel_neon_begin();
+ sha256_base_do_finalize(desc, __sha256_block_neon);
kernel_neon_end();
}
return sha256_base_finish(desc, out);
@@ -143,7 +154,6 @@ static struct shash_alg neon_algs[] = { {
.base.cra_name = "sha256",
.base.cra_driver_name = "sha256-arm64-neon",
.base.cra_priority = 150,
- .base.cra_flags = CRYPTO_ALG_TYPE_SHASH,
.base.cra_blocksize = SHA256_BLOCK_SIZE,
.base.cra_module = THIS_MODULE,
}, {
@@ -156,7 +166,6 @@ static struct shash_alg neon_algs[] = { {
.base.cra_name = "sha224",
.base.cra_driver_name = "sha224-arm64-neon",
.base.cra_priority = 150,
- .base.cra_flags = CRYPTO_ALG_TYPE_SHASH,
.base.cra_blocksize = SHA224_BLOCK_SIZE,
.base.cra_module = THIS_MODULE,
} };
@@ -167,7 +176,7 @@ static int __init sha256_mod_init(void)
if (ret)
return ret;
- if (elf_hwcap & HWCAP_ASIMD) {
+ if (cpu_have_named_feature(ASIMD)) {
ret = crypto_register_shashes(neon_algs, ARRAY_SIZE(neon_algs));
if (ret)
crypto_unregister_shashes(algs, ARRAY_SIZE(algs));
@@ -177,7 +186,7 @@ static int __init sha256_mod_init(void)
static void __exit sha256_mod_fini(void)
{
- if (elf_hwcap & HWCAP_ASIMD)
+ if (cpu_have_named_feature(ASIMD))
crypto_unregister_shashes(neon_algs, ARRAY_SIZE(neon_algs));
crypto_unregister_shashes(algs, ARRAY_SIZE(algs));
}
diff --git a/arch/arm64/crypto/sha3-ce-core.S b/arch/arm64/crypto/sha3-ce-core.S
new file mode 100644
index 000000000000..9c77313f5a60
--- /dev/null
+++ b/arch/arm64/crypto/sha3-ce-core.S
@@ -0,0 +1,212 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * sha3-ce-core.S - core SHA-3 transform using v8.2 Crypto Extensions
+ *
+ * Copyright (C) 2018 Linaro Ltd <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+ .irp b,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31
+ .set .Lv\b\().2d, \b
+ .set .Lv\b\().16b, \b
+ .endr
+
+ /*
+ * ARMv8.2 Crypto Extensions instructions
+ */
+ .macro eor3, rd, rn, rm, ra
+ .inst 0xce000000 | .L\rd | (.L\rn << 5) | (.L\ra << 10) | (.L\rm << 16)
+ .endm
+
+ .macro rax1, rd, rn, rm
+ .inst 0xce608c00 | .L\rd | (.L\rn << 5) | (.L\rm << 16)
+ .endm
+
+ .macro bcax, rd, rn, rm, ra
+ .inst 0xce200000 | .L\rd | (.L\rn << 5) | (.L\ra << 10) | (.L\rm << 16)
+ .endm
+
+ .macro xar, rd, rn, rm, imm6
+ .inst 0xce800000 | .L\rd | (.L\rn << 5) | ((\imm6) << 10) | (.L\rm << 16)
+ .endm
+
+ /*
+ * int sha3_ce_transform(u64 *st, const u8 *data, int blocks, int dg_size)
+ */
+ .text
+SYM_FUNC_START(sha3_ce_transform)
+ /* load state */
+ add x8, x0, #32
+ ld1 { v0.1d- v3.1d}, [x0]
+ ld1 { v4.1d- v7.1d}, [x8], #32
+ ld1 { v8.1d-v11.1d}, [x8], #32
+ ld1 {v12.1d-v15.1d}, [x8], #32
+ ld1 {v16.1d-v19.1d}, [x8], #32
+ ld1 {v20.1d-v23.1d}, [x8], #32
+ ld1 {v24.1d}, [x8]
+
+0: sub w2, w2, #1
+ mov w8, #24
+ adr_l x9, .Lsha3_rcon
+
+ /* load input */
+ ld1 {v25.8b-v28.8b}, [x1], #32
+ ld1 {v29.8b-v31.8b}, [x1], #24
+ eor v0.8b, v0.8b, v25.8b
+ eor v1.8b, v1.8b, v26.8b
+ eor v2.8b, v2.8b, v27.8b
+ eor v3.8b, v3.8b, v28.8b
+ eor v4.8b, v4.8b, v29.8b
+ eor v5.8b, v5.8b, v30.8b
+ eor v6.8b, v6.8b, v31.8b
+
+ tbnz x3, #6, 2f // SHA3-512
+
+ ld1 {v25.8b-v28.8b}, [x1], #32
+ ld1 {v29.8b-v30.8b}, [x1], #16
+ eor v7.8b, v7.8b, v25.8b
+ eor v8.8b, v8.8b, v26.8b
+ eor v9.8b, v9.8b, v27.8b
+ eor v10.8b, v10.8b, v28.8b
+ eor v11.8b, v11.8b, v29.8b
+ eor v12.8b, v12.8b, v30.8b
+
+ tbnz x3, #4, 1f // SHA3-384 or SHA3-224
+
+ // SHA3-256
+ ld1 {v25.8b-v28.8b}, [x1], #32
+ eor v13.8b, v13.8b, v25.8b
+ eor v14.8b, v14.8b, v26.8b
+ eor v15.8b, v15.8b, v27.8b
+ eor v16.8b, v16.8b, v28.8b
+ b 3f
+
+1: tbz x3, #2, 3f // bit 2 cleared? SHA-384
+
+ // SHA3-224
+ ld1 {v25.8b-v28.8b}, [x1], #32
+ ld1 {v29.8b}, [x1], #8
+ eor v13.8b, v13.8b, v25.8b
+ eor v14.8b, v14.8b, v26.8b
+ eor v15.8b, v15.8b, v27.8b
+ eor v16.8b, v16.8b, v28.8b
+ eor v17.8b, v17.8b, v29.8b
+ b 3f
+
+ // SHA3-512
+2: ld1 {v25.8b-v26.8b}, [x1], #16
+ eor v7.8b, v7.8b, v25.8b
+ eor v8.8b, v8.8b, v26.8b
+
+3: sub w8, w8, #1
+
+ eor3 v29.16b, v4.16b, v9.16b, v14.16b
+ eor3 v26.16b, v1.16b, v6.16b, v11.16b
+ eor3 v28.16b, v3.16b, v8.16b, v13.16b
+ eor3 v25.16b, v0.16b, v5.16b, v10.16b
+ eor3 v27.16b, v2.16b, v7.16b, v12.16b
+ eor3 v29.16b, v29.16b, v19.16b, v24.16b
+ eor3 v26.16b, v26.16b, v16.16b, v21.16b
+ eor3 v28.16b, v28.16b, v18.16b, v23.16b
+ eor3 v25.16b, v25.16b, v15.16b, v20.16b
+ eor3 v27.16b, v27.16b, v17.16b, v22.16b
+
+ rax1 v30.2d, v29.2d, v26.2d // bc[0]
+ rax1 v26.2d, v26.2d, v28.2d // bc[2]
+ rax1 v28.2d, v28.2d, v25.2d // bc[4]
+ rax1 v25.2d, v25.2d, v27.2d // bc[1]
+ rax1 v27.2d, v27.2d, v29.2d // bc[3]
+
+ eor v0.16b, v0.16b, v30.16b
+ xar v29.2d, v1.2d, v25.2d, (64 - 1)
+ xar v1.2d, v6.2d, v25.2d, (64 - 44)
+ xar v6.2d, v9.2d, v28.2d, (64 - 20)
+ xar v9.2d, v22.2d, v26.2d, (64 - 61)
+ xar v22.2d, v14.2d, v28.2d, (64 - 39)
+ xar v14.2d, v20.2d, v30.2d, (64 - 18)
+ xar v31.2d, v2.2d, v26.2d, (64 - 62)
+ xar v2.2d, v12.2d, v26.2d, (64 - 43)
+ xar v12.2d, v13.2d, v27.2d, (64 - 25)
+ xar v13.2d, v19.2d, v28.2d, (64 - 8)
+ xar v19.2d, v23.2d, v27.2d, (64 - 56)
+ xar v23.2d, v15.2d, v30.2d, (64 - 41)
+ xar v15.2d, v4.2d, v28.2d, (64 - 27)
+ xar v28.2d, v24.2d, v28.2d, (64 - 14)
+ xar v24.2d, v21.2d, v25.2d, (64 - 2)
+ xar v8.2d, v8.2d, v27.2d, (64 - 55)
+ xar v4.2d, v16.2d, v25.2d, (64 - 45)
+ xar v16.2d, v5.2d, v30.2d, (64 - 36)
+ xar v5.2d, v3.2d, v27.2d, (64 - 28)
+ xar v27.2d, v18.2d, v27.2d, (64 - 21)
+ xar v3.2d, v17.2d, v26.2d, (64 - 15)
+ xar v25.2d, v11.2d, v25.2d, (64 - 10)
+ xar v26.2d, v7.2d, v26.2d, (64 - 6)
+ xar v30.2d, v10.2d, v30.2d, (64 - 3)
+
+ bcax v20.16b, v31.16b, v22.16b, v8.16b
+ bcax v21.16b, v8.16b, v23.16b, v22.16b
+ bcax v22.16b, v22.16b, v24.16b, v23.16b
+ bcax v23.16b, v23.16b, v31.16b, v24.16b
+ bcax v24.16b, v24.16b, v8.16b, v31.16b
+
+ ld1r {v31.2d}, [x9], #8
+
+ bcax v17.16b, v25.16b, v19.16b, v3.16b
+ bcax v18.16b, v3.16b, v15.16b, v19.16b
+ bcax v19.16b, v19.16b, v16.16b, v15.16b
+ bcax v15.16b, v15.16b, v25.16b, v16.16b
+ bcax v16.16b, v16.16b, v3.16b, v25.16b
+
+ bcax v10.16b, v29.16b, v12.16b, v26.16b
+ bcax v11.16b, v26.16b, v13.16b, v12.16b
+ bcax v12.16b, v12.16b, v14.16b, v13.16b
+ bcax v13.16b, v13.16b, v29.16b, v14.16b
+ bcax v14.16b, v14.16b, v26.16b, v29.16b
+
+ bcax v7.16b, v30.16b, v9.16b, v4.16b
+ bcax v8.16b, v4.16b, v5.16b, v9.16b
+ bcax v9.16b, v9.16b, v6.16b, v5.16b
+ bcax v5.16b, v5.16b, v30.16b, v6.16b
+ bcax v6.16b, v6.16b, v4.16b, v30.16b
+
+ bcax v3.16b, v27.16b, v0.16b, v28.16b
+ bcax v4.16b, v28.16b, v1.16b, v0.16b
+ bcax v0.16b, v0.16b, v2.16b, v1.16b
+ bcax v1.16b, v1.16b, v27.16b, v2.16b
+ bcax v2.16b, v2.16b, v28.16b, v27.16b
+
+ eor v0.16b, v0.16b, v31.16b
+
+ cbnz w8, 3b
+ cond_yield 4f, x8, x9
+ cbnz w2, 0b
+
+ /* save state */
+4: st1 { v0.1d- v3.1d}, [x0], #32
+ st1 { v4.1d- v7.1d}, [x0], #32
+ st1 { v8.1d-v11.1d}, [x0], #32
+ st1 {v12.1d-v15.1d}, [x0], #32
+ st1 {v16.1d-v19.1d}, [x0], #32
+ st1 {v20.1d-v23.1d}, [x0], #32
+ st1 {v24.1d}, [x0]
+ mov w0, w2
+ ret
+SYM_FUNC_END(sha3_ce_transform)
+
+ .section ".rodata", "a"
+ .align 8
+.Lsha3_rcon:
+ .quad 0x0000000000000001, 0x0000000000008082, 0x800000000000808a
+ .quad 0x8000000080008000, 0x000000000000808b, 0x0000000080000001
+ .quad 0x8000000080008081, 0x8000000000008009, 0x000000000000008a
+ .quad 0x0000000000000088, 0x0000000080008009, 0x000000008000000a
+ .quad 0x000000008000808b, 0x800000000000008b, 0x8000000000008089
+ .quad 0x8000000000008003, 0x8000000000008002, 0x8000000000000080
+ .quad 0x000000000000800a, 0x800000008000000a, 0x8000000080008081
+ .quad 0x8000000000008080, 0x0000000080000001, 0x8000000080008008
diff --git a/arch/arm64/crypto/sha3-ce-glue.c b/arch/arm64/crypto/sha3-ce-glue.c
new file mode 100644
index 000000000000..250e1377c481
--- /dev/null
+++ b/arch/arm64/crypto/sha3-ce-glue.c
@@ -0,0 +1,166 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * sha3-ce-glue.c - core SHA-3 transform using v8.2 Crypto Extensions
+ *
+ * Copyright (C) 2018 Linaro Ltd <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <asm/hwcap.h>
+#include <asm/neon.h>
+#include <asm/simd.h>
+#include <asm/unaligned.h>
+#include <crypto/internal/hash.h>
+#include <crypto/internal/simd.h>
+#include <crypto/sha3.h>
+#include <linux/cpufeature.h>
+#include <linux/crypto.h>
+#include <linux/module.h>
+
+MODULE_DESCRIPTION("SHA3 secure hash using ARMv8 Crypto Extensions");
+MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
+MODULE_LICENSE("GPL v2");
+MODULE_ALIAS_CRYPTO("sha3-224");
+MODULE_ALIAS_CRYPTO("sha3-256");
+MODULE_ALIAS_CRYPTO("sha3-384");
+MODULE_ALIAS_CRYPTO("sha3-512");
+
+asmlinkage int sha3_ce_transform(u64 *st, const u8 *data, int blocks,
+ int md_len);
+
+static int sha3_update(struct shash_desc *desc, const u8 *data,
+ unsigned int len)
+{
+ struct sha3_state *sctx = shash_desc_ctx(desc);
+ unsigned int digest_size = crypto_shash_digestsize(desc->tfm);
+
+ if (!crypto_simd_usable())
+ return crypto_sha3_update(desc, data, len);
+
+ if ((sctx->partial + len) >= sctx->rsiz) {
+ int blocks;
+
+ if (sctx->partial) {
+ int p = sctx->rsiz - sctx->partial;
+
+ memcpy(sctx->buf + sctx->partial, data, p);
+ kernel_neon_begin();
+ sha3_ce_transform(sctx->st, sctx->buf, 1, digest_size);
+ kernel_neon_end();
+
+ data += p;
+ len -= p;
+ sctx->partial = 0;
+ }
+
+ blocks = len / sctx->rsiz;
+ len %= sctx->rsiz;
+
+ while (blocks) {
+ int rem;
+
+ kernel_neon_begin();
+ rem = sha3_ce_transform(sctx->st, data, blocks,
+ digest_size);
+ kernel_neon_end();
+ data += (blocks - rem) * sctx->rsiz;
+ blocks = rem;
+ }
+ }
+
+ if (len) {
+ memcpy(sctx->buf + sctx->partial, data, len);
+ sctx->partial += len;
+ }
+ return 0;
+}
+
+static int sha3_final(struct shash_desc *desc, u8 *out)
+{
+ struct sha3_state *sctx = shash_desc_ctx(desc);
+ unsigned int digest_size = crypto_shash_digestsize(desc->tfm);
+ __le64 *digest = (__le64 *)out;
+ int i;
+
+ if (!crypto_simd_usable())
+ return crypto_sha3_final(desc, out);
+
+ sctx->buf[sctx->partial++] = 0x06;
+ memset(sctx->buf + sctx->partial, 0, sctx->rsiz - sctx->partial);
+ sctx->buf[sctx->rsiz - 1] |= 0x80;
+
+ kernel_neon_begin();
+ sha3_ce_transform(sctx->st, sctx->buf, 1, digest_size);
+ kernel_neon_end();
+
+ for (i = 0; i < digest_size / 8; i++)
+ put_unaligned_le64(sctx->st[i], digest++);
+
+ if (digest_size & 4)
+ put_unaligned_le32(sctx->st[i], (__le32 *)digest);
+
+ memzero_explicit(sctx, sizeof(*sctx));
+ return 0;
+}
+
+static struct shash_alg algs[] = { {
+ .digestsize = SHA3_224_DIGEST_SIZE,
+ .init = crypto_sha3_init,
+ .update = sha3_update,
+ .final = sha3_final,
+ .descsize = sizeof(struct sha3_state),
+ .base.cra_name = "sha3-224",
+ .base.cra_driver_name = "sha3-224-ce",
+ .base.cra_blocksize = SHA3_224_BLOCK_SIZE,
+ .base.cra_module = THIS_MODULE,
+ .base.cra_priority = 200,
+}, {
+ .digestsize = SHA3_256_DIGEST_SIZE,
+ .init = crypto_sha3_init,
+ .update = sha3_update,
+ .final = sha3_final,
+ .descsize = sizeof(struct sha3_state),
+ .base.cra_name = "sha3-256",
+ .base.cra_driver_name = "sha3-256-ce",
+ .base.cra_blocksize = SHA3_256_BLOCK_SIZE,
+ .base.cra_module = THIS_MODULE,
+ .base.cra_priority = 200,
+}, {
+ .digestsize = SHA3_384_DIGEST_SIZE,
+ .init = crypto_sha3_init,
+ .update = sha3_update,
+ .final = sha3_final,
+ .descsize = sizeof(struct sha3_state),
+ .base.cra_name = "sha3-384",
+ .base.cra_driver_name = "sha3-384-ce",
+ .base.cra_blocksize = SHA3_384_BLOCK_SIZE,
+ .base.cra_module = THIS_MODULE,
+ .base.cra_priority = 200,
+}, {
+ .digestsize = SHA3_512_DIGEST_SIZE,
+ .init = crypto_sha3_init,
+ .update = sha3_update,
+ .final = sha3_final,
+ .descsize = sizeof(struct sha3_state),
+ .base.cra_name = "sha3-512",
+ .base.cra_driver_name = "sha3-512-ce",
+ .base.cra_blocksize = SHA3_512_BLOCK_SIZE,
+ .base.cra_module = THIS_MODULE,
+ .base.cra_priority = 200,
+} };
+
+static int __init sha3_neon_mod_init(void)
+{
+ return crypto_register_shashes(algs, ARRAY_SIZE(algs));
+}
+
+static void __exit sha3_neon_mod_fini(void)
+{
+ crypto_unregister_shashes(algs, ARRAY_SIZE(algs));
+}
+
+module_cpu_feature_match(SHA3, sha3_neon_mod_init);
+module_exit(sha3_neon_mod_fini);
diff --git a/arch/arm64/crypto/sha512-armv8.pl b/arch/arm64/crypto/sha512-armv8.pl
index c55efb308544..35ec9ae99fe1 100644
--- a/arch/arm64/crypto/sha512-armv8.pl
+++ b/arch/arm64/crypto/sha512-armv8.pl
@@ -1,4 +1,14 @@
#! /usr/bin/env perl
+# SPDX-License-Identifier: GPL-2.0
+
+# This code is taken from the OpenSSL project but the author (Andy Polyakov)
+# has relicensed it under the GPLv2. Therefore this program is free software;
+# you can redistribute it and/or modify it under the terms of the GNU General
+# Public License version 2 as published by the Free Software Foundation.
+#
+# The original headers, including the original license headers, are
+# included below for completeness.
+
# Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the OpenSSL license (the "License"). You may not use
@@ -11,8 +21,6 @@
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
-#
-# Permission to use under GPLv2 terms is granted.
# ====================================================================
#
# SHA256/512 for ARMv8.
@@ -35,7 +43,7 @@
# on Cortex-A53 (or by 4 cycles per round).
# (***) Super-impressive coefficients over gcc-generated code are
# indication of some compiler "pathology", most notably code
-# generated with -mgeneral-regs-only is significanty faster
+# generated with -mgeneral-regs-only is significantly faster
# and the gap is only 40-90%.
#
# October 2016.
diff --git a/arch/arm64/crypto/sha512-ce-core.S b/arch/arm64/crypto/sha512-ce-core.S
new file mode 100644
index 000000000000..b6a3a36e15f5
--- /dev/null
+++ b/arch/arm64/crypto/sha512-ce-core.S
@@ -0,0 +1,206 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * sha512-ce-core.S - core SHA-384/SHA-512 transform using v8 Crypto Extensions
+ *
+ * Copyright (C) 2018 Linaro Ltd <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+ .irp b,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
+ .set .Lq\b, \b
+ .set .Lv\b\().2d, \b
+ .endr
+
+ .macro sha512h, rd, rn, rm
+ .inst 0xce608000 | .L\rd | (.L\rn << 5) | (.L\rm << 16)
+ .endm
+
+ .macro sha512h2, rd, rn, rm
+ .inst 0xce608400 | .L\rd | (.L\rn << 5) | (.L\rm << 16)
+ .endm
+
+ .macro sha512su0, rd, rn
+ .inst 0xcec08000 | .L\rd | (.L\rn << 5)
+ .endm
+
+ .macro sha512su1, rd, rn, rm
+ .inst 0xce608800 | .L\rd | (.L\rn << 5) | (.L\rm << 16)
+ .endm
+
+ /*
+ * The SHA-512 round constants
+ */
+ .section ".rodata", "a"
+ .align 4
+.Lsha512_rcon:
+ .quad 0x428a2f98d728ae22, 0x7137449123ef65cd
+ .quad 0xb5c0fbcfec4d3b2f, 0xe9b5dba58189dbbc
+ .quad 0x3956c25bf348b538, 0x59f111f1b605d019
+ .quad 0x923f82a4af194f9b, 0xab1c5ed5da6d8118
+ .quad 0xd807aa98a3030242, 0x12835b0145706fbe
+ .quad 0x243185be4ee4b28c, 0x550c7dc3d5ffb4e2
+ .quad 0x72be5d74f27b896f, 0x80deb1fe3b1696b1
+ .quad 0x9bdc06a725c71235, 0xc19bf174cf692694
+ .quad 0xe49b69c19ef14ad2, 0xefbe4786384f25e3
+ .quad 0x0fc19dc68b8cd5b5, 0x240ca1cc77ac9c65
+ .quad 0x2de92c6f592b0275, 0x4a7484aa6ea6e483
+ .quad 0x5cb0a9dcbd41fbd4, 0x76f988da831153b5
+ .quad 0x983e5152ee66dfab, 0xa831c66d2db43210
+ .quad 0xb00327c898fb213f, 0xbf597fc7beef0ee4
+ .quad 0xc6e00bf33da88fc2, 0xd5a79147930aa725
+ .quad 0x06ca6351e003826f, 0x142929670a0e6e70
+ .quad 0x27b70a8546d22ffc, 0x2e1b21385c26c926
+ .quad 0x4d2c6dfc5ac42aed, 0x53380d139d95b3df
+ .quad 0x650a73548baf63de, 0x766a0abb3c77b2a8
+ .quad 0x81c2c92e47edaee6, 0x92722c851482353b
+ .quad 0xa2bfe8a14cf10364, 0xa81a664bbc423001
+ .quad 0xc24b8b70d0f89791, 0xc76c51a30654be30
+ .quad 0xd192e819d6ef5218, 0xd69906245565a910
+ .quad 0xf40e35855771202a, 0x106aa07032bbd1b8
+ .quad 0x19a4c116b8d2d0c8, 0x1e376c085141ab53
+ .quad 0x2748774cdf8eeb99, 0x34b0bcb5e19b48a8
+ .quad 0x391c0cb3c5c95a63, 0x4ed8aa4ae3418acb
+ .quad 0x5b9cca4f7763e373, 0x682e6ff3d6b2b8a3
+ .quad 0x748f82ee5defb2fc, 0x78a5636f43172f60
+ .quad 0x84c87814a1f0ab72, 0x8cc702081a6439ec
+ .quad 0x90befffa23631e28, 0xa4506cebde82bde9
+ .quad 0xbef9a3f7b2c67915, 0xc67178f2e372532b
+ .quad 0xca273eceea26619c, 0xd186b8c721c0c207
+ .quad 0xeada7dd6cde0eb1e, 0xf57d4f7fee6ed178
+ .quad 0x06f067aa72176fba, 0x0a637dc5a2c898a6
+ .quad 0x113f9804bef90dae, 0x1b710b35131c471b
+ .quad 0x28db77f523047d84, 0x32caab7b40c72493
+ .quad 0x3c9ebe0a15c9bebc, 0x431d67c49c100d4c
+ .quad 0x4cc5d4becb3e42b6, 0x597f299cfc657e2a
+ .quad 0x5fcb6fab3ad6faec, 0x6c44198c4a475817
+
+ .macro dround, i0, i1, i2, i3, i4, rc0, rc1, in0, in1, in2, in3, in4
+ .ifnb \rc1
+ ld1 {v\rc1\().2d}, [x4], #16
+ .endif
+ add v5.2d, v\rc0\().2d, v\in0\().2d
+ ext v6.16b, v\i2\().16b, v\i3\().16b, #8
+ ext v5.16b, v5.16b, v5.16b, #8
+ ext v7.16b, v\i1\().16b, v\i2\().16b, #8
+ add v\i3\().2d, v\i3\().2d, v5.2d
+ .ifnb \in1
+ ext v5.16b, v\in3\().16b, v\in4\().16b, #8
+ sha512su0 v\in0\().2d, v\in1\().2d
+ .endif
+ sha512h q\i3, q6, v7.2d
+ .ifnb \in1
+ sha512su1 v\in0\().2d, v\in2\().2d, v5.2d
+ .endif
+ add v\i4\().2d, v\i1\().2d, v\i3\().2d
+ sha512h2 q\i3, q\i1, v\i0\().2d
+ .endm
+
+ /*
+ * void sha512_ce_transform(struct sha512_state *sst, u8 const *src,
+ * int blocks)
+ */
+ .text
+SYM_FUNC_START(sha512_ce_transform)
+ /* load state */
+ ld1 {v8.2d-v11.2d}, [x0]
+
+ /* load first 4 round constants */
+ adr_l x3, .Lsha512_rcon
+ ld1 {v20.2d-v23.2d}, [x3], #64
+
+ /* load input */
+0: ld1 {v12.2d-v15.2d}, [x1], #64
+ ld1 {v16.2d-v19.2d}, [x1], #64
+ sub w2, w2, #1
+
+CPU_LE( rev64 v12.16b, v12.16b )
+CPU_LE( rev64 v13.16b, v13.16b )
+CPU_LE( rev64 v14.16b, v14.16b )
+CPU_LE( rev64 v15.16b, v15.16b )
+CPU_LE( rev64 v16.16b, v16.16b )
+CPU_LE( rev64 v17.16b, v17.16b )
+CPU_LE( rev64 v18.16b, v18.16b )
+CPU_LE( rev64 v19.16b, v19.16b )
+
+ mov x4, x3 // rc pointer
+
+ mov v0.16b, v8.16b
+ mov v1.16b, v9.16b
+ mov v2.16b, v10.16b
+ mov v3.16b, v11.16b
+
+ // v0 ab cd -- ef gh ab
+ // v1 cd -- ef gh ab cd
+ // v2 ef gh ab cd -- ef
+ // v3 gh ab cd -- ef gh
+ // v4 -- ef gh ab cd --
+
+ dround 0, 1, 2, 3, 4, 20, 24, 12, 13, 19, 16, 17
+ dround 3, 0, 4, 2, 1, 21, 25, 13, 14, 12, 17, 18
+ dround 2, 3, 1, 4, 0, 22, 26, 14, 15, 13, 18, 19
+ dround 4, 2, 0, 1, 3, 23, 27, 15, 16, 14, 19, 12
+ dround 1, 4, 3, 0, 2, 24, 28, 16, 17, 15, 12, 13
+
+ dround 0, 1, 2, 3, 4, 25, 29, 17, 18, 16, 13, 14
+ dround 3, 0, 4, 2, 1, 26, 30, 18, 19, 17, 14, 15
+ dround 2, 3, 1, 4, 0, 27, 31, 19, 12, 18, 15, 16
+ dround 4, 2, 0, 1, 3, 28, 24, 12, 13, 19, 16, 17
+ dround 1, 4, 3, 0, 2, 29, 25, 13, 14, 12, 17, 18
+
+ dround 0, 1, 2, 3, 4, 30, 26, 14, 15, 13, 18, 19
+ dround 3, 0, 4, 2, 1, 31, 27, 15, 16, 14, 19, 12
+ dround 2, 3, 1, 4, 0, 24, 28, 16, 17, 15, 12, 13
+ dround 4, 2, 0, 1, 3, 25, 29, 17, 18, 16, 13, 14
+ dround 1, 4, 3, 0, 2, 26, 30, 18, 19, 17, 14, 15
+
+ dround 0, 1, 2, 3, 4, 27, 31, 19, 12, 18, 15, 16
+ dround 3, 0, 4, 2, 1, 28, 24, 12, 13, 19, 16, 17
+ dround 2, 3, 1, 4, 0, 29, 25, 13, 14, 12, 17, 18
+ dround 4, 2, 0, 1, 3, 30, 26, 14, 15, 13, 18, 19
+ dround 1, 4, 3, 0, 2, 31, 27, 15, 16, 14, 19, 12
+
+ dround 0, 1, 2, 3, 4, 24, 28, 16, 17, 15, 12, 13
+ dround 3, 0, 4, 2, 1, 25, 29, 17, 18, 16, 13, 14
+ dround 2, 3, 1, 4, 0, 26, 30, 18, 19, 17, 14, 15
+ dround 4, 2, 0, 1, 3, 27, 31, 19, 12, 18, 15, 16
+ dround 1, 4, 3, 0, 2, 28, 24, 12, 13, 19, 16, 17
+
+ dround 0, 1, 2, 3, 4, 29, 25, 13, 14, 12, 17, 18
+ dround 3, 0, 4, 2, 1, 30, 26, 14, 15, 13, 18, 19
+ dround 2, 3, 1, 4, 0, 31, 27, 15, 16, 14, 19, 12
+ dround 4, 2, 0, 1, 3, 24, 28, 16, 17, 15, 12, 13
+ dround 1, 4, 3, 0, 2, 25, 29, 17, 18, 16, 13, 14
+
+ dround 0, 1, 2, 3, 4, 26, 30, 18, 19, 17, 14, 15
+ dround 3, 0, 4, 2, 1, 27, 31, 19, 12, 18, 15, 16
+ dround 2, 3, 1, 4, 0, 28, 24, 12
+ dround 4, 2, 0, 1, 3, 29, 25, 13
+ dround 1, 4, 3, 0, 2, 30, 26, 14
+
+ dround 0, 1, 2, 3, 4, 31, 27, 15
+ dround 3, 0, 4, 2, 1, 24, , 16
+ dround 2, 3, 1, 4, 0, 25, , 17
+ dround 4, 2, 0, 1, 3, 26, , 18
+ dround 1, 4, 3, 0, 2, 27, , 19
+
+ /* update state */
+ add v8.2d, v8.2d, v0.2d
+ add v9.2d, v9.2d, v1.2d
+ add v10.2d, v10.2d, v2.2d
+ add v11.2d, v11.2d, v3.2d
+
+ cond_yield 3f, x4, x5
+ /* handled all input blocks? */
+ cbnz w2, 0b
+
+ /* store new state */
+3: st1 {v8.2d-v11.2d}, [x0]
+ mov w0, w2
+ ret
+SYM_FUNC_END(sha512_ce_transform)
diff --git a/arch/arm64/crypto/sha512-ce-glue.c b/arch/arm64/crypto/sha512-ce-glue.c
new file mode 100644
index 000000000000..94cb7580deb7
--- /dev/null
+++ b/arch/arm64/crypto/sha512-ce-glue.c
@@ -0,0 +1,121 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * sha512-ce-glue.c - SHA-384/SHA-512 using ARMv8 Crypto Extensions
+ *
+ * Copyright (C) 2018 Linaro Ltd <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <asm/neon.h>
+#include <asm/simd.h>
+#include <asm/unaligned.h>
+#include <crypto/internal/hash.h>
+#include <crypto/internal/simd.h>
+#include <crypto/sha2.h>
+#include <crypto/sha512_base.h>
+#include <linux/cpufeature.h>
+#include <linux/crypto.h>
+#include <linux/module.h>
+
+MODULE_DESCRIPTION("SHA-384/SHA-512 secure hash using ARMv8 Crypto Extensions");
+MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
+MODULE_LICENSE("GPL v2");
+MODULE_ALIAS_CRYPTO("sha384");
+MODULE_ALIAS_CRYPTO("sha512");
+
+asmlinkage int sha512_ce_transform(struct sha512_state *sst, u8 const *src,
+ int blocks);
+
+asmlinkage void sha512_block_data_order(u64 *digest, u8 const *src, int blocks);
+
+static void __sha512_ce_transform(struct sha512_state *sst, u8 const *src,
+ int blocks)
+{
+ while (blocks) {
+ int rem;
+
+ kernel_neon_begin();
+ rem = sha512_ce_transform(sst, src, blocks);
+ kernel_neon_end();
+ src += (blocks - rem) * SHA512_BLOCK_SIZE;
+ blocks = rem;
+ }
+}
+
+static void __sha512_block_data_order(struct sha512_state *sst, u8 const *src,
+ int blocks)
+{
+ sha512_block_data_order(sst->state, src, blocks);
+}
+
+static int sha512_ce_update(struct shash_desc *desc, const u8 *data,
+ unsigned int len)
+{
+ sha512_block_fn *fn = crypto_simd_usable() ? __sha512_ce_transform
+ : __sha512_block_data_order;
+
+ sha512_base_do_update(desc, data, len, fn);
+ return 0;
+}
+
+static int sha512_ce_finup(struct shash_desc *desc, const u8 *data,
+ unsigned int len, u8 *out)
+{
+ sha512_block_fn *fn = crypto_simd_usable() ? __sha512_ce_transform
+ : __sha512_block_data_order;
+
+ sha512_base_do_update(desc, data, len, fn);
+ sha512_base_do_finalize(desc, fn);
+ return sha512_base_finish(desc, out);
+}
+
+static int sha512_ce_final(struct shash_desc *desc, u8 *out)
+{
+ sha512_block_fn *fn = crypto_simd_usable() ? __sha512_ce_transform
+ : __sha512_block_data_order;
+
+ sha512_base_do_finalize(desc, fn);
+ return sha512_base_finish(desc, out);
+}
+
+static struct shash_alg algs[] = { {
+ .init = sha384_base_init,
+ .update = sha512_ce_update,
+ .final = sha512_ce_final,
+ .finup = sha512_ce_finup,
+ .descsize = sizeof(struct sha512_state),
+ .digestsize = SHA384_DIGEST_SIZE,
+ .base.cra_name = "sha384",
+ .base.cra_driver_name = "sha384-ce",
+ .base.cra_priority = 200,
+ .base.cra_blocksize = SHA512_BLOCK_SIZE,
+ .base.cra_module = THIS_MODULE,
+}, {
+ .init = sha512_base_init,
+ .update = sha512_ce_update,
+ .final = sha512_ce_final,
+ .finup = sha512_ce_finup,
+ .descsize = sizeof(struct sha512_state),
+ .digestsize = SHA512_DIGEST_SIZE,
+ .base.cra_name = "sha512",
+ .base.cra_driver_name = "sha512-ce",
+ .base.cra_priority = 200,
+ .base.cra_blocksize = SHA512_BLOCK_SIZE,
+ .base.cra_module = THIS_MODULE,
+} };
+
+static int __init sha512_ce_mod_init(void)
+{
+ return crypto_register_shashes(algs, ARRAY_SIZE(algs));
+}
+
+static void __exit sha512_ce_mod_fini(void)
+{
+ crypto_unregister_shashes(algs, ARRAY_SIZE(algs));
+}
+
+module_cpu_feature_match(SHA512, sha512_ce_mod_init);
+module_exit(sha512_ce_mod_fini);
diff --git a/arch/arm64/crypto/sha512-core.S_shipped b/arch/arm64/crypto/sha512-core.S_shipped
deleted file mode 100644
index bd0f59f06c9d..000000000000
--- a/arch/arm64/crypto/sha512-core.S_shipped
+++ /dev/null
@@ -1,1085 +0,0 @@
-// Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
-//
-// Licensed under the OpenSSL license (the "License"). You may not use
-// this file except in compliance with the License. You can obtain a copy
-// in the file LICENSE in the source distribution or at
-// https://www.openssl.org/source/license.html
-
-// ====================================================================
-// Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
-// project. The module is, however, dual licensed under OpenSSL and
-// CRYPTOGAMS licenses depending on where you obtain it. For further
-// details see http://www.openssl.org/~appro/cryptogams/.
-//
-// Permission to use under GPLv2 terms is granted.
-// ====================================================================
-//
-// SHA256/512 for ARMv8.
-//
-// Performance in cycles per processed byte and improvement coefficient
-// over code generated with "default" compiler:
-//
-// SHA256-hw SHA256(*) SHA512
-// Apple A7 1.97 10.5 (+33%) 6.73 (-1%(**))
-// Cortex-A53 2.38 15.5 (+115%) 10.0 (+150%(***))
-// Cortex-A57 2.31 11.6 (+86%) 7.51 (+260%(***))
-// Denver 2.01 10.5 (+26%) 6.70 (+8%)
-// X-Gene 20.0 (+100%) 12.8 (+300%(***))
-// Mongoose 2.36 13.0 (+50%) 8.36 (+33%)
-//
-// (*) Software SHA256 results are of lesser relevance, presented
-// mostly for informational purposes.
-// (**) The result is a trade-off: it's possible to improve it by
-// 10% (or by 1 cycle per round), but at the cost of 20% loss
-// on Cortex-A53 (or by 4 cycles per round).
-// (***) Super-impressive coefficients over gcc-generated code are
-// indication of some compiler "pathology", most notably code
-// generated with -mgeneral-regs-only is significanty faster
-// and the gap is only 40-90%.
-//
-// October 2016.
-//
-// Originally it was reckoned that it makes no sense to implement NEON
-// version of SHA256 for 64-bit processors. This is because performance
-// improvement on most wide-spread Cortex-A5x processors was observed
-// to be marginal, same on Cortex-A53 and ~10% on A57. But then it was
-// observed that 32-bit NEON SHA256 performs significantly better than
-// 64-bit scalar version on *some* of the more recent processors. As
-// result 64-bit NEON version of SHA256 was added to provide best
-// all-round performance. For example it executes ~30% faster on X-Gene
-// and Mongoose. [For reference, NEON version of SHA512 is bound to
-// deliver much less improvement, likely *negative* on Cortex-A5x.
-// Which is why NEON support is limited to SHA256.]
-
-#ifndef __KERNEL__
-# include "arm_arch.h"
-#endif
-
-.text
-
-.extern OPENSSL_armcap_P
-.globl sha512_block_data_order
-.type sha512_block_data_order,%function
-.align 6
-sha512_block_data_order:
- stp x29,x30,[sp,#-128]!
- add x29,sp,#0
-
- stp x19,x20,[sp,#16]
- stp x21,x22,[sp,#32]
- stp x23,x24,[sp,#48]
- stp x25,x26,[sp,#64]
- stp x27,x28,[sp,#80]
- sub sp,sp,#4*8
-
- ldp x20,x21,[x0] // load context
- ldp x22,x23,[x0,#2*8]
- ldp x24,x25,[x0,#4*8]
- add x2,x1,x2,lsl#7 // end of input
- ldp x26,x27,[x0,#6*8]
- adr x30,.LK512
- stp x0,x2,[x29,#96]
-
-.Loop:
- ldp x3,x4,[x1],#2*8
- ldr x19,[x30],#8 // *K++
- eor x28,x21,x22 // magic seed
- str x1,[x29,#112]
-#ifndef __AARCH64EB__
- rev x3,x3 // 0
-#endif
- ror x16,x24,#14
- add x27,x27,x19 // h+=K[i]
- eor x6,x24,x24,ror#23
- and x17,x25,x24
- bic x19,x26,x24
- add x27,x27,x3 // h+=X[i]
- orr x17,x17,x19 // Ch(e,f,g)
- eor x19,x20,x21 // a^b, b^c in next round
- eor x16,x16,x6,ror#18 // Sigma1(e)
- ror x6,x20,#28
- add x27,x27,x17 // h+=Ch(e,f,g)
- eor x17,x20,x20,ror#5
- add x27,x27,x16 // h+=Sigma1(e)
- and x28,x28,x19 // (b^c)&=(a^b)
- add x23,x23,x27 // d+=h
- eor x28,x28,x21 // Maj(a,b,c)
- eor x17,x6,x17,ror#34 // Sigma0(a)
- add x27,x27,x28 // h+=Maj(a,b,c)
- ldr x28,[x30],#8 // *K++, x19 in next round
- //add x27,x27,x17 // h+=Sigma0(a)
-#ifndef __AARCH64EB__
- rev x4,x4 // 1
-#endif
- ldp x5,x6,[x1],#2*8
- add x27,x27,x17 // h+=Sigma0(a)
- ror x16,x23,#14
- add x26,x26,x28 // h+=K[i]
- eor x7,x23,x23,ror#23
- and x17,x24,x23
- bic x28,x25,x23
- add x26,x26,x4 // h+=X[i]
- orr x17,x17,x28 // Ch(e,f,g)
- eor x28,x27,x20 // a^b, b^c in next round
- eor x16,x16,x7,ror#18 // Sigma1(e)
- ror x7,x27,#28
- add x26,x26,x17 // h+=Ch(e,f,g)
- eor x17,x27,x27,ror#5
- add x26,x26,x16 // h+=Sigma1(e)
- and x19,x19,x28 // (b^c)&=(a^b)
- add x22,x22,x26 // d+=h
- eor x19,x19,x20 // Maj(a,b,c)
- eor x17,x7,x17,ror#34 // Sigma0(a)
- add x26,x26,x19 // h+=Maj(a,b,c)
- ldr x19,[x30],#8 // *K++, x28 in next round
- //add x26,x26,x17 // h+=Sigma0(a)
-#ifndef __AARCH64EB__
- rev x5,x5 // 2
-#endif
- add x26,x26,x17 // h+=Sigma0(a)
- ror x16,x22,#14
- add x25,x25,x19 // h+=K[i]
- eor x8,x22,x22,ror#23
- and x17,x23,x22
- bic x19,x24,x22
- add x25,x25,x5 // h+=X[i]
- orr x17,x17,x19 // Ch(e,f,g)
- eor x19,x26,x27 // a^b, b^c in next round
- eor x16,x16,x8,ror#18 // Sigma1(e)
- ror x8,x26,#28
- add x25,x25,x17 // h+=Ch(e,f,g)
- eor x17,x26,x26,ror#5
- add x25,x25,x16 // h+=Sigma1(e)
- and x28,x28,x19 // (b^c)&=(a^b)
- add x21,x21,x25 // d+=h
- eor x28,x28,x27 // Maj(a,b,c)
- eor x17,x8,x17,ror#34 // Sigma0(a)
- add x25,x25,x28 // h+=Maj(a,b,c)
- ldr x28,[x30],#8 // *K++, x19 in next round
- //add x25,x25,x17 // h+=Sigma0(a)
-#ifndef __AARCH64EB__
- rev x6,x6 // 3
-#endif
- ldp x7,x8,[x1],#2*8
- add x25,x25,x17 // h+=Sigma0(a)
- ror x16,x21,#14
- add x24,x24,x28 // h+=K[i]
- eor x9,x21,x21,ror#23
- and x17,x22,x21
- bic x28,x23,x21
- add x24,x24,x6 // h+=X[i]
- orr x17,x17,x28 // Ch(e,f,g)
- eor x28,x25,x26 // a^b, b^c in next round
- eor x16,x16,x9,ror#18 // Sigma1(e)
- ror x9,x25,#28
- add x24,x24,x17 // h+=Ch(e,f,g)
- eor x17,x25,x25,ror#5
- add x24,x24,x16 // h+=Sigma1(e)
- and x19,x19,x28 // (b^c)&=(a^b)
- add x20,x20,x24 // d+=h
- eor x19,x19,x26 // Maj(a,b,c)
- eor x17,x9,x17,ror#34 // Sigma0(a)
- add x24,x24,x19 // h+=Maj(a,b,c)
- ldr x19,[x30],#8 // *K++, x28 in next round
- //add x24,x24,x17 // h+=Sigma0(a)
-#ifndef __AARCH64EB__
- rev x7,x7 // 4
-#endif
- add x24,x24,x17 // h+=Sigma0(a)
- ror x16,x20,#14
- add x23,x23,x19 // h+=K[i]
- eor x10,x20,x20,ror#23
- and x17,x21,x20
- bic x19,x22,x20
- add x23,x23,x7 // h+=X[i]
- orr x17,x17,x19 // Ch(e,f,g)
- eor x19,x24,x25 // a^b, b^c in next round
- eor x16,x16,x10,ror#18 // Sigma1(e)
- ror x10,x24,#28
- add x23,x23,x17 // h+=Ch(e,f,g)
- eor x17,x24,x24,ror#5
- add x23,x23,x16 // h+=Sigma1(e)
- and x28,x28,x19 // (b^c)&=(a^b)
- add x27,x27,x23 // d+=h
- eor x28,x28,x25 // Maj(a,b,c)
- eor x17,x10,x17,ror#34 // Sigma0(a)
- add x23,x23,x28 // h+=Maj(a,b,c)
- ldr x28,[x30],#8 // *K++, x19 in next round
- //add x23,x23,x17 // h+=Sigma0(a)
-#ifndef __AARCH64EB__
- rev x8,x8 // 5
-#endif
- ldp x9,x10,[x1],#2*8
- add x23,x23,x17 // h+=Sigma0(a)
- ror x16,x27,#14
- add x22,x22,x28 // h+=K[i]
- eor x11,x27,x27,ror#23
- and x17,x20,x27
- bic x28,x21,x27
- add x22,x22,x8 // h+=X[i]
- orr x17,x17,x28 // Ch(e,f,g)
- eor x28,x23,x24 // a^b, b^c in next round
- eor x16,x16,x11,ror#18 // Sigma1(e)
- ror x11,x23,#28
- add x22,x22,x17 // h+=Ch(e,f,g)
- eor x17,x23,x23,ror#5
- add x22,x22,x16 // h+=Sigma1(e)
- and x19,x19,x28 // (b^c)&=(a^b)
- add x26,x26,x22 // d+=h
- eor x19,x19,x24 // Maj(a,b,c)
- eor x17,x11,x17,ror#34 // Sigma0(a)
- add x22,x22,x19 // h+=Maj(a,b,c)
- ldr x19,[x30],#8 // *K++, x28 in next round
- //add x22,x22,x17 // h+=Sigma0(a)
-#ifndef __AARCH64EB__
- rev x9,x9 // 6
-#endif
- add x22,x22,x17 // h+=Sigma0(a)
- ror x16,x26,#14
- add x21,x21,x19 // h+=K[i]
- eor x12,x26,x26,ror#23
- and x17,x27,x26
- bic x19,x20,x26
- add x21,x21,x9 // h+=X[i]
- orr x17,x17,x19 // Ch(e,f,g)
- eor x19,x22,x23 // a^b, b^c in next round
- eor x16,x16,x12,ror#18 // Sigma1(e)
- ror x12,x22,#28
- add x21,x21,x17 // h+=Ch(e,f,g)
- eor x17,x22,x22,ror#5
- add x21,x21,x16 // h+=Sigma1(e)
- and x28,x28,x19 // (b^c)&=(a^b)
- add x25,x25,x21 // d+=h
- eor x28,x28,x23 // Maj(a,b,c)
- eor x17,x12,x17,ror#34 // Sigma0(a)
- add x21,x21,x28 // h+=Maj(a,b,c)
- ldr x28,[x30],#8 // *K++, x19 in next round
- //add x21,x21,x17 // h+=Sigma0(a)
-#ifndef __AARCH64EB__
- rev x10,x10 // 7
-#endif
- ldp x11,x12,[x1],#2*8
- add x21,x21,x17 // h+=Sigma0(a)
- ror x16,x25,#14
- add x20,x20,x28 // h+=K[i]
- eor x13,x25,x25,ror#23
- and x17,x26,x25
- bic x28,x27,x25
- add x20,x20,x10 // h+=X[i]
- orr x17,x17,x28 // Ch(e,f,g)
- eor x28,x21,x22 // a^b, b^c in next round
- eor x16,x16,x13,ror#18 // Sigma1(e)
- ror x13,x21,#28
- add x20,x20,x17 // h+=Ch(e,f,g)
- eor x17,x21,x21,ror#5
- add x20,x20,x16 // h+=Sigma1(e)
- and x19,x19,x28 // (b^c)&=(a^b)
- add x24,x24,x20 // d+=h
- eor x19,x19,x22 // Maj(a,b,c)
- eor x17,x13,x17,ror#34 // Sigma0(a)
- add x20,x20,x19 // h+=Maj(a,b,c)
- ldr x19,[x30],#8 // *K++, x28 in next round
- //add x20,x20,x17 // h+=Sigma0(a)
-#ifndef __AARCH64EB__
- rev x11,x11 // 8
-#endif
- add x20,x20,x17 // h+=Sigma0(a)
- ror x16,x24,#14
- add x27,x27,x19 // h+=K[i]
- eor x14,x24,x24,ror#23
- and x17,x25,x24
- bic x19,x26,x24
- add x27,x27,x11 // h+=X[i]
- orr x17,x17,x19 // Ch(e,f,g)
- eor x19,x20,x21 // a^b, b^c in next round
- eor x16,x16,x14,ror#18 // Sigma1(e)
- ror x14,x20,#28
- add x27,x27,x17 // h+=Ch(e,f,g)
- eor x17,x20,x20,ror#5
- add x27,x27,x16 // h+=Sigma1(e)
- and x28,x28,x19 // (b^c)&=(a^b)
- add x23,x23,x27 // d+=h
- eor x28,x28,x21 // Maj(a,b,c)
- eor x17,x14,x17,ror#34 // Sigma0(a)
- add x27,x27,x28 // h+=Maj(a,b,c)
- ldr x28,[x30],#8 // *K++, x19 in next round
- //add x27,x27,x17 // h+=Sigma0(a)
-#ifndef __AARCH64EB__
- rev x12,x12 // 9
-#endif
- ldp x13,x14,[x1],#2*8
- add x27,x27,x17 // h+=Sigma0(a)
- ror x16,x23,#14
- add x26,x26,x28 // h+=K[i]
- eor x15,x23,x23,ror#23
- and x17,x24,x23
- bic x28,x25,x23
- add x26,x26,x12 // h+=X[i]
- orr x17,x17,x28 // Ch(e,f,g)
- eor x28,x27,x20 // a^b, b^c in next round
- eor x16,x16,x15,ror#18 // Sigma1(e)
- ror x15,x27,#28
- add x26,x26,x17 // h+=Ch(e,f,g)
- eor x17,x27,x27,ror#5
- add x26,x26,x16 // h+=Sigma1(e)
- and x19,x19,x28 // (b^c)&=(a^b)
- add x22,x22,x26 // d+=h
- eor x19,x19,x20 // Maj(a,b,c)
- eor x17,x15,x17,ror#34 // Sigma0(a)
- add x26,x26,x19 // h+=Maj(a,b,c)
- ldr x19,[x30],#8 // *K++, x28 in next round
- //add x26,x26,x17 // h+=Sigma0(a)
-#ifndef __AARCH64EB__
- rev x13,x13 // 10
-#endif
- add x26,x26,x17 // h+=Sigma0(a)
- ror x16,x22,#14
- add x25,x25,x19 // h+=K[i]
- eor x0,x22,x22,ror#23
- and x17,x23,x22
- bic x19,x24,x22
- add x25,x25,x13 // h+=X[i]
- orr x17,x17,x19 // Ch(e,f,g)
- eor x19,x26,x27 // a^b, b^c in next round
- eor x16,x16,x0,ror#18 // Sigma1(e)
- ror x0,x26,#28
- add x25,x25,x17 // h+=Ch(e,f,g)
- eor x17,x26,x26,ror#5
- add x25,x25,x16 // h+=Sigma1(e)
- and x28,x28,x19 // (b^c)&=(a^b)
- add x21,x21,x25 // d+=h
- eor x28,x28,x27 // Maj(a,b,c)
- eor x17,x0,x17,ror#34 // Sigma0(a)
- add x25,x25,x28 // h+=Maj(a,b,c)
- ldr x28,[x30],#8 // *K++, x19 in next round
- //add x25,x25,x17 // h+=Sigma0(a)
-#ifndef __AARCH64EB__
- rev x14,x14 // 11
-#endif
- ldp x15,x0,[x1],#2*8
- add x25,x25,x17 // h+=Sigma0(a)
- str x6,[sp,#24]
- ror x16,x21,#14
- add x24,x24,x28 // h+=K[i]
- eor x6,x21,x21,ror#23
- and x17,x22,x21
- bic x28,x23,x21
- add x24,x24,x14 // h+=X[i]
- orr x17,x17,x28 // Ch(e,f,g)
- eor x28,x25,x26 // a^b, b^c in next round
- eor x16,x16,x6,ror#18 // Sigma1(e)
- ror x6,x25,#28
- add x24,x24,x17 // h+=Ch(e,f,g)
- eor x17,x25,x25,ror#5
- add x24,x24,x16 // h+=Sigma1(e)
- and x19,x19,x28 // (b^c)&=(a^b)
- add x20,x20,x24 // d+=h
- eor x19,x19,x26 // Maj(a,b,c)
- eor x17,x6,x17,ror#34 // Sigma0(a)
- add x24,x24,x19 // h+=Maj(a,b,c)
- ldr x19,[x30],#8 // *K++, x28 in next round
- //add x24,x24,x17 // h+=Sigma0(a)
-#ifndef __AARCH64EB__
- rev x15,x15 // 12
-#endif
- add x24,x24,x17 // h+=Sigma0(a)
- str x7,[sp,#0]
- ror x16,x20,#14
- add x23,x23,x19 // h+=K[i]
- eor x7,x20,x20,ror#23
- and x17,x21,x20
- bic x19,x22,x20
- add x23,x23,x15 // h+=X[i]
- orr x17,x17,x19 // Ch(e,f,g)
- eor x19,x24,x25 // a^b, b^c in next round
- eor x16,x16,x7,ror#18 // Sigma1(e)
- ror x7,x24,#28
- add x23,x23,x17 // h+=Ch(e,f,g)
- eor x17,x24,x24,ror#5
- add x23,x23,x16 // h+=Sigma1(e)
- and x28,x28,x19 // (b^c)&=(a^b)
- add x27,x27,x23 // d+=h
- eor x28,x28,x25 // Maj(a,b,c)
- eor x17,x7,x17,ror#34 // Sigma0(a)
- add x23,x23,x28 // h+=Maj(a,b,c)
- ldr x28,[x30],#8 // *K++, x19 in next round
- //add x23,x23,x17 // h+=Sigma0(a)
-#ifndef __AARCH64EB__
- rev x0,x0 // 13
-#endif
- ldp x1,x2,[x1]
- add x23,x23,x17 // h+=Sigma0(a)
- str x8,[sp,#8]
- ror x16,x27,#14
- add x22,x22,x28 // h+=K[i]
- eor x8,x27,x27,ror#23
- and x17,x20,x27
- bic x28,x21,x27
- add x22,x22,x0 // h+=X[i]
- orr x17,x17,x28 // Ch(e,f,g)
- eor x28,x23,x24 // a^b, b^c in next round
- eor x16,x16,x8,ror#18 // Sigma1(e)
- ror x8,x23,#28
- add x22,x22,x17 // h+=Ch(e,f,g)
- eor x17,x23,x23,ror#5
- add x22,x22,x16 // h+=Sigma1(e)
- and x19,x19,x28 // (b^c)&=(a^b)
- add x26,x26,x22 // d+=h
- eor x19,x19,x24 // Maj(a,b,c)
- eor x17,x8,x17,ror#34 // Sigma0(a)
- add x22,x22,x19 // h+=Maj(a,b,c)
- ldr x19,[x30],#8 // *K++, x28 in next round
- //add x22,x22,x17 // h+=Sigma0(a)
-#ifndef __AARCH64EB__
- rev x1,x1 // 14
-#endif
- ldr x6,[sp,#24]
- add x22,x22,x17 // h+=Sigma0(a)
- str x9,[sp,#16]
- ror x16,x26,#14
- add x21,x21,x19 // h+=K[i]
- eor x9,x26,x26,ror#23
- and x17,x27,x26
- bic x19,x20,x26
- add x21,x21,x1 // h+=X[i]
- orr x17,x17,x19 // Ch(e,f,g)
- eor x19,x22,x23 // a^b, b^c in next round
- eor x16,x16,x9,ror#18 // Sigma1(e)
- ror x9,x22,#28
- add x21,x21,x17 // h+=Ch(e,f,g)
- eor x17,x22,x22,ror#5
- add x21,x21,x16 // h+=Sigma1(e)
- and x28,x28,x19 // (b^c)&=(a^b)
- add x25,x25,x21 // d+=h
- eor x28,x28,x23 // Maj(a,b,c)
- eor x17,x9,x17,ror#34 // Sigma0(a)
- add x21,x21,x28 // h+=Maj(a,b,c)
- ldr x28,[x30],#8 // *K++, x19 in next round
- //add x21,x21,x17 // h+=Sigma0(a)
-#ifndef __AARCH64EB__
- rev x2,x2 // 15
-#endif
- ldr x7,[sp,#0]
- add x21,x21,x17 // h+=Sigma0(a)
- str x10,[sp,#24]
- ror x16,x25,#14
- add x20,x20,x28 // h+=K[i]
- ror x9,x4,#1
- and x17,x26,x25
- ror x8,x1,#19
- bic x28,x27,x25
- ror x10,x21,#28
- add x20,x20,x2 // h+=X[i]
- eor x16,x16,x25,ror#18
- eor x9,x9,x4,ror#8
- orr x17,x17,x28 // Ch(e,f,g)
- eor x28,x21,x22 // a^b, b^c in next round
- eor x16,x16,x25,ror#41 // Sigma1(e)
- eor x10,x10,x21,ror#34
- add x20,x20,x17 // h+=Ch(e,f,g)
- and x19,x19,x28 // (b^c)&=(a^b)
- eor x8,x8,x1,ror#61
- eor x9,x9,x4,lsr#7 // sigma0(X[i+1])
- add x20,x20,x16 // h+=Sigma1(e)
- eor x19,x19,x22 // Maj(a,b,c)
- eor x17,x10,x21,ror#39 // Sigma0(a)
- eor x8,x8,x1,lsr#6 // sigma1(X[i+14])
- add x3,x3,x12
- add x24,x24,x20 // d+=h
- add x20,x20,x19 // h+=Maj(a,b,c)
- ldr x19,[x30],#8 // *K++, x28 in next round
- add x3,x3,x9
- add x20,x20,x17 // h+=Sigma0(a)
- add x3,x3,x8
-.Loop_16_xx:
- ldr x8,[sp,#8]
- str x11,[sp,#0]
- ror x16,x24,#14
- add x27,x27,x19 // h+=K[i]
- ror x10,x5,#1
- and x17,x25,x24
- ror x9,x2,#19
- bic x19,x26,x24
- ror x11,x20,#28
- add x27,x27,x3 // h+=X[i]
- eor x16,x16,x24,ror#18
- eor x10,x10,x5,ror#8
- orr x17,x17,x19 // Ch(e,f,g)
- eor x19,x20,x21 // a^b, b^c in next round
- eor x16,x16,x24,ror#41 // Sigma1(e)
- eor x11,x11,x20,ror#34
- add x27,x27,x17 // h+=Ch(e,f,g)
- and x28,x28,x19 // (b^c)&=(a^b)
- eor x9,x9,x2,ror#61
- eor x10,x10,x5,lsr#7 // sigma0(X[i+1])
- add x27,x27,x16 // h+=Sigma1(e)
- eor x28,x28,x21 // Maj(a,b,c)
- eor x17,x11,x20,ror#39 // Sigma0(a)
- eor x9,x9,x2,lsr#6 // sigma1(X[i+14])
- add x4,x4,x13
- add x23,x23,x27 // d+=h
- add x27,x27,x28 // h+=Maj(a,b,c)
- ldr x28,[x30],#8 // *K++, x19 in next round
- add x4,x4,x10
- add x27,x27,x17 // h+=Sigma0(a)
- add x4,x4,x9
- ldr x9,[sp,#16]
- str x12,[sp,#8]
- ror x16,x23,#14
- add x26,x26,x28 // h+=K[i]
- ror x11,x6,#1
- and x17,x24,x23
- ror x10,x3,#19
- bic x28,x25,x23
- ror x12,x27,#28
- add x26,x26,x4 // h+=X[i]
- eor x16,x16,x23,ror#18
- eor x11,x11,x6,ror#8
- orr x17,x17,x28 // Ch(e,f,g)
- eor x28,x27,x20 // a^b, b^c in next round
- eor x16,x16,x23,ror#41 // Sigma1(e)
- eor x12,x12,x27,ror#34
- add x26,x26,x17 // h+=Ch(e,f,g)
- and x19,x19,x28 // (b^c)&=(a^b)
- eor x10,x10,x3,ror#61
- eor x11,x11,x6,lsr#7 // sigma0(X[i+1])
- add x26,x26,x16 // h+=Sigma1(e)
- eor x19,x19,x20 // Maj(a,b,c)
- eor x17,x12,x27,ror#39 // Sigma0(a)
- eor x10,x10,x3,lsr#6 // sigma1(X[i+14])
- add x5,x5,x14
- add x22,x22,x26 // d+=h
- add x26,x26,x19 // h+=Maj(a,b,c)
- ldr x19,[x30],#8 // *K++, x28 in next round
- add x5,x5,x11
- add x26,x26,x17 // h+=Sigma0(a)
- add x5,x5,x10
- ldr x10,[sp,#24]
- str x13,[sp,#16]
- ror x16,x22,#14
- add x25,x25,x19 // h+=K[i]
- ror x12,x7,#1
- and x17,x23,x22
- ror x11,x4,#19
- bic x19,x24,x22
- ror x13,x26,#28
- add x25,x25,x5 // h+=X[i]
- eor x16,x16,x22,ror#18
- eor x12,x12,x7,ror#8
- orr x17,x17,x19 // Ch(e,f,g)
- eor x19,x26,x27 // a^b, b^c in next round
- eor x16,x16,x22,ror#41 // Sigma1(e)
- eor x13,x13,x26,ror#34
- add x25,x25,x17 // h+=Ch(e,f,g)
- and x28,x28,x19 // (b^c)&=(a^b)
- eor x11,x11,x4,ror#61
- eor x12,x12,x7,lsr#7 // sigma0(X[i+1])
- add x25,x25,x16 // h+=Sigma1(e)
- eor x28,x28,x27 // Maj(a,b,c)
- eor x17,x13,x26,ror#39 // Sigma0(a)
- eor x11,x11,x4,lsr#6 // sigma1(X[i+14])
- add x6,x6,x15
- add x21,x21,x25 // d+=h
- add x25,x25,x28 // h+=Maj(a,b,c)
- ldr x28,[x30],#8 // *K++, x19 in next round
- add x6,x6,x12
- add x25,x25,x17 // h+=Sigma0(a)
- add x6,x6,x11
- ldr x11,[sp,#0]
- str x14,[sp,#24]
- ror x16,x21,#14
- add x24,x24,x28 // h+=K[i]
- ror x13,x8,#1
- and x17,x22,x21
- ror x12,x5,#19
- bic x28,x23,x21
- ror x14,x25,#28
- add x24,x24,x6 // h+=X[i]
- eor x16,x16,x21,ror#18
- eor x13,x13,x8,ror#8
- orr x17,x17,x28 // Ch(e,f,g)
- eor x28,x25,x26 // a^b, b^c in next round
- eor x16,x16,x21,ror#41 // Sigma1(e)
- eor x14,x14,x25,ror#34
- add x24,x24,x17 // h+=Ch(e,f,g)
- and x19,x19,x28 // (b^c)&=(a^b)
- eor x12,x12,x5,ror#61
- eor x13,x13,x8,lsr#7 // sigma0(X[i+1])
- add x24,x24,x16 // h+=Sigma1(e)
- eor x19,x19,x26 // Maj(a,b,c)
- eor x17,x14,x25,ror#39 // Sigma0(a)
- eor x12,x12,x5,lsr#6 // sigma1(X[i+14])
- add x7,x7,x0
- add x20,x20,x24 // d+=h
- add x24,x24,x19 // h+=Maj(a,b,c)
- ldr x19,[x30],#8 // *K++, x28 in next round
- add x7,x7,x13
- add x24,x24,x17 // h+=Sigma0(a)
- add x7,x7,x12
- ldr x12,[sp,#8]
- str x15,[sp,#0]
- ror x16,x20,#14
- add x23,x23,x19 // h+=K[i]
- ror x14,x9,#1
- and x17,x21,x20
- ror x13,x6,#19
- bic x19,x22,x20
- ror x15,x24,#28
- add x23,x23,x7 // h+=X[i]
- eor x16,x16,x20,ror#18
- eor x14,x14,x9,ror#8
- orr x17,x17,x19 // Ch(e,f,g)
- eor x19,x24,x25 // a^b, b^c in next round
- eor x16,x16,x20,ror#41 // Sigma1(e)
- eor x15,x15,x24,ror#34
- add x23,x23,x17 // h+=Ch(e,f,g)
- and x28,x28,x19 // (b^c)&=(a^b)
- eor x13,x13,x6,ror#61
- eor x14,x14,x9,lsr#7 // sigma0(X[i+1])
- add x23,x23,x16 // h+=Sigma1(e)
- eor x28,x28,x25 // Maj(a,b,c)
- eor x17,x15,x24,ror#39 // Sigma0(a)
- eor x13,x13,x6,lsr#6 // sigma1(X[i+14])
- add x8,x8,x1
- add x27,x27,x23 // d+=h
- add x23,x23,x28 // h+=Maj(a,b,c)
- ldr x28,[x30],#8 // *K++, x19 in next round
- add x8,x8,x14
- add x23,x23,x17 // h+=Sigma0(a)
- add x8,x8,x13
- ldr x13,[sp,#16]
- str x0,[sp,#8]
- ror x16,x27,#14
- add x22,x22,x28 // h+=K[i]
- ror x15,x10,#1
- and x17,x20,x27
- ror x14,x7,#19
- bic x28,x21,x27
- ror x0,x23,#28
- add x22,x22,x8 // h+=X[i]
- eor x16,x16,x27,ror#18
- eor x15,x15,x10,ror#8
- orr x17,x17,x28 // Ch(e,f,g)
- eor x28,x23,x24 // a^b, b^c in next round
- eor x16,x16,x27,ror#41 // Sigma1(e)
- eor x0,x0,x23,ror#34
- add x22,x22,x17 // h+=Ch(e,f,g)
- and x19,x19,x28 // (b^c)&=(a^b)
- eor x14,x14,x7,ror#61
- eor x15,x15,x10,lsr#7 // sigma0(X[i+1])
- add x22,x22,x16 // h+=Sigma1(e)
- eor x19,x19,x24 // Maj(a,b,c)
- eor x17,x0,x23,ror#39 // Sigma0(a)
- eor x14,x14,x7,lsr#6 // sigma1(X[i+14])
- add x9,x9,x2
- add x26,x26,x22 // d+=h
- add x22,x22,x19 // h+=Maj(a,b,c)
- ldr x19,[x30],#8 // *K++, x28 in next round
- add x9,x9,x15
- add x22,x22,x17 // h+=Sigma0(a)
- add x9,x9,x14
- ldr x14,[sp,#24]
- str x1,[sp,#16]
- ror x16,x26,#14
- add x21,x21,x19 // h+=K[i]
- ror x0,x11,#1
- and x17,x27,x26
- ror x15,x8,#19
- bic x19,x20,x26
- ror x1,x22,#28
- add x21,x21,x9 // h+=X[i]
- eor x16,x16,x26,ror#18
- eor x0,x0,x11,ror#8
- orr x17,x17,x19 // Ch(e,f,g)
- eor x19,x22,x23 // a^b, b^c in next round
- eor x16,x16,x26,ror#41 // Sigma1(e)
- eor x1,x1,x22,ror#34
- add x21,x21,x17 // h+=Ch(e,f,g)
- and x28,x28,x19 // (b^c)&=(a^b)
- eor x15,x15,x8,ror#61
- eor x0,x0,x11,lsr#7 // sigma0(X[i+1])
- add x21,x21,x16 // h+=Sigma1(e)
- eor x28,x28,x23 // Maj(a,b,c)
- eor x17,x1,x22,ror#39 // Sigma0(a)
- eor x15,x15,x8,lsr#6 // sigma1(X[i+14])
- add x10,x10,x3
- add x25,x25,x21 // d+=h
- add x21,x21,x28 // h+=Maj(a,b,c)
- ldr x28,[x30],#8 // *K++, x19 in next round
- add x10,x10,x0
- add x21,x21,x17 // h+=Sigma0(a)
- add x10,x10,x15
- ldr x15,[sp,#0]
- str x2,[sp,#24]
- ror x16,x25,#14
- add x20,x20,x28 // h+=K[i]
- ror x1,x12,#1
- and x17,x26,x25
- ror x0,x9,#19
- bic x28,x27,x25
- ror x2,x21,#28
- add x20,x20,x10 // h+=X[i]
- eor x16,x16,x25,ror#18
- eor x1,x1,x12,ror#8
- orr x17,x17,x28 // Ch(e,f,g)
- eor x28,x21,x22 // a^b, b^c in next round
- eor x16,x16,x25,ror#41 // Sigma1(e)
- eor x2,x2,x21,ror#34
- add x20,x20,x17 // h+=Ch(e,f,g)
- and x19,x19,x28 // (b^c)&=(a^b)
- eor x0,x0,x9,ror#61
- eor x1,x1,x12,lsr#7 // sigma0(X[i+1])
- add x20,x20,x16 // h+=Sigma1(e)
- eor x19,x19,x22 // Maj(a,b,c)
- eor x17,x2,x21,ror#39 // Sigma0(a)
- eor x0,x0,x9,lsr#6 // sigma1(X[i+14])
- add x11,x11,x4
- add x24,x24,x20 // d+=h
- add x20,x20,x19 // h+=Maj(a,b,c)
- ldr x19,[x30],#8 // *K++, x28 in next round
- add x11,x11,x1
- add x20,x20,x17 // h+=Sigma0(a)
- add x11,x11,x0
- ldr x0,[sp,#8]
- str x3,[sp,#0]
- ror x16,x24,#14
- add x27,x27,x19 // h+=K[i]
- ror x2,x13,#1
- and x17,x25,x24
- ror x1,x10,#19
- bic x19,x26,x24
- ror x3,x20,#28
- add x27,x27,x11 // h+=X[i]
- eor x16,x16,x24,ror#18
- eor x2,x2,x13,ror#8
- orr x17,x17,x19 // Ch(e,f,g)
- eor x19,x20,x21 // a^b, b^c in next round
- eor x16,x16,x24,ror#41 // Sigma1(e)
- eor x3,x3,x20,ror#34
- add x27,x27,x17 // h+=Ch(e,f,g)
- and x28,x28,x19 // (b^c)&=(a^b)
- eor x1,x1,x10,ror#61
- eor x2,x2,x13,lsr#7 // sigma0(X[i+1])
- add x27,x27,x16 // h+=Sigma1(e)
- eor x28,x28,x21 // Maj(a,b,c)
- eor x17,x3,x20,ror#39 // Sigma0(a)
- eor x1,x1,x10,lsr#6 // sigma1(X[i+14])
- add x12,x12,x5
- add x23,x23,x27 // d+=h
- add x27,x27,x28 // h+=Maj(a,b,c)
- ldr x28,[x30],#8 // *K++, x19 in next round
- add x12,x12,x2
- add x27,x27,x17 // h+=Sigma0(a)
- add x12,x12,x1
- ldr x1,[sp,#16]
- str x4,[sp,#8]
- ror x16,x23,#14
- add x26,x26,x28 // h+=K[i]
- ror x3,x14,#1
- and x17,x24,x23
- ror x2,x11,#19
- bic x28,x25,x23
- ror x4,x27,#28
- add x26,x26,x12 // h+=X[i]
- eor x16,x16,x23,ror#18
- eor x3,x3,x14,ror#8
- orr x17,x17,x28 // Ch(e,f,g)
- eor x28,x27,x20 // a^b, b^c in next round
- eor x16,x16,x23,ror#41 // Sigma1(e)
- eor x4,x4,x27,ror#34
- add x26,x26,x17 // h+=Ch(e,f,g)
- and x19,x19,x28 // (b^c)&=(a^b)
- eor x2,x2,x11,ror#61
- eor x3,x3,x14,lsr#7 // sigma0(X[i+1])
- add x26,x26,x16 // h+=Sigma1(e)
- eor x19,x19,x20 // Maj(a,b,c)
- eor x17,x4,x27,ror#39 // Sigma0(a)
- eor x2,x2,x11,lsr#6 // sigma1(X[i+14])
- add x13,x13,x6
- add x22,x22,x26 // d+=h
- add x26,x26,x19 // h+=Maj(a,b,c)
- ldr x19,[x30],#8 // *K++, x28 in next round
- add x13,x13,x3
- add x26,x26,x17 // h+=Sigma0(a)
- add x13,x13,x2
- ldr x2,[sp,#24]
- str x5,[sp,#16]
- ror x16,x22,#14
- add x25,x25,x19 // h+=K[i]
- ror x4,x15,#1
- and x17,x23,x22
- ror x3,x12,#19
- bic x19,x24,x22
- ror x5,x26,#28
- add x25,x25,x13 // h+=X[i]
- eor x16,x16,x22,ror#18
- eor x4,x4,x15,ror#8
- orr x17,x17,x19 // Ch(e,f,g)
- eor x19,x26,x27 // a^b, b^c in next round
- eor x16,x16,x22,ror#41 // Sigma1(e)
- eor x5,x5,x26,ror#34
- add x25,x25,x17 // h+=Ch(e,f,g)
- and x28,x28,x19 // (b^c)&=(a^b)
- eor x3,x3,x12,ror#61
- eor x4,x4,x15,lsr#7 // sigma0(X[i+1])
- add x25,x25,x16 // h+=Sigma1(e)
- eor x28,x28,x27 // Maj(a,b,c)
- eor x17,x5,x26,ror#39 // Sigma0(a)
- eor x3,x3,x12,lsr#6 // sigma1(X[i+14])
- add x14,x14,x7
- add x21,x21,x25 // d+=h
- add x25,x25,x28 // h+=Maj(a,b,c)
- ldr x28,[x30],#8 // *K++, x19 in next round
- add x14,x14,x4
- add x25,x25,x17 // h+=Sigma0(a)
- add x14,x14,x3
- ldr x3,[sp,#0]
- str x6,[sp,#24]
- ror x16,x21,#14
- add x24,x24,x28 // h+=K[i]
- ror x5,x0,#1
- and x17,x22,x21
- ror x4,x13,#19
- bic x28,x23,x21
- ror x6,x25,#28
- add x24,x24,x14 // h+=X[i]
- eor x16,x16,x21,ror#18
- eor x5,x5,x0,ror#8
- orr x17,x17,x28 // Ch(e,f,g)
- eor x28,x25,x26 // a^b, b^c in next round
- eor x16,x16,x21,ror#41 // Sigma1(e)
- eor x6,x6,x25,ror#34
- add x24,x24,x17 // h+=Ch(e,f,g)
- and x19,x19,x28 // (b^c)&=(a^b)
- eor x4,x4,x13,ror#61
- eor x5,x5,x0,lsr#7 // sigma0(X[i+1])
- add x24,x24,x16 // h+=Sigma1(e)
- eor x19,x19,x26 // Maj(a,b,c)
- eor x17,x6,x25,ror#39 // Sigma0(a)
- eor x4,x4,x13,lsr#6 // sigma1(X[i+14])
- add x15,x15,x8
- add x20,x20,x24 // d+=h
- add x24,x24,x19 // h+=Maj(a,b,c)
- ldr x19,[x30],#8 // *K++, x28 in next round
- add x15,x15,x5
- add x24,x24,x17 // h+=Sigma0(a)
- add x15,x15,x4
- ldr x4,[sp,#8]
- str x7,[sp,#0]
- ror x16,x20,#14
- add x23,x23,x19 // h+=K[i]
- ror x6,x1,#1
- and x17,x21,x20
- ror x5,x14,#19
- bic x19,x22,x20
- ror x7,x24,#28
- add x23,x23,x15 // h+=X[i]
- eor x16,x16,x20,ror#18
- eor x6,x6,x1,ror#8
- orr x17,x17,x19 // Ch(e,f,g)
- eor x19,x24,x25 // a^b, b^c in next round
- eor x16,x16,x20,ror#41 // Sigma1(e)
- eor x7,x7,x24,ror#34
- add x23,x23,x17 // h+=Ch(e,f,g)
- and x28,x28,x19 // (b^c)&=(a^b)
- eor x5,x5,x14,ror#61
- eor x6,x6,x1,lsr#7 // sigma0(X[i+1])
- add x23,x23,x16 // h+=Sigma1(e)
- eor x28,x28,x25 // Maj(a,b,c)
- eor x17,x7,x24,ror#39 // Sigma0(a)
- eor x5,x5,x14,lsr#6 // sigma1(X[i+14])
- add x0,x0,x9
- add x27,x27,x23 // d+=h
- add x23,x23,x28 // h+=Maj(a,b,c)
- ldr x28,[x30],#8 // *K++, x19 in next round
- add x0,x0,x6
- add x23,x23,x17 // h+=Sigma0(a)
- add x0,x0,x5
- ldr x5,[sp,#16]
- str x8,[sp,#8]
- ror x16,x27,#14
- add x22,x22,x28 // h+=K[i]
- ror x7,x2,#1
- and x17,x20,x27
- ror x6,x15,#19
- bic x28,x21,x27
- ror x8,x23,#28
- add x22,x22,x0 // h+=X[i]
- eor x16,x16,x27,ror#18
- eor x7,x7,x2,ror#8
- orr x17,x17,x28 // Ch(e,f,g)
- eor x28,x23,x24 // a^b, b^c in next round
- eor x16,x16,x27,ror#41 // Sigma1(e)
- eor x8,x8,x23,ror#34
- add x22,x22,x17 // h+=Ch(e,f,g)
- and x19,x19,x28 // (b^c)&=(a^b)
- eor x6,x6,x15,ror#61
- eor x7,x7,x2,lsr#7 // sigma0(X[i+1])
- add x22,x22,x16 // h+=Sigma1(e)
- eor x19,x19,x24 // Maj(a,b,c)
- eor x17,x8,x23,ror#39 // Sigma0(a)
- eor x6,x6,x15,lsr#6 // sigma1(X[i+14])
- add x1,x1,x10
- add x26,x26,x22 // d+=h
- add x22,x22,x19 // h+=Maj(a,b,c)
- ldr x19,[x30],#8 // *K++, x28 in next round
- add x1,x1,x7
- add x22,x22,x17 // h+=Sigma0(a)
- add x1,x1,x6
- ldr x6,[sp,#24]
- str x9,[sp,#16]
- ror x16,x26,#14
- add x21,x21,x19 // h+=K[i]
- ror x8,x3,#1
- and x17,x27,x26
- ror x7,x0,#19
- bic x19,x20,x26
- ror x9,x22,#28
- add x21,x21,x1 // h+=X[i]
- eor x16,x16,x26,ror#18
- eor x8,x8,x3,ror#8
- orr x17,x17,x19 // Ch(e,f,g)
- eor x19,x22,x23 // a^b, b^c in next round
- eor x16,x16,x26,ror#41 // Sigma1(e)
- eor x9,x9,x22,ror#34
- add x21,x21,x17 // h+=Ch(e,f,g)
- and x28,x28,x19 // (b^c)&=(a^b)
- eor x7,x7,x0,ror#61
- eor x8,x8,x3,lsr#7 // sigma0(X[i+1])
- add x21,x21,x16 // h+=Sigma1(e)
- eor x28,x28,x23 // Maj(a,b,c)
- eor x17,x9,x22,ror#39 // Sigma0(a)
- eor x7,x7,x0,lsr#6 // sigma1(X[i+14])
- add x2,x2,x11
- add x25,x25,x21 // d+=h
- add x21,x21,x28 // h+=Maj(a,b,c)
- ldr x28,[x30],#8 // *K++, x19 in next round
- add x2,x2,x8
- add x21,x21,x17 // h+=Sigma0(a)
- add x2,x2,x7
- ldr x7,[sp,#0]
- str x10,[sp,#24]
- ror x16,x25,#14
- add x20,x20,x28 // h+=K[i]
- ror x9,x4,#1
- and x17,x26,x25
- ror x8,x1,#19
- bic x28,x27,x25
- ror x10,x21,#28
- add x20,x20,x2 // h+=X[i]
- eor x16,x16,x25,ror#18
- eor x9,x9,x4,ror#8
- orr x17,x17,x28 // Ch(e,f,g)
- eor x28,x21,x22 // a^b, b^c in next round
- eor x16,x16,x25,ror#41 // Sigma1(e)
- eor x10,x10,x21,ror#34
- add x20,x20,x17 // h+=Ch(e,f,g)
- and x19,x19,x28 // (b^c)&=(a^b)
- eor x8,x8,x1,ror#61
- eor x9,x9,x4,lsr#7 // sigma0(X[i+1])
- add x20,x20,x16 // h+=Sigma1(e)
- eor x19,x19,x22 // Maj(a,b,c)
- eor x17,x10,x21,ror#39 // Sigma0(a)
- eor x8,x8,x1,lsr#6 // sigma1(X[i+14])
- add x3,x3,x12
- add x24,x24,x20 // d+=h
- add x20,x20,x19 // h+=Maj(a,b,c)
- ldr x19,[x30],#8 // *K++, x28 in next round
- add x3,x3,x9
- add x20,x20,x17 // h+=Sigma0(a)
- add x3,x3,x8
- cbnz x19,.Loop_16_xx
-
- ldp x0,x2,[x29,#96]
- ldr x1,[x29,#112]
- sub x30,x30,#648 // rewind
-
- ldp x3,x4,[x0]
- ldp x5,x6,[x0,#2*8]
- add x1,x1,#14*8 // advance input pointer
- ldp x7,x8,[x0,#4*8]
- add x20,x20,x3
- ldp x9,x10,[x0,#6*8]
- add x21,x21,x4
- add x22,x22,x5
- add x23,x23,x6
- stp x20,x21,[x0]
- add x24,x24,x7
- add x25,x25,x8
- stp x22,x23,[x0,#2*8]
- add x26,x26,x9
- add x27,x27,x10
- cmp x1,x2
- stp x24,x25,[x0,#4*8]
- stp x26,x27,[x0,#6*8]
- b.ne .Loop
-
- ldp x19,x20,[x29,#16]
- add sp,sp,#4*8
- ldp x21,x22,[x29,#32]
- ldp x23,x24,[x29,#48]
- ldp x25,x26,[x29,#64]
- ldp x27,x28,[x29,#80]
- ldp x29,x30,[sp],#128
- ret
-.size sha512_block_data_order,.-sha512_block_data_order
-
-.align 6
-.type .LK512,%object
-.LK512:
- .quad 0x428a2f98d728ae22,0x7137449123ef65cd
- .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
- .quad 0x3956c25bf348b538,0x59f111f1b605d019
- .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
- .quad 0xd807aa98a3030242,0x12835b0145706fbe
- .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
- .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
- .quad 0x9bdc06a725c71235,0xc19bf174cf692694
- .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
- .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
- .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
- .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
- .quad 0x983e5152ee66dfab,0xa831c66d2db43210
- .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
- .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
- .quad 0x06ca6351e003826f,0x142929670a0e6e70
- .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
- .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
- .quad 0x650a73548baf63de,0x766a0abb3c77b2a8
- .quad 0x81c2c92e47edaee6,0x92722c851482353b
- .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
- .quad 0xc24b8b70d0f89791,0xc76c51a30654be30
- .quad 0xd192e819d6ef5218,0xd69906245565a910
- .quad 0xf40e35855771202a,0x106aa07032bbd1b8
- .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
- .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
- .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
- .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
- .quad 0x748f82ee5defb2fc,0x78a5636f43172f60
- .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
- .quad 0x90befffa23631e28,0xa4506cebde82bde9
- .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
- .quad 0xca273eceea26619c,0xd186b8c721c0c207
- .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
- .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
- .quad 0x113f9804bef90dae,0x1b710b35131c471b
- .quad 0x28db77f523047d84,0x32caab7b40c72493
- .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
- .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
- .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
- .quad 0 // terminator
-.size .LK512,.-.LK512
-#ifndef __KERNEL__
-.align 3
-.LOPENSSL_armcap_P:
-# ifdef __ILP32__
- .long OPENSSL_armcap_P-.
-# else
- .quad OPENSSL_armcap_P-.
-# endif
-#endif
-.asciz "SHA512 block transform for ARMv8, CRYPTOGAMS by <appro@openssl.org>"
-.align 2
-#ifndef __KERNEL__
-.comm OPENSSL_armcap_P,4,4
-#endif
diff --git a/arch/arm64/crypto/sha512-glue.c b/arch/arm64/crypto/sha512-glue.c
index aff35c9992a4..2acff1c7df5d 100644
--- a/arch/arm64/crypto/sha512-glue.c
+++ b/arch/arm64/crypto/sha512-glue.c
@@ -1,20 +1,14 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Linux/arm64 port of the OpenSSL SHA512 implementation for AArch64
*
* Copyright (c) 2016 Linaro Ltd. <ard.biesheuvel@linaro.org>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the Free
- * Software Foundation; either version 2 of the License, or (at your option)
- * any later version.
- *
*/
#include <crypto/internal/hash.h>
-#include <linux/cryptohash.h>
#include <linux/types.h>
#include <linux/string.h>
-#include <crypto/sha.h>
+#include <crypto/sha2.h>
#include <crypto/sha512_base.h>
#include <asm/neon.h>
@@ -25,14 +19,21 @@ MODULE_LICENSE("GPL v2");
MODULE_ALIAS_CRYPTO("sha384");
MODULE_ALIAS_CRYPTO("sha512");
-asmlinkage void sha512_block_data_order(u32 *digest, const void *data,
+asmlinkage void sha512_block_data_order(u64 *digest, const void *data,
unsigned int num_blks);
+EXPORT_SYMBOL(sha512_block_data_order);
+
+static void __sha512_block_data_order(struct sha512_state *sst, u8 const *src,
+ int blocks)
+{
+ sha512_block_data_order(sst->state, src, blocks);
+}
static int sha512_update(struct shash_desc *desc, const u8 *data,
unsigned int len)
{
return sha512_base_do_update(desc, data, len,
- (sha512_block_fn *)sha512_block_data_order);
+ __sha512_block_data_order);
}
static int sha512_finup(struct shash_desc *desc, const u8 *data,
@@ -40,9 +41,8 @@ static int sha512_finup(struct shash_desc *desc, const u8 *data,
{
if (len)
sha512_base_do_update(desc, data, len,
- (sha512_block_fn *)sha512_block_data_order);
- sha512_base_do_finalize(desc,
- (sha512_block_fn *)sha512_block_data_order);
+ __sha512_block_data_order);
+ sha512_base_do_finalize(desc, __sha512_block_data_order);
return sha512_base_finish(desc, out);
}
@@ -62,7 +62,6 @@ static struct shash_alg algs[] = { {
.base.cra_name = "sha512",
.base.cra_driver_name = "sha512-arm64",
.base.cra_priority = 150,
- .base.cra_flags = CRYPTO_ALG_TYPE_SHASH,
.base.cra_blocksize = SHA512_BLOCK_SIZE,
.base.cra_module = THIS_MODULE,
}, {
@@ -75,7 +74,6 @@ static struct shash_alg algs[] = { {
.base.cra_name = "sha384",
.base.cra_driver_name = "sha384-arm64",
.base.cra_priority = 150,
- .base.cra_flags = CRYPTO_ALG_TYPE_SHASH,
.base.cra_blocksize = SHA384_BLOCK_SIZE,
.base.cra_module = THIS_MODULE,
} };
diff --git a/arch/arm64/crypto/sm3-ce-core.S b/arch/arm64/crypto/sm3-ce-core.S
new file mode 100644
index 000000000000..ca70cfacd0d0
--- /dev/null
+++ b/arch/arm64/crypto/sm3-ce-core.S
@@ -0,0 +1,139 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * sm3-ce-core.S - SM3 secure hash using ARMv8.2 Crypto Extensions
+ *
+ * Copyright (C) 2018 Linaro Ltd <ard.biesheuvel@linaro.org>
+ */
+
+#include <linux/linkage.h>
+#include <linux/cfi_types.h>
+#include <asm/assembler.h>
+
+ .irp b, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12
+ .set .Lv\b\().4s, \b
+ .endr
+
+ .macro sm3partw1, rd, rn, rm
+ .inst 0xce60c000 | .L\rd | (.L\rn << 5) | (.L\rm << 16)
+ .endm
+
+ .macro sm3partw2, rd, rn, rm
+ .inst 0xce60c400 | .L\rd | (.L\rn << 5) | (.L\rm << 16)
+ .endm
+
+ .macro sm3ss1, rd, rn, rm, ra
+ .inst 0xce400000 | .L\rd | (.L\rn << 5) | (.L\ra << 10) | (.L\rm << 16)
+ .endm
+
+ .macro sm3tt1a, rd, rn, rm, imm2
+ .inst 0xce408000 | .L\rd | (.L\rn << 5) | ((\imm2) << 12) | (.L\rm << 16)
+ .endm
+
+ .macro sm3tt1b, rd, rn, rm, imm2
+ .inst 0xce408400 | .L\rd | (.L\rn << 5) | ((\imm2) << 12) | (.L\rm << 16)
+ .endm
+
+ .macro sm3tt2a, rd, rn, rm, imm2
+ .inst 0xce408800 | .L\rd | (.L\rn << 5) | ((\imm2) << 12) | (.L\rm << 16)
+ .endm
+
+ .macro sm3tt2b, rd, rn, rm, imm2
+ .inst 0xce408c00 | .L\rd | (.L\rn << 5) | ((\imm2) << 12) | (.L\rm << 16)
+ .endm
+
+ .macro round, ab, s0, t0, t1, i
+ sm3ss1 v5.4s, v8.4s, \t0\().4s, v9.4s
+ shl \t1\().4s, \t0\().4s, #1
+ sri \t1\().4s, \t0\().4s, #31
+ sm3tt1\ab v8.4s, v5.4s, v10.4s, \i
+ sm3tt2\ab v9.4s, v5.4s, \s0\().4s, \i
+ .endm
+
+ .macro qround, ab, s0, s1, s2, s3, s4
+ .ifnb \s4
+ ext \s4\().16b, \s1\().16b, \s2\().16b, #12
+ ext v6.16b, \s0\().16b, \s1\().16b, #12
+ ext v7.16b, \s2\().16b, \s3\().16b, #8
+ sm3partw1 \s4\().4s, \s0\().4s, \s3\().4s
+ .endif
+
+ eor v10.16b, \s0\().16b, \s1\().16b
+
+ round \ab, \s0, v11, v12, 0
+ round \ab, \s0, v12, v11, 1
+ round \ab, \s0, v11, v12, 2
+ round \ab, \s0, v12, v11, 3
+
+ .ifnb \s4
+ sm3partw2 \s4\().4s, v7.4s, v6.4s
+ .endif
+ .endm
+
+ /*
+ * void sm3_ce_transform(struct sm3_state *sst, u8 const *src,
+ * int blocks)
+ */
+ .text
+SYM_TYPED_FUNC_START(sm3_ce_transform)
+ /* load state */
+ ld1 {v8.4s-v9.4s}, [x0]
+ rev64 v8.4s, v8.4s
+ rev64 v9.4s, v9.4s
+ ext v8.16b, v8.16b, v8.16b, #8
+ ext v9.16b, v9.16b, v9.16b, #8
+
+ adr_l x8, .Lt
+ ldp s13, s14, [x8]
+
+ /* load input */
+0: ld1 {v0.16b-v3.16b}, [x1], #64
+ sub w2, w2, #1
+
+ mov v15.16b, v8.16b
+ mov v16.16b, v9.16b
+
+CPU_LE( rev32 v0.16b, v0.16b )
+CPU_LE( rev32 v1.16b, v1.16b )
+CPU_LE( rev32 v2.16b, v2.16b )
+CPU_LE( rev32 v3.16b, v3.16b )
+
+ ext v11.16b, v13.16b, v13.16b, #4
+
+ qround a, v0, v1, v2, v3, v4
+ qround a, v1, v2, v3, v4, v0
+ qround a, v2, v3, v4, v0, v1
+ qround a, v3, v4, v0, v1, v2
+
+ ext v11.16b, v14.16b, v14.16b, #4
+
+ qround b, v4, v0, v1, v2, v3
+ qround b, v0, v1, v2, v3, v4
+ qround b, v1, v2, v3, v4, v0
+ qround b, v2, v3, v4, v0, v1
+ qround b, v3, v4, v0, v1, v2
+ qround b, v4, v0, v1, v2, v3
+ qround b, v0, v1, v2, v3, v4
+ qround b, v1, v2, v3, v4, v0
+ qround b, v2, v3, v4, v0, v1
+ qround b, v3, v4
+ qround b, v4, v0
+ qround b, v0, v1
+
+ eor v8.16b, v8.16b, v15.16b
+ eor v9.16b, v9.16b, v16.16b
+
+ /* handled all input blocks? */
+ cbnz w2, 0b
+
+ /* save state */
+ rev64 v8.4s, v8.4s
+ rev64 v9.4s, v9.4s
+ ext v8.16b, v8.16b, v8.16b, #8
+ ext v9.16b, v9.16b, v9.16b, #8
+ st1 {v8.4s-v9.4s}, [x0]
+ ret
+SYM_FUNC_END(sm3_ce_transform)
+
+ .section ".rodata", "a"
+ .align 3
+.Lt: .word 0x79cc4519, 0x9d8a7a87
diff --git a/arch/arm64/crypto/sm3-ce-glue.c b/arch/arm64/crypto/sm3-ce-glue.c
new file mode 100644
index 000000000000..ee98954ae8ca
--- /dev/null
+++ b/arch/arm64/crypto/sm3-ce-glue.c
@@ -0,0 +1,101 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * sm3-ce-glue.c - SM3 secure hash using ARMv8.2 Crypto Extensions
+ *
+ * Copyright (C) 2018 Linaro Ltd <ard.biesheuvel@linaro.org>
+ */
+
+#include <asm/neon.h>
+#include <asm/simd.h>
+#include <asm/unaligned.h>
+#include <crypto/internal/hash.h>
+#include <crypto/internal/simd.h>
+#include <crypto/sm3.h>
+#include <crypto/sm3_base.h>
+#include <linux/cpufeature.h>
+#include <linux/crypto.h>
+#include <linux/module.h>
+
+MODULE_DESCRIPTION("SM3 secure hash using ARMv8 Crypto Extensions");
+MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
+MODULE_LICENSE("GPL v2");
+
+asmlinkage void sm3_ce_transform(struct sm3_state *sst, u8 const *src,
+ int blocks);
+
+static int sm3_ce_update(struct shash_desc *desc, const u8 *data,
+ unsigned int len)
+{
+ if (!crypto_simd_usable()) {
+ sm3_update(shash_desc_ctx(desc), data, len);
+ return 0;
+ }
+
+ kernel_neon_begin();
+ sm3_base_do_update(desc, data, len, sm3_ce_transform);
+ kernel_neon_end();
+
+ return 0;
+}
+
+static int sm3_ce_final(struct shash_desc *desc, u8 *out)
+{
+ if (!crypto_simd_usable()) {
+ sm3_final(shash_desc_ctx(desc), out);
+ return 0;
+ }
+
+ kernel_neon_begin();
+ sm3_base_do_finalize(desc, sm3_ce_transform);
+ kernel_neon_end();
+
+ return sm3_base_finish(desc, out);
+}
+
+static int sm3_ce_finup(struct shash_desc *desc, const u8 *data,
+ unsigned int len, u8 *out)
+{
+ if (!crypto_simd_usable()) {
+ struct sm3_state *sctx = shash_desc_ctx(desc);
+
+ if (len)
+ sm3_update(sctx, data, len);
+ sm3_final(sctx, out);
+ return 0;
+ }
+
+ kernel_neon_begin();
+ if (len)
+ sm3_base_do_update(desc, data, len, sm3_ce_transform);
+ sm3_base_do_finalize(desc, sm3_ce_transform);
+ kernel_neon_end();
+
+ return sm3_base_finish(desc, out);
+}
+
+static struct shash_alg sm3_alg = {
+ .digestsize = SM3_DIGEST_SIZE,
+ .init = sm3_base_init,
+ .update = sm3_ce_update,
+ .final = sm3_ce_final,
+ .finup = sm3_ce_finup,
+ .descsize = sizeof(struct sm3_state),
+ .base.cra_name = "sm3",
+ .base.cra_driver_name = "sm3-ce",
+ .base.cra_blocksize = SM3_BLOCK_SIZE,
+ .base.cra_module = THIS_MODULE,
+ .base.cra_priority = 200,
+};
+
+static int __init sm3_ce_mod_init(void)
+{
+ return crypto_register_shash(&sm3_alg);
+}
+
+static void __exit sm3_ce_mod_fini(void)
+{
+ crypto_unregister_shash(&sm3_alg);
+}
+
+module_cpu_feature_match(SM3, sm3_ce_mod_init);
+module_exit(sm3_ce_mod_fini);
diff --git a/arch/arm64/crypto/sm4-ce-cipher-core.S b/arch/arm64/crypto/sm4-ce-cipher-core.S
new file mode 100644
index 000000000000..4ac6cfbc5797
--- /dev/null
+++ b/arch/arm64/crypto/sm4-ce-cipher-core.S
@@ -0,0 +1,36 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+ .irp b, 0, 1, 2, 3, 4, 5, 6, 7, 8
+ .set .Lv\b\().4s, \b
+ .endr
+
+ .macro sm4e, rd, rn
+ .inst 0xcec08400 | .L\rd | (.L\rn << 5)
+ .endm
+
+ /*
+ * void sm4_ce_do_crypt(const u32 *rk, u32 *out, const u32 *in);
+ */
+ .text
+SYM_FUNC_START(sm4_ce_do_crypt)
+ ld1 {v8.4s}, [x2]
+ ld1 {v0.4s-v3.4s}, [x0], #64
+CPU_LE( rev32 v8.16b, v8.16b )
+ ld1 {v4.4s-v7.4s}, [x0]
+ sm4e v8.4s, v0.4s
+ sm4e v8.4s, v1.4s
+ sm4e v8.4s, v2.4s
+ sm4e v8.4s, v3.4s
+ sm4e v8.4s, v4.4s
+ sm4e v8.4s, v5.4s
+ sm4e v8.4s, v6.4s
+ sm4e v8.4s, v7.4s
+ rev64 v8.4s, v8.4s
+ ext v8.16b, v8.16b, v8.16b, #8
+CPU_LE( rev32 v8.16b, v8.16b )
+ st1 {v8.4s}, [x1]
+ ret
+SYM_FUNC_END(sm4_ce_do_crypt)
diff --git a/arch/arm64/crypto/sm4-ce-cipher-glue.c b/arch/arm64/crypto/sm4-ce-cipher-glue.c
new file mode 100644
index 000000000000..76a34ef4abbb
--- /dev/null
+++ b/arch/arm64/crypto/sm4-ce-cipher-glue.c
@@ -0,0 +1,82 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <asm/neon.h>
+#include <asm/simd.h>
+#include <crypto/sm4.h>
+#include <crypto/internal/simd.h>
+#include <linux/module.h>
+#include <linux/cpufeature.h>
+#include <linux/crypto.h>
+#include <linux/types.h>
+
+MODULE_ALIAS_CRYPTO("sm4");
+MODULE_ALIAS_CRYPTO("sm4-ce");
+MODULE_DESCRIPTION("SM4 symmetric cipher using ARMv8 Crypto Extensions");
+MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
+MODULE_LICENSE("GPL v2");
+
+asmlinkage void sm4_ce_do_crypt(const u32 *rk, void *out, const void *in);
+
+static int sm4_ce_setkey(struct crypto_tfm *tfm, const u8 *key,
+ unsigned int key_len)
+{
+ struct sm4_ctx *ctx = crypto_tfm_ctx(tfm);
+
+ return sm4_expandkey(ctx, key, key_len);
+}
+
+static void sm4_ce_encrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
+{
+ const struct sm4_ctx *ctx = crypto_tfm_ctx(tfm);
+
+ if (!crypto_simd_usable()) {
+ sm4_crypt_block(ctx->rkey_enc, out, in);
+ } else {
+ kernel_neon_begin();
+ sm4_ce_do_crypt(ctx->rkey_enc, out, in);
+ kernel_neon_end();
+ }
+}
+
+static void sm4_ce_decrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
+{
+ const struct sm4_ctx *ctx = crypto_tfm_ctx(tfm);
+
+ if (!crypto_simd_usable()) {
+ sm4_crypt_block(ctx->rkey_dec, out, in);
+ } else {
+ kernel_neon_begin();
+ sm4_ce_do_crypt(ctx->rkey_dec, out, in);
+ kernel_neon_end();
+ }
+}
+
+static struct crypto_alg sm4_ce_alg = {
+ .cra_name = "sm4",
+ .cra_driver_name = "sm4-ce",
+ .cra_priority = 300,
+ .cra_flags = CRYPTO_ALG_TYPE_CIPHER,
+ .cra_blocksize = SM4_BLOCK_SIZE,
+ .cra_ctxsize = sizeof(struct sm4_ctx),
+ .cra_module = THIS_MODULE,
+ .cra_u.cipher = {
+ .cia_min_keysize = SM4_KEY_SIZE,
+ .cia_max_keysize = SM4_KEY_SIZE,
+ .cia_setkey = sm4_ce_setkey,
+ .cia_encrypt = sm4_ce_encrypt,
+ .cia_decrypt = sm4_ce_decrypt
+ }
+};
+
+static int __init sm4_ce_mod_init(void)
+{
+ return crypto_register_alg(&sm4_ce_alg);
+}
+
+static void __exit sm4_ce_mod_fini(void)
+{
+ crypto_unregister_alg(&sm4_ce_alg);
+}
+
+module_cpu_feature_match(SM4, sm4_ce_mod_init);
+module_exit(sm4_ce_mod_fini);
diff --git a/arch/arm64/crypto/sm4-ce-core.S b/arch/arm64/crypto/sm4-ce-core.S
new file mode 100644
index 000000000000..934e0f093279
--- /dev/null
+++ b/arch/arm64/crypto/sm4-ce-core.S
@@ -0,0 +1,660 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * SM4 Cipher Algorithm for ARMv8 with Crypto Extensions
+ * as specified in
+ * https://tools.ietf.org/id/draft-ribose-cfrg-sm4-10.html
+ *
+ * Copyright (C) 2022, Alibaba Group.
+ * Copyright (C) 2022 Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+.arch armv8-a+crypto
+
+.irp b, 0, 1, 2, 3, 4, 5, 6, 7, 16, 20, 24, 25, 26, 27, 28, 29, 30, 31
+ .set .Lv\b\().4s, \b
+.endr
+
+.macro sm4e, vd, vn
+ .inst 0xcec08400 | (.L\vn << 5) | .L\vd
+.endm
+
+.macro sm4ekey, vd, vn, vm
+ .inst 0xce60c800 | (.L\vm << 16) | (.L\vn << 5) | .L\vd
+.endm
+
+/* Register macros */
+
+#define RTMP0 v16
+#define RTMP1 v17
+#define RTMP2 v18
+#define RTMP3 v19
+
+#define RIV v20
+
+/* Helper macros. */
+
+#define PREPARE \
+ ld1 {v24.16b-v27.16b}, [x0], #64; \
+ ld1 {v28.16b-v31.16b}, [x0];
+
+#define SM4_CRYPT_BLK(b0) \
+ rev32 b0.16b, b0.16b; \
+ sm4e b0.4s, v24.4s; \
+ sm4e b0.4s, v25.4s; \
+ sm4e b0.4s, v26.4s; \
+ sm4e b0.4s, v27.4s; \
+ sm4e b0.4s, v28.4s; \
+ sm4e b0.4s, v29.4s; \
+ sm4e b0.4s, v30.4s; \
+ sm4e b0.4s, v31.4s; \
+ rev64 b0.4s, b0.4s; \
+ ext b0.16b, b0.16b, b0.16b, #8; \
+ rev32 b0.16b, b0.16b;
+
+#define SM4_CRYPT_BLK4(b0, b1, b2, b3) \
+ rev32 b0.16b, b0.16b; \
+ rev32 b1.16b, b1.16b; \
+ rev32 b2.16b, b2.16b; \
+ rev32 b3.16b, b3.16b; \
+ sm4e b0.4s, v24.4s; \
+ sm4e b1.4s, v24.4s; \
+ sm4e b2.4s, v24.4s; \
+ sm4e b3.4s, v24.4s; \
+ sm4e b0.4s, v25.4s; \
+ sm4e b1.4s, v25.4s; \
+ sm4e b2.4s, v25.4s; \
+ sm4e b3.4s, v25.4s; \
+ sm4e b0.4s, v26.4s; \
+ sm4e b1.4s, v26.4s; \
+ sm4e b2.4s, v26.4s; \
+ sm4e b3.4s, v26.4s; \
+ sm4e b0.4s, v27.4s; \
+ sm4e b1.4s, v27.4s; \
+ sm4e b2.4s, v27.4s; \
+ sm4e b3.4s, v27.4s; \
+ sm4e b0.4s, v28.4s; \
+ sm4e b1.4s, v28.4s; \
+ sm4e b2.4s, v28.4s; \
+ sm4e b3.4s, v28.4s; \
+ sm4e b0.4s, v29.4s; \
+ sm4e b1.4s, v29.4s; \
+ sm4e b2.4s, v29.4s; \
+ sm4e b3.4s, v29.4s; \
+ sm4e b0.4s, v30.4s; \
+ sm4e b1.4s, v30.4s; \
+ sm4e b2.4s, v30.4s; \
+ sm4e b3.4s, v30.4s; \
+ sm4e b0.4s, v31.4s; \
+ sm4e b1.4s, v31.4s; \
+ sm4e b2.4s, v31.4s; \
+ sm4e b3.4s, v31.4s; \
+ rev64 b0.4s, b0.4s; \
+ rev64 b1.4s, b1.4s; \
+ rev64 b2.4s, b2.4s; \
+ rev64 b3.4s, b3.4s; \
+ ext b0.16b, b0.16b, b0.16b, #8; \
+ ext b1.16b, b1.16b, b1.16b, #8; \
+ ext b2.16b, b2.16b, b2.16b, #8; \
+ ext b3.16b, b3.16b, b3.16b, #8; \
+ rev32 b0.16b, b0.16b; \
+ rev32 b1.16b, b1.16b; \
+ rev32 b2.16b, b2.16b; \
+ rev32 b3.16b, b3.16b;
+
+#define SM4_CRYPT_BLK8(b0, b1, b2, b3, b4, b5, b6, b7) \
+ rev32 b0.16b, b0.16b; \
+ rev32 b1.16b, b1.16b; \
+ rev32 b2.16b, b2.16b; \
+ rev32 b3.16b, b3.16b; \
+ rev32 b4.16b, b4.16b; \
+ rev32 b5.16b, b5.16b; \
+ rev32 b6.16b, b6.16b; \
+ rev32 b7.16b, b7.16b; \
+ sm4e b0.4s, v24.4s; \
+ sm4e b1.4s, v24.4s; \
+ sm4e b2.4s, v24.4s; \
+ sm4e b3.4s, v24.4s; \
+ sm4e b4.4s, v24.4s; \
+ sm4e b5.4s, v24.4s; \
+ sm4e b6.4s, v24.4s; \
+ sm4e b7.4s, v24.4s; \
+ sm4e b0.4s, v25.4s; \
+ sm4e b1.4s, v25.4s; \
+ sm4e b2.4s, v25.4s; \
+ sm4e b3.4s, v25.4s; \
+ sm4e b4.4s, v25.4s; \
+ sm4e b5.4s, v25.4s; \
+ sm4e b6.4s, v25.4s; \
+ sm4e b7.4s, v25.4s; \
+ sm4e b0.4s, v26.4s; \
+ sm4e b1.4s, v26.4s; \
+ sm4e b2.4s, v26.4s; \
+ sm4e b3.4s, v26.4s; \
+ sm4e b4.4s, v26.4s; \
+ sm4e b5.4s, v26.4s; \
+ sm4e b6.4s, v26.4s; \
+ sm4e b7.4s, v26.4s; \
+ sm4e b0.4s, v27.4s; \
+ sm4e b1.4s, v27.4s; \
+ sm4e b2.4s, v27.4s; \
+ sm4e b3.4s, v27.4s; \
+ sm4e b4.4s, v27.4s; \
+ sm4e b5.4s, v27.4s; \
+ sm4e b6.4s, v27.4s; \
+ sm4e b7.4s, v27.4s; \
+ sm4e b0.4s, v28.4s; \
+ sm4e b1.4s, v28.4s; \
+ sm4e b2.4s, v28.4s; \
+ sm4e b3.4s, v28.4s; \
+ sm4e b4.4s, v28.4s; \
+ sm4e b5.4s, v28.4s; \
+ sm4e b6.4s, v28.4s; \
+ sm4e b7.4s, v28.4s; \
+ sm4e b0.4s, v29.4s; \
+ sm4e b1.4s, v29.4s; \
+ sm4e b2.4s, v29.4s; \
+ sm4e b3.4s, v29.4s; \
+ sm4e b4.4s, v29.4s; \
+ sm4e b5.4s, v29.4s; \
+ sm4e b6.4s, v29.4s; \
+ sm4e b7.4s, v29.4s; \
+ sm4e b0.4s, v30.4s; \
+ sm4e b1.4s, v30.4s; \
+ sm4e b2.4s, v30.4s; \
+ sm4e b3.4s, v30.4s; \
+ sm4e b4.4s, v30.4s; \
+ sm4e b5.4s, v30.4s; \
+ sm4e b6.4s, v30.4s; \
+ sm4e b7.4s, v30.4s; \
+ sm4e b0.4s, v31.4s; \
+ sm4e b1.4s, v31.4s; \
+ sm4e b2.4s, v31.4s; \
+ sm4e b3.4s, v31.4s; \
+ sm4e b4.4s, v31.4s; \
+ sm4e b5.4s, v31.4s; \
+ sm4e b6.4s, v31.4s; \
+ sm4e b7.4s, v31.4s; \
+ rev64 b0.4s, b0.4s; \
+ rev64 b1.4s, b1.4s; \
+ rev64 b2.4s, b2.4s; \
+ rev64 b3.4s, b3.4s; \
+ rev64 b4.4s, b4.4s; \
+ rev64 b5.4s, b5.4s; \
+ rev64 b6.4s, b6.4s; \
+ rev64 b7.4s, b7.4s; \
+ ext b0.16b, b0.16b, b0.16b, #8; \
+ ext b1.16b, b1.16b, b1.16b, #8; \
+ ext b2.16b, b2.16b, b2.16b, #8; \
+ ext b3.16b, b3.16b, b3.16b, #8; \
+ ext b4.16b, b4.16b, b4.16b, #8; \
+ ext b5.16b, b5.16b, b5.16b, #8; \
+ ext b6.16b, b6.16b, b6.16b, #8; \
+ ext b7.16b, b7.16b, b7.16b, #8; \
+ rev32 b0.16b, b0.16b; \
+ rev32 b1.16b, b1.16b; \
+ rev32 b2.16b, b2.16b; \
+ rev32 b3.16b, b3.16b; \
+ rev32 b4.16b, b4.16b; \
+ rev32 b5.16b, b5.16b; \
+ rev32 b6.16b, b6.16b; \
+ rev32 b7.16b, b7.16b;
+
+
+.align 3
+SYM_FUNC_START(sm4_ce_expand_key)
+ /* input:
+ * x0: 128-bit key
+ * x1: rkey_enc
+ * x2: rkey_dec
+ * x3: fk array
+ * x4: ck array
+ */
+ ld1 {v0.16b}, [x0];
+ rev32 v0.16b, v0.16b;
+ ld1 {v1.16b}, [x3];
+ /* load ck */
+ ld1 {v24.16b-v27.16b}, [x4], #64;
+ ld1 {v28.16b-v31.16b}, [x4];
+
+ /* input ^ fk */
+ eor v0.16b, v0.16b, v1.16b;
+
+ sm4ekey v0.4s, v0.4s, v24.4s;
+ sm4ekey v1.4s, v0.4s, v25.4s;
+ sm4ekey v2.4s, v1.4s, v26.4s;
+ sm4ekey v3.4s, v2.4s, v27.4s;
+ sm4ekey v4.4s, v3.4s, v28.4s;
+ sm4ekey v5.4s, v4.4s, v29.4s;
+ sm4ekey v6.4s, v5.4s, v30.4s;
+ sm4ekey v7.4s, v6.4s, v31.4s;
+
+ st1 {v0.16b-v3.16b}, [x1], #64;
+ st1 {v4.16b-v7.16b}, [x1];
+ rev64 v7.4s, v7.4s;
+ rev64 v6.4s, v6.4s;
+ rev64 v5.4s, v5.4s;
+ rev64 v4.4s, v4.4s;
+ rev64 v3.4s, v3.4s;
+ rev64 v2.4s, v2.4s;
+ rev64 v1.4s, v1.4s;
+ rev64 v0.4s, v0.4s;
+ ext v7.16b, v7.16b, v7.16b, #8;
+ ext v6.16b, v6.16b, v6.16b, #8;
+ ext v5.16b, v5.16b, v5.16b, #8;
+ ext v4.16b, v4.16b, v4.16b, #8;
+ ext v3.16b, v3.16b, v3.16b, #8;
+ ext v2.16b, v2.16b, v2.16b, #8;
+ ext v1.16b, v1.16b, v1.16b, #8;
+ ext v0.16b, v0.16b, v0.16b, #8;
+ st1 {v7.16b}, [x2], #16;
+ st1 {v6.16b}, [x2], #16;
+ st1 {v5.16b}, [x2], #16;
+ st1 {v4.16b}, [x2], #16;
+ st1 {v3.16b}, [x2], #16;
+ st1 {v2.16b}, [x2], #16;
+ st1 {v1.16b}, [x2], #16;
+ st1 {v0.16b}, [x2];
+
+ ret;
+SYM_FUNC_END(sm4_ce_expand_key)
+
+.align 3
+SYM_FUNC_START(sm4_ce_crypt_block)
+ /* input:
+ * x0: round key array, CTX
+ * x1: dst
+ * x2: src
+ */
+ PREPARE;
+
+ ld1 {v0.16b}, [x2];
+ SM4_CRYPT_BLK(v0);
+ st1 {v0.16b}, [x1];
+
+ ret;
+SYM_FUNC_END(sm4_ce_crypt_block)
+
+.align 3
+SYM_FUNC_START(sm4_ce_crypt)
+ /* input:
+ * x0: round key array, CTX
+ * x1: dst
+ * x2: src
+ * w3: nblocks
+ */
+ PREPARE;
+
+.Lcrypt_loop_blk:
+ sub w3, w3, #8;
+ tbnz w3, #31, .Lcrypt_tail8;
+
+ ld1 {v0.16b-v3.16b}, [x2], #64;
+ ld1 {v4.16b-v7.16b}, [x2], #64;
+
+ SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7);
+
+ st1 {v0.16b-v3.16b}, [x1], #64;
+ st1 {v4.16b-v7.16b}, [x1], #64;
+
+ cbz w3, .Lcrypt_end;
+ b .Lcrypt_loop_blk;
+
+.Lcrypt_tail8:
+ add w3, w3, #8;
+ cmp w3, #4;
+ blt .Lcrypt_tail4;
+
+ sub w3, w3, #4;
+
+ ld1 {v0.16b-v3.16b}, [x2], #64;
+ SM4_CRYPT_BLK4(v0, v1, v2, v3);
+ st1 {v0.16b-v3.16b}, [x1], #64;
+
+ cbz w3, .Lcrypt_end;
+
+.Lcrypt_tail4:
+ sub w3, w3, #1;
+
+ ld1 {v0.16b}, [x2], #16;
+ SM4_CRYPT_BLK(v0);
+ st1 {v0.16b}, [x1], #16;
+
+ cbnz w3, .Lcrypt_tail4;
+
+.Lcrypt_end:
+ ret;
+SYM_FUNC_END(sm4_ce_crypt)
+
+.align 3
+SYM_FUNC_START(sm4_ce_cbc_enc)
+ /* input:
+ * x0: round key array, CTX
+ * x1: dst
+ * x2: src
+ * x3: iv (big endian, 128 bit)
+ * w4: nblocks
+ */
+ PREPARE;
+
+ ld1 {RIV.16b}, [x3];
+
+.Lcbc_enc_loop:
+ sub w4, w4, #1;
+
+ ld1 {RTMP0.16b}, [x2], #16;
+ eor RIV.16b, RIV.16b, RTMP0.16b;
+
+ SM4_CRYPT_BLK(RIV);
+
+ st1 {RIV.16b}, [x1], #16;
+
+ cbnz w4, .Lcbc_enc_loop;
+
+ /* store new IV */
+ st1 {RIV.16b}, [x3];
+
+ ret;
+SYM_FUNC_END(sm4_ce_cbc_enc)
+
+.align 3
+SYM_FUNC_START(sm4_ce_cbc_dec)
+ /* input:
+ * x0: round key array, CTX
+ * x1: dst
+ * x2: src
+ * x3: iv (big endian, 128 bit)
+ * w4: nblocks
+ */
+ PREPARE;
+
+ ld1 {RIV.16b}, [x3];
+
+.Lcbc_loop_blk:
+ sub w4, w4, #8;
+ tbnz w4, #31, .Lcbc_tail8;
+
+ ld1 {v0.16b-v3.16b}, [x2], #64;
+ ld1 {v4.16b-v7.16b}, [x2];
+
+ SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7);
+
+ sub x2, x2, #64;
+ eor v0.16b, v0.16b, RIV.16b;
+ ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64;
+ eor v1.16b, v1.16b, RTMP0.16b;
+ eor v2.16b, v2.16b, RTMP1.16b;
+ eor v3.16b, v3.16b, RTMP2.16b;
+ st1 {v0.16b-v3.16b}, [x1], #64;
+
+ eor v4.16b, v4.16b, RTMP3.16b;
+ ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64;
+ eor v5.16b, v5.16b, RTMP0.16b;
+ eor v6.16b, v6.16b, RTMP1.16b;
+ eor v7.16b, v7.16b, RTMP2.16b;
+
+ mov RIV.16b, RTMP3.16b;
+ st1 {v4.16b-v7.16b}, [x1], #64;
+
+ cbz w4, .Lcbc_end;
+ b .Lcbc_loop_blk;
+
+.Lcbc_tail8:
+ add w4, w4, #8;
+ cmp w4, #4;
+ blt .Lcbc_tail4;
+
+ sub w4, w4, #4;
+
+ ld1 {v0.16b-v3.16b}, [x2];
+
+ SM4_CRYPT_BLK4(v0, v1, v2, v3);
+
+ eor v0.16b, v0.16b, RIV.16b;
+ ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64;
+ eor v1.16b, v1.16b, RTMP0.16b;
+ eor v2.16b, v2.16b, RTMP1.16b;
+ eor v3.16b, v3.16b, RTMP2.16b;
+
+ mov RIV.16b, RTMP3.16b;
+ st1 {v0.16b-v3.16b}, [x1], #64;
+
+ cbz w4, .Lcbc_end;
+
+.Lcbc_tail4:
+ sub w4, w4, #1;
+
+ ld1 {v0.16b}, [x2];
+
+ SM4_CRYPT_BLK(v0);
+
+ eor v0.16b, v0.16b, RIV.16b;
+ ld1 {RIV.16b}, [x2], #16;
+ st1 {v0.16b}, [x1], #16;
+
+ cbnz w4, .Lcbc_tail4;
+
+.Lcbc_end:
+ /* store new IV */
+ st1 {RIV.16b}, [x3];
+
+ ret;
+SYM_FUNC_END(sm4_ce_cbc_dec)
+
+.align 3
+SYM_FUNC_START(sm4_ce_cfb_enc)
+ /* input:
+ * x0: round key array, CTX
+ * x1: dst
+ * x2: src
+ * x3: iv (big endian, 128 bit)
+ * w4: nblocks
+ */
+ PREPARE;
+
+ ld1 {RIV.16b}, [x3];
+
+.Lcfb_enc_loop:
+ sub w4, w4, #1;
+
+ SM4_CRYPT_BLK(RIV);
+
+ ld1 {RTMP0.16b}, [x2], #16;
+ eor RIV.16b, RIV.16b, RTMP0.16b;
+ st1 {RIV.16b}, [x1], #16;
+
+ cbnz w4, .Lcfb_enc_loop;
+
+ /* store new IV */
+ st1 {RIV.16b}, [x3];
+
+ ret;
+SYM_FUNC_END(sm4_ce_cfb_enc)
+
+.align 3
+SYM_FUNC_START(sm4_ce_cfb_dec)
+ /* input:
+ * x0: round key array, CTX
+ * x1: dst
+ * x2: src
+ * x3: iv (big endian, 128 bit)
+ * w4: nblocks
+ */
+ PREPARE;
+
+ ld1 {v0.16b}, [x3];
+
+.Lcfb_loop_blk:
+ sub w4, w4, #8;
+ tbnz w4, #31, .Lcfb_tail8;
+
+ ld1 {v1.16b, v2.16b, v3.16b}, [x2], #48;
+ ld1 {v4.16b-v7.16b}, [x2];
+
+ SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7);
+
+ sub x2, x2, #48;
+ ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64;
+ eor v0.16b, v0.16b, RTMP0.16b;
+ eor v1.16b, v1.16b, RTMP1.16b;
+ eor v2.16b, v2.16b, RTMP2.16b;
+ eor v3.16b, v3.16b, RTMP3.16b;
+ st1 {v0.16b-v3.16b}, [x1], #64;
+
+ ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64;
+ eor v4.16b, v4.16b, RTMP0.16b;
+ eor v5.16b, v5.16b, RTMP1.16b;
+ eor v6.16b, v6.16b, RTMP2.16b;
+ eor v7.16b, v7.16b, RTMP3.16b;
+ st1 {v4.16b-v7.16b}, [x1], #64;
+
+ mov v0.16b, RTMP3.16b;
+
+ cbz w4, .Lcfb_end;
+ b .Lcfb_loop_blk;
+
+.Lcfb_tail8:
+ add w4, w4, #8;
+ cmp w4, #4;
+ blt .Lcfb_tail4;
+
+ sub w4, w4, #4;
+
+ ld1 {v1.16b, v2.16b, v3.16b}, [x2];
+
+ SM4_CRYPT_BLK4(v0, v1, v2, v3);
+
+ ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64;
+ eor v0.16b, v0.16b, RTMP0.16b;
+ eor v1.16b, v1.16b, RTMP1.16b;
+ eor v2.16b, v2.16b, RTMP2.16b;
+ eor v3.16b, v3.16b, RTMP3.16b;
+ st1 {v0.16b-v3.16b}, [x1], #64;
+
+ mov v0.16b, RTMP3.16b;
+
+ cbz w4, .Lcfb_end;
+
+.Lcfb_tail4:
+ sub w4, w4, #1;
+
+ SM4_CRYPT_BLK(v0);
+
+ ld1 {RTMP0.16b}, [x2], #16;
+ eor v0.16b, v0.16b, RTMP0.16b;
+ st1 {v0.16b}, [x1], #16;
+
+ mov v0.16b, RTMP0.16b;
+
+ cbnz w4, .Lcfb_tail4;
+
+.Lcfb_end:
+ /* store new IV */
+ st1 {v0.16b}, [x3];
+
+ ret;
+SYM_FUNC_END(sm4_ce_cfb_dec)
+
+.align 3
+SYM_FUNC_START(sm4_ce_ctr_enc)
+ /* input:
+ * x0: round key array, CTX
+ * x1: dst
+ * x2: src
+ * x3: ctr (big endian, 128 bit)
+ * w4: nblocks
+ */
+ PREPARE;
+
+ ldp x7, x8, [x3];
+ rev x7, x7;
+ rev x8, x8;
+
+.Lctr_loop_blk:
+ sub w4, w4, #8;
+ tbnz w4, #31, .Lctr_tail8;
+
+#define inc_le128(vctr) \
+ mov vctr.d[1], x8; \
+ mov vctr.d[0], x7; \
+ adds x8, x8, #1; \
+ adc x7, x7, xzr; \
+ rev64 vctr.16b, vctr.16b;
+
+ /* construct CTRs */
+ inc_le128(v0); /* +0 */
+ inc_le128(v1); /* +1 */
+ inc_le128(v2); /* +2 */
+ inc_le128(v3); /* +3 */
+ inc_le128(v4); /* +4 */
+ inc_le128(v5); /* +5 */
+ inc_le128(v6); /* +6 */
+ inc_le128(v7); /* +7 */
+
+ SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7);
+
+ ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64;
+ eor v0.16b, v0.16b, RTMP0.16b;
+ eor v1.16b, v1.16b, RTMP1.16b;
+ eor v2.16b, v2.16b, RTMP2.16b;
+ eor v3.16b, v3.16b, RTMP3.16b;
+ st1 {v0.16b-v3.16b}, [x1], #64;
+
+ ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64;
+ eor v4.16b, v4.16b, RTMP0.16b;
+ eor v5.16b, v5.16b, RTMP1.16b;
+ eor v6.16b, v6.16b, RTMP2.16b;
+ eor v7.16b, v7.16b, RTMP3.16b;
+ st1 {v4.16b-v7.16b}, [x1], #64;
+
+ cbz w4, .Lctr_end;
+ b .Lctr_loop_blk;
+
+.Lctr_tail8:
+ add w4, w4, #8;
+ cmp w4, #4;
+ blt .Lctr_tail4;
+
+ sub w4, w4, #4;
+
+ /* construct CTRs */
+ inc_le128(v0); /* +0 */
+ inc_le128(v1); /* +1 */
+ inc_le128(v2); /* +2 */
+ inc_le128(v3); /* +3 */
+
+ SM4_CRYPT_BLK4(v0, v1, v2, v3);
+
+ ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64;
+ eor v0.16b, v0.16b, RTMP0.16b;
+ eor v1.16b, v1.16b, RTMP1.16b;
+ eor v2.16b, v2.16b, RTMP2.16b;
+ eor v3.16b, v3.16b, RTMP3.16b;
+ st1 {v0.16b-v3.16b}, [x1], #64;
+
+ cbz w4, .Lctr_end;
+
+.Lctr_tail4:
+ sub w4, w4, #1;
+
+ /* construct CTRs */
+ inc_le128(v0);
+
+ SM4_CRYPT_BLK(v0);
+
+ ld1 {RTMP0.16b}, [x2], #16;
+ eor v0.16b, v0.16b, RTMP0.16b;
+ st1 {v0.16b}, [x1], #16;
+
+ cbnz w4, .Lctr_tail4;
+
+.Lctr_end:
+ /* store new CTR */
+ rev x7, x7;
+ rev x8, x8;
+ stp x7, x8, [x3];
+
+ ret;
+SYM_FUNC_END(sm4_ce_ctr_enc)
diff --git a/arch/arm64/crypto/sm4-ce-glue.c b/arch/arm64/crypto/sm4-ce-glue.c
new file mode 100644
index 000000000000..496d55c0d01a
--- /dev/null
+++ b/arch/arm64/crypto/sm4-ce-glue.c
@@ -0,0 +1,372 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * SM4 Cipher Algorithm, using ARMv8 Crypto Extensions
+ * as specified in
+ * https://tools.ietf.org/id/draft-ribose-cfrg-sm4-10.html
+ *
+ * Copyright (C) 2022, Alibaba Group.
+ * Copyright (C) 2022 Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
+ */
+
+#include <linux/module.h>
+#include <linux/crypto.h>
+#include <linux/kernel.h>
+#include <linux/cpufeature.h>
+#include <asm/neon.h>
+#include <asm/simd.h>
+#include <crypto/internal/simd.h>
+#include <crypto/internal/skcipher.h>
+#include <crypto/sm4.h>
+
+#define BYTES2BLKS(nbytes) ((nbytes) >> 4)
+
+asmlinkage void sm4_ce_expand_key(const u8 *key, u32 *rkey_enc, u32 *rkey_dec,
+ const u32 *fk, const u32 *ck);
+asmlinkage void sm4_ce_crypt_block(const u32 *rkey, u8 *dst, const u8 *src);
+asmlinkage void sm4_ce_crypt(const u32 *rkey, u8 *dst, const u8 *src,
+ unsigned int nblks);
+asmlinkage void sm4_ce_cbc_enc(const u32 *rkey, u8 *dst, const u8 *src,
+ u8 *iv, unsigned int nblks);
+asmlinkage void sm4_ce_cbc_dec(const u32 *rkey, u8 *dst, const u8 *src,
+ u8 *iv, unsigned int nblks);
+asmlinkage void sm4_ce_cfb_enc(const u32 *rkey, u8 *dst, const u8 *src,
+ u8 *iv, unsigned int nblks);
+asmlinkage void sm4_ce_cfb_dec(const u32 *rkey, u8 *dst, const u8 *src,
+ u8 *iv, unsigned int nblks);
+asmlinkage void sm4_ce_ctr_enc(const u32 *rkey, u8 *dst, const u8 *src,
+ u8 *iv, unsigned int nblks);
+
+static int sm4_setkey(struct crypto_skcipher *tfm, const u8 *key,
+ unsigned int key_len)
+{
+ struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm);
+
+ if (key_len != SM4_KEY_SIZE)
+ return -EINVAL;
+
+ sm4_ce_expand_key(key, ctx->rkey_enc, ctx->rkey_dec,
+ crypto_sm4_fk, crypto_sm4_ck);
+ return 0;
+}
+
+static int sm4_ecb_do_crypt(struct skcipher_request *req, const u32 *rkey)
+{
+ struct skcipher_walk walk;
+ unsigned int nbytes;
+ int err;
+
+ err = skcipher_walk_virt(&walk, req, false);
+
+ while ((nbytes = walk.nbytes) > 0) {
+ const u8 *src = walk.src.virt.addr;
+ u8 *dst = walk.dst.virt.addr;
+ unsigned int nblks;
+
+ kernel_neon_begin();
+
+ nblks = BYTES2BLKS(nbytes);
+ if (nblks) {
+ sm4_ce_crypt(rkey, dst, src, nblks);
+ nbytes -= nblks * SM4_BLOCK_SIZE;
+ }
+
+ kernel_neon_end();
+
+ err = skcipher_walk_done(&walk, nbytes);
+ }
+
+ return err;
+}
+
+static int sm4_ecb_encrypt(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm);
+
+ return sm4_ecb_do_crypt(req, ctx->rkey_enc);
+}
+
+static int sm4_ecb_decrypt(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm);
+
+ return sm4_ecb_do_crypt(req, ctx->rkey_dec);
+}
+
+static int sm4_cbc_encrypt(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm);
+ struct skcipher_walk walk;
+ unsigned int nbytes;
+ int err;
+
+ err = skcipher_walk_virt(&walk, req, false);
+
+ while ((nbytes = walk.nbytes) > 0) {
+ const u8 *src = walk.src.virt.addr;
+ u8 *dst = walk.dst.virt.addr;
+ unsigned int nblks;
+
+ kernel_neon_begin();
+
+ nblks = BYTES2BLKS(nbytes);
+ if (nblks) {
+ sm4_ce_cbc_enc(ctx->rkey_enc, dst, src, walk.iv, nblks);
+ nbytes -= nblks * SM4_BLOCK_SIZE;
+ }
+
+ kernel_neon_end();
+
+ err = skcipher_walk_done(&walk, nbytes);
+ }
+
+ return err;
+}
+
+static int sm4_cbc_decrypt(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm);
+ struct skcipher_walk walk;
+ unsigned int nbytes;
+ int err;
+
+ err = skcipher_walk_virt(&walk, req, false);
+
+ while ((nbytes = walk.nbytes) > 0) {
+ const u8 *src = walk.src.virt.addr;
+ u8 *dst = walk.dst.virt.addr;
+ unsigned int nblks;
+
+ kernel_neon_begin();
+
+ nblks = BYTES2BLKS(nbytes);
+ if (nblks) {
+ sm4_ce_cbc_dec(ctx->rkey_dec, dst, src, walk.iv, nblks);
+ nbytes -= nblks * SM4_BLOCK_SIZE;
+ }
+
+ kernel_neon_end();
+
+ err = skcipher_walk_done(&walk, nbytes);
+ }
+
+ return err;
+}
+
+static int sm4_cfb_encrypt(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm);
+ struct skcipher_walk walk;
+ unsigned int nbytes;
+ int err;
+
+ err = skcipher_walk_virt(&walk, req, false);
+
+ while ((nbytes = walk.nbytes) > 0) {
+ const u8 *src = walk.src.virt.addr;
+ u8 *dst = walk.dst.virt.addr;
+ unsigned int nblks;
+
+ kernel_neon_begin();
+
+ nblks = BYTES2BLKS(nbytes);
+ if (nblks) {
+ sm4_ce_cfb_enc(ctx->rkey_enc, dst, src, walk.iv, nblks);
+ dst += nblks * SM4_BLOCK_SIZE;
+ src += nblks * SM4_BLOCK_SIZE;
+ nbytes -= nblks * SM4_BLOCK_SIZE;
+ }
+
+ /* tail */
+ if (walk.nbytes == walk.total && nbytes > 0) {
+ u8 keystream[SM4_BLOCK_SIZE];
+
+ sm4_ce_crypt_block(ctx->rkey_enc, keystream, walk.iv);
+ crypto_xor_cpy(dst, src, keystream, nbytes);
+ nbytes = 0;
+ }
+
+ kernel_neon_end();
+
+ err = skcipher_walk_done(&walk, nbytes);
+ }
+
+ return err;
+}
+
+static int sm4_cfb_decrypt(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm);
+ struct skcipher_walk walk;
+ unsigned int nbytes;
+ int err;
+
+ err = skcipher_walk_virt(&walk, req, false);
+
+ while ((nbytes = walk.nbytes) > 0) {
+ const u8 *src = walk.src.virt.addr;
+ u8 *dst = walk.dst.virt.addr;
+ unsigned int nblks;
+
+ kernel_neon_begin();
+
+ nblks = BYTES2BLKS(nbytes);
+ if (nblks) {
+ sm4_ce_cfb_dec(ctx->rkey_enc, dst, src, walk.iv, nblks);
+ dst += nblks * SM4_BLOCK_SIZE;
+ src += nblks * SM4_BLOCK_SIZE;
+ nbytes -= nblks * SM4_BLOCK_SIZE;
+ }
+
+ /* tail */
+ if (walk.nbytes == walk.total && nbytes > 0) {
+ u8 keystream[SM4_BLOCK_SIZE];
+
+ sm4_ce_crypt_block(ctx->rkey_enc, keystream, walk.iv);
+ crypto_xor_cpy(dst, src, keystream, nbytes);
+ nbytes = 0;
+ }
+
+ kernel_neon_end();
+
+ err = skcipher_walk_done(&walk, nbytes);
+ }
+
+ return err;
+}
+
+static int sm4_ctr_crypt(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm);
+ struct skcipher_walk walk;
+ unsigned int nbytes;
+ int err;
+
+ err = skcipher_walk_virt(&walk, req, false);
+
+ while ((nbytes = walk.nbytes) > 0) {
+ const u8 *src = walk.src.virt.addr;
+ u8 *dst = walk.dst.virt.addr;
+ unsigned int nblks;
+
+ kernel_neon_begin();
+
+ nblks = BYTES2BLKS(nbytes);
+ if (nblks) {
+ sm4_ce_ctr_enc(ctx->rkey_enc, dst, src, walk.iv, nblks);
+ dst += nblks * SM4_BLOCK_SIZE;
+ src += nblks * SM4_BLOCK_SIZE;
+ nbytes -= nblks * SM4_BLOCK_SIZE;
+ }
+
+ /* tail */
+ if (walk.nbytes == walk.total && nbytes > 0) {
+ u8 keystream[SM4_BLOCK_SIZE];
+
+ sm4_ce_crypt_block(ctx->rkey_enc, keystream, walk.iv);
+ crypto_inc(walk.iv, SM4_BLOCK_SIZE);
+ crypto_xor_cpy(dst, src, keystream, nbytes);
+ nbytes = 0;
+ }
+
+ kernel_neon_end();
+
+ err = skcipher_walk_done(&walk, nbytes);
+ }
+
+ return err;
+}
+
+static struct skcipher_alg sm4_algs[] = {
+ {
+ .base = {
+ .cra_name = "ecb(sm4)",
+ .cra_driver_name = "ecb-sm4-ce",
+ .cra_priority = 400,
+ .cra_blocksize = SM4_BLOCK_SIZE,
+ .cra_ctxsize = sizeof(struct sm4_ctx),
+ .cra_module = THIS_MODULE,
+ },
+ .min_keysize = SM4_KEY_SIZE,
+ .max_keysize = SM4_KEY_SIZE,
+ .setkey = sm4_setkey,
+ .encrypt = sm4_ecb_encrypt,
+ .decrypt = sm4_ecb_decrypt,
+ }, {
+ .base = {
+ .cra_name = "cbc(sm4)",
+ .cra_driver_name = "cbc-sm4-ce",
+ .cra_priority = 400,
+ .cra_blocksize = SM4_BLOCK_SIZE,
+ .cra_ctxsize = sizeof(struct sm4_ctx),
+ .cra_module = THIS_MODULE,
+ },
+ .min_keysize = SM4_KEY_SIZE,
+ .max_keysize = SM4_KEY_SIZE,
+ .ivsize = SM4_BLOCK_SIZE,
+ .setkey = sm4_setkey,
+ .encrypt = sm4_cbc_encrypt,
+ .decrypt = sm4_cbc_decrypt,
+ }, {
+ .base = {
+ .cra_name = "cfb(sm4)",
+ .cra_driver_name = "cfb-sm4-ce",
+ .cra_priority = 400,
+ .cra_blocksize = 1,
+ .cra_ctxsize = sizeof(struct sm4_ctx),
+ .cra_module = THIS_MODULE,
+ },
+ .min_keysize = SM4_KEY_SIZE,
+ .max_keysize = SM4_KEY_SIZE,
+ .ivsize = SM4_BLOCK_SIZE,
+ .chunksize = SM4_BLOCK_SIZE,
+ .setkey = sm4_setkey,
+ .encrypt = sm4_cfb_encrypt,
+ .decrypt = sm4_cfb_decrypt,
+ }, {
+ .base = {
+ .cra_name = "ctr(sm4)",
+ .cra_driver_name = "ctr-sm4-ce",
+ .cra_priority = 400,
+ .cra_blocksize = 1,
+ .cra_ctxsize = sizeof(struct sm4_ctx),
+ .cra_module = THIS_MODULE,
+ },
+ .min_keysize = SM4_KEY_SIZE,
+ .max_keysize = SM4_KEY_SIZE,
+ .ivsize = SM4_BLOCK_SIZE,
+ .chunksize = SM4_BLOCK_SIZE,
+ .setkey = sm4_setkey,
+ .encrypt = sm4_ctr_crypt,
+ .decrypt = sm4_ctr_crypt,
+ }
+};
+
+static int __init sm4_init(void)
+{
+ return crypto_register_skciphers(sm4_algs, ARRAY_SIZE(sm4_algs));
+}
+
+static void __exit sm4_exit(void)
+{
+ crypto_unregister_skciphers(sm4_algs, ARRAY_SIZE(sm4_algs));
+}
+
+module_cpu_feature_match(SM4, sm4_init);
+module_exit(sm4_exit);
+
+MODULE_DESCRIPTION("SM4 ECB/CBC/CFB/CTR using ARMv8 Crypto Extensions");
+MODULE_ALIAS_CRYPTO("sm4-ce");
+MODULE_ALIAS_CRYPTO("sm4");
+MODULE_ALIAS_CRYPTO("ecb(sm4)");
+MODULE_ALIAS_CRYPTO("cbc(sm4)");
+MODULE_ALIAS_CRYPTO("cfb(sm4)");
+MODULE_ALIAS_CRYPTO("ctr(sm4)");
+MODULE_AUTHOR("Tianjia Zhang <tianjia.zhang@linux.alibaba.com>");
+MODULE_LICENSE("GPL v2");
diff --git a/arch/arm64/crypto/sm4-neon-core.S b/arch/arm64/crypto/sm4-neon-core.S
new file mode 100644
index 000000000000..3d5256b354d2
--- /dev/null
+++ b/arch/arm64/crypto/sm4-neon-core.S
@@ -0,0 +1,487 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * SM4 Cipher Algorithm for ARMv8 NEON
+ * as specified in
+ * https://tools.ietf.org/id/draft-ribose-cfrg-sm4-10.html
+ *
+ * Copyright (C) 2022, Alibaba Group.
+ * Copyright (C) 2022 Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+/* Register macros */
+
+#define RTMP0 v8
+#define RTMP1 v9
+#define RTMP2 v10
+#define RTMP3 v11
+
+#define RX0 v12
+#define RX1 v13
+#define RKEY v14
+#define RIV v15
+
+/* Helper macros. */
+
+#define PREPARE \
+ adr_l x5, crypto_sm4_sbox; \
+ ld1 {v16.16b-v19.16b}, [x5], #64; \
+ ld1 {v20.16b-v23.16b}, [x5], #64; \
+ ld1 {v24.16b-v27.16b}, [x5], #64; \
+ ld1 {v28.16b-v31.16b}, [x5];
+
+#define transpose_4x4(s0, s1, s2, s3) \
+ zip1 RTMP0.4s, s0.4s, s1.4s; \
+ zip1 RTMP1.4s, s2.4s, s3.4s; \
+ zip2 RTMP2.4s, s0.4s, s1.4s; \
+ zip2 RTMP3.4s, s2.4s, s3.4s; \
+ zip1 s0.2d, RTMP0.2d, RTMP1.2d; \
+ zip2 s1.2d, RTMP0.2d, RTMP1.2d; \
+ zip1 s2.2d, RTMP2.2d, RTMP3.2d; \
+ zip2 s3.2d, RTMP2.2d, RTMP3.2d;
+
+#define rotate_clockwise_90(s0, s1, s2, s3) \
+ zip1 RTMP0.4s, s1.4s, s0.4s; \
+ zip2 RTMP1.4s, s1.4s, s0.4s; \
+ zip1 RTMP2.4s, s3.4s, s2.4s; \
+ zip2 RTMP3.4s, s3.4s, s2.4s; \
+ zip1 s0.2d, RTMP2.2d, RTMP0.2d; \
+ zip2 s1.2d, RTMP2.2d, RTMP0.2d; \
+ zip1 s2.2d, RTMP3.2d, RTMP1.2d; \
+ zip2 s3.2d, RTMP3.2d, RTMP1.2d;
+
+#define ROUND4(round, s0, s1, s2, s3) \
+ dup RX0.4s, RKEY.s[round]; \
+ /* rk ^ s1 ^ s2 ^ s3 */ \
+ eor RTMP1.16b, s2.16b, s3.16b; \
+ eor RX0.16b, RX0.16b, s1.16b; \
+ eor RX0.16b, RX0.16b, RTMP1.16b; \
+ \
+ /* sbox, non-linear part */ \
+ movi RTMP3.16b, #64; /* sizeof(sbox) / 4 */ \
+ tbl RTMP0.16b, {v16.16b-v19.16b}, RX0.16b; \
+ sub RX0.16b, RX0.16b, RTMP3.16b; \
+ tbx RTMP0.16b, {v20.16b-v23.16b}, RX0.16b; \
+ sub RX0.16b, RX0.16b, RTMP3.16b; \
+ tbx RTMP0.16b, {v24.16b-v27.16b}, RX0.16b; \
+ sub RX0.16b, RX0.16b, RTMP3.16b; \
+ tbx RTMP0.16b, {v28.16b-v31.16b}, RX0.16b; \
+ \
+ /* linear part */ \
+ shl RTMP1.4s, RTMP0.4s, #8; \
+ shl RTMP2.4s, RTMP0.4s, #16; \
+ shl RTMP3.4s, RTMP0.4s, #24; \
+ sri RTMP1.4s, RTMP0.4s, #(32-8); \
+ sri RTMP2.4s, RTMP0.4s, #(32-16); \
+ sri RTMP3.4s, RTMP0.4s, #(32-24); \
+ /* RTMP1 = x ^ rol32(x, 8) ^ rol32(x, 16) */ \
+ eor RTMP1.16b, RTMP1.16b, RTMP0.16b; \
+ eor RTMP1.16b, RTMP1.16b, RTMP2.16b; \
+ /* RTMP3 = x ^ rol32(x, 24) ^ rol32(RTMP1, 2) */ \
+ eor RTMP3.16b, RTMP3.16b, RTMP0.16b; \
+ shl RTMP2.4s, RTMP1.4s, 2; \
+ sri RTMP2.4s, RTMP1.4s, #(32-2); \
+ eor RTMP3.16b, RTMP3.16b, RTMP2.16b; \
+ /* s0 ^= RTMP3 */ \
+ eor s0.16b, s0.16b, RTMP3.16b;
+
+#define SM4_CRYPT_BLK4(b0, b1, b2, b3) \
+ rev32 b0.16b, b0.16b; \
+ rev32 b1.16b, b1.16b; \
+ rev32 b2.16b, b2.16b; \
+ rev32 b3.16b, b3.16b; \
+ \
+ transpose_4x4(b0, b1, b2, b3); \
+ \
+ mov x6, 8; \
+4: \
+ ld1 {RKEY.4s}, [x0], #16; \
+ subs x6, x6, #1; \
+ \
+ ROUND4(0, b0, b1, b2, b3); \
+ ROUND4(1, b1, b2, b3, b0); \
+ ROUND4(2, b2, b3, b0, b1); \
+ ROUND4(3, b3, b0, b1, b2); \
+ \
+ bne 4b; \
+ \
+ rotate_clockwise_90(b0, b1, b2, b3); \
+ rev32 b0.16b, b0.16b; \
+ rev32 b1.16b, b1.16b; \
+ rev32 b2.16b, b2.16b; \
+ rev32 b3.16b, b3.16b; \
+ \
+ /* repoint to rkey */ \
+ sub x0, x0, #128;
+
+#define ROUND8(round, s0, s1, s2, s3, t0, t1, t2, t3) \
+ /* rk ^ s1 ^ s2 ^ s3 */ \
+ dup RX0.4s, RKEY.s[round]; \
+ eor RTMP0.16b, s2.16b, s3.16b; \
+ mov RX1.16b, RX0.16b; \
+ eor RTMP1.16b, t2.16b, t3.16b; \
+ eor RX0.16b, RX0.16b, s1.16b; \
+ eor RX1.16b, RX1.16b, t1.16b; \
+ eor RX0.16b, RX0.16b, RTMP0.16b; \
+ eor RX1.16b, RX1.16b, RTMP1.16b; \
+ \
+ /* sbox, non-linear part */ \
+ movi RTMP3.16b, #64; /* sizeof(sbox) / 4 */ \
+ tbl RTMP0.16b, {v16.16b-v19.16b}, RX0.16b; \
+ tbl RTMP1.16b, {v16.16b-v19.16b}, RX1.16b; \
+ sub RX0.16b, RX0.16b, RTMP3.16b; \
+ sub RX1.16b, RX1.16b, RTMP3.16b; \
+ tbx RTMP0.16b, {v20.16b-v23.16b}, RX0.16b; \
+ tbx RTMP1.16b, {v20.16b-v23.16b}, RX1.16b; \
+ sub RX0.16b, RX0.16b, RTMP3.16b; \
+ sub RX1.16b, RX1.16b, RTMP3.16b; \
+ tbx RTMP0.16b, {v24.16b-v27.16b}, RX0.16b; \
+ tbx RTMP1.16b, {v24.16b-v27.16b}, RX1.16b; \
+ sub RX0.16b, RX0.16b, RTMP3.16b; \
+ sub RX1.16b, RX1.16b, RTMP3.16b; \
+ tbx RTMP0.16b, {v28.16b-v31.16b}, RX0.16b; \
+ tbx RTMP1.16b, {v28.16b-v31.16b}, RX1.16b; \
+ \
+ /* linear part */ \
+ shl RX0.4s, RTMP0.4s, #8; \
+ shl RX1.4s, RTMP1.4s, #8; \
+ shl RTMP2.4s, RTMP0.4s, #16; \
+ shl RTMP3.4s, RTMP1.4s, #16; \
+ sri RX0.4s, RTMP0.4s, #(32 - 8); \
+ sri RX1.4s, RTMP1.4s, #(32 - 8); \
+ sri RTMP2.4s, RTMP0.4s, #(32 - 16); \
+ sri RTMP3.4s, RTMP1.4s, #(32 - 16); \
+ /* RX = x ^ rol32(x, 8) ^ rol32(x, 16) */ \
+ eor RX0.16b, RX0.16b, RTMP0.16b; \
+ eor RX1.16b, RX1.16b, RTMP1.16b; \
+ eor RX0.16b, RX0.16b, RTMP2.16b; \
+ eor RX1.16b, RX1.16b, RTMP3.16b; \
+ /* RTMP0/1 ^= x ^ rol32(x, 24) ^ rol32(RX, 2) */ \
+ shl RTMP2.4s, RTMP0.4s, #24; \
+ shl RTMP3.4s, RTMP1.4s, #24; \
+ sri RTMP2.4s, RTMP0.4s, #(32 - 24); \
+ sri RTMP3.4s, RTMP1.4s, #(32 - 24); \
+ eor RTMP0.16b, RTMP0.16b, RTMP2.16b; \
+ eor RTMP1.16b, RTMP1.16b, RTMP3.16b; \
+ shl RTMP2.4s, RX0.4s, #2; \
+ shl RTMP3.4s, RX1.4s, #2; \
+ sri RTMP2.4s, RX0.4s, #(32 - 2); \
+ sri RTMP3.4s, RX1.4s, #(32 - 2); \
+ eor RTMP0.16b, RTMP0.16b, RTMP2.16b; \
+ eor RTMP1.16b, RTMP1.16b, RTMP3.16b; \
+ /* s0/t0 ^= RTMP0/1 */ \
+ eor s0.16b, s0.16b, RTMP0.16b; \
+ eor t0.16b, t0.16b, RTMP1.16b;
+
+#define SM4_CRYPT_BLK8(b0, b1, b2, b3, b4, b5, b6, b7) \
+ rev32 b0.16b, b0.16b; \
+ rev32 b1.16b, b1.16b; \
+ rev32 b2.16b, b2.16b; \
+ rev32 b3.16b, b3.16b; \
+ rev32 b4.16b, b4.16b; \
+ rev32 b5.16b, b5.16b; \
+ rev32 b6.16b, b6.16b; \
+ rev32 b7.16b, b7.16b; \
+ \
+ transpose_4x4(b0, b1, b2, b3); \
+ transpose_4x4(b4, b5, b6, b7); \
+ \
+ mov x6, 8; \
+8: \
+ ld1 {RKEY.4s}, [x0], #16; \
+ subs x6, x6, #1; \
+ \
+ ROUND8(0, b0, b1, b2, b3, b4, b5, b6, b7); \
+ ROUND8(1, b1, b2, b3, b0, b5, b6, b7, b4); \
+ ROUND8(2, b2, b3, b0, b1, b6, b7, b4, b5); \
+ ROUND8(3, b3, b0, b1, b2, b7, b4, b5, b6); \
+ \
+ bne 8b; \
+ \
+ rotate_clockwise_90(b0, b1, b2, b3); \
+ rotate_clockwise_90(b4, b5, b6, b7); \
+ rev32 b0.16b, b0.16b; \
+ rev32 b1.16b, b1.16b; \
+ rev32 b2.16b, b2.16b; \
+ rev32 b3.16b, b3.16b; \
+ rev32 b4.16b, b4.16b; \
+ rev32 b5.16b, b5.16b; \
+ rev32 b6.16b, b6.16b; \
+ rev32 b7.16b, b7.16b; \
+ \
+ /* repoint to rkey */ \
+ sub x0, x0, #128;
+
+
+.align 3
+SYM_FUNC_START_LOCAL(__sm4_neon_crypt_blk1_4)
+ /* input:
+ * x0: round key array, CTX
+ * x1: dst
+ * x2: src
+ * w3: num blocks (1..4)
+ */
+ PREPARE;
+
+ ld1 {v0.16b}, [x2], #16;
+ mov v1.16b, v0.16b;
+ mov v2.16b, v0.16b;
+ mov v3.16b, v0.16b;
+ cmp w3, #2;
+ blt .Lblk4_load_input_done;
+ ld1 {v1.16b}, [x2], #16;
+ beq .Lblk4_load_input_done;
+ ld1 {v2.16b}, [x2], #16;
+ cmp w3, #3;
+ beq .Lblk4_load_input_done;
+ ld1 {v3.16b}, [x2];
+
+.Lblk4_load_input_done:
+ SM4_CRYPT_BLK4(v0, v1, v2, v3);
+
+ st1 {v0.16b}, [x1], #16;
+ cmp w3, #2;
+ blt .Lblk4_store_output_done;
+ st1 {v1.16b}, [x1], #16;
+ beq .Lblk4_store_output_done;
+ st1 {v2.16b}, [x1], #16;
+ cmp w3, #3;
+ beq .Lblk4_store_output_done;
+ st1 {v3.16b}, [x1];
+
+.Lblk4_store_output_done:
+ ret;
+SYM_FUNC_END(__sm4_neon_crypt_blk1_4)
+
+.align 3
+SYM_FUNC_START(sm4_neon_crypt_blk1_8)
+ /* input:
+ * x0: round key array, CTX
+ * x1: dst
+ * x2: src
+ * w3: num blocks (1..8)
+ */
+ cmp w3, #5;
+ blt __sm4_neon_crypt_blk1_4;
+
+ PREPARE;
+
+ ld1 {v0.16b-v3.16b}, [x2], #64;
+ ld1 {v4.16b}, [x2], #16;
+ mov v5.16b, v4.16b;
+ mov v6.16b, v4.16b;
+ mov v7.16b, v4.16b;
+ beq .Lblk8_load_input_done;
+ ld1 {v5.16b}, [x2], #16;
+ cmp w3, #7;
+ blt .Lblk8_load_input_done;
+ ld1 {v6.16b}, [x2], #16;
+ beq .Lblk8_load_input_done;
+ ld1 {v7.16b}, [x2];
+
+.Lblk8_load_input_done:
+ SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7);
+
+ cmp w3, #6;
+ st1 {v0.16b-v3.16b}, [x1], #64;
+ st1 {v4.16b}, [x1], #16;
+ blt .Lblk8_store_output_done;
+ st1 {v5.16b}, [x1], #16;
+ beq .Lblk8_store_output_done;
+ st1 {v6.16b}, [x1], #16;
+ cmp w3, #7;
+ beq .Lblk8_store_output_done;
+ st1 {v7.16b}, [x1];
+
+.Lblk8_store_output_done:
+ ret;
+SYM_FUNC_END(sm4_neon_crypt_blk1_8)
+
+.align 3
+SYM_FUNC_START(sm4_neon_crypt_blk8)
+ /* input:
+ * x0: round key array, CTX
+ * x1: dst
+ * x2: src
+ * w3: nblocks (multiples of 8)
+ */
+ PREPARE;
+
+.Lcrypt_loop_blk:
+ subs w3, w3, #8;
+ bmi .Lcrypt_end;
+
+ ld1 {v0.16b-v3.16b}, [x2], #64;
+ ld1 {v4.16b-v7.16b}, [x2], #64;
+
+ SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7);
+
+ st1 {v0.16b-v3.16b}, [x1], #64;
+ st1 {v4.16b-v7.16b}, [x1], #64;
+
+ b .Lcrypt_loop_blk;
+
+.Lcrypt_end:
+ ret;
+SYM_FUNC_END(sm4_neon_crypt_blk8)
+
+.align 3
+SYM_FUNC_START(sm4_neon_cbc_dec_blk8)
+ /* input:
+ * x0: round key array, CTX
+ * x1: dst
+ * x2: src
+ * x3: iv (big endian, 128 bit)
+ * w4: nblocks (multiples of 8)
+ */
+ PREPARE;
+
+ ld1 {RIV.16b}, [x3];
+
+.Lcbc_loop_blk:
+ subs w4, w4, #8;
+ bmi .Lcbc_end;
+
+ ld1 {v0.16b-v3.16b}, [x2], #64;
+ ld1 {v4.16b-v7.16b}, [x2];
+
+ SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7);
+
+ sub x2, x2, #64;
+ eor v0.16b, v0.16b, RIV.16b;
+ ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64;
+ eor v1.16b, v1.16b, RTMP0.16b;
+ eor v2.16b, v2.16b, RTMP1.16b;
+ eor v3.16b, v3.16b, RTMP2.16b;
+ st1 {v0.16b-v3.16b}, [x1], #64;
+
+ eor v4.16b, v4.16b, RTMP3.16b;
+ ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64;
+ eor v5.16b, v5.16b, RTMP0.16b;
+ eor v6.16b, v6.16b, RTMP1.16b;
+ eor v7.16b, v7.16b, RTMP2.16b;
+
+ mov RIV.16b, RTMP3.16b;
+ st1 {v4.16b-v7.16b}, [x1], #64;
+
+ b .Lcbc_loop_blk;
+
+.Lcbc_end:
+ /* store new IV */
+ st1 {RIV.16b}, [x3];
+
+ ret;
+SYM_FUNC_END(sm4_neon_cbc_dec_blk8)
+
+.align 3
+SYM_FUNC_START(sm4_neon_cfb_dec_blk8)
+ /* input:
+ * x0: round key array, CTX
+ * x1: dst
+ * x2: src
+ * x3: iv (big endian, 128 bit)
+ * w4: nblocks (multiples of 8)
+ */
+ PREPARE;
+
+ ld1 {v0.16b}, [x3];
+
+.Lcfb_loop_blk:
+ subs w4, w4, #8;
+ bmi .Lcfb_end;
+
+ ld1 {v1.16b, v2.16b, v3.16b}, [x2], #48;
+ ld1 {v4.16b-v7.16b}, [x2];
+
+ SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7);
+
+ sub x2, x2, #48;
+ ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64;
+ eor v0.16b, v0.16b, RTMP0.16b;
+ eor v1.16b, v1.16b, RTMP1.16b;
+ eor v2.16b, v2.16b, RTMP2.16b;
+ eor v3.16b, v3.16b, RTMP3.16b;
+ st1 {v0.16b-v3.16b}, [x1], #64;
+
+ ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64;
+ eor v4.16b, v4.16b, RTMP0.16b;
+ eor v5.16b, v5.16b, RTMP1.16b;
+ eor v6.16b, v6.16b, RTMP2.16b;
+ eor v7.16b, v7.16b, RTMP3.16b;
+ st1 {v4.16b-v7.16b}, [x1], #64;
+
+ mov v0.16b, RTMP3.16b;
+
+ b .Lcfb_loop_blk;
+
+.Lcfb_end:
+ /* store new IV */
+ st1 {v0.16b}, [x3];
+
+ ret;
+SYM_FUNC_END(sm4_neon_cfb_dec_blk8)
+
+.align 3
+SYM_FUNC_START(sm4_neon_ctr_enc_blk8)
+ /* input:
+ * x0: round key array, CTX
+ * x1: dst
+ * x2: src
+ * x3: ctr (big endian, 128 bit)
+ * w4: nblocks (multiples of 8)
+ */
+ PREPARE;
+
+ ldp x7, x8, [x3];
+ rev x7, x7;
+ rev x8, x8;
+
+.Lctr_loop_blk:
+ subs w4, w4, #8;
+ bmi .Lctr_end;
+
+#define inc_le128(vctr) \
+ mov vctr.d[1], x8; \
+ mov vctr.d[0], x7; \
+ adds x8, x8, #1; \
+ adc x7, x7, xzr; \
+ rev64 vctr.16b, vctr.16b;
+
+ /* construct CTRs */
+ inc_le128(v0); /* +0 */
+ inc_le128(v1); /* +1 */
+ inc_le128(v2); /* +2 */
+ inc_le128(v3); /* +3 */
+ inc_le128(v4); /* +4 */
+ inc_le128(v5); /* +5 */
+ inc_le128(v6); /* +6 */
+ inc_le128(v7); /* +7 */
+
+ SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7);
+
+ ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64;
+ eor v0.16b, v0.16b, RTMP0.16b;
+ eor v1.16b, v1.16b, RTMP1.16b;
+ eor v2.16b, v2.16b, RTMP2.16b;
+ eor v3.16b, v3.16b, RTMP3.16b;
+ st1 {v0.16b-v3.16b}, [x1], #64;
+
+ ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64;
+ eor v4.16b, v4.16b, RTMP0.16b;
+ eor v5.16b, v5.16b, RTMP1.16b;
+ eor v6.16b, v6.16b, RTMP2.16b;
+ eor v7.16b, v7.16b, RTMP3.16b;
+ st1 {v4.16b-v7.16b}, [x1], #64;
+
+ b .Lctr_loop_blk;
+
+.Lctr_end:
+ /* store new CTR */
+ rev x7, x7;
+ rev x8, x8;
+ stp x7, x8, [x3];
+
+ ret;
+SYM_FUNC_END(sm4_neon_ctr_enc_blk8)
diff --git a/arch/arm64/crypto/sm4-neon-glue.c b/arch/arm64/crypto/sm4-neon-glue.c
new file mode 100644
index 000000000000..03a6a6866a31
--- /dev/null
+++ b/arch/arm64/crypto/sm4-neon-glue.c
@@ -0,0 +1,442 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * SM4 Cipher Algorithm, using ARMv8 NEON
+ * as specified in
+ * https://tools.ietf.org/id/draft-ribose-cfrg-sm4-10.html
+ *
+ * Copyright (C) 2022, Alibaba Group.
+ * Copyright (C) 2022 Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
+ */
+
+#include <linux/module.h>
+#include <linux/crypto.h>
+#include <linux/kernel.h>
+#include <linux/cpufeature.h>
+#include <asm/neon.h>
+#include <asm/simd.h>
+#include <crypto/internal/simd.h>
+#include <crypto/internal/skcipher.h>
+#include <crypto/sm4.h>
+
+#define BYTES2BLKS(nbytes) ((nbytes) >> 4)
+#define BYTES2BLK8(nbytes) (((nbytes) >> 4) & ~(8 - 1))
+
+asmlinkage void sm4_neon_crypt_blk1_8(const u32 *rkey, u8 *dst, const u8 *src,
+ unsigned int nblks);
+asmlinkage void sm4_neon_crypt_blk8(const u32 *rkey, u8 *dst, const u8 *src,
+ unsigned int nblks);
+asmlinkage void sm4_neon_cbc_dec_blk8(const u32 *rkey, u8 *dst, const u8 *src,
+ u8 *iv, unsigned int nblks);
+asmlinkage void sm4_neon_cfb_dec_blk8(const u32 *rkey, u8 *dst, const u8 *src,
+ u8 *iv, unsigned int nblks);
+asmlinkage void sm4_neon_ctr_enc_blk8(const u32 *rkey, u8 *dst, const u8 *src,
+ u8 *iv, unsigned int nblks);
+
+static int sm4_setkey(struct crypto_skcipher *tfm, const u8 *key,
+ unsigned int key_len)
+{
+ struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm);
+
+ return sm4_expandkey(ctx, key, key_len);
+}
+
+static int sm4_ecb_do_crypt(struct skcipher_request *req, const u32 *rkey)
+{
+ struct skcipher_walk walk;
+ unsigned int nbytes;
+ int err;
+
+ err = skcipher_walk_virt(&walk, req, false);
+
+ while ((nbytes = walk.nbytes) > 0) {
+ const u8 *src = walk.src.virt.addr;
+ u8 *dst = walk.dst.virt.addr;
+ unsigned int nblks;
+
+ kernel_neon_begin();
+
+ nblks = BYTES2BLK8(nbytes);
+ if (nblks) {
+ sm4_neon_crypt_blk8(rkey, dst, src, nblks);
+ dst += nblks * SM4_BLOCK_SIZE;
+ src += nblks * SM4_BLOCK_SIZE;
+ nbytes -= nblks * SM4_BLOCK_SIZE;
+ }
+
+ nblks = BYTES2BLKS(nbytes);
+ if (nblks) {
+ sm4_neon_crypt_blk1_8(rkey, dst, src, nblks);
+ nbytes -= nblks * SM4_BLOCK_SIZE;
+ }
+
+ kernel_neon_end();
+
+ err = skcipher_walk_done(&walk, nbytes);
+ }
+
+ return err;
+}
+
+static int sm4_ecb_encrypt(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm);
+
+ return sm4_ecb_do_crypt(req, ctx->rkey_enc);
+}
+
+static int sm4_ecb_decrypt(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm);
+
+ return sm4_ecb_do_crypt(req, ctx->rkey_dec);
+}
+
+static int sm4_cbc_encrypt(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm);
+ struct skcipher_walk walk;
+ unsigned int nbytes;
+ int err;
+
+ err = skcipher_walk_virt(&walk, req, false);
+
+ while ((nbytes = walk.nbytes) > 0) {
+ const u8 *iv = walk.iv;
+ const u8 *src = walk.src.virt.addr;
+ u8 *dst = walk.dst.virt.addr;
+
+ while (nbytes >= SM4_BLOCK_SIZE) {
+ crypto_xor_cpy(dst, src, iv, SM4_BLOCK_SIZE);
+ sm4_crypt_block(ctx->rkey_enc, dst, dst);
+ iv = dst;
+ src += SM4_BLOCK_SIZE;
+ dst += SM4_BLOCK_SIZE;
+ nbytes -= SM4_BLOCK_SIZE;
+ }
+ if (iv != walk.iv)
+ memcpy(walk.iv, iv, SM4_BLOCK_SIZE);
+
+ err = skcipher_walk_done(&walk, nbytes);
+ }
+
+ return err;
+}
+
+static int sm4_cbc_decrypt(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm);
+ struct skcipher_walk walk;
+ unsigned int nbytes;
+ int err;
+
+ err = skcipher_walk_virt(&walk, req, false);
+
+ while ((nbytes = walk.nbytes) > 0) {
+ const u8 *src = walk.src.virt.addr;
+ u8 *dst = walk.dst.virt.addr;
+ unsigned int nblks;
+
+ kernel_neon_begin();
+
+ nblks = BYTES2BLK8(nbytes);
+ if (nblks) {
+ sm4_neon_cbc_dec_blk8(ctx->rkey_dec, dst, src,
+ walk.iv, nblks);
+ dst += nblks * SM4_BLOCK_SIZE;
+ src += nblks * SM4_BLOCK_SIZE;
+ nbytes -= nblks * SM4_BLOCK_SIZE;
+ }
+
+ nblks = BYTES2BLKS(nbytes);
+ if (nblks) {
+ u8 keystream[SM4_BLOCK_SIZE * 8];
+ u8 iv[SM4_BLOCK_SIZE];
+ int i;
+
+ sm4_neon_crypt_blk1_8(ctx->rkey_dec, keystream,
+ src, nblks);
+
+ src += ((int)nblks - 2) * SM4_BLOCK_SIZE;
+ dst += (nblks - 1) * SM4_BLOCK_SIZE;
+ memcpy(iv, src + SM4_BLOCK_SIZE, SM4_BLOCK_SIZE);
+
+ for (i = nblks - 1; i > 0; i--) {
+ crypto_xor_cpy(dst, src,
+ &keystream[i * SM4_BLOCK_SIZE],
+ SM4_BLOCK_SIZE);
+ src -= SM4_BLOCK_SIZE;
+ dst -= SM4_BLOCK_SIZE;
+ }
+ crypto_xor_cpy(dst, walk.iv,
+ keystream, SM4_BLOCK_SIZE);
+ memcpy(walk.iv, iv, SM4_BLOCK_SIZE);
+ nbytes -= nblks * SM4_BLOCK_SIZE;
+ }
+
+ kernel_neon_end();
+
+ err = skcipher_walk_done(&walk, nbytes);
+ }
+
+ return err;
+}
+
+static int sm4_cfb_encrypt(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm);
+ struct skcipher_walk walk;
+ unsigned int nbytes;
+ int err;
+
+ err = skcipher_walk_virt(&walk, req, false);
+
+ while ((nbytes = walk.nbytes) > 0) {
+ u8 keystream[SM4_BLOCK_SIZE];
+ const u8 *iv = walk.iv;
+ const u8 *src = walk.src.virt.addr;
+ u8 *dst = walk.dst.virt.addr;
+
+ while (nbytes >= SM4_BLOCK_SIZE) {
+ sm4_crypt_block(ctx->rkey_enc, keystream, iv);
+ crypto_xor_cpy(dst, src, keystream, SM4_BLOCK_SIZE);
+ iv = dst;
+ src += SM4_BLOCK_SIZE;
+ dst += SM4_BLOCK_SIZE;
+ nbytes -= SM4_BLOCK_SIZE;
+ }
+ if (iv != walk.iv)
+ memcpy(walk.iv, iv, SM4_BLOCK_SIZE);
+
+ /* tail */
+ if (walk.nbytes == walk.total && nbytes > 0) {
+ sm4_crypt_block(ctx->rkey_enc, keystream, walk.iv);
+ crypto_xor_cpy(dst, src, keystream, nbytes);
+ nbytes = 0;
+ }
+
+ err = skcipher_walk_done(&walk, nbytes);
+ }
+
+ return err;
+}
+
+static int sm4_cfb_decrypt(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm);
+ struct skcipher_walk walk;
+ unsigned int nbytes;
+ int err;
+
+ err = skcipher_walk_virt(&walk, req, false);
+
+ while ((nbytes = walk.nbytes) > 0) {
+ const u8 *src = walk.src.virt.addr;
+ u8 *dst = walk.dst.virt.addr;
+ unsigned int nblks;
+
+ kernel_neon_begin();
+
+ nblks = BYTES2BLK8(nbytes);
+ if (nblks) {
+ sm4_neon_cfb_dec_blk8(ctx->rkey_enc, dst, src,
+ walk.iv, nblks);
+ dst += nblks * SM4_BLOCK_SIZE;
+ src += nblks * SM4_BLOCK_SIZE;
+ nbytes -= nblks * SM4_BLOCK_SIZE;
+ }
+
+ nblks = BYTES2BLKS(nbytes);
+ if (nblks) {
+ u8 keystream[SM4_BLOCK_SIZE * 8];
+
+ memcpy(keystream, walk.iv, SM4_BLOCK_SIZE);
+ if (nblks > 1)
+ memcpy(&keystream[SM4_BLOCK_SIZE], src,
+ (nblks - 1) * SM4_BLOCK_SIZE);
+ memcpy(walk.iv, src + (nblks - 1) * SM4_BLOCK_SIZE,
+ SM4_BLOCK_SIZE);
+
+ sm4_neon_crypt_blk1_8(ctx->rkey_enc, keystream,
+ keystream, nblks);
+
+ crypto_xor_cpy(dst, src, keystream,
+ nblks * SM4_BLOCK_SIZE);
+ dst += nblks * SM4_BLOCK_SIZE;
+ src += nblks * SM4_BLOCK_SIZE;
+ nbytes -= nblks * SM4_BLOCK_SIZE;
+ }
+
+ kernel_neon_end();
+
+ /* tail */
+ if (walk.nbytes == walk.total && nbytes > 0) {
+ u8 keystream[SM4_BLOCK_SIZE];
+
+ sm4_crypt_block(ctx->rkey_enc, keystream, walk.iv);
+ crypto_xor_cpy(dst, src, keystream, nbytes);
+ nbytes = 0;
+ }
+
+ err = skcipher_walk_done(&walk, nbytes);
+ }
+
+ return err;
+}
+
+static int sm4_ctr_crypt(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm);
+ struct skcipher_walk walk;
+ unsigned int nbytes;
+ int err;
+
+ err = skcipher_walk_virt(&walk, req, false);
+
+ while ((nbytes = walk.nbytes) > 0) {
+ const u8 *src = walk.src.virt.addr;
+ u8 *dst = walk.dst.virt.addr;
+ unsigned int nblks;
+
+ kernel_neon_begin();
+
+ nblks = BYTES2BLK8(nbytes);
+ if (nblks) {
+ sm4_neon_ctr_enc_blk8(ctx->rkey_enc, dst, src,
+ walk.iv, nblks);
+ dst += nblks * SM4_BLOCK_SIZE;
+ src += nblks * SM4_BLOCK_SIZE;
+ nbytes -= nblks * SM4_BLOCK_SIZE;
+ }
+
+ nblks = BYTES2BLKS(nbytes);
+ if (nblks) {
+ u8 keystream[SM4_BLOCK_SIZE * 8];
+ int i;
+
+ for (i = 0; i < nblks; i++) {
+ memcpy(&keystream[i * SM4_BLOCK_SIZE],
+ walk.iv, SM4_BLOCK_SIZE);
+ crypto_inc(walk.iv, SM4_BLOCK_SIZE);
+ }
+ sm4_neon_crypt_blk1_8(ctx->rkey_enc, keystream,
+ keystream, nblks);
+
+ crypto_xor_cpy(dst, src, keystream,
+ nblks * SM4_BLOCK_SIZE);
+ dst += nblks * SM4_BLOCK_SIZE;
+ src += nblks * SM4_BLOCK_SIZE;
+ nbytes -= nblks * SM4_BLOCK_SIZE;
+ }
+
+ kernel_neon_end();
+
+ /* tail */
+ if (walk.nbytes == walk.total && nbytes > 0) {
+ u8 keystream[SM4_BLOCK_SIZE];
+
+ sm4_crypt_block(ctx->rkey_enc, keystream, walk.iv);
+ crypto_inc(walk.iv, SM4_BLOCK_SIZE);
+ crypto_xor_cpy(dst, src, keystream, nbytes);
+ nbytes = 0;
+ }
+
+ err = skcipher_walk_done(&walk, nbytes);
+ }
+
+ return err;
+}
+
+static struct skcipher_alg sm4_algs[] = {
+ {
+ .base = {
+ .cra_name = "ecb(sm4)",
+ .cra_driver_name = "ecb-sm4-neon",
+ .cra_priority = 200,
+ .cra_blocksize = SM4_BLOCK_SIZE,
+ .cra_ctxsize = sizeof(struct sm4_ctx),
+ .cra_module = THIS_MODULE,
+ },
+ .min_keysize = SM4_KEY_SIZE,
+ .max_keysize = SM4_KEY_SIZE,
+ .setkey = sm4_setkey,
+ .encrypt = sm4_ecb_encrypt,
+ .decrypt = sm4_ecb_decrypt,
+ }, {
+ .base = {
+ .cra_name = "cbc(sm4)",
+ .cra_driver_name = "cbc-sm4-neon",
+ .cra_priority = 200,
+ .cra_blocksize = SM4_BLOCK_SIZE,
+ .cra_ctxsize = sizeof(struct sm4_ctx),
+ .cra_module = THIS_MODULE,
+ },
+ .min_keysize = SM4_KEY_SIZE,
+ .max_keysize = SM4_KEY_SIZE,
+ .ivsize = SM4_BLOCK_SIZE,
+ .setkey = sm4_setkey,
+ .encrypt = sm4_cbc_encrypt,
+ .decrypt = sm4_cbc_decrypt,
+ }, {
+ .base = {
+ .cra_name = "cfb(sm4)",
+ .cra_driver_name = "cfb-sm4-neon",
+ .cra_priority = 200,
+ .cra_blocksize = 1,
+ .cra_ctxsize = sizeof(struct sm4_ctx),
+ .cra_module = THIS_MODULE,
+ },
+ .min_keysize = SM4_KEY_SIZE,
+ .max_keysize = SM4_KEY_SIZE,
+ .ivsize = SM4_BLOCK_SIZE,
+ .chunksize = SM4_BLOCK_SIZE,
+ .setkey = sm4_setkey,
+ .encrypt = sm4_cfb_encrypt,
+ .decrypt = sm4_cfb_decrypt,
+ }, {
+ .base = {
+ .cra_name = "ctr(sm4)",
+ .cra_driver_name = "ctr-sm4-neon",
+ .cra_priority = 200,
+ .cra_blocksize = 1,
+ .cra_ctxsize = sizeof(struct sm4_ctx),
+ .cra_module = THIS_MODULE,
+ },
+ .min_keysize = SM4_KEY_SIZE,
+ .max_keysize = SM4_KEY_SIZE,
+ .ivsize = SM4_BLOCK_SIZE,
+ .chunksize = SM4_BLOCK_SIZE,
+ .setkey = sm4_setkey,
+ .encrypt = sm4_ctr_crypt,
+ .decrypt = sm4_ctr_crypt,
+ }
+};
+
+static int __init sm4_init(void)
+{
+ return crypto_register_skciphers(sm4_algs, ARRAY_SIZE(sm4_algs));
+}
+
+static void __exit sm4_exit(void)
+{
+ crypto_unregister_skciphers(sm4_algs, ARRAY_SIZE(sm4_algs));
+}
+
+module_init(sm4_init);
+module_exit(sm4_exit);
+
+MODULE_DESCRIPTION("SM4 ECB/CBC/CFB/CTR using ARMv8 NEON");
+MODULE_ALIAS_CRYPTO("sm4-neon");
+MODULE_ALIAS_CRYPTO("sm4");
+MODULE_ALIAS_CRYPTO("ecb(sm4)");
+MODULE_ALIAS_CRYPTO("cbc(sm4)");
+MODULE_ALIAS_CRYPTO("cfb(sm4)");
+MODULE_ALIAS_CRYPTO("ctr(sm4)");
+MODULE_AUTHOR("Tianjia Zhang <tianjia.zhang@linux.alibaba.com>");
+MODULE_LICENSE("GPL v2");