aboutsummaryrefslogtreecommitdiffstats
path: root/arch/arm/crypto
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--arch/arm/crypto/.gitignore2
-rw-r--r--arch/arm/crypto/Kconfig231
-rw-r--r--arch/arm/crypto/Makefile14
-rw-r--r--arch/arm/crypto/aes-ce-core.S32
-rw-r--r--arch/arm/crypto/aes-cipher-core.S42
-rw-r--r--arch/arm/crypto/aes-neonbs-core.S149
-rw-r--r--arch/arm/crypto/aes-neonbs-glue.c103
-rw-r--r--arch/arm/crypto/blake2b-neon-core.S347
-rw-r--r--arch/arm/crypto/blake2b-neon-glue.c105
-rw-r--r--arch/arm/crypto/blake2s-core.S306
-rw-r--r--arch/arm/crypto/blake2s-glue.c7
-rw-r--r--arch/arm/crypto/chacha-glue.c49
-rw-r--r--arch/arm/crypto/chacha-neon-core.S97
-rw-r--r--arch/arm/crypto/chacha-scalar-core.S43
-rw-r--r--arch/arm/crypto/crc32-ce-core.S2
-rw-r--r--arch/arm/crypto/curve25519-core.S2
-rw-r--r--arch/arm/crypto/curve25519-glue.c9
-rw-r--r--arch/arm/crypto/ghash-ce-core.S5
-rw-r--r--arch/arm/crypto/ghash-ce-glue.c51
-rw-r--r--arch/arm/crypto/nhpoly1305-neon-glue.c2
-rw-r--r--arch/arm/crypto/poly1305-core.S_shipped1158
-rw-r--r--arch/arm/crypto/poly1305-glue.c18
-rw-r--r--arch/arm/crypto/sha1-armv4-large.S2
-rw-r--r--arch/arm/crypto/sha1-ce-glue.c2
-rw-r--r--arch/arm/crypto/sha1.h2
-rw-r--r--arch/arm/crypto/sha1_glue.c3
-rw-r--r--arch/arm/crypto/sha1_neon_glue.c3
-rw-r--r--arch/arm/crypto/sha2-ce-glue.c2
-rw-r--r--arch/arm/crypto/sha256-armv4.pl6
-rw-r--r--arch/arm/crypto/sha256-core.S_shipped2816
-rw-r--r--arch/arm/crypto/sha256_glue.c3
-rw-r--r--arch/arm/crypto/sha256_neon_glue.c3
-rw-r--r--arch/arm/crypto/sha512-armv4.pl8
-rw-r--r--arch/arm/crypto/sha512-core.S_shipped1869
-rw-r--r--arch/arm/crypto/sha512-glue.c2
-rw-r--r--arch/arm/crypto/sha512-neon-glue.c2
36 files changed, 1301 insertions, 6196 deletions
diff --git a/arch/arm/crypto/.gitignore b/arch/arm/crypto/.gitignore
index 31e1f538df7d..8d7f4bcaec2c 100644
--- a/arch/arm/crypto/.gitignore
+++ b/arch/arm/crypto/.gitignore
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: GPL-2.0-only
aesbs-core.S
sha256-core.S
sha512-core.S
+poly1305-core.S
diff --git a/arch/arm/crypto/Kconfig b/arch/arm/crypto/Kconfig
index 2674de6ada1f..3858c4d4cb98 100644
--- a/arch/arm/crypto/Kconfig
+++ b/arch/arm/crypto/Kconfig
@@ -1,73 +1,156 @@
# SPDX-License-Identifier: GPL-2.0
-menuconfig ARM_CRYPTO
- bool "ARM Accelerated Cryptographic Algorithms"
- depends on ARM
+menu "Accelerated Cryptographic Algorithms for CPU (arm)"
+
+config CRYPTO_CURVE25519_NEON
+ tristate "Public key crypto: Curve25519 (NEON)"
+ depends on KERNEL_MODE_NEON
+ select CRYPTO_LIB_CURVE25519_GENERIC
+ select CRYPTO_ARCH_HAVE_LIB_CURVE25519
+ help
+ Curve25519 algorithm
+
+ Architecture: arm with
+ - NEON (Advanced SIMD) extensions
+
+config CRYPTO_GHASH_ARM_CE
+ tristate "Hash functions: GHASH (PMULL/NEON/ARMv8 Crypto Extensions)"
+ depends on KERNEL_MODE_NEON
+ select CRYPTO_HASH
+ select CRYPTO_CRYPTD
+ select CRYPTO_GF128MUL
+ help
+ GCM GHASH function (NIST SP800-38D)
+
+ Architecture: arm using
+ - PMULL (Polynomial Multiply Long) instructions
+ - NEON (Advanced SIMD) extensions
+ - ARMv8 Crypto Extensions
+
+ Use an implementation of GHASH (used by the GCM AEAD chaining mode)
+ that uses the 64x64 to 128 bit polynomial multiplication (vmull.p64)
+ that is part of the ARMv8 Crypto Extensions, or a slower variant that
+ uses the vmull.p8 instruction that is part of the basic NEON ISA.
+
+config CRYPTO_NHPOLY1305_NEON
+ tristate "Hash functions: NHPoly1305 (NEON)"
+ depends on KERNEL_MODE_NEON
+ select CRYPTO_NHPOLY1305
help
- Say Y here to choose from a selection of cryptographic algorithms
- implemented using ARM specific CPU features or instructions.
+ NHPoly1305 hash function (Adiantum)
-if ARM_CRYPTO
+ Architecture: arm using:
+ - NEON (Advanced SIMD) extensions
+
+config CRYPTO_POLY1305_ARM
+ tristate "Hash functions: Poly1305 (NEON)"
+ select CRYPTO_HASH
+ select CRYPTO_ARCH_HAVE_LIB_POLY1305
+ help
+ Poly1305 authenticator algorithm (RFC7539)
+
+ Architecture: arm optionally using
+ - NEON (Advanced SIMD) extensions
+
+config CRYPTO_BLAKE2S_ARM
+ bool "Hash functions: BLAKE2s"
+ select CRYPTO_ARCH_HAVE_LIB_BLAKE2S
+ help
+ BLAKE2s cryptographic hash function (RFC 7693)
+
+ Architecture: arm
+
+ This is faster than the generic implementations of BLAKE2s and
+ BLAKE2b, but slower than the NEON implementation of BLAKE2b.
+ There is no NEON implementation of BLAKE2s, since NEON doesn't
+ really help with it.
+
+config CRYPTO_BLAKE2B_NEON
+ tristate "Hash functions: BLAKE2b (NEON)"
+ depends on KERNEL_MODE_NEON
+ select CRYPTO_BLAKE2B
+ help
+ BLAKE2b cryptographic hash function (RFC 7693)
+
+ Architecture: arm using
+ - NEON (Advanced SIMD) extensions
+
+ BLAKE2b digest algorithm optimized with ARM NEON instructions.
+ On ARM processors that have NEON support but not the ARMv8
+ Crypto Extensions, typically this BLAKE2b implementation is
+ much faster than the SHA-2 family and slightly faster than
+ SHA-1.
config CRYPTO_SHA1_ARM
- tristate "SHA1 digest algorithm (ARM-asm)"
+ tristate "Hash functions: SHA-1"
select CRYPTO_SHA1
select CRYPTO_HASH
help
- SHA-1 secure hash standard (FIPS 180-1/DFIPS 180-2) implemented
- using optimized ARM assembler.
+ SHA-1 secure hash algorithm (FIPS 180)
+
+ Architecture: arm
config CRYPTO_SHA1_ARM_NEON
- tristate "SHA1 digest algorithm (ARM NEON)"
+ tristate "Hash functions: SHA-1 (NEON)"
depends on KERNEL_MODE_NEON
select CRYPTO_SHA1_ARM
select CRYPTO_SHA1
select CRYPTO_HASH
help
- SHA-1 secure hash standard (FIPS 180-1/DFIPS 180-2) implemented
- using optimized ARM NEON assembly, when NEON instructions are
- available.
+ SHA-1 secure hash algorithm (FIPS 180)
+
+ Architecture: arm using
+ - NEON (Advanced SIMD) extensions
config CRYPTO_SHA1_ARM_CE
- tristate "SHA1 digest algorithm (ARM v8 Crypto Extensions)"
- depends on KERNEL_MODE_NEON && (CC_IS_CLANG || GCC_VERSION >= 40800)
+ tristate "Hash functions: SHA-1 (ARMv8 Crypto Extensions)"
+ depends on KERNEL_MODE_NEON
select CRYPTO_SHA1_ARM
select CRYPTO_HASH
help
- SHA-1 secure hash standard (FIPS 180-1/DFIPS 180-2) implemented
- using special ARMv8 Crypto Extensions.
+ SHA-1 secure hash algorithm (FIPS 180)
+
+ Architecture: arm using ARMv8 Crypto Extensions
config CRYPTO_SHA2_ARM_CE
- tristate "SHA-224/256 digest algorithm (ARM v8 Crypto Extensions)"
- depends on KERNEL_MODE_NEON && (CC_IS_CLANG || GCC_VERSION >= 40800)
+ tristate "Hash functions: SHA-224 and SHA-256 (ARMv8 Crypto Extensions)"
+ depends on KERNEL_MODE_NEON
select CRYPTO_SHA256_ARM
select CRYPTO_HASH
help
- SHA-256 secure hash standard (DFIPS 180-2) implemented
- using special ARMv8 Crypto Extensions.
+ SHA-224 and SHA-256 secure hash algorithms (FIPS 180)
+
+ Architecture: arm using
+ - ARMv8 Crypto Extensions
config CRYPTO_SHA256_ARM
- tristate "SHA-224/256 digest algorithm (ARM-asm and NEON)"
+ tristate "Hash functions: SHA-224 and SHA-256 (NEON)"
select CRYPTO_HASH
depends on !CPU_V7M
help
- SHA-256 secure hash standard (DFIPS 180-2) implemented
- using optimized ARM assembler and NEON, when available.
+ SHA-224 and SHA-256 secure hash algorithms (FIPS 180)
+
+ Architecture: arm using
+ - NEON (Advanced SIMD) extensions
config CRYPTO_SHA512_ARM
- tristate "SHA-384/512 digest algorithm (ARM-asm and NEON)"
+ tristate "Hash functions: SHA-384 and SHA-512 (NEON)"
select CRYPTO_HASH
depends on !CPU_V7M
help
- SHA-512 secure hash standard (DFIPS 180-2) implemented
- using optimized ARM assembler and NEON, when available.
+ SHA-384 and SHA-512 secure hash algorithms (FIPS 180)
+
+ Architecture: arm using
+ - NEON (Advanced SIMD) extensions
config CRYPTO_AES_ARM
- tristate "Scalar AES cipher for ARM"
+ tristate "Ciphers: AES"
select CRYPTO_ALGAPI
select CRYPTO_AES
help
- Use optimized AES assembler routines for ARM platforms.
+ Block ciphers: AES cipher algorithms (FIPS-197)
+
+ Architecture: arm
On ARM processors without the Crypto Extensions, this is the
fastest AES implementation for single blocks. For multiple
@@ -79,14 +162,21 @@ config CRYPTO_AES_ARM
such attacks very difficult.
config CRYPTO_AES_ARM_BS
- tristate "Bit sliced AES using NEON instructions"
+ tristate "Ciphers: AES, modes: ECB/CBC/CTR/XTS (bit-sliced NEON)"
depends on KERNEL_MODE_NEON
select CRYPTO_SKCIPHER
select CRYPTO_LIB_AES
+ select CRYPTO_AES
+ select CRYPTO_CBC
select CRYPTO_SIMD
help
- Use a faster and more secure NEON based implementation of AES in CBC,
- CTR and XTS modes
+ Length-preserving ciphers: AES cipher algorithms (FIPS-197)
+ with block cipher modes:
+ - ECB (Electronic Codebook) mode (NIST SP800-38A)
+ - CBC (Cipher Block Chaining) mode (NIST SP800-38A)
+ - CTR (Counter) mode (NIST SP800-38A)
+ - XTS (XOR Encrypt XOR with ciphertext stealing) mode (NIST SP800-38E
+ and IEEE 1619)
Bit sliced AES gives around 45% speedup on Cortex-A15 for CTR mode
and for XTS mode encryption, CBC and XTS mode decryption speedup is
@@ -95,58 +185,59 @@ config CRYPTO_AES_ARM_BS
believed to be invulnerable to cache timing attacks.
config CRYPTO_AES_ARM_CE
- tristate "Accelerated AES using ARMv8 Crypto Extensions"
- depends on KERNEL_MODE_NEON && (CC_IS_CLANG || GCC_VERSION >= 40800)
+ tristate "Ciphers: AES, modes: ECB/CBC/CTS/CTR/XTS (ARMv8 Crypto Extensions)"
+ depends on KERNEL_MODE_NEON
select CRYPTO_SKCIPHER
select CRYPTO_LIB_AES
select CRYPTO_SIMD
help
- Use an implementation of AES in CBC, CTR and XTS modes that uses
- ARMv8 Crypto Extensions
+ Length-preserving ciphers: AES cipher algorithms (FIPS-197)
+ with block cipher modes:
+ - ECB (Electronic Codebook) mode (NIST SP800-38A)
+ - CBC (Cipher Block Chaining) mode (NIST SP800-38A)
+ - CTR (Counter) mode (NIST SP800-38A)
+ - CTS (Cipher Text Stealing) mode (NIST SP800-38A)
+ - XTS (XOR Encrypt XOR with ciphertext stealing) mode (NIST SP800-38E
+ and IEEE 1619)
+
+ Architecture: arm using:
+ - ARMv8 Crypto Extensions
-config CRYPTO_GHASH_ARM_CE
- tristate "PMULL-accelerated GHASH using NEON/ARMv8 Crypto Extensions"
- depends on KERNEL_MODE_NEON && (CC_IS_CLANG || GCC_VERSION >= 40800)
- select CRYPTO_HASH
- select CRYPTO_CRYPTD
- select CRYPTO_GF128MUL
+config CRYPTO_CHACHA20_NEON
+ tristate "Ciphers: ChaCha20, XChaCha20, XChaCha12 (NEON)"
+ select CRYPTO_SKCIPHER
+ select CRYPTO_ARCH_HAVE_LIB_CHACHA
help
- Use an implementation of GHASH (used by the GCM AEAD chaining mode)
- that uses the 64x64 to 128 bit polynomial multiplication (vmull.p64)
- that is part of the ARMv8 Crypto Extensions, or a slower variant that
- uses the vmull.p8 instruction that is part of the basic NEON ISA.
+ Length-preserving ciphers: ChaCha20, XChaCha20, and XChaCha12
+ stream cipher algorithms
-config CRYPTO_CRCT10DIF_ARM_CE
- tristate "CRCT10DIF digest algorithm using PMULL instructions"
- depends on KERNEL_MODE_NEON && (CC_IS_CLANG || GCC_VERSION >= 40800)
- depends on CRC_T10DIF
- select CRYPTO_HASH
+ Architecture: arm using:
+ - NEON (Advanced SIMD) extensions
config CRYPTO_CRC32_ARM_CE
- tristate "CRC32(C) digest algorithm using CRC and/or PMULL instructions"
- depends on KERNEL_MODE_NEON && (CC_IS_CLANG || GCC_VERSION >= 40800)
+ tristate "CRC32C and CRC32"
+ depends on KERNEL_MODE_NEON
depends on CRC32
select CRYPTO_HASH
+ help
+ CRC32c CRC algorithm with the iSCSI polynomial (RFC 3385 and RFC 3720)
+ and CRC32 CRC algorithm (IEEE 802.3)
-config CRYPTO_CHACHA20_NEON
- tristate "NEON and scalar accelerated ChaCha stream cipher algorithms"
- select CRYPTO_SKCIPHER
- select CRYPTO_ARCH_HAVE_LIB_CHACHA
+ Architecture: arm using:
+ - CRC and/or PMULL instructions
-config CRYPTO_POLY1305_ARM
- tristate "Accelerated scalar and SIMD Poly1305 hash implementations"
- select CRYPTO_HASH
- select CRYPTO_ARCH_HAVE_LIB_POLY1305
+ Drivers: crc32-arm-ce and crc32c-arm-ce
-config CRYPTO_NHPOLY1305_NEON
- tristate "NEON accelerated NHPoly1305 hash function (for Adiantum)"
+config CRYPTO_CRCT10DIF_ARM_CE
+ tristate "CRCT10DIF"
depends on KERNEL_MODE_NEON
- select CRYPTO_NHPOLY1305
+ depends on CRC_T10DIF
+ select CRYPTO_HASH
+ help
+ CRC16 CRC algorithm used for the T10 (SCSI) Data Integrity Field (DIF)
-config CRYPTO_CURVE25519_NEON
- tristate "NEON accelerated Curve25519 scalar multiplication library"
- depends on KERNEL_MODE_NEON
- select CRYPTO_LIB_CURVE25519_GENERIC
- select CRYPTO_ARCH_HAVE_LIB_CURVE25519
+ Architecture: arm using:
+ - PMULL (Polynomial Multiply Long) instructions
+
+endmenu
-endif
diff --git a/arch/arm/crypto/Makefile b/arch/arm/crypto/Makefile
index b745c17d356f..971e74546fb1 100644
--- a/arch/arm/crypto/Makefile
+++ b/arch/arm/crypto/Makefile
@@ -9,6 +9,8 @@ obj-$(CONFIG_CRYPTO_SHA1_ARM) += sha1-arm.o
obj-$(CONFIG_CRYPTO_SHA1_ARM_NEON) += sha1-arm-neon.o
obj-$(CONFIG_CRYPTO_SHA256_ARM) += sha256-arm.o
obj-$(CONFIG_CRYPTO_SHA512_ARM) += sha512-arm.o
+obj-$(CONFIG_CRYPTO_BLAKE2S_ARM) += libblake2s-arm.o
+obj-$(CONFIG_CRYPTO_BLAKE2B_NEON) += blake2b-neon.o
obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha-neon.o
obj-$(CONFIG_CRYPTO_POLY1305_ARM) += poly1305-arm.o
obj-$(CONFIG_CRYPTO_NHPOLY1305_NEON) += nhpoly1305-neon.o
@@ -29,6 +31,8 @@ sha256-arm-neon-$(CONFIG_KERNEL_MODE_NEON) := sha256_neon_glue.o
sha256-arm-y := sha256-core.o sha256_glue.o $(sha256-arm-neon-y)
sha512-arm-neon-$(CONFIG_KERNEL_MODE_NEON) := sha512-neon-glue.o
sha512-arm-y := sha512-core.o sha512-glue.o $(sha512-arm-neon-y)
+libblake2s-arm-y:= blake2s-core.o blake2s-glue.o
+blake2b-neon-y := blake2b-neon-core.o blake2b-neon-glue.o
sha1-arm-ce-y := sha1-ce-core.o sha1-ce-glue.o
sha2-arm-ce-y := sha2-ce-core.o sha2-ce-glue.o
aes-arm-ce-y := aes-ce-core.o aes-ce-glue.o
@@ -41,20 +45,12 @@ poly1305-arm-y := poly1305-core.o poly1305-glue.o
nhpoly1305-neon-y := nh-neon-core.o nhpoly1305-neon-glue.o
curve25519-neon-y := curve25519-core.o curve25519-glue.o
-ifdef REGENERATE_ARM_CRYPTO
quiet_cmd_perl = PERL $@
cmd_perl = $(PERL) $(<) > $(@)
-$(src)/poly1305-core.S_shipped: $(src)/poly1305-armv4.pl
+$(obj)/%-core.S: $(src)/%-armv4.pl
$(call cmd,perl)
-$(src)/sha256-core.S_shipped: $(src)/sha256-armv4.pl
- $(call cmd,perl)
-
-$(src)/sha512-core.S_shipped: $(src)/sha512-armv4.pl
- $(call cmd,perl)
-endif
-
clean-files += poly1305-core.S sha256-core.S sha512-core.S
# massage the perlasm code a bit so we only get the NEON routine if we need it
diff --git a/arch/arm/crypto/aes-ce-core.S b/arch/arm/crypto/aes-ce-core.S
index 4d1707388d94..312428d83eed 100644
--- a/arch/arm/crypto/aes-ce-core.S
+++ b/arch/arm/crypto/aes-ce-core.S
@@ -386,20 +386,32 @@ ENTRY(ce_aes_ctr_encrypt)
.Lctrloop4x:
subs r4, r4, #4
bmi .Lctr1x
- add r6, r6, #1
+
+ /*
+ * NOTE: the sequence below has been carefully tweaked to avoid
+ * a silicon erratum that exists in Cortex-A57 (#1742098) and
+ * Cortex-A72 (#1655431) cores, where AESE/AESMC instruction pairs
+ * may produce an incorrect result if they take their input from a
+ * register of which a single 32-bit lane has been updated the last
+ * time it was modified. To work around this, the lanes of registers
+ * q0-q3 below are not manipulated individually, and the different
+ * counter values are prepared by successive manipulations of q7.
+ */
+ add ip, r6, #1
vmov q0, q7
+ rev ip, ip
+ add lr, r6, #2
+ vmov s31, ip @ set lane 3 of q1 via q7
+ add ip, r6, #3
+ rev lr, lr
vmov q1, q7
- rev ip, r6
- add r6, r6, #1
+ vmov s31, lr @ set lane 3 of q2 via q7
+ rev ip, ip
vmov q2, q7
- vmov s7, ip
- rev ip, r6
- add r6, r6, #1
+ vmov s31, ip @ set lane 3 of q3 via q7
+ add r6, r6, #4
vmov q3, q7
- vmov s11, ip
- rev ip, r6
- add r6, r6, #1
- vmov s15, ip
+
vld1.8 {q4-q5}, [r1]!
vld1.8 {q6}, [r1]!
vld1.8 {q15}, [r1]!
diff --git a/arch/arm/crypto/aes-cipher-core.S b/arch/arm/crypto/aes-cipher-core.S
index 472e56d09eea..1da3f41359aa 100644
--- a/arch/arm/crypto/aes-cipher-core.S
+++ b/arch/arm/crypto/aes-cipher-core.S
@@ -99,28 +99,6 @@
__hround \out2, \out3, \in2, \in1, \in0, \in3, \in1, \in0, 0, \sz, \op, \oldcpsr
.endm
- .macro __rev, out, in
- .if __LINUX_ARM_ARCH__ < 6
- lsl t0, \in, #24
- and t1, \in, #0xff00
- and t2, \in, #0xff0000
- orr \out, t0, \in, lsr #24
- orr \out, \out, t1, lsl #8
- orr \out, \out, t2, lsr #8
- .else
- rev \out, \in
- .endif
- .endm
-
- .macro __adrl, out, sym, c
- .if __LINUX_ARM_ARCH__ < 7
- ldr\c \out, =\sym
- .else
- movw\c \out, #:lower16:\sym
- movt\c \out, #:upper16:\sym
- .endif
- .endm
-
.macro do_crypt, round, ttab, ltab, bsz
push {r3-r11, lr}
@@ -133,10 +111,10 @@
ldr r7, [in, #12]
#ifdef CONFIG_CPU_BIG_ENDIAN
- __rev r4, r4
- __rev r5, r5
- __rev r6, r6
- __rev r7, r7
+ rev_l r4, t0
+ rev_l r5, t0
+ rev_l r6, t0
+ rev_l r7, t0
#endif
eor r4, r4, r8
@@ -144,7 +122,7 @@
eor r6, r6, r10
eor r7, r7, r11
- __adrl ttab, \ttab
+ mov_l ttab, \ttab
/*
* Disable interrupts and prefetch the 1024-byte 'ft' or 'it' table into
* L1 cache, assuming cacheline size >= 32. This is a hardening measure
@@ -180,7 +158,7 @@
2: .ifb \ltab
add ttab, ttab, #1
.else
- __adrl ttab, \ltab
+ mov_l ttab, \ltab
// Prefetch inverse S-box for final round; see explanation above
.set i, 0
.rept 256 / 64
@@ -194,10 +172,10 @@
\round r4, r5, r6, r7, r8, r9, r10, r11, \bsz, b, rounds
#ifdef CONFIG_CPU_BIG_ENDIAN
- __rev r4, r4
- __rev r5, r5
- __rev r6, r6
- __rev r7, r7
+ rev_l r4, t0
+ rev_l r5, t0
+ rev_l r6, t0
+ rev_l r7, t0
#endif
ldr out, [sp]
diff --git a/arch/arm/crypto/aes-neonbs-core.S b/arch/arm/crypto/aes-neonbs-core.S
index cfaed4e67535..7b61032f29fa 100644
--- a/arch/arm/crypto/aes-neonbs-core.S
+++ b/arch/arm/crypto/aes-neonbs-core.S
@@ -77,11 +77,6 @@
vldr \out\()h, \sym + 8
.endm
- .macro __adr, reg, lbl
- adr \reg, \lbl
-THUMB( orr \reg, \reg, #1 )
- .endm
-
.macro in_bs_ch, b0, b1, b2, b3, b4, b5, b6, b7
veor \b2, \b2, \b1
veor \b5, \b5, \b6
@@ -629,11 +624,11 @@ ENDPROC(aesbs_decrypt8)
push {r4-r6, lr}
ldr r5, [sp, #16] // number of blocks
-99: __adr ip, 0f
+99: adr ip, 0f
and lr, r5, #7
cmp r5, #8
sub ip, ip, lr, lsl #2
- bxlt ip // computed goto if blocks < 8
+ movlt pc, ip // computed goto if blocks < 8
vld1.8 {q0}, [r1]!
vld1.8 {q1}, [r1]!
@@ -648,11 +643,11 @@ ENDPROC(aesbs_decrypt8)
mov rounds, r3
bl \do8
- __adr ip, 1f
+ adr ip, 1f
and lr, r5, #7
cmp r5, #8
sub ip, ip, lr, lsl #2
- bxlt ip // computed goto if blocks < 8
+ movlt pc, ip // computed goto if blocks < 8
vst1.8 {\o0}, [r0]!
vst1.8 {\o1}, [r0]!
@@ -689,12 +684,12 @@ ENTRY(aesbs_cbc_decrypt)
push {r4-r6, lr}
ldm ip, {r5-r6} // load args 4-5
-99: __adr ip, 0f
+99: adr ip, 0f
and lr, r5, #7
cmp r5, #8
sub ip, ip, lr, lsl #2
mov lr, r1
- bxlt ip // computed goto if blocks < 8
+ movlt pc, ip // computed goto if blocks < 8
vld1.8 {q0}, [lr]!
vld1.8 {q1}, [lr]!
@@ -718,11 +713,11 @@ ENTRY(aesbs_cbc_decrypt)
vmov q14, q8
vmov q15, q8
- __adr ip, 1f
+ adr ip, 1f
and lr, r5, #7
cmp r5, #8
sub ip, ip, lr, lsl #2
- bxlt ip // computed goto if blocks < 8
+ movlt pc, ip // computed goto if blocks < 8
vld1.8 {q9}, [r1]!
vld1.8 {q10}, [r1]!
@@ -733,9 +728,9 @@ ENTRY(aesbs_cbc_decrypt)
vld1.8 {q15}, [r1]!
W(nop)
-1: __adr ip, 2f
+1: adr ip, 2f
sub ip, ip, lr, lsl #3
- bxlt ip // computed goto if blocks < 8
+ movlt pc, ip // computed goto if blocks < 8
veor q0, q0, q8
vst1.8 {q0}, [r0]!
@@ -763,29 +758,24 @@ ENTRY(aesbs_cbc_decrypt)
ENDPROC(aesbs_cbc_decrypt)
.macro next_ctr, q
- vmov.32 \q\()h[1], r10
+ vmov \q\()h, r9, r10
adds r10, r10, #1
- vmov.32 \q\()h[0], r9
adcs r9, r9, #0
- vmov.32 \q\()l[1], r8
+ vmov \q\()l, r7, r8
adcs r8, r8, #0
- vmov.32 \q\()l[0], r7
adc r7, r7, #0
vrev32.8 \q, \q
.endm
/*
* aesbs_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[],
- * int rounds, int blocks, u8 ctr[], u8 final[])
+ * int rounds, int bytes, u8 ctr[])
*/
ENTRY(aesbs_ctr_encrypt)
mov ip, sp
push {r4-r10, lr}
- ldm ip, {r5-r7} // load args 4-6
- teq r7, #0
- addne r5, r5, #1 // one extra block if final != 0
-
+ ldm ip, {r5, r6} // load args 4-5
vld1.8 {q0}, [r6] // load counter
vrev32.8 q1, q0
vmov r9, r10, d3
@@ -797,20 +787,19 @@ ENTRY(aesbs_ctr_encrypt)
adc r7, r7, #0
99: vmov q1, q0
+ sub lr, r5, #1
vmov q2, q0
+ adr ip, 0f
vmov q3, q0
+ and lr, lr, #112
vmov q4, q0
+ cmp r5, #112
vmov q5, q0
+ sub ip, ip, lr, lsl #1
vmov q6, q0
+ add ip, ip, lr, lsr #2
vmov q7, q0
-
- __adr ip, 0f
- sub lr, r5, #1
- and lr, lr, #7
- cmp r5, #8
- sub ip, ip, lr, lsl #5
- sub ip, ip, lr, lsl #2
- bxlt ip // computed goto if blocks < 8
+ movle pc, ip // computed goto if bytes < 112
next_ctr q1
next_ctr q2
@@ -824,13 +813,15 @@ ENTRY(aesbs_ctr_encrypt)
mov rounds, r3
bl aesbs_encrypt8
- __adr ip, 1f
- and lr, r5, #7
- cmp r5, #8
- movgt r4, #0
- ldrle r4, [sp, #40] // load final in the last round
- sub ip, ip, lr, lsl #2
- bxlt ip // computed goto if blocks < 8
+ adr ip, 1f
+ sub lr, r5, #1
+ cmp r5, #128
+ bic lr, lr, #15
+ ands r4, r5, #15 // preserves C flag
+ teqcs r5, r5 // set Z flag if not last iteration
+ sub ip, ip, lr, lsr #2
+ rsb r4, r4, #16
+ movcc pc, ip // computed goto if bytes < 128
vld1.8 {q8}, [r1]!
vld1.8 {q9}, [r1]!
@@ -839,46 +830,70 @@ ENTRY(aesbs_ctr_encrypt)
vld1.8 {q12}, [r1]!
vld1.8 {q13}, [r1]!
vld1.8 {q14}, [r1]!
- teq r4, #0 // skip last block if 'final'
-1: bne 2f
+1: subne r1, r1, r4
vld1.8 {q15}, [r1]!
-2: __adr ip, 3f
- cmp r5, #8
- sub ip, ip, lr, lsl #3
- bxlt ip // computed goto if blocks < 8
+ add ip, ip, #2f - 1b
veor q0, q0, q8
- vst1.8 {q0}, [r0]!
veor q1, q1, q9
- vst1.8 {q1}, [r0]!
veor q4, q4, q10
- vst1.8 {q4}, [r0]!
veor q6, q6, q11
- vst1.8 {q6}, [r0]!
veor q3, q3, q12
- vst1.8 {q3}, [r0]!
veor q7, q7, q13
- vst1.8 {q7}, [r0]!
veor q2, q2, q14
+ bne 3f
+ veor q5, q5, q15
+
+ movcc pc, ip // computed goto if bytes < 128
+
+ vst1.8 {q0}, [r0]!
+ vst1.8 {q1}, [r0]!
+ vst1.8 {q4}, [r0]!
+ vst1.8 {q6}, [r0]!
+ vst1.8 {q3}, [r0]!
+ vst1.8 {q7}, [r0]!
vst1.8 {q2}, [r0]!
- teq r4, #0 // skip last block if 'final'
- W(bne) 5f
-3: veor q5, q5, q15
+2: subne r0, r0, r4
vst1.8 {q5}, [r0]!
-4: next_ctr q0
+ next_ctr q0
- subs r5, r5, #8
+ subs r5, r5, #128
bgt 99b
vst1.8 {q0}, [r6]
pop {r4-r10, pc}
-5: vst1.8 {q5}, [r4]
- b 4b
+3: adr lr, .Lpermute_table + 16
+ cmp r5, #16 // Z flag remains cleared
+ sub lr, lr, r4
+ vld1.8 {q8-q9}, [lr]
+ vtbl.8 d16, {q5}, d16
+ vtbl.8 d17, {q5}, d17
+ veor q5, q8, q15
+ bcc 4f // have to reload prev if R5 < 16
+ vtbx.8 d10, {q2}, d18
+ vtbx.8 d11, {q2}, d19
+ mov pc, ip // branch back to VST sequence
+
+4: sub r0, r0, r4
+ vshr.s8 q9, q9, #7 // create mask for VBIF
+ vld1.8 {q8}, [r0] // reload
+ vbif q5, q8, q9
+ vst1.8 {q5}, [r0]
+ pop {r4-r10, pc}
ENDPROC(aesbs_ctr_encrypt)
+ .align 6
+.Lpermute_table:
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+ .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
+ .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+
.macro next_tweak, out, in, const, tmp
vshr.s64 \tmp, \in, #63
vand \tmp, \tmp, \const
@@ -893,6 +908,7 @@ ENDPROC(aesbs_ctr_encrypt)
* aesbs_xts_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
* int blocks, u8 iv[], int reorder_last_tweak)
*/
+ .align 6
__xts_prepare8:
vld1.8 {q14}, [r7] // load iv
vmov.i32 d30, #0x87 // compose tweak mask vector
@@ -900,12 +916,12 @@ __xts_prepare8:
vshr.u64 d30, d31, #7
vmov q12, q14
- __adr ip, 0f
+ adr ip, 0f
and r4, r6, #7
cmp r6, #8
sub ip, ip, r4, lsl #5
mov r4, sp
- bxlt ip // computed goto if blocks < 8
+ movlt pc, ip // computed goto if blocks < 8
vld1.8 {q0}, [r1]!
next_tweak q12, q14, q15, q13
@@ -961,8 +977,7 @@ ENDPROC(__xts_prepare8)
push {r4-r8, lr}
mov r5, sp // preserve sp
ldrd r6, r7, [sp, #24] // get blocks and iv args
- ldr r8, [sp, #32] // reorder final tweak?
- rsb r8, r8, #1
+ rsb r8, ip, #1
sub ip, sp, #128 // make room for 8x tweak
bic ip, ip, #0xf // align sp to 16 bytes
mov sp, ip
@@ -973,12 +988,12 @@ ENDPROC(__xts_prepare8)
mov rounds, r3
bl \do8
- __adr ip, 0f
+ adr ip, 0f
and lr, r6, #7
cmp r6, #8
sub ip, ip, lr, lsl #2
mov r4, sp
- bxlt ip // computed goto if blocks < 8
+ movlt pc, ip // computed goto if blocks < 8
vld1.8 {q8}, [r4, :128]!
vld1.8 {q9}, [r4, :128]!
@@ -989,9 +1004,9 @@ ENDPROC(__xts_prepare8)
vld1.8 {q14}, [r4, :128]!
vld1.8 {q15}, [r4, :128]
-0: __adr ip, 1f
+0: adr ip, 1f
sub ip, ip, lr, lsl #3
- bxlt ip // computed goto if blocks < 8
+ movlt pc, ip // computed goto if blocks < 8
veor \o0, \o0, q8
vst1.8 {\o0}, [r0]!
@@ -1018,9 +1033,11 @@ ENDPROC(__xts_prepare8)
.endm
ENTRY(aesbs_xts_encrypt)
+ mov ip, #0 // never reorder final tweak
__xts_crypt aesbs_encrypt8, q0, q1, q4, q6, q3, q7, q2, q5
ENDPROC(aesbs_xts_encrypt)
ENTRY(aesbs_xts_decrypt)
+ ldr ip, [sp, #8] // reorder final tweak?
__xts_crypt aesbs_decrypt8, q0, q1, q6, q4, q2, q7, q3, q5
ENDPROC(aesbs_xts_decrypt)
diff --git a/arch/arm/crypto/aes-neonbs-glue.c b/arch/arm/crypto/aes-neonbs-glue.c
index e85839a8aaeb..f00f042ef357 100644
--- a/arch/arm/crypto/aes-neonbs-glue.c
+++ b/arch/arm/crypto/aes-neonbs-glue.c
@@ -8,8 +8,8 @@
#include <asm/neon.h>
#include <asm/simd.h>
#include <crypto/aes.h>
-#include <crypto/cbc.h>
#include <crypto/ctr.h>
+#include <crypto/internal/cipher.h>
#include <crypto/internal/simd.h>
#include <crypto/internal/skcipher.h>
#include <crypto/scatterwalk.h>
@@ -20,10 +20,12 @@ MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
MODULE_LICENSE("GPL v2");
MODULE_ALIAS_CRYPTO("ecb(aes)");
-MODULE_ALIAS_CRYPTO("cbc(aes)");
+MODULE_ALIAS_CRYPTO("cbc(aes)-all");
MODULE_ALIAS_CRYPTO("ctr(aes)");
MODULE_ALIAS_CRYPTO("xts(aes)");
+MODULE_IMPORT_NS(CRYPTO_INTERNAL);
+
asmlinkage void aesbs_convert_key(u8 out[], u32 const rk[], int rounds);
asmlinkage void aesbs_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[],
@@ -35,7 +37,7 @@ asmlinkage void aesbs_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[],
int rounds, int blocks, u8 iv[]);
asmlinkage void aesbs_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[],
- int rounds, int blocks, u8 ctr[], u8 final[]);
+ int rounds, int blocks, u8 ctr[]);
asmlinkage void aesbs_xts_encrypt(u8 out[], u8 const in[], u8 const rk[],
int rounds, int blocks, u8 iv[], int);
@@ -49,7 +51,7 @@ struct aesbs_ctx {
struct aesbs_cbc_ctx {
struct aesbs_ctx key;
- struct crypto_cipher *enc_tfm;
+ struct crypto_skcipher *enc_tfm;
};
struct aesbs_xts_ctx {
@@ -138,20 +140,25 @@ static int aesbs_cbc_setkey(struct crypto_skcipher *tfm, const u8 *in_key,
kernel_neon_begin();
aesbs_convert_key(ctx->key.rk, rk.key_enc, ctx->key.rounds);
kernel_neon_end();
+ memzero_explicit(&rk, sizeof(rk));
- return crypto_cipher_setkey(ctx->enc_tfm, in_key, key_len);
+ return crypto_skcipher_setkey(ctx->enc_tfm, in_key, key_len);
}
-static void cbc_encrypt_one(struct crypto_skcipher *tfm, const u8 *src, u8 *dst)
+static int cbc_encrypt(struct skcipher_request *req)
{
+ struct skcipher_request *subreq = skcipher_request_ctx(req);
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
struct aesbs_cbc_ctx *ctx = crypto_skcipher_ctx(tfm);
- crypto_cipher_encrypt_one(ctx->enc_tfm, dst, src);
-}
+ skcipher_request_set_tfm(subreq, ctx->enc_tfm);
+ skcipher_request_set_callback(subreq,
+ skcipher_request_flags(req),
+ NULL, NULL);
+ skcipher_request_set_crypt(subreq, req->src, req->dst,
+ req->cryptlen, req->iv);
-static int cbc_encrypt(struct skcipher_request *req)
-{
- return crypto_cbc_encrypt_walk(req, cbc_encrypt_one);
+ return crypto_skcipher_encrypt(subreq);
}
static int cbc_decrypt(struct skcipher_request *req)
@@ -182,20 +189,28 @@ static int cbc_decrypt(struct skcipher_request *req)
return err;
}
-static int cbc_init(struct crypto_tfm *tfm)
+static int cbc_init(struct crypto_skcipher *tfm)
{
- struct aesbs_cbc_ctx *ctx = crypto_tfm_ctx(tfm);
+ struct aesbs_cbc_ctx *ctx = crypto_skcipher_ctx(tfm);
+ unsigned int reqsize;
+
+ ctx->enc_tfm = crypto_alloc_skcipher("cbc(aes)", 0, CRYPTO_ALG_ASYNC |
+ CRYPTO_ALG_NEED_FALLBACK);
+ if (IS_ERR(ctx->enc_tfm))
+ return PTR_ERR(ctx->enc_tfm);
- ctx->enc_tfm = crypto_alloc_cipher("aes", 0, 0);
+ reqsize = sizeof(struct skcipher_request);
+ reqsize += crypto_skcipher_reqsize(ctx->enc_tfm);
+ crypto_skcipher_set_reqsize(tfm, reqsize);
- return PTR_ERR_OR_ZERO(ctx->enc_tfm);
+ return 0;
}
-static void cbc_exit(struct crypto_tfm *tfm)
+static void cbc_exit(struct crypto_skcipher *tfm)
{
- struct aesbs_cbc_ctx *ctx = crypto_tfm_ctx(tfm);
+ struct aesbs_cbc_ctx *ctx = crypto_skcipher_ctx(tfm);
- crypto_free_cipher(ctx->enc_tfm);
+ crypto_free_skcipher(ctx->enc_tfm);
}
static int aesbs_ctr_setkey_sync(struct crypto_skcipher *tfm, const u8 *in_key,
@@ -228,32 +243,25 @@ static int ctr_encrypt(struct skcipher_request *req)
err = skcipher_walk_virt(&walk, req, false);
while (walk.nbytes > 0) {
- unsigned int blocks = walk.nbytes / AES_BLOCK_SIZE;
- u8 *final = (walk.total % AES_BLOCK_SIZE) ? buf : NULL;
+ const u8 *src = walk.src.virt.addr;
+ u8 *dst = walk.dst.virt.addr;
+ int bytes = walk.nbytes;
- if (walk.nbytes < walk.total) {
- blocks = round_down(blocks,
- walk.stride / AES_BLOCK_SIZE);
- final = NULL;
- }
+ if (unlikely(bytes < AES_BLOCK_SIZE))
+ src = dst = memcpy(buf + sizeof(buf) - bytes,
+ src, bytes);
+ else if (walk.nbytes < walk.total)
+ bytes &= ~(8 * AES_BLOCK_SIZE - 1);
kernel_neon_begin();
- aesbs_ctr_encrypt(walk.dst.virt.addr, walk.src.virt.addr,
- ctx->rk, ctx->rounds, blocks, walk.iv, final);
+ aesbs_ctr_encrypt(dst, src, ctx->rk, ctx->rounds, bytes, walk.iv);
kernel_neon_end();
- if (final) {
- u8 *dst = walk.dst.virt.addr + blocks * AES_BLOCK_SIZE;
- u8 *src = walk.src.virt.addr + blocks * AES_BLOCK_SIZE;
-
- crypto_xor_cpy(dst, src, final,
- walk.total % AES_BLOCK_SIZE);
+ if (unlikely(bytes < AES_BLOCK_SIZE))
+ memcpy(walk.dst.virt.addr,
+ buf + sizeof(buf) - bytes, bytes);
- err = skcipher_walk_done(&walk, 0);
- break;
- }
- err = skcipher_walk_done(&walk,
- walk.nbytes - blocks * AES_BLOCK_SIZE);
+ err = skcipher_walk_done(&walk, walk.nbytes - bytes);
}
return err;
@@ -303,9 +311,9 @@ static int aesbs_xts_setkey(struct crypto_skcipher *tfm, const u8 *in_key,
return aesbs_setkey(tfm, in_key, key_len);
}
-static int xts_init(struct crypto_tfm *tfm)
+static int xts_init(struct crypto_skcipher *tfm)
{
- struct aesbs_xts_ctx *ctx = crypto_tfm_ctx(tfm);
+ struct aesbs_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
ctx->cts_tfm = crypto_alloc_cipher("aes", 0, 0);
if (IS_ERR(ctx->cts_tfm))
@@ -318,9 +326,9 @@ static int xts_init(struct crypto_tfm *tfm)
return PTR_ERR_OR_ZERO(ctx->tweak_tfm);
}
-static void xts_exit(struct crypto_tfm *tfm)
+static void xts_exit(struct crypto_skcipher *tfm)
{
- struct aesbs_xts_ctx *ctx = crypto_tfm_ctx(tfm);
+ struct aesbs_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
crypto_free_cipher(ctx->tweak_tfm);
crypto_free_cipher(ctx->cts_tfm);
@@ -430,9 +438,8 @@ static struct skcipher_alg aes_algs[] = { {
.base.cra_blocksize = AES_BLOCK_SIZE,
.base.cra_ctxsize = sizeof(struct aesbs_cbc_ctx),
.base.cra_module = THIS_MODULE,
- .base.cra_flags = CRYPTO_ALG_INTERNAL,
- .base.cra_init = cbc_init,
- .base.cra_exit = cbc_exit,
+ .base.cra_flags = CRYPTO_ALG_INTERNAL |
+ CRYPTO_ALG_NEED_FALLBACK,
.min_keysize = AES_MIN_KEY_SIZE,
.max_keysize = AES_MAX_KEY_SIZE,
@@ -441,6 +448,8 @@ static struct skcipher_alg aes_algs[] = { {
.setkey = aesbs_cbc_setkey,
.encrypt = cbc_encrypt,
.decrypt = cbc_decrypt,
+ .init = cbc_init,
+ .exit = cbc_exit,
}, {
.base.cra_name = "__ctr(aes)",
.base.cra_driver_name = "__ctr-aes-neonbs",
@@ -482,8 +491,6 @@ static struct skcipher_alg aes_algs[] = { {
.base.cra_ctxsize = sizeof(struct aesbs_xts_ctx),
.base.cra_module = THIS_MODULE,
.base.cra_flags = CRYPTO_ALG_INTERNAL,
- .base.cra_init = xts_init,
- .base.cra_exit = xts_exit,
.min_keysize = 2 * AES_MIN_KEY_SIZE,
.max_keysize = 2 * AES_MAX_KEY_SIZE,
@@ -492,6 +499,8 @@ static struct skcipher_alg aes_algs[] = { {
.setkey = aesbs_xts_setkey,
.encrypt = xts_encrypt,
.decrypt = xts_decrypt,
+ .init = xts_init,
+ .exit = xts_exit,
} };
static struct simd_skcipher_alg *aes_simd_algs[ARRAY_SIZE(aes_algs)];
diff --git a/arch/arm/crypto/blake2b-neon-core.S b/arch/arm/crypto/blake2b-neon-core.S
new file mode 100644
index 000000000000..0406a186377f
--- /dev/null
+++ b/arch/arm/crypto/blake2b-neon-core.S
@@ -0,0 +1,347 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * BLAKE2b digest algorithm, NEON accelerated
+ *
+ * Copyright 2020 Google LLC
+ *
+ * Author: Eric Biggers <ebiggers@google.com>
+ */
+
+#include <linux/linkage.h>
+
+ .text
+ .fpu neon
+
+ // The arguments to blake2b_compress_neon()
+ STATE .req r0
+ BLOCK .req r1
+ NBLOCKS .req r2
+ INC .req r3
+
+ // Pointers to the rotation tables
+ ROR24_TABLE .req r4
+ ROR16_TABLE .req r5
+
+ // The original stack pointer
+ ORIG_SP .req r6
+
+ // NEON registers which contain the message words of the current block.
+ // M_0-M_3 are occasionally used for other purposes too.
+ M_0 .req d16
+ M_1 .req d17
+ M_2 .req d18
+ M_3 .req d19
+ M_4 .req d20
+ M_5 .req d21
+ M_6 .req d22
+ M_7 .req d23
+ M_8 .req d24
+ M_9 .req d25
+ M_10 .req d26
+ M_11 .req d27
+ M_12 .req d28
+ M_13 .req d29
+ M_14 .req d30
+ M_15 .req d31
+
+ .align 4
+ // Tables for computing ror64(x, 24) and ror64(x, 16) using the vtbl.8
+ // instruction. This is the most efficient way to implement these
+ // rotation amounts with NEON. (On Cortex-A53 it's the same speed as
+ // vshr.u64 + vsli.u64, while on Cortex-A7 it's faster.)
+.Lror24_table:
+ .byte 3, 4, 5, 6, 7, 0, 1, 2
+.Lror16_table:
+ .byte 2, 3, 4, 5, 6, 7, 0, 1
+ // The BLAKE2b initialization vector
+.Lblake2b_IV:
+ .quad 0x6a09e667f3bcc908, 0xbb67ae8584caa73b
+ .quad 0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1
+ .quad 0x510e527fade682d1, 0x9b05688c2b3e6c1f
+ .quad 0x1f83d9abfb41bd6b, 0x5be0cd19137e2179
+
+// Execute one round of BLAKE2b by updating the state matrix v[0..15] in the
+// NEON registers q0-q7. The message block is in q8..q15 (M_0-M_15). The stack
+// pointer points to a 32-byte aligned buffer containing a copy of q8 and q9
+// (M_0-M_3), so that they can be reloaded if they are used as temporary
+// registers. The macro arguments s0-s15 give the order in which the message
+// words are used in this round. 'final' is 1 if this is the final round.
+.macro _blake2b_round s0, s1, s2, s3, s4, s5, s6, s7, \
+ s8, s9, s10, s11, s12, s13, s14, s15, final=0
+
+ // Mix the columns:
+ // (v[0], v[4], v[8], v[12]), (v[1], v[5], v[9], v[13]),
+ // (v[2], v[6], v[10], v[14]), and (v[3], v[7], v[11], v[15]).
+
+ // a += b + m[blake2b_sigma[r][2*i + 0]];
+ vadd.u64 q0, q0, q2
+ vadd.u64 q1, q1, q3
+ vadd.u64 d0, d0, M_\s0
+ vadd.u64 d1, d1, M_\s2
+ vadd.u64 d2, d2, M_\s4
+ vadd.u64 d3, d3, M_\s6
+
+ // d = ror64(d ^ a, 32);
+ veor q6, q6, q0
+ veor q7, q7, q1
+ vrev64.32 q6, q6
+ vrev64.32 q7, q7
+
+ // c += d;
+ vadd.u64 q4, q4, q6
+ vadd.u64 q5, q5, q7
+
+ // b = ror64(b ^ c, 24);
+ vld1.8 {M_0}, [ROR24_TABLE, :64]
+ veor q2, q2, q4
+ veor q3, q3, q5
+ vtbl.8 d4, {d4}, M_0
+ vtbl.8 d5, {d5}, M_0
+ vtbl.8 d6, {d6}, M_0
+ vtbl.8 d7, {d7}, M_0
+
+ // a += b + m[blake2b_sigma[r][2*i + 1]];
+ //
+ // M_0 got clobbered above, so we have to reload it if any of the four
+ // message words this step needs happens to be M_0. Otherwise we don't
+ // need to reload it here, as it will just get clobbered again below.
+.if \s1 == 0 || \s3 == 0 || \s5 == 0 || \s7 == 0
+ vld1.8 {M_0}, [sp, :64]
+.endif
+ vadd.u64 q0, q0, q2
+ vadd.u64 q1, q1, q3
+ vadd.u64 d0, d0, M_\s1
+ vadd.u64 d1, d1, M_\s3
+ vadd.u64 d2, d2, M_\s5
+ vadd.u64 d3, d3, M_\s7
+
+ // d = ror64(d ^ a, 16);
+ vld1.8 {M_0}, [ROR16_TABLE, :64]
+ veor q6, q6, q0
+ veor q7, q7, q1
+ vtbl.8 d12, {d12}, M_0
+ vtbl.8 d13, {d13}, M_0
+ vtbl.8 d14, {d14}, M_0
+ vtbl.8 d15, {d15}, M_0
+
+ // c += d;
+ vadd.u64 q4, q4, q6
+ vadd.u64 q5, q5, q7
+
+ // b = ror64(b ^ c, 63);
+ //
+ // This rotation amount isn't a multiple of 8, so it has to be
+ // implemented using a pair of shifts, which requires temporary
+ // registers. Use q8-q9 (M_0-M_3) for this, and reload them afterwards.
+ veor q8, q2, q4
+ veor q9, q3, q5
+ vshr.u64 q2, q8, #63
+ vshr.u64 q3, q9, #63
+ vsli.u64 q2, q8, #1
+ vsli.u64 q3, q9, #1
+ vld1.8 {q8-q9}, [sp, :256]
+
+ // Mix the diagonals:
+ // (v[0], v[5], v[10], v[15]), (v[1], v[6], v[11], v[12]),
+ // (v[2], v[7], v[8], v[13]), and (v[3], v[4], v[9], v[14]).
+ //
+ // There are two possible ways to do this: use 'vext' instructions to
+ // shift the rows of the matrix so that the diagonals become columns,
+ // and undo it afterwards; or just use 64-bit operations on 'd'
+ // registers instead of 128-bit operations on 'q' registers. We use the
+ // latter approach, as it performs much better on Cortex-A7.
+
+ // a += b + m[blake2b_sigma[r][2*i + 0]];
+ vadd.u64 d0, d0, d5
+ vadd.u64 d1, d1, d6
+ vadd.u64 d2, d2, d7
+ vadd.u64 d3, d3, d4
+ vadd.u64 d0, d0, M_\s8
+ vadd.u64 d1, d1, M_\s10
+ vadd.u64 d2, d2, M_\s12
+ vadd.u64 d3, d3, M_\s14
+
+ // d = ror64(d ^ a, 32);
+ veor d15, d15, d0
+ veor d12, d12, d1
+ veor d13, d13, d2
+ veor d14, d14, d3
+ vrev64.32 d15, d15
+ vrev64.32 d12, d12
+ vrev64.32 d13, d13
+ vrev64.32 d14, d14
+
+ // c += d;
+ vadd.u64 d10, d10, d15
+ vadd.u64 d11, d11, d12
+ vadd.u64 d8, d8, d13
+ vadd.u64 d9, d9, d14
+
+ // b = ror64(b ^ c, 24);
+ vld1.8 {M_0}, [ROR24_TABLE, :64]
+ veor d5, d5, d10
+ veor d6, d6, d11
+ veor d7, d7, d8
+ veor d4, d4, d9
+ vtbl.8 d5, {d5}, M_0
+ vtbl.8 d6, {d6}, M_0
+ vtbl.8 d7, {d7}, M_0
+ vtbl.8 d4, {d4}, M_0
+
+ // a += b + m[blake2b_sigma[r][2*i + 1]];
+.if \s9 == 0 || \s11 == 0 || \s13 == 0 || \s15 == 0
+ vld1.8 {M_0}, [sp, :64]
+.endif
+ vadd.u64 d0, d0, d5
+ vadd.u64 d1, d1, d6
+ vadd.u64 d2, d2, d7
+ vadd.u64 d3, d3, d4
+ vadd.u64 d0, d0, M_\s9
+ vadd.u64 d1, d1, M_\s11
+ vadd.u64 d2, d2, M_\s13
+ vadd.u64 d3, d3, M_\s15
+
+ // d = ror64(d ^ a, 16);
+ vld1.8 {M_0}, [ROR16_TABLE, :64]
+ veor d15, d15, d0
+ veor d12, d12, d1
+ veor d13, d13, d2
+ veor d14, d14, d3
+ vtbl.8 d12, {d12}, M_0
+ vtbl.8 d13, {d13}, M_0
+ vtbl.8 d14, {d14}, M_0
+ vtbl.8 d15, {d15}, M_0
+
+ // c += d;
+ vadd.u64 d10, d10, d15
+ vadd.u64 d11, d11, d12
+ vadd.u64 d8, d8, d13
+ vadd.u64 d9, d9, d14
+
+ // b = ror64(b ^ c, 63);
+ veor d16, d4, d9
+ veor d17, d5, d10
+ veor d18, d6, d11
+ veor d19, d7, d8
+ vshr.u64 q2, q8, #63
+ vshr.u64 q3, q9, #63
+ vsli.u64 q2, q8, #1
+ vsli.u64 q3, q9, #1
+ // Reloading q8-q9 can be skipped on the final round.
+.if ! \final
+ vld1.8 {q8-q9}, [sp, :256]
+.endif
+.endm
+
+//
+// void blake2b_compress_neon(struct blake2b_state *state,
+// const u8 *block, size_t nblocks, u32 inc);
+//
+// Only the first three fields of struct blake2b_state are used:
+// u64 h[8]; (inout)
+// u64 t[2]; (inout)
+// u64 f[2]; (in)
+//
+ .align 5
+ENTRY(blake2b_compress_neon)
+ push {r4-r10}
+
+ // Allocate a 32-byte stack buffer that is 32-byte aligned.
+ mov ORIG_SP, sp
+ sub ip, sp, #32
+ bic ip, ip, #31
+ mov sp, ip
+
+ adr ROR24_TABLE, .Lror24_table
+ adr ROR16_TABLE, .Lror16_table
+
+ mov ip, STATE
+ vld1.64 {q0-q1}, [ip]! // Load h[0..3]
+ vld1.64 {q2-q3}, [ip]! // Load h[4..7]
+.Lnext_block:
+ adr r10, .Lblake2b_IV
+ vld1.64 {q14-q15}, [ip] // Load t[0..1] and f[0..1]
+ vld1.64 {q4-q5}, [r10]! // Load IV[0..3]
+ vmov r7, r8, d28 // Copy t[0] to (r7, r8)
+ vld1.64 {q6-q7}, [r10] // Load IV[4..7]
+ adds r7, r7, INC // Increment counter
+ bcs .Lslow_inc_ctr
+ vmov.i32 d28[0], r7
+ vst1.64 {d28}, [ip] // Update t[0]
+.Linc_ctr_done:
+
+ // Load the next message block and finish initializing the state matrix
+ // 'v'. Fortunately, there are exactly enough NEON registers to fit the
+ // entire state matrix in q0-q7 and the entire message block in q8-15.
+ //
+ // However, _blake2b_round also needs some extra registers for rotates,
+ // so we have to spill some registers. It's better to spill the message
+ // registers than the state registers, as the message doesn't change.
+ // Therefore we store a copy of the first 32 bytes of the message block
+ // (q8-q9) in an aligned buffer on the stack so that they can be
+ // reloaded when needed. (We could just reload directly from the
+ // message buffer, but it's faster to use aligned loads.)
+ vld1.8 {q8-q9}, [BLOCK]!
+ veor q6, q6, q14 // v[12..13] = IV[4..5] ^ t[0..1]
+ vld1.8 {q10-q11}, [BLOCK]!
+ veor q7, q7, q15 // v[14..15] = IV[6..7] ^ f[0..1]
+ vld1.8 {q12-q13}, [BLOCK]!
+ vst1.8 {q8-q9}, [sp, :256]
+ mov ip, STATE
+ vld1.8 {q14-q15}, [BLOCK]!
+
+ // Execute the rounds. Each round is provided the order in which it
+ // needs to use the message words.
+ _blake2b_round 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+ _blake2b_round 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3
+ _blake2b_round 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4
+ _blake2b_round 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8
+ _blake2b_round 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13
+ _blake2b_round 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9
+ _blake2b_round 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11
+ _blake2b_round 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10
+ _blake2b_round 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5
+ _blake2b_round 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0
+ _blake2b_round 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+ _blake2b_round 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 \
+ final=1
+
+ // Fold the final state matrix into the hash chaining value:
+ //
+ // for (i = 0; i < 8; i++)
+ // h[i] ^= v[i] ^ v[i + 8];
+ //
+ vld1.64 {q8-q9}, [ip]! // Load old h[0..3]
+ veor q0, q0, q4 // v[0..1] ^= v[8..9]
+ veor q1, q1, q5 // v[2..3] ^= v[10..11]
+ vld1.64 {q10-q11}, [ip] // Load old h[4..7]
+ veor q2, q2, q6 // v[4..5] ^= v[12..13]
+ veor q3, q3, q7 // v[6..7] ^= v[14..15]
+ veor q0, q0, q8 // v[0..1] ^= h[0..1]
+ veor q1, q1, q9 // v[2..3] ^= h[2..3]
+ mov ip, STATE
+ subs NBLOCKS, NBLOCKS, #1 // nblocks--
+ vst1.64 {q0-q1}, [ip]! // Store new h[0..3]
+ veor q2, q2, q10 // v[4..5] ^= h[4..5]
+ veor q3, q3, q11 // v[6..7] ^= h[6..7]
+ vst1.64 {q2-q3}, [ip]! // Store new h[4..7]
+
+ // Advance to the next block, if there is one.
+ bne .Lnext_block // nblocks != 0?
+
+ mov sp, ORIG_SP
+ pop {r4-r10}
+ mov pc, lr
+
+.Lslow_inc_ctr:
+ // Handle the case where the counter overflowed its low 32 bits, by
+ // carrying the overflow bit into the full 128-bit counter.
+ vmov r9, r10, d29
+ adcs r8, r8, #0
+ adcs r9, r9, #0
+ adc r10, r10, #0
+ vmov d28, r7, r8
+ vmov d29, r9, r10
+ vst1.64 {q14}, [ip] // Update t[0] and t[1]
+ b .Linc_ctr_done
+ENDPROC(blake2b_compress_neon)
diff --git a/arch/arm/crypto/blake2b-neon-glue.c b/arch/arm/crypto/blake2b-neon-glue.c
new file mode 100644
index 000000000000..4b59d027ba4a
--- /dev/null
+++ b/arch/arm/crypto/blake2b-neon-glue.c
@@ -0,0 +1,105 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * BLAKE2b digest algorithm, NEON accelerated
+ *
+ * Copyright 2020 Google LLC
+ */
+
+#include <crypto/internal/blake2b.h>
+#include <crypto/internal/hash.h>
+#include <crypto/internal/simd.h>
+
+#include <linux/module.h>
+#include <linux/sizes.h>
+
+#include <asm/neon.h>
+#include <asm/simd.h>
+
+asmlinkage void blake2b_compress_neon(struct blake2b_state *state,
+ const u8 *block, size_t nblocks, u32 inc);
+
+static void blake2b_compress_arch(struct blake2b_state *state,
+ const u8 *block, size_t nblocks, u32 inc)
+{
+ if (!crypto_simd_usable()) {
+ blake2b_compress_generic(state, block, nblocks, inc);
+ return;
+ }
+
+ do {
+ const size_t blocks = min_t(size_t, nblocks,
+ SZ_4K / BLAKE2B_BLOCK_SIZE);
+
+ kernel_neon_begin();
+ blake2b_compress_neon(state, block, blocks, inc);
+ kernel_neon_end();
+
+ nblocks -= blocks;
+ block += blocks * BLAKE2B_BLOCK_SIZE;
+ } while (nblocks);
+}
+
+static int crypto_blake2b_update_neon(struct shash_desc *desc,
+ const u8 *in, unsigned int inlen)
+{
+ return crypto_blake2b_update(desc, in, inlen, blake2b_compress_arch);
+}
+
+static int crypto_blake2b_final_neon(struct shash_desc *desc, u8 *out)
+{
+ return crypto_blake2b_final(desc, out, blake2b_compress_arch);
+}
+
+#define BLAKE2B_ALG(name, driver_name, digest_size) \
+ { \
+ .base.cra_name = name, \
+ .base.cra_driver_name = driver_name, \
+ .base.cra_priority = 200, \
+ .base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY, \
+ .base.cra_blocksize = BLAKE2B_BLOCK_SIZE, \
+ .base.cra_ctxsize = sizeof(struct blake2b_tfm_ctx), \
+ .base.cra_module = THIS_MODULE, \
+ .digestsize = digest_size, \
+ .setkey = crypto_blake2b_setkey, \
+ .init = crypto_blake2b_init, \
+ .update = crypto_blake2b_update_neon, \
+ .final = crypto_blake2b_final_neon, \
+ .descsize = sizeof(struct blake2b_state), \
+ }
+
+static struct shash_alg blake2b_neon_algs[] = {
+ BLAKE2B_ALG("blake2b-160", "blake2b-160-neon", BLAKE2B_160_HASH_SIZE),
+ BLAKE2B_ALG("blake2b-256", "blake2b-256-neon", BLAKE2B_256_HASH_SIZE),
+ BLAKE2B_ALG("blake2b-384", "blake2b-384-neon", BLAKE2B_384_HASH_SIZE),
+ BLAKE2B_ALG("blake2b-512", "blake2b-512-neon", BLAKE2B_512_HASH_SIZE),
+};
+
+static int __init blake2b_neon_mod_init(void)
+{
+ if (!(elf_hwcap & HWCAP_NEON))
+ return -ENODEV;
+
+ return crypto_register_shashes(blake2b_neon_algs,
+ ARRAY_SIZE(blake2b_neon_algs));
+}
+
+static void __exit blake2b_neon_mod_exit(void)
+{
+ crypto_unregister_shashes(blake2b_neon_algs,
+ ARRAY_SIZE(blake2b_neon_algs));
+}
+
+module_init(blake2b_neon_mod_init);
+module_exit(blake2b_neon_mod_exit);
+
+MODULE_DESCRIPTION("BLAKE2b digest algorithm, NEON accelerated");
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Eric Biggers <ebiggers@google.com>");
+MODULE_ALIAS_CRYPTO("blake2b-160");
+MODULE_ALIAS_CRYPTO("blake2b-160-neon");
+MODULE_ALIAS_CRYPTO("blake2b-256");
+MODULE_ALIAS_CRYPTO("blake2b-256-neon");
+MODULE_ALIAS_CRYPTO("blake2b-384");
+MODULE_ALIAS_CRYPTO("blake2b-384-neon");
+MODULE_ALIAS_CRYPTO("blake2b-512");
+MODULE_ALIAS_CRYPTO("blake2b-512-neon");
diff --git a/arch/arm/crypto/blake2s-core.S b/arch/arm/crypto/blake2s-core.S
new file mode 100644
index 000000000000..df40e46601f1
--- /dev/null
+++ b/arch/arm/crypto/blake2s-core.S
@@ -0,0 +1,306 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * BLAKE2s digest algorithm, ARM scalar implementation
+ *
+ * Copyright 2020 Google LLC
+ *
+ * Author: Eric Biggers <ebiggers@google.com>
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+ // Registers used to hold message words temporarily. There aren't
+ // enough ARM registers to hold the whole message block, so we have to
+ // load the words on-demand.
+ M_0 .req r12
+ M_1 .req r14
+
+// The BLAKE2s initialization vector
+.Lblake2s_IV:
+ .word 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A
+ .word 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
+
+.macro __ldrd a, b, src, offset
+#if __LINUX_ARM_ARCH__ >= 6
+ ldrd \a, \b, [\src, #\offset]
+#else
+ ldr \a, [\src, #\offset]
+ ldr \b, [\src, #\offset + 4]
+#endif
+.endm
+
+.macro __strd a, b, dst, offset
+#if __LINUX_ARM_ARCH__ >= 6
+ strd \a, \b, [\dst, #\offset]
+#else
+ str \a, [\dst, #\offset]
+ str \b, [\dst, #\offset + 4]
+#endif
+.endm
+
+.macro _le32_bswap a, tmp
+#ifdef __ARMEB__
+ rev_l \a, \tmp
+#endif
+.endm
+
+.macro _le32_bswap_8x a, b, c, d, e, f, g, h, tmp
+ _le32_bswap \a, \tmp
+ _le32_bswap \b, \tmp
+ _le32_bswap \c, \tmp
+ _le32_bswap \d, \tmp
+ _le32_bswap \e, \tmp
+ _le32_bswap \f, \tmp
+ _le32_bswap \g, \tmp
+ _le32_bswap \h, \tmp
+.endm
+
+// Execute a quarter-round of BLAKE2s by mixing two columns or two diagonals.
+// (a0, b0, c0, d0) and (a1, b1, c1, d1) give the registers containing the two
+// columns/diagonals. s0-s1 are the word offsets to the message words the first
+// column/diagonal needs, and likewise s2-s3 for the second column/diagonal.
+// M_0 and M_1 are free to use, and the message block can be found at sp + 32.
+//
+// Note that to save instructions, the rotations don't happen when the
+// pseudocode says they should, but rather they are delayed until the values are
+// used. See the comment above _blake2s_round().
+.macro _blake2s_quarterround a0, b0, c0, d0, a1, b1, c1, d1, s0, s1, s2, s3
+
+ ldr M_0, [sp, #32 + 4 * \s0]
+ ldr M_1, [sp, #32 + 4 * \s2]
+
+ // a += b + m[blake2s_sigma[r][2*i + 0]];
+ add \a0, \a0, \b0, ror #brot
+ add \a1, \a1, \b1, ror #brot
+ add \a0, \a0, M_0
+ add \a1, \a1, M_1
+
+ // d = ror32(d ^ a, 16);
+ eor \d0, \a0, \d0, ror #drot
+ eor \d1, \a1, \d1, ror #drot
+
+ // c += d;
+ add \c0, \c0, \d0, ror #16
+ add \c1, \c1, \d1, ror #16
+
+ // b = ror32(b ^ c, 12);
+ eor \b0, \c0, \b0, ror #brot
+ eor \b1, \c1, \b1, ror #brot
+
+ ldr M_0, [sp, #32 + 4 * \s1]
+ ldr M_1, [sp, #32 + 4 * \s3]
+
+ // a += b + m[blake2s_sigma[r][2*i + 1]];
+ add \a0, \a0, \b0, ror #12
+ add \a1, \a1, \b1, ror #12
+ add \a0, \a0, M_0
+ add \a1, \a1, M_1
+
+ // d = ror32(d ^ a, 8);
+ eor \d0, \a0, \d0, ror#16
+ eor \d1, \a1, \d1, ror#16
+
+ // c += d;
+ add \c0, \c0, \d0, ror#8
+ add \c1, \c1, \d1, ror#8
+
+ // b = ror32(b ^ c, 7);
+ eor \b0, \c0, \b0, ror#12
+ eor \b1, \c1, \b1, ror#12
+.endm
+
+// Execute one round of BLAKE2s by updating the state matrix v[0..15]. v[0..9]
+// are in r0..r9. The stack pointer points to 8 bytes of scratch space for
+// spilling v[8..9], then to v[9..15], then to the message block. r10-r12 and
+// r14 are free to use. The macro arguments s0-s15 give the order in which the
+// message words are used in this round.
+//
+// All rotates are performed using the implicit rotate operand accepted by the
+// 'add' and 'eor' instructions. This is faster than using explicit rotate
+// instructions. To make this work, we allow the values in the second and last
+// rows of the BLAKE2s state matrix (rows 'b' and 'd') to temporarily have the
+// wrong rotation amount. The rotation amount is then fixed up just in time
+// when the values are used. 'brot' is the number of bits the values in row 'b'
+// need to be rotated right to arrive at the correct values, and 'drot'
+// similarly for row 'd'. (brot, drot) start out as (0, 0) but we make it such
+// that they end up as (7, 8) after every round.
+.macro _blake2s_round s0, s1, s2, s3, s4, s5, s6, s7, \
+ s8, s9, s10, s11, s12, s13, s14, s15
+
+ // Mix first two columns:
+ // (v[0], v[4], v[8], v[12]) and (v[1], v[5], v[9], v[13]).
+ __ldrd r10, r11, sp, 16 // load v[12] and v[13]
+ _blake2s_quarterround r0, r4, r8, r10, r1, r5, r9, r11, \
+ \s0, \s1, \s2, \s3
+ __strd r8, r9, sp, 0
+ __strd r10, r11, sp, 16
+
+ // Mix second two columns:
+ // (v[2], v[6], v[10], v[14]) and (v[3], v[7], v[11], v[15]).
+ __ldrd r8, r9, sp, 8 // load v[10] and v[11]
+ __ldrd r10, r11, sp, 24 // load v[14] and v[15]
+ _blake2s_quarterround r2, r6, r8, r10, r3, r7, r9, r11, \
+ \s4, \s5, \s6, \s7
+ str r10, [sp, #24] // store v[14]
+ // v[10], v[11], and v[15] are used below, so no need to store them yet.
+
+ .set brot, 7
+ .set drot, 8
+
+ // Mix first two diagonals:
+ // (v[0], v[5], v[10], v[15]) and (v[1], v[6], v[11], v[12]).
+ ldr r10, [sp, #16] // load v[12]
+ _blake2s_quarterround r0, r5, r8, r11, r1, r6, r9, r10, \
+ \s8, \s9, \s10, \s11
+ __strd r8, r9, sp, 8
+ str r11, [sp, #28]
+ str r10, [sp, #16]
+
+ // Mix second two diagonals:
+ // (v[2], v[7], v[8], v[13]) and (v[3], v[4], v[9], v[14]).
+ __ldrd r8, r9, sp, 0 // load v[8] and v[9]
+ __ldrd r10, r11, sp, 20 // load v[13] and v[14]
+ _blake2s_quarterround r2, r7, r8, r10, r3, r4, r9, r11, \
+ \s12, \s13, \s14, \s15
+ __strd r10, r11, sp, 20
+.endm
+
+//
+// void blake2s_compress(struct blake2s_state *state,
+// const u8 *block, size_t nblocks, u32 inc);
+//
+// Only the first three fields of struct blake2s_state are used:
+// u32 h[8]; (inout)
+// u32 t[2]; (inout)
+// u32 f[2]; (in)
+//
+ .align 5
+ENTRY(blake2s_compress)
+ push {r0-r2,r4-r11,lr} // keep this an even number
+
+.Lnext_block:
+ // r0 is 'state'
+ // r1 is 'block'
+ // r3 is 'inc'
+
+ // Load and increment the counter t[0..1].
+ __ldrd r10, r11, r0, 32
+ adds r10, r10, r3
+ adc r11, r11, #0
+ __strd r10, r11, r0, 32
+
+ // _blake2s_round is very short on registers, so copy the message block
+ // to the stack to save a register during the rounds. This also has the
+ // advantage that misalignment only needs to be dealt with in one place.
+ sub sp, sp, #64
+ mov r12, sp
+ tst r1, #3
+ bne .Lcopy_block_misaligned
+ ldmia r1!, {r2-r9}
+ _le32_bswap_8x r2, r3, r4, r5, r6, r7, r8, r9, r14
+ stmia r12!, {r2-r9}
+ ldmia r1!, {r2-r9}
+ _le32_bswap_8x r2, r3, r4, r5, r6, r7, r8, r9, r14
+ stmia r12, {r2-r9}
+.Lcopy_block_done:
+ str r1, [sp, #68] // Update message pointer
+
+ // Calculate v[8..15]. Push v[9..15] onto the stack, and leave space
+ // for spilling v[8..9]. Leave v[8..9] in r8-r9.
+ mov r14, r0 // r14 = state
+ adr r12, .Lblake2s_IV
+ ldmia r12!, {r8-r9} // load IV[0..1]
+ __ldrd r0, r1, r14, 40 // load f[0..1]
+ ldm r12, {r2-r7} // load IV[3..7]
+ eor r4, r4, r10 // v[12] = IV[4] ^ t[0]
+ eor r5, r5, r11 // v[13] = IV[5] ^ t[1]
+ eor r6, r6, r0 // v[14] = IV[6] ^ f[0]
+ eor r7, r7, r1 // v[15] = IV[7] ^ f[1]
+ push {r2-r7} // push v[9..15]
+ sub sp, sp, #8 // leave space for v[8..9]
+
+ // Load h[0..7] == v[0..7].
+ ldm r14, {r0-r7}
+
+ // Execute the rounds. Each round is provided the order in which it
+ // needs to use the message words.
+ .set brot, 0
+ .set drot, 0
+ _blake2s_round 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+ _blake2s_round 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3
+ _blake2s_round 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4
+ _blake2s_round 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8
+ _blake2s_round 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13
+ _blake2s_round 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9
+ _blake2s_round 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11
+ _blake2s_round 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10
+ _blake2s_round 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5
+ _blake2s_round 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0
+
+ // Fold the final state matrix into the hash chaining value:
+ //
+ // for (i = 0; i < 8; i++)
+ // h[i] ^= v[i] ^ v[i + 8];
+ //
+ ldr r14, [sp, #96] // r14 = &h[0]
+ add sp, sp, #8 // v[8..9] are already loaded.
+ pop {r10-r11} // load v[10..11]
+ eor r0, r0, r8
+ eor r1, r1, r9
+ eor r2, r2, r10
+ eor r3, r3, r11
+ ldm r14, {r8-r11} // load h[0..3]
+ eor r0, r0, r8
+ eor r1, r1, r9
+ eor r2, r2, r10
+ eor r3, r3, r11
+ stmia r14!, {r0-r3} // store new h[0..3]
+ ldm r14, {r0-r3} // load old h[4..7]
+ pop {r8-r11} // load v[12..15]
+ eor r0, r0, r4, ror #brot
+ eor r1, r1, r5, ror #brot
+ eor r2, r2, r6, ror #brot
+ eor r3, r3, r7, ror #brot
+ eor r0, r0, r8, ror #drot
+ eor r1, r1, r9, ror #drot
+ eor r2, r2, r10, ror #drot
+ eor r3, r3, r11, ror #drot
+ add sp, sp, #64 // skip copy of message block
+ stm r14, {r0-r3} // store new h[4..7]
+
+ // Advance to the next block, if there is one. Note that if there are
+ // multiple blocks, then 'inc' (the counter increment amount) must be
+ // 64. So we can simply set it to 64 without re-loading it.
+ ldm sp, {r0, r1, r2} // load (state, block, nblocks)
+ mov r3, #64 // set 'inc'
+ subs r2, r2, #1 // nblocks--
+ str r2, [sp, #8]
+ bne .Lnext_block // nblocks != 0?
+
+ pop {r0-r2,r4-r11,pc}
+
+ // The next message block (pointed to by r1) isn't 4-byte aligned, so it
+ // can't be loaded using ldmia. Copy it to the stack buffer (pointed to
+ // by r12) using an alternative method. r2-r9 are free to use.
+.Lcopy_block_misaligned:
+ mov r2, #64
+1:
+#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
+ ldr r3, [r1], #4
+ _le32_bswap r3, r4
+#else
+ ldrb r3, [r1, #0]
+ ldrb r4, [r1, #1]
+ ldrb r5, [r1, #2]
+ ldrb r6, [r1, #3]
+ add r1, r1, #4
+ orr r3, r3, r4, lsl #8
+ orr r3, r3, r5, lsl #16
+ orr r3, r3, r6, lsl #24
+#endif
+ subs r2, r2, #4
+ str r3, [r12], #4
+ bne 1b
+ b .Lcopy_block_done
+ENDPROC(blake2s_compress)
diff --git a/arch/arm/crypto/blake2s-glue.c b/arch/arm/crypto/blake2s-glue.c
new file mode 100644
index 000000000000..0238a70d9581
--- /dev/null
+++ b/arch/arm/crypto/blake2s-glue.c
@@ -0,0 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include <crypto/internal/blake2s.h>
+#include <linux/module.h>
+
+/* defined in blake2s-core.S */
+EXPORT_SYMBOL(blake2s_compress);
diff --git a/arch/arm/crypto/chacha-glue.c b/arch/arm/crypto/chacha-glue.c
index 6fdb0ac62b3d..cdde8fd01f8f 100644
--- a/arch/arm/crypto/chacha-glue.c
+++ b/arch/arm/crypto/chacha-glue.c
@@ -23,7 +23,7 @@
asmlinkage void chacha_block_xor_neon(const u32 *state, u8 *dst, const u8 *src,
int nrounds);
asmlinkage void chacha_4block_xor_neon(const u32 *state, u8 *dst, const u8 *src,
- int nrounds);
+ int nrounds, unsigned int nbytes);
asmlinkage void hchacha_block_arm(const u32 *state, u32 *out, int nrounds);
asmlinkage void hchacha_block_neon(const u32 *state, u32 *out, int nrounds);
@@ -42,24 +42,25 @@ static void chacha_doneon(u32 *state, u8 *dst, const u8 *src,
{
u8 buf[CHACHA_BLOCK_SIZE];
- while (bytes >= CHACHA_BLOCK_SIZE * 4) {
- chacha_4block_xor_neon(state, dst, src, nrounds);
- bytes -= CHACHA_BLOCK_SIZE * 4;
- src += CHACHA_BLOCK_SIZE * 4;
- dst += CHACHA_BLOCK_SIZE * 4;
- state[12] += 4;
- }
- while (bytes >= CHACHA_BLOCK_SIZE) {
- chacha_block_xor_neon(state, dst, src, nrounds);
- bytes -= CHACHA_BLOCK_SIZE;
- src += CHACHA_BLOCK_SIZE;
- dst += CHACHA_BLOCK_SIZE;
- state[12]++;
+ while (bytes > CHACHA_BLOCK_SIZE) {
+ unsigned int l = min(bytes, CHACHA_BLOCK_SIZE * 4U);
+
+ chacha_4block_xor_neon(state, dst, src, nrounds, l);
+ bytes -= l;
+ src += l;
+ dst += l;
+ state[12] += DIV_ROUND_UP(l, CHACHA_BLOCK_SIZE);
}
if (bytes) {
- memcpy(buf, src, bytes);
- chacha_block_xor_neon(state, buf, buf, nrounds);
- memcpy(dst, buf, bytes);
+ const u8 *s = src;
+ u8 *d = dst;
+
+ if (bytes != CHACHA_BLOCK_SIZE)
+ s = d = memcpy(buf, src, bytes);
+ chacha_block_xor_neon(state, d, s, nrounds);
+ if (d != dst)
+ memcpy(dst, buf, bytes);
+ state[12]++;
}
}
@@ -91,9 +92,17 @@ void chacha_crypt_arch(u32 *state, u8 *dst, const u8 *src, unsigned int bytes,
return;
}
- kernel_neon_begin();
- chacha_doneon(state, dst, src, bytes, nrounds);
- kernel_neon_end();
+ do {
+ unsigned int todo = min_t(unsigned int, bytes, SZ_4K);
+
+ kernel_neon_begin();
+ chacha_doneon(state, dst, src, todo, nrounds);
+ kernel_neon_end();
+
+ bytes -= todo;
+ src += todo;
+ dst += todo;
+ } while (bytes);
}
EXPORT_SYMBOL(chacha_crypt_arch);
diff --git a/arch/arm/crypto/chacha-neon-core.S b/arch/arm/crypto/chacha-neon-core.S
index eb22926d4912..13d12f672656 100644
--- a/arch/arm/crypto/chacha-neon-core.S
+++ b/arch/arm/crypto/chacha-neon-core.S
@@ -47,6 +47,7 @@
*/
#include <linux/linkage.h>
+#include <asm/cache.h>
.text
.fpu neon
@@ -205,7 +206,7 @@ ENDPROC(hchacha_block_neon)
.align 5
ENTRY(chacha_4block_xor_neon)
- push {r4-r5}
+ push {r4, lr}
mov r4, sp // preserve the stack pointer
sub ip, sp, #0x20 // allocate a 32 byte buffer
bic ip, ip, #0x1f // aligned to 32 bytes
@@ -229,10 +230,10 @@ ENTRY(chacha_4block_xor_neon)
vld1.32 {q0-q1}, [r0]
vld1.32 {q2-q3}, [ip]
- adr r5, .Lctrinc
+ adr lr, .Lctrinc
vdup.32 q15, d7[1]
vdup.32 q14, d7[0]
- vld1.32 {q4}, [r5, :128]
+ vld1.32 {q4}, [lr, :128]
vdup.32 q13, d6[1]
vdup.32 q12, d6[0]
vdup.32 q11, d5[1]
@@ -455,7 +456,7 @@ ENTRY(chacha_4block_xor_neon)
// Re-interleave the words in the first two rows of each block (x0..7).
// Also add the counter values 0-3 to x12[0-3].
- vld1.32 {q8}, [r5, :128] // load counter values 0-3
+ vld1.32 {q8}, [lr, :128] // load counter values 0-3
vzip.32 q0, q1 // => (0 1 0 1) (0 1 0 1)
vzip.32 q2, q3 // => (2 3 2 3) (2 3 2 3)
vzip.32 q4, q5 // => (4 5 4 5) (4 5 4 5)
@@ -493,6 +494,8 @@ ENTRY(chacha_4block_xor_neon)
// Re-interleave the words in the last two rows of each block (x8..15).
vld1.32 {q8-q9}, [sp, :256]
+ mov sp, r4 // restore original stack pointer
+ ldr r4, [r4, #8] // load number of bytes
vzip.32 q12, q13 // => (12 13 12 13) (12 13 12 13)
vzip.32 q14, q15 // => (14 15 14 15) (14 15 14 15)
vzip.32 q8, q9 // => (8 9 8 9) (8 9 8 9)
@@ -520,41 +523,121 @@ ENTRY(chacha_4block_xor_neon)
// XOR the rest of the data with the keystream
vld1.8 {q0-q1}, [r2]!
+ subs r4, r4, #96
veor q0, q0, q8
veor q1, q1, q12
+ ble .Lle96
vst1.8 {q0-q1}, [r1]!
vld1.8 {q0-q1}, [r2]!
+ subs r4, r4, #32
veor q0, q0, q2
veor q1, q1, q6
+ ble .Lle128
vst1.8 {q0-q1}, [r1]!
vld1.8 {q0-q1}, [r2]!
+ subs r4, r4, #32
veor q0, q0, q10
veor q1, q1, q14
+ ble .Lle160
vst1.8 {q0-q1}, [r1]!
vld1.8 {q0-q1}, [r2]!
+ subs r4, r4, #32
veor q0, q0, q4
veor q1, q1, q5
+ ble .Lle192
vst1.8 {q0-q1}, [r1]!
vld1.8 {q0-q1}, [r2]!
+ subs r4, r4, #32
veor q0, q0, q9
veor q1, q1, q13
+ ble .Lle224
vst1.8 {q0-q1}, [r1]!
vld1.8 {q0-q1}, [r2]!
+ subs r4, r4, #32
veor q0, q0, q3
veor q1, q1, q7
+ blt .Llt256
+.Lout:
vst1.8 {q0-q1}, [r1]!
vld1.8 {q0-q1}, [r2]
- mov sp, r4 // restore original stack pointer
veor q0, q0, q11
veor q1, q1, q15
vst1.8 {q0-q1}, [r1]
- pop {r4-r5}
- bx lr
+ pop {r4, pc}
+
+.Lle192:
+ vmov q4, q9
+ vmov q5, q13
+
+.Lle160:
+ // nothing to do
+
+.Lfinalblock:
+ // Process the final block if processing less than 4 full blocks.
+ // Entered with 32 bytes of ChaCha cipher stream in q4-q5, and the
+ // previous 32 byte output block that still needs to be written at
+ // [r1] in q0-q1.
+ beq .Lfullblock
+
+.Lpartialblock:
+ adr lr, .Lpermute + 32
+ add r2, r2, r4
+ add lr, lr, r4
+ add r4, r4, r1
+
+ vld1.8 {q2-q3}, [lr]
+ vld1.8 {q6-q7}, [r2]
+
+ add r4, r4, #32
+
+ vtbl.8 d4, {q4-q5}, d4
+ vtbl.8 d5, {q4-q5}, d5
+ vtbl.8 d6, {q4-q5}, d6
+ vtbl.8 d7, {q4-q5}, d7
+
+ veor q6, q6, q2
+ veor q7, q7, q3
+
+ vst1.8 {q6-q7}, [r4] // overlapping stores
+ vst1.8 {q0-q1}, [r1]
+ pop {r4, pc}
+
+.Lfullblock:
+ vmov q11, q4
+ vmov q15, q5
+ b .Lout
+.Lle96:
+ vmov q4, q2
+ vmov q5, q6
+ b .Lfinalblock
+.Lle128:
+ vmov q4, q10
+ vmov q5, q14
+ b .Lfinalblock
+.Lle224:
+ vmov q4, q3
+ vmov q5, q7
+ b .Lfinalblock
+.Llt256:
+ vmov q4, q11
+ vmov q5, q15
+ b .Lpartialblock
ENDPROC(chacha_4block_xor_neon)
+
+ .align L1_CACHE_SHIFT
+.Lpermute:
+ .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
+ .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
+ .byte 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17
+ .byte 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f
+ .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
+ .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
+ .byte 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17
+ .byte 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f
diff --git a/arch/arm/crypto/chacha-scalar-core.S b/arch/arm/crypto/chacha-scalar-core.S
index 2985b80a45b5..083fe1ab96d0 100644
--- a/arch/arm/crypto/chacha-scalar-core.S
+++ b/arch/arm/crypto/chacha-scalar-core.S
@@ -41,32 +41,15 @@
X14 .req r12
X15 .req r14
-.macro __rev out, in, t0, t1, t2
-.if __LINUX_ARM_ARCH__ >= 6
- rev \out, \in
-.else
- lsl \t0, \in, #24
- and \t1, \in, #0xff00
- and \t2, \in, #0xff0000
- orr \out, \t0, \in, lsr #24
- orr \out, \out, \t1, lsl #8
- orr \out, \out, \t2, lsr #8
-.endif
-.endm
-
-.macro _le32_bswap x, t0, t1, t2
+.macro _le32_bswap_4x a, b, c, d, tmp
#ifdef __ARMEB__
- __rev \x, \x, \t0, \t1, \t2
+ rev_l \a, \tmp
+ rev_l \b, \tmp
+ rev_l \c, \tmp
+ rev_l \d, \tmp
#endif
.endm
-.macro _le32_bswap_4x a, b, c, d, t0, t1, t2
- _le32_bswap \a, \t0, \t1, \t2
- _le32_bswap \b, \t0, \t1, \t2
- _le32_bswap \c, \t0, \t1, \t2
- _le32_bswap \d, \t0, \t1, \t2
-.endm
-
.macro __ldrd a, b, src, offset
#if __LINUX_ARM_ARCH__ >= 6
ldrd \a, \b, [\src, #\offset]
@@ -200,7 +183,7 @@
add X1, X1, r9
add X2, X2, r10
add X3, X3, r11
- _le32_bswap_4x X0, X1, X2, X3, r8, r9, r10
+ _le32_bswap_4x X0, X1, X2, X3, r8
ldmia r12!, {r8-r11}
eor X0, X0, r8
eor X1, X1, r9
@@ -216,7 +199,7 @@
ldmia r12!, {X0-X3}
add X6, r10, X6, ror #brot
add X7, r11, X7, ror #brot
- _le32_bswap_4x X4, X5, X6, X7, r8, r9, r10
+ _le32_bswap_4x X4, X5, X6, X7, r8
eor X4, X4, X0
eor X5, X5, X1
eor X6, X6, X2
@@ -231,7 +214,7 @@
add r1, r1, r9 // x9
add r6, r6, r10 // x10
add r7, r7, r11 // x11
- _le32_bswap_4x r0, r1, r6, r7, r8, r9, r10
+ _le32_bswap_4x r0, r1, r6, r7, r8
ldmia r12!, {r8-r11}
eor r0, r0, r8 // x8
eor r1, r1, r9 // x9
@@ -245,7 +228,7 @@
add r3, r9, r3, ror #drot // x13
add r4, r10, r4, ror #drot // x14
add r5, r11, r5, ror #drot // x15
- _le32_bswap_4x r2, r3, r4, r5, r9, r10, r11
+ _le32_bswap_4x r2, r3, r4, r5, r9
ldr r9, [sp, #72] // load LEN
eor r2, r2, r0 // x12
eor r3, r3, r1 // x13
@@ -301,7 +284,7 @@
add X1, X1, r9
add X2, X2, r10
add X3, X3, r11
- _le32_bswap_4x X0, X1, X2, X3, r8, r9, r10
+ _le32_bswap_4x X0, X1, X2, X3, r8
stmia r14!, {X0-X3}
// Save keystream for x4-x7
@@ -311,7 +294,7 @@
add X5, r9, X5, ror #brot
add X6, r10, X6, ror #brot
add X7, r11, X7, ror #brot
- _le32_bswap_4x X4, X5, X6, X7, r8, r9, r10
+ _le32_bswap_4x X4, X5, X6, X7, r8
add r8, sp, #64
stmia r14!, {X4-X7}
@@ -323,7 +306,7 @@
add r1, r1, r9 // x9
add r6, r6, r10 // x10
add r7, r7, r11 // x11
- _le32_bswap_4x r0, r1, r6, r7, r8, r9, r10
+ _le32_bswap_4x r0, r1, r6, r7, r8
stmia r14!, {r0,r1,r6,r7}
__ldrd r8, r9, sp, 144
__ldrd r10, r11, sp, 152
@@ -331,7 +314,7 @@
add r3, r9, r3, ror #drot // x13
add r4, r10, r4, ror #drot // x14
add r5, r11, r5, ror #drot // x15
- _le32_bswap_4x r2, r3, r4, r5, r9, r10, r11
+ _le32_bswap_4x r2, r3, r4, r5, r9
stmia r14, {r2-r5}
// Stack: ks0-ks15 unused0-unused7 x0-x15 OUT IN LEN
diff --git a/arch/arm/crypto/crc32-ce-core.S b/arch/arm/crypto/crc32-ce-core.S
index 5cbd4a6fedad..3f13a76b9066 100644
--- a/arch/arm/crypto/crc32-ce-core.S
+++ b/arch/arm/crypto/crc32-ce-core.S
@@ -39,7 +39,7 @@
* CRC32 polynomial:0x04c11db7(BE)/0xEDB88320(LE)
* PCLMULQDQ is a new instruction in Intel SSE4.2, the reference can be found
* at:
- * http://www.intel.com/products/processor/manuals/
+ * https://www.intel.com/products/processor/manuals/
* Intel(R) 64 and IA-32 Architectures Software Developer's Manual
* Volume 2B: Instruction Set Reference, N-Z
*
diff --git a/arch/arm/crypto/curve25519-core.S b/arch/arm/crypto/curve25519-core.S
index be18af52e7dc..b697fa5d059a 100644
--- a/arch/arm/crypto/curve25519-core.S
+++ b/arch/arm/crypto/curve25519-core.S
@@ -10,8 +10,8 @@
#include <linux/linkage.h>
.text
-.fpu neon
.arch armv7-a
+.fpu neon
.align 4
ENTRY(curve25519_neon)
diff --git a/arch/arm/crypto/curve25519-glue.c b/arch/arm/crypto/curve25519-glue.c
index 776ae07e0469..9bdafd57888c 100644
--- a/arch/arm/crypto/curve25519-glue.c
+++ b/arch/arm/crypto/curve25519-glue.c
@@ -16,6 +16,7 @@
#include <linux/module.h>
#include <linux/init.h>
#include <linux/jump_label.h>
+#include <linux/scatterlist.h>
#include <crypto/curve25519.h>
asmlinkage void curve25519_neon(u8 mypublic[CURVE25519_KEY_SIZE],
@@ -111,7 +112,7 @@ static struct kpp_alg curve25519_alg = {
.max_size = curve25519_max_size,
};
-static int __init mod_init(void)
+static int __init arm_curve25519_init(void)
{
if (elf_hwcap & HWCAP_NEON) {
static_branch_enable(&have_neon);
@@ -121,14 +122,14 @@ static int __init mod_init(void)
return 0;
}
-static void __exit mod_exit(void)
+static void __exit arm_curve25519_exit(void)
{
if (IS_REACHABLE(CONFIG_CRYPTO_KPP) && elf_hwcap & HWCAP_NEON)
crypto_unregister_kpp(&curve25519_alg);
}
-module_init(mod_init);
-module_exit(mod_exit);
+module_init(arm_curve25519_init);
+module_exit(arm_curve25519_exit);
MODULE_ALIAS_CRYPTO("curve25519");
MODULE_ALIAS_CRYPTO("curve25519-neon");
diff --git a/arch/arm/crypto/ghash-ce-core.S b/arch/arm/crypto/ghash-ce-core.S
index 534c9647726d..9f51e3fa4526 100644
--- a/arch/arm/crypto/ghash-ce-core.S
+++ b/arch/arm/crypto/ghash-ce-core.S
@@ -8,6 +8,9 @@
#include <linux/linkage.h>
#include <asm/assembler.h>
+ .arch armv8-a
+ .fpu crypto-neon-fp-armv8
+
SHASH .req q0
T1 .req q1
XL .req q2
@@ -88,8 +91,6 @@
T3_H .req d17
.text
- .arch armv8-a
- .fpu crypto-neon-fp-armv8
.macro __pmull_p64, rd, rn, rm, b1, b2, b3, b4
vmull.p64 \rd, \rn, \rm
diff --git a/arch/arm/crypto/ghash-ce-glue.c b/arch/arm/crypto/ghash-ce-glue.c
index a00fd329255f..f13401f3e669 100644
--- a/arch/arm/crypto/ghash-ce-glue.c
+++ b/arch/arm/crypto/ghash-ce-glue.c
@@ -16,6 +16,7 @@
#include <crypto/gf128mul.h>
#include <linux/cpufeature.h>
#include <linux/crypto.h>
+#include <linux/jump_label.h>
#include <linux/module.h>
MODULE_DESCRIPTION("GHASH hash function using ARMv8 Crypto Extensions");
@@ -27,12 +28,8 @@ MODULE_ALIAS_CRYPTO("ghash");
#define GHASH_DIGEST_SIZE 16
struct ghash_key {
- u64 h[2];
- u64 h2[2];
- u64 h3[2];
- u64 h4[2];
-
be128 k;
+ u64 h[][2];
};
struct ghash_desc_ctx {
@@ -46,16 +43,12 @@ struct ghash_async_ctx {
};
asmlinkage void pmull_ghash_update_p64(int blocks, u64 dg[], const char *src,
- struct ghash_key const *k,
- const char *head);
+ u64 const h[][2], const char *head);
asmlinkage void pmull_ghash_update_p8(int blocks, u64 dg[], const char *src,
- struct ghash_key const *k,
- const char *head);
+ u64 const h[][2], const char *head);
-static void (*pmull_ghash_update)(int blocks, u64 dg[], const char *src,
- struct ghash_key const *k,
- const char *head);
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(use_p64);
static int ghash_init(struct shash_desc *desc)
{
@@ -70,7 +63,10 @@ static void ghash_do_update(int blocks, u64 dg[], const char *src,
{
if (likely(crypto_simd_usable())) {
kernel_neon_begin();
- pmull_ghash_update(blocks, dg, src, key, head);
+ if (static_branch_likely(&use_p64))
+ pmull_ghash_update_p64(blocks, dg, src, key->h, head);
+ else
+ pmull_ghash_update_p8(blocks, dg, src, key->h, head);
kernel_neon_end();
} else {
be128 dst = { cpu_to_be64(dg[1]), cpu_to_be64(dg[0]) };
@@ -161,25 +157,26 @@ static int ghash_setkey(struct crypto_shash *tfm,
const u8 *inkey, unsigned int keylen)
{
struct ghash_key *key = crypto_shash_ctx(tfm);
- be128 h;
if (keylen != GHASH_BLOCK_SIZE)
return -EINVAL;
/* needed for the fallback */
memcpy(&key->k, inkey, GHASH_BLOCK_SIZE);
- ghash_reflect(key->h, &key->k);
+ ghash_reflect(key->h[0], &key->k);
- h = key->k;
- gf128mul_lle(&h, &key->k);
- ghash_reflect(key->h2, &h);
+ if (static_branch_likely(&use_p64)) {
+ be128 h = key->k;
- gf128mul_lle(&h, &key->k);
- ghash_reflect(key->h3, &h);
+ gf128mul_lle(&h, &key->k);
+ ghash_reflect(key->h[1], &h);
- gf128mul_lle(&h, &key->k);
- ghash_reflect(key->h4, &h);
+ gf128mul_lle(&h, &key->k);
+ ghash_reflect(key->h[2], &h);
+ gf128mul_lle(&h, &key->k);
+ ghash_reflect(key->h[3], &h);
+ }
return 0;
}
@@ -195,7 +192,7 @@ static struct shash_alg ghash_alg = {
.base.cra_driver_name = "ghash-ce-sync",
.base.cra_priority = 300 - 1,
.base.cra_blocksize = GHASH_BLOCK_SIZE,
- .base.cra_ctxsize = sizeof(struct ghash_key),
+ .base.cra_ctxsize = sizeof(struct ghash_key) + sizeof(u64[2]),
.base.cra_module = THIS_MODULE,
};
@@ -354,10 +351,10 @@ static int __init ghash_ce_mod_init(void)
if (!(elf_hwcap & HWCAP_NEON))
return -ENODEV;
- if (elf_hwcap2 & HWCAP2_PMULL)
- pmull_ghash_update = pmull_ghash_update_p64;
- else
- pmull_ghash_update = pmull_ghash_update_p8;
+ if (elf_hwcap2 & HWCAP2_PMULL) {
+ ghash_alg.base.cra_ctxsize += 3 * sizeof(u64[2]);
+ static_branch_enable(&use_p64);
+ }
err = crypto_register_shash(&ghash_alg);
if (err)
diff --git a/arch/arm/crypto/nhpoly1305-neon-glue.c b/arch/arm/crypto/nhpoly1305-neon-glue.c
index ae5aefc44a4d..ffa8d73fe722 100644
--- a/arch/arm/crypto/nhpoly1305-neon-glue.c
+++ b/arch/arm/crypto/nhpoly1305-neon-glue.c
@@ -30,7 +30,7 @@ static int nhpoly1305_neon_update(struct shash_desc *desc,
return crypto_nhpoly1305_update(desc, src, srclen);
do {
- unsigned int n = min_t(unsigned int, srclen, PAGE_SIZE);
+ unsigned int n = min_t(unsigned int, srclen, SZ_4K);
kernel_neon_begin();
crypto_nhpoly1305_update_helper(desc, src, n, _nh_neon);
diff --git a/arch/arm/crypto/poly1305-core.S_shipped b/arch/arm/crypto/poly1305-core.S_shipped
deleted file mode 100644
index 37b71d990293..000000000000
--- a/arch/arm/crypto/poly1305-core.S_shipped
+++ /dev/null
@@ -1,1158 +0,0 @@
-#ifndef __KERNEL__
-# include "arm_arch.h"
-#else
-# define __ARM_ARCH__ __LINUX_ARM_ARCH__
-# define __ARM_MAX_ARCH__ __LINUX_ARM_ARCH__
-# define poly1305_init poly1305_init_arm
-# define poly1305_blocks poly1305_blocks_arm
-# define poly1305_emit poly1305_emit_arm
-.globl poly1305_blocks_neon
-#endif
-
-#if defined(__thumb2__)
-.syntax unified
-.thumb
-#else
-.code 32
-#endif
-
-.text
-
-.globl poly1305_emit
-.globl poly1305_blocks
-.globl poly1305_init
-.type poly1305_init,%function
-.align 5
-poly1305_init:
-.Lpoly1305_init:
- stmdb sp!,{r4-r11}
-
- eor r3,r3,r3
- cmp r1,#0
- str r3,[r0,#0] @ zero hash value
- str r3,[r0,#4]
- str r3,[r0,#8]
- str r3,[r0,#12]
- str r3,[r0,#16]
- str r3,[r0,#36] @ clear is_base2_26
- add r0,r0,#20
-
-#ifdef __thumb2__
- it eq
-#endif
- moveq r0,#0
- beq .Lno_key
-
-#if __ARM_MAX_ARCH__>=7
- mov r3,#-1
- str r3,[r0,#28] @ impossible key power value
-# ifndef __KERNEL__
- adr r11,.Lpoly1305_init
- ldr r12,.LOPENSSL_armcap
-# endif
-#endif
- ldrb r4,[r1,#0]
- mov r10,#0x0fffffff
- ldrb r5,[r1,#1]
- and r3,r10,#-4 @ 0x0ffffffc
- ldrb r6,[r1,#2]
- ldrb r7,[r1,#3]
- orr r4,r4,r5,lsl#8
- ldrb r5,[r1,#4]
- orr r4,r4,r6,lsl#16
- ldrb r6,[r1,#5]
- orr r4,r4,r7,lsl#24
- ldrb r7,[r1,#6]
- and r4,r4,r10
-
-#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
-# if !defined(_WIN32)
- ldr r12,[r11,r12] @ OPENSSL_armcap_P
-# endif
-# if defined(__APPLE__) || defined(_WIN32)
- ldr r12,[r12]
-# endif
-#endif
- ldrb r8,[r1,#7]
- orr r5,r5,r6,lsl#8
- ldrb r6,[r1,#8]
- orr r5,r5,r7,lsl#16
- ldrb r7,[r1,#9]
- orr r5,r5,r8,lsl#24
- ldrb r8,[r1,#10]
- and r5,r5,r3
-
-#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
- tst r12,#ARMV7_NEON @ check for NEON
-# ifdef __thumb2__
- adr r9,.Lpoly1305_blocks_neon
- adr r11,.Lpoly1305_blocks
- it ne
- movne r11,r9
- adr r12,.Lpoly1305_emit
- orr r11,r11,#1 @ thumb-ify addresses
- orr r12,r12,#1
-# else
- add r12,r11,#(.Lpoly1305_emit-.Lpoly1305_init)
- ite eq
- addeq r11,r11,#(.Lpoly1305_blocks-.Lpoly1305_init)
- addne r11,r11,#(.Lpoly1305_blocks_neon-.Lpoly1305_init)
-# endif
-#endif
- ldrb r9,[r1,#11]
- orr r6,r6,r7,lsl#8
- ldrb r7,[r1,#12]
- orr r6,r6,r8,lsl#16
- ldrb r8,[r1,#13]
- orr r6,r6,r9,lsl#24
- ldrb r9,[r1,#14]
- and r6,r6,r3
-
- ldrb r10,[r1,#15]
- orr r7,r7,r8,lsl#8
- str r4,[r0,#0]
- orr r7,r7,r9,lsl#16
- str r5,[r0,#4]
- orr r7,r7,r10,lsl#24
- str r6,[r0,#8]
- and r7,r7,r3
- str r7,[r0,#12]
-#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
- stmia r2,{r11,r12} @ fill functions table
- mov r0,#1
-#else
- mov r0,#0
-#endif
-.Lno_key:
- ldmia sp!,{r4-r11}
-#if __ARM_ARCH__>=5
- bx lr @ bx lr
-#else
- tst lr,#1
- moveq pc,lr @ be binary compatible with V4, yet
- .word 0xe12fff1e @ interoperable with Thumb ISA:-)
-#endif
-.size poly1305_init,.-poly1305_init
-.type poly1305_blocks,%function
-.align 5
-poly1305_blocks:
-.Lpoly1305_blocks:
- stmdb sp!,{r3-r11,lr}
-
- ands r2,r2,#-16
- beq .Lno_data
-
- add r2,r2,r1 @ end pointer
- sub sp,sp,#32
-
-#if __ARM_ARCH__<7
- ldmia r0,{r4-r12} @ load context
- add r0,r0,#20
- str r2,[sp,#16] @ offload stuff
- str r0,[sp,#12]
-#else
- ldr lr,[r0,#36] @ is_base2_26
- ldmia r0!,{r4-r8} @ load hash value
- str r2,[sp,#16] @ offload stuff
- str r0,[sp,#12]
-
- adds r9,r4,r5,lsl#26 @ base 2^26 -> base 2^32
- mov r10,r5,lsr#6
- adcs r10,r10,r6,lsl#20
- mov r11,r6,lsr#12
- adcs r11,r11,r7,lsl#14
- mov r12,r7,lsr#18
- adcs r12,r12,r8,lsl#8
- mov r2,#0
- teq lr,#0
- str r2,[r0,#16] @ clear is_base2_26
- adc r2,r2,r8,lsr#24
-
- itttt ne
- movne r4,r9 @ choose between radixes
- movne r5,r10
- movne r6,r11
- movne r7,r12
- ldmia r0,{r9-r12} @ load key
- it ne
- movne r8,r2
-#endif
-
- mov lr,r1
- cmp r3,#0
- str r10,[sp,#20]
- str r11,[sp,#24]
- str r12,[sp,#28]
- b .Loop
-
-.align 4
-.Loop:
-#if __ARM_ARCH__<7
- ldrb r0,[lr],#16 @ load input
-# ifdef __thumb2__
- it hi
-# endif
- addhi r8,r8,#1 @ 1<<128
- ldrb r1,[lr,#-15]
- ldrb r2,[lr,#-14]
- ldrb r3,[lr,#-13]
- orr r1,r0,r1,lsl#8
- ldrb r0,[lr,#-12]
- orr r2,r1,r2,lsl#16
- ldrb r1,[lr,#-11]
- orr r3,r2,r3,lsl#24
- ldrb r2,[lr,#-10]
- adds r4,r4,r3 @ accumulate input
-
- ldrb r3,[lr,#-9]
- orr r1,r0,r1,lsl#8
- ldrb r0,[lr,#-8]
- orr r2,r1,r2,lsl#16
- ldrb r1,[lr,#-7]
- orr r3,r2,r3,lsl#24
- ldrb r2,[lr,#-6]
- adcs r5,r5,r3
-
- ldrb r3,[lr,#-5]
- orr r1,r0,r1,lsl#8
- ldrb r0,[lr,#-4]
- orr r2,r1,r2,lsl#16
- ldrb r1,[lr,#-3]
- orr r3,r2,r3,lsl#24
- ldrb r2,[lr,#-2]
- adcs r6,r6,r3
-
- ldrb r3,[lr,#-1]
- orr r1,r0,r1,lsl#8
- str lr,[sp,#8] @ offload input pointer
- orr r2,r1,r2,lsl#16
- add r10,r10,r10,lsr#2
- orr r3,r2,r3,lsl#24
-#else
- ldr r0,[lr],#16 @ load input
- it hi
- addhi r8,r8,#1 @ padbit
- ldr r1,[lr,#-12]
- ldr r2,[lr,#-8]
- ldr r3,[lr,#-4]
-# ifdef __ARMEB__
- rev r0,r0
- rev r1,r1
- rev r2,r2
- rev r3,r3
-# endif
- adds r4,r4,r0 @ accumulate input
- str lr,[sp,#8] @ offload input pointer
- adcs r5,r5,r1
- add r10,r10,r10,lsr#2
- adcs r6,r6,r2
-#endif
- add r11,r11,r11,lsr#2
- adcs r7,r7,r3
- add r12,r12,r12,lsr#2
-
- umull r2,r3,r5,r9
- adc r8,r8,#0
- umull r0,r1,r4,r9
- umlal r2,r3,r8,r10
- umlal r0,r1,r7,r10
- ldr r10,[sp,#20] @ reload r10
- umlal r2,r3,r6,r12
- umlal r0,r1,r5,r12
- umlal r2,r3,r7,r11
- umlal r0,r1,r6,r11
- umlal r2,r3,r4,r10
- str r0,[sp,#0] @ future r4
- mul r0,r11,r8
- ldr r11,[sp,#24] @ reload r11
- adds r2,r2,r1 @ d1+=d0>>32
- eor r1,r1,r1
- adc lr,r3,#0 @ future r6
- str r2,[sp,#4] @ future r5
-
- mul r2,r12,r8
- eor r3,r3,r3
- umlal r0,r1,r7,r12
- ldr r12,[sp,#28] @ reload r12
- umlal r2,r3,r7,r9
- umlal r0,r1,r6,r9
- umlal r2,r3,r6,r10
- umlal r0,r1,r5,r10
- umlal r2,r3,r5,r11
- umlal r0,r1,r4,r11
- umlal r2,r3,r4,r12
- ldr r4,[sp,#0]
- mul r8,r9,r8
- ldr r5,[sp,#4]
-
- adds r6,lr,r0 @ d2+=d1>>32
- ldr lr,[sp,#8] @ reload input pointer
- adc r1,r1,#0
- adds r7,r2,r1 @ d3+=d2>>32
- ldr r0,[sp,#16] @ reload end pointer
- adc r3,r3,#0
- add r8,r8,r3 @ h4+=d3>>32
-
- and r1,r8,#-4
- and r8,r8,#3
- add r1,r1,r1,lsr#2 @ *=5
- adds r4,r4,r1
- adcs r5,r5,#0
- adcs r6,r6,#0
- adcs r7,r7,#0
- adc r8,r8,#0
-
- cmp r0,lr @ done yet?
- bhi .Loop
-
- ldr r0,[sp,#12]
- add sp,sp,#32
- stmdb r0,{r4-r8} @ store the result
-
-.Lno_data:
-#if __ARM_ARCH__>=5
- ldmia sp!,{r3-r11,pc}
-#else
- ldmia sp!,{r3-r11,lr}
- tst lr,#1
- moveq pc,lr @ be binary compatible with V4, yet
- .word 0xe12fff1e @ interoperable with Thumb ISA:-)
-#endif
-.size poly1305_blocks,.-poly1305_blocks
-.type poly1305_emit,%function
-.align 5
-poly1305_emit:
-.Lpoly1305_emit:
- stmdb sp!,{r4-r11}
-
- ldmia r0,{r3-r7}
-
-#if __ARM_ARCH__>=7
- ldr ip,[r0,#36] @ is_base2_26
-
- adds r8,r3,r4,lsl#26 @ base 2^26 -> base 2^32
- mov r9,r4,lsr#6
- adcs r9,r9,r5,lsl#20
- mov r10,r5,lsr#12
- adcs r10,r10,r6,lsl#14
- mov r11,r6,lsr#18
- adcs r11,r11,r7,lsl#8
- mov r0,#0
- adc r0,r0,r7,lsr#24
-
- tst ip,ip
- itttt ne
- movne r3,r8
- movne r4,r9
- movne r5,r10
- movne r6,r11
- it ne
- movne r7,r0
-#endif
-
- adds r8,r3,#5 @ compare to modulus
- adcs r9,r4,#0
- adcs r10,r5,#0
- adcs r11,r6,#0
- adc r0,r7,#0
- tst r0,#4 @ did it carry/borrow?
-
-#ifdef __thumb2__
- it ne
-#endif
- movne r3,r8
- ldr r8,[r2,#0]
-#ifdef __thumb2__
- it ne
-#endif
- movne r4,r9
- ldr r9,[r2,#4]
-#ifdef __thumb2__
- it ne
-#endif
- movne r5,r10
- ldr r10,[r2,#8]
-#ifdef __thumb2__
- it ne
-#endif
- movne r6,r11
- ldr r11,[r2,#12]
-
- adds r3,r3,r8
- adcs r4,r4,r9
- adcs r5,r5,r10
- adc r6,r6,r11
-
-#if __ARM_ARCH__>=7
-# ifdef __ARMEB__
- rev r3,r3
- rev r4,r4
- rev r5,r5
- rev r6,r6
-# endif
- str r3,[r1,#0]
- str r4,[r1,#4]
- str r5,[r1,#8]
- str r6,[r1,#12]
-#else
- strb r3,[r1,#0]
- mov r3,r3,lsr#8
- strb r4,[r1,#4]
- mov r4,r4,lsr#8
- strb r5,[r1,#8]
- mov r5,r5,lsr#8
- strb r6,[r1,#12]
- mov r6,r6,lsr#8
-
- strb r3,[r1,#1]
- mov r3,r3,lsr#8
- strb r4,[r1,#5]
- mov r4,r4,lsr#8
- strb r5,[r1,#9]
- mov r5,r5,lsr#8
- strb r6,[r1,#13]
- mov r6,r6,lsr#8
-
- strb r3,[r1,#2]
- mov r3,r3,lsr#8
- strb r4,[r1,#6]
- mov r4,r4,lsr#8
- strb r5,[r1,#10]
- mov r5,r5,lsr#8
- strb r6,[r1,#14]
- mov r6,r6,lsr#8
-
- strb r3,[r1,#3]
- strb r4,[r1,#7]
- strb r5,[r1,#11]
- strb r6,[r1,#15]
-#endif
- ldmia sp!,{r4-r11}
-#if __ARM_ARCH__>=5
- bx lr @ bx lr
-#else
- tst lr,#1
- moveq pc,lr @ be binary compatible with V4, yet
- .word 0xe12fff1e @ interoperable with Thumb ISA:-)
-#endif
-.size poly1305_emit,.-poly1305_emit
-#if __ARM_MAX_ARCH__>=7
-.fpu neon
-
-.type poly1305_init_neon,%function
-.align 5
-poly1305_init_neon:
-.Lpoly1305_init_neon:
- ldr r3,[r0,#48] @ first table element
- cmp r3,#-1 @ is value impossible?
- bne .Lno_init_neon
-
- ldr r4,[r0,#20] @ load key base 2^32
- ldr r5,[r0,#24]
- ldr r6,[r0,#28]
- ldr r7,[r0,#32]
-
- and r2,r4,#0x03ffffff @ base 2^32 -> base 2^26
- mov r3,r4,lsr#26
- mov r4,r5,lsr#20
- orr r3,r3,r5,lsl#6
- mov r5,r6,lsr#14
- orr r4,r4,r6,lsl#12
- mov r6,r7,lsr#8
- orr r5,r5,r7,lsl#18
- and r3,r3,#0x03ffffff
- and r4,r4,#0x03ffffff
- and r5,r5,#0x03ffffff
-
- vdup.32 d0,r2 @ r^1 in both lanes
- add r2,r3,r3,lsl#2 @ *5
- vdup.32 d1,r3
- add r3,r4,r4,lsl#2
- vdup.32 d2,r2
- vdup.32 d3,r4
- add r4,r5,r5,lsl#2
- vdup.32 d4,r3
- vdup.32 d5,r5
- add r5,r6,r6,lsl#2
- vdup.32 d6,r4
- vdup.32 d7,r6
- vdup.32 d8,r5
-
- mov r5,#2 @ counter
-
-.Lsquare_neon:
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
- @ d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
- @ d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
- @ d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
- @ d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
-
- vmull.u32 q5,d0,d0[1]
- vmull.u32 q6,d1,d0[1]
- vmull.u32 q7,d3,d0[1]
- vmull.u32 q8,d5,d0[1]
- vmull.u32 q9,d7,d0[1]
-
- vmlal.u32 q5,d7,d2[1]
- vmlal.u32 q6,d0,d1[1]
- vmlal.u32 q7,d1,d1[1]
- vmlal.u32 q8,d3,d1[1]
- vmlal.u32 q9,d5,d1[1]
-
- vmlal.u32 q5,d5,d4[1]
- vmlal.u32 q6,d7,d4[1]
- vmlal.u32 q8,d1,d3[1]
- vmlal.u32 q7,d0,d3[1]
- vmlal.u32 q9,d3,d3[1]
-
- vmlal.u32 q5,d3,d6[1]
- vmlal.u32 q8,d0,d5[1]
- vmlal.u32 q6,d5,d6[1]
- vmlal.u32 q7,d7,d6[1]
- vmlal.u32 q9,d1,d5[1]
-
- vmlal.u32 q8,d7,d8[1]
- vmlal.u32 q5,d1,d8[1]
- vmlal.u32 q6,d3,d8[1]
- vmlal.u32 q7,d5,d8[1]
- vmlal.u32 q9,d0,d7[1]
-
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @ lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
- @ and P. Schwabe
- @
- @ H0>>+H1>>+H2>>+H3>>+H4
- @ H3>>+H4>>*5+H0>>+H1
- @
- @ Trivia.
- @
- @ Result of multiplication of n-bit number by m-bit number is
- @ n+m bits wide. However! Even though 2^n is a n+1-bit number,
- @ m-bit number multiplied by 2^n is still n+m bits wide.
- @
- @ Sum of two n-bit numbers is n+1 bits wide, sum of three - n+2,
- @ and so is sum of four. Sum of 2^m n-m-bit numbers and n-bit
- @ one is n+1 bits wide.
- @
- @ >>+ denotes Hnext += Hn>>26, Hn &= 0x3ffffff. This means that
- @ H0, H2, H3 are guaranteed to be 26 bits wide, while H1 and H4
- @ can be 27. However! In cases when their width exceeds 26 bits
- @ they are limited by 2^26+2^6. This in turn means that *sum*
- @ of the products with these values can still be viewed as sum
- @ of 52-bit numbers as long as the amount of addends is not a
- @ power of 2. For example,
- @
- @ H4 = H4*R0 + H3*R1 + H2*R2 + H1*R3 + H0 * R4,
- @
- @ which can't be larger than 5 * (2^26 + 2^6) * (2^26 + 2^6), or
- @ 5 * (2^52 + 2*2^32 + 2^12), which in turn is smaller than
- @ 8 * (2^52) or 2^55. However, the value is then multiplied by
- @ by 5, so we should be looking at 5 * 5 * (2^52 + 2^33 + 2^12),
- @ which is less than 32 * (2^52) or 2^57. And when processing
- @ data we are looking at triple as many addends...
- @
- @ In key setup procedure pre-reduced H0 is limited by 5*4+1 and
- @ 5*H4 - by 5*5 52-bit addends, or 57 bits. But when hashing the
- @ input H0 is limited by (5*4+1)*3 addends, or 58 bits, while
- @ 5*H4 by 5*5*3, or 59[!] bits. How is this relevant? vmlal.u32
- @ instruction accepts 2x32-bit input and writes 2x64-bit result.
- @ This means that result of reduction have to be compressed upon
- @ loop wrap-around. This can be done in the process of reduction
- @ to minimize amount of instructions [as well as amount of
- @ 128-bit instructions, which benefits low-end processors], but
- @ one has to watch for H2 (which is narrower than H0) and 5*H4
- @ not being wider than 58 bits, so that result of right shift
- @ by 26 bits fits in 32 bits. This is also useful on x86,
- @ because it allows to use paddd in place for paddq, which
- @ benefits Atom, where paddq is ridiculously slow.
-
- vshr.u64 q15,q8,#26
- vmovn.i64 d16,q8
- vshr.u64 q4,q5,#26
- vmovn.i64 d10,q5
- vadd.i64 q9,q9,q15 @ h3 -> h4
- vbic.i32 d16,#0xfc000000 @ &=0x03ffffff
- vadd.i64 q6,q6,q4 @ h0 -> h1
- vbic.i32 d10,#0xfc000000
-
- vshrn.u64 d30,q9,#26
- vmovn.i64 d18,q9
- vshr.u64 q4,q6,#26
- vmovn.i64 d12,q6
- vadd.i64 q7,q7,q4 @ h1 -> h2
- vbic.i32 d18,#0xfc000000
- vbic.i32 d12,#0xfc000000
-
- vadd.i32 d10,d10,d30
- vshl.u32 d30,d30,#2
- vshrn.u64 d8,q7,#26
- vmovn.i64 d14,q7
- vadd.i32 d10,d10,d30 @ h4 -> h0
- vadd.i32 d16,d16,d8 @ h2 -> h3
- vbic.i32 d14,#0xfc000000
-
- vshr.u32 d30,d10,#26
- vbic.i32 d10,#0xfc000000
- vshr.u32 d8,d16,#26
- vbic.i32 d16,#0xfc000000
- vadd.i32 d12,d12,d30 @ h0 -> h1
- vadd.i32 d18,d18,d8 @ h3 -> h4
-
- subs r5,r5,#1
- beq .Lsquare_break_neon
-
- add r6,r0,#(48+0*9*4)
- add r7,r0,#(48+1*9*4)
-
- vtrn.32 d0,d10 @ r^2:r^1
- vtrn.32 d3,d14
- vtrn.32 d5,d16
- vtrn.32 d1,d12
- vtrn.32 d7,d18
-
- vshl.u32 d4,d3,#2 @ *5
- vshl.u32 d6,d5,#2
- vshl.u32 d2,d1,#2
- vshl.u32 d8,d7,#2
- vadd.i32 d4,d4,d3
- vadd.i32 d2,d2,d1
- vadd.i32 d6,d6,d5
- vadd.i32 d8,d8,d7
-
- vst4.32 {d0[0],d1[0],d2[0],d3[0]},[r6]!
- vst4.32 {d0[1],d1[1],d2[1],d3[1]},[r7]!
- vst4.32 {d4[0],d5[0],d6[0],d7[0]},[r6]!
- vst4.32 {d4[1],d5[1],d6[1],d7[1]},[r7]!
- vst1.32 {d8[0]},[r6,:32]
- vst1.32 {d8[1]},[r7,:32]
-
- b .Lsquare_neon
-
-.align 4
-.Lsquare_break_neon:
- add r6,r0,#(48+2*4*9)
- add r7,r0,#(48+3*4*9)
-
- vmov d0,d10 @ r^4:r^3
- vshl.u32 d2,d12,#2 @ *5
- vmov d1,d12
- vshl.u32 d4,d14,#2
- vmov d3,d14
- vshl.u32 d6,d16,#2
- vmov d5,d16
- vshl.u32 d8,d18,#2
- vmov d7,d18
- vadd.i32 d2,d2,d12
- vadd.i32 d4,d4,d14
- vadd.i32 d6,d6,d16
- vadd.i32 d8,d8,d18
-
- vst4.32 {d0[0],d1[0],d2[0],d3[0]},[r6]!
- vst4.32 {d0[1],d1[1],d2[1],d3[1]},[r7]!
- vst4.32 {d4[0],d5[0],d6[0],d7[0]},[r6]!
- vst4.32 {d4[1],d5[1],d6[1],d7[1]},[r7]!
- vst1.32 {d8[0]},[r6]
- vst1.32 {d8[1]},[r7]
-
-.Lno_init_neon:
- bx lr @ bx lr
-.size poly1305_init_neon,.-poly1305_init_neon
-
-.type poly1305_blocks_neon,%function
-.align 5
-poly1305_blocks_neon:
-.Lpoly1305_blocks_neon:
- ldr ip,[r0,#36] @ is_base2_26
-
- cmp r2,#64
- blo .Lpoly1305_blocks
-
- stmdb sp!,{r4-r7}
- vstmdb sp!,{d8-d15} @ ABI specification says so
-
- tst ip,ip @ is_base2_26?
- bne .Lbase2_26_neon
-
- stmdb sp!,{r1-r3,lr}
- bl .Lpoly1305_init_neon
-
- ldr r4,[r0,#0] @ load hash value base 2^32
- ldr r5,[r0,#4]
- ldr r6,[r0,#8]
- ldr r7,[r0,#12]
- ldr ip,[r0,#16]
-
- and r2,r4,#0x03ffffff @ base 2^32 -> base 2^26
- mov r3,r4,lsr#26
- veor d10,d10,d10
- mov r4,r5,lsr#20
- orr r3,r3,r5,lsl#6
- veor d12,d12,d12
- mov r5,r6,lsr#14
- orr r4,r4,r6,lsl#12
- veor d14,d14,d14
- mov r6,r7,lsr#8
- orr r5,r5,r7,lsl#18
- veor d16,d16,d16
- and r3,r3,#0x03ffffff
- orr r6,r6,ip,lsl#24
- veor d18,d18,d18
- and r4,r4,#0x03ffffff
- mov r1,#1
- and r5,r5,#0x03ffffff
- str r1,[r0,#36] @ set is_base2_26
-
- vmov.32 d10[0],r2
- vmov.32 d12[0],r3
- vmov.32 d14[0],r4
- vmov.32 d16[0],r5
- vmov.32 d18[0],r6
- adr r5,.Lzeros
-
- ldmia sp!,{r1-r3,lr}
- b .Lhash_loaded
-
-.align 4
-.Lbase2_26_neon:
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @ load hash value
-
- veor d10,d10,d10
- veor d12,d12,d12
- veor d14,d14,d14
- veor d16,d16,d16
- veor d18,d18,d18
- vld4.32 {d10[0],d12[0],d14[0],d16[0]},[r0]!
- adr r5,.Lzeros
- vld1.32 {d18[0]},[r0]
- sub r0,r0,#16 @ rewind
-
-.Lhash_loaded:
- add r4,r1,#32
- mov r3,r3,lsl#24
- tst r2,#31
- beq .Leven
-
- vld4.32 {d20[0],d22[0],d24[0],d26[0]},[r1]!
- vmov.32 d28[0],r3
- sub r2,r2,#16
- add r4,r1,#32
-
-# ifdef __ARMEB__
- vrev32.8 q10,q10
- vrev32.8 q13,q13
- vrev32.8 q11,q11
- vrev32.8 q12,q12
-# endif
- vsri.u32 d28,d26,#8 @ base 2^32 -> base 2^26
- vshl.u32 d26,d26,#18
-
- vsri.u32 d26,d24,#14
- vshl.u32 d24,d24,#12
- vadd.i32 d29,d28,d18 @ add hash value and move to #hi
-
- vbic.i32 d26,#0xfc000000
- vsri.u32 d24,d22,#20
- vshl.u32 d22,d22,#6
-
- vbic.i32 d24,#0xfc000000
- vsri.u32 d22,d20,#26
- vadd.i32 d27,d26,d16
-
- vbic.i32 d20,#0xfc000000
- vbic.i32 d22,#0xfc000000
- vadd.i32 d25,d24,d14
-
- vadd.i32 d21,d20,d10
- vadd.i32 d23,d22,d12
-
- mov r7,r5
- add r6,r0,#48
-
- cmp r2,r2
- b .Long_tail
-
-.align 4
-.Leven:
- subs r2,r2,#64
- it lo
- movlo r4,r5
-
- vmov.i32 q14,#1<<24 @ padbit, yes, always
- vld4.32 {d20,d22,d24,d26},[r1] @ inp[0:1]
- add r1,r1,#64
- vld4.32 {d21,d23,d25,d27},[r4] @ inp[2:3] (or 0)
- add r4,r4,#64
- itt hi
- addhi r7,r0,#(48+1*9*4)
- addhi r6,r0,#(48+3*9*4)
-
-# ifdef __ARMEB__
- vrev32.8 q10,q10
- vrev32.8 q13,q13
- vrev32.8 q11,q11
- vrev32.8 q12,q12
-# endif
- vsri.u32 q14,q13,#8 @ base 2^32 -> base 2^26
- vshl.u32 q13,q13,#18
-
- vsri.u32 q13,q12,#14
- vshl.u32 q12,q12,#12
-
- vbic.i32 q13,#0xfc000000
- vsri.u32 q12,q11,#20
- vshl.u32 q11,q11,#6
-
- vbic.i32 q12,#0xfc000000
- vsri.u32 q11,q10,#26
-
- vbic.i32 q10,#0xfc000000
- vbic.i32 q11,#0xfc000000
-
- bls .Lskip_loop
-
- vld4.32 {d0[1],d1[1],d2[1],d3[1]},[r7]! @ load r^2
- vld4.32 {d0[0],d1[0],d2[0],d3[0]},[r6]! @ load r^4
- vld4.32 {d4[1],d5[1],d6[1],d7[1]},[r7]!
- vld4.32 {d4[0],d5[0],d6[0],d7[0]},[r6]!
- b .Loop_neon
-
-.align 5
-.Loop_neon:
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
- @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
- @ ___________________/
- @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
- @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
- @ ___________________/ ____________________/
- @
- @ Note that we start with inp[2:3]*r^2. This is because it
- @ doesn't depend on reduction in previous iteration.
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @ d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
- @ d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
- @ d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
- @ d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
- @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
-
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @ inp[2:3]*r^2
-
- vadd.i32 d24,d24,d14 @ accumulate inp[0:1]
- vmull.u32 q7,d25,d0[1]
- vadd.i32 d20,d20,d10
- vmull.u32 q5,d21,d0[1]
- vadd.i32 d26,d26,d16
- vmull.u32 q8,d27,d0[1]
- vmlal.u32 q7,d23,d1[1]
- vadd.i32 d22,d22,d12
- vmull.u32 q6,d23,d0[1]
-
- vadd.i32 d28,d28,d18
- vmull.u32 q9,d29,d0[1]
- subs r2,r2,#64
- vmlal.u32 q5,d29,d2[1]
- it lo
- movlo r4,r5
- vmlal.u32 q8,d25,d1[1]
- vld1.32 d8[1],[r7,:32]
- vmlal.u32 q6,d21,d1[1]
- vmlal.u32 q9,d27,d1[1]
-
- vmlal.u32 q5,d27,d4[1]
- vmlal.u32 q8,d23,d3[1]
- vmlal.u32 q9,d25,d3[1]
- vmlal.u32 q6,d29,d4[1]
- vmlal.u32 q7,d21,d3[1]
-
- vmlal.u32 q8,d21,d5[1]
- vmlal.u32 q5,d25,d6[1]
- vmlal.u32 q9,d23,d5[1]
- vmlal.u32 q6,d27,d6[1]
- vmlal.u32 q7,d29,d6[1]
-
- vmlal.u32 q8,d29,d8[1]
- vmlal.u32 q5,d23,d8[1]
- vmlal.u32 q9,d21,d7[1]
- vmlal.u32 q6,d25,d8[1]
- vmlal.u32 q7,d27,d8[1]
-
- vld4.32 {d21,d23,d25,d27},[r4] @ inp[2:3] (or 0)
- add r4,r4,#64
-
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @ (hash+inp[0:1])*r^4 and accumulate
-
- vmlal.u32 q8,d26,d0[0]
- vmlal.u32 q5,d20,d0[0]
- vmlal.u32 q9,d28,d0[0]
- vmlal.u32 q6,d22,d0[0]
- vmlal.u32 q7,d24,d0[0]
- vld1.32 d8[0],[r6,:32]
-
- vmlal.u32 q8,d24,d1[0]
- vmlal.u32 q5,d28,d2[0]
- vmlal.u32 q9,d26,d1[0]
- vmlal.u32 q6,d20,d1[0]
- vmlal.u32 q7,d22,d1[0]
-
- vmlal.u32 q8,d22,d3[0]
- vmlal.u32 q5,d26,d4[0]
- vmlal.u32 q9,d24,d3[0]
- vmlal.u32 q6,d28,d4[0]
- vmlal.u32 q7,d20,d3[0]
-
- vmlal.u32 q8,d20,d5[0]
- vmlal.u32 q5,d24,d6[0]
- vmlal.u32 q9,d22,d5[0]
- vmlal.u32 q6,d26,d6[0]
- vmlal.u32 q8,d28,d8[0]
-
- vmlal.u32 q7,d28,d6[0]
- vmlal.u32 q5,d22,d8[0]
- vmlal.u32 q9,d20,d7[0]
- vmov.i32 q14,#1<<24 @ padbit, yes, always
- vmlal.u32 q6,d24,d8[0]
- vmlal.u32 q7,d26,d8[0]
-
- vld4.32 {d20,d22,d24,d26},[r1] @ inp[0:1]
- add r1,r1,#64
-# ifdef __ARMEB__
- vrev32.8 q10,q10
- vrev32.8 q11,q11
- vrev32.8 q12,q12
- vrev32.8 q13,q13
-# endif
-
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @ lazy reduction interleaved with base 2^32 -> base 2^26 of
- @ inp[0:3] previously loaded to q10-q13 and smashed to q10-q14.
-
- vshr.u64 q15,q8,#26
- vmovn.i64 d16,q8
- vshr.u64 q4,q5,#26
- vmovn.i64 d10,q5
- vadd.i64 q9,q9,q15 @ h3 -> h4
- vbic.i32 d16,#0xfc000000
- vsri.u32 q14,q13,#8 @ base 2^32 -> base 2^26
- vadd.i64 q6,q6,q4 @ h0 -> h1
- vshl.u32 q13,q13,#18
- vbic.i32 d10,#0xfc000000
-
- vshrn.u64 d30,q9,#26
- vmovn.i64 d18,q9
- vshr.u64 q4,q6,#26
- vmovn.i64 d12,q6
- vadd.i64 q7,q7,q4 @ h1 -> h2
- vsri.u32 q13,q12,#14
- vbic.i32 d18,#0xfc000000
- vshl.u32 q12,q12,#12
- vbic.i32 d12,#0xfc000000
-
- vadd.i32 d10,d10,d30
- vshl.u32 d30,d30,#2
- vbic.i32 q13,#0xfc000000
- vshrn.u64 d8,q7,#26
- vmovn.i64 d14,q7
- vaddl.u32 q5,d10,d30 @ h4 -> h0 [widen for a sec]
- vsri.u32 q12,q11,#20
- vadd.i32 d16,d16,d8 @ h2 -> h3
- vshl.u32 q11,q11,#6
- vbic.i32 d14,#0xfc000000
- vbic.i32 q12,#0xfc000000
-
- vshrn.u64 d30,q5,#26 @ re-narrow
- vmovn.i64 d10,q5
- vsri.u32 q11,q10,#26
- vbic.i32 q10,#0xfc000000
- vshr.u32 d8,d16,#26
- vbic.i32 d16,#0xfc000000
- vbic.i32 d10,#0xfc000000
- vadd.i32 d12,d12,d30 @ h0 -> h1
- vadd.i32 d18,d18,d8 @ h3 -> h4
- vbic.i32 q11,#0xfc000000
-
- bhi .Loop_neon
-
-.Lskip_loop:
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @ multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
-
- add r7,r0,#(48+0*9*4)
- add r6,r0,#(48+1*9*4)
- adds r2,r2,#32
- it ne
- movne r2,#0
- bne .Long_tail
-
- vadd.i32 d25,d24,d14 @ add hash value and move to #hi
- vadd.i32 d21,d20,d10
- vadd.i32 d27,d26,d16
- vadd.i32 d23,d22,d12
- vadd.i32 d29,d28,d18
-
-.Long_tail:
- vld4.32 {d0[1],d1[1],d2[1],d3[1]},[r7]! @ load r^1
- vld4.32 {d0[0],d1[0],d2[0],d3[0]},[r6]! @ load r^2
-
- vadd.i32 d24,d24,d14 @ can be redundant
- vmull.u32 q7,d25,d0
- vadd.i32 d20,d20,d10
- vmull.u32 q5,d21,d0
- vadd.i32 d26,d26,d16
- vmull.u32 q8,d27,d0
- vadd.i32 d22,d22,d12
- vmull.u32 q6,d23,d0
- vadd.i32 d28,d28,d18
- vmull.u32 q9,d29,d0
-
- vmlal.u32 q5,d29,d2
- vld4.32 {d4[1],d5[1],d6[1],d7[1]},[r7]!
- vmlal.u32 q8,d25,d1
- vld4.32 {d4[0],d5[0],d6[0],d7[0]},[r6]!
- vmlal.u32 q6,d21,d1
- vmlal.u32 q9,d27,d1
- vmlal.u32 q7,d23,d1
-
- vmlal.u32 q8,d23,d3
- vld1.32 d8[1],[r7,:32]
- vmlal.u32 q5,d27,d4
- vld1.32 d8[0],[r6,:32]
- vmlal.u32 q9,d25,d3
- vmlal.u32 q6,d29,d4
- vmlal.u32 q7,d21,d3
-
- vmlal.u32 q8,d21,d5
- it ne
- addne r7,r0,#(48+2*9*4)
- vmlal.u32 q5,d25,d6
- it ne
- addne r6,r0,#(48+3*9*4)
- vmlal.u32 q9,d23,d5
- vmlal.u32 q6,d27,d6
- vmlal.u32 q7,d29,d6
-
- vmlal.u32 q8,d29,d8
- vorn q0,q0,q0 @ all-ones, can be redundant
- vmlal.u32 q5,d23,d8
- vshr.u64 q0,q0,#38
- vmlal.u32 q9,d21,d7
- vmlal.u32 q6,d25,d8
- vmlal.u32 q7,d27,d8
-
- beq .Lshort_tail
-
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @ (hash+inp[0:1])*r^4:r^3 and accumulate
-
- vld4.32 {d0[1],d1[1],d2[1],d3[1]},[r7]! @ load r^3
- vld4.32 {d0[0],d1[0],d2[0],d3[0]},[r6]! @ load r^4
-
- vmlal.u32 q7,d24,d0
- vmlal.u32 q5,d20,d0
- vmlal.u32 q8,d26,d0
- vmlal.u32 q6,d22,d0
- vmlal.u32 q9,d28,d0
-
- vmlal.u32 q5,d28,d2
- vld4.32 {d4[1],d5[1],d6[1],d7[1]},[r7]!
- vmlal.u32 q8,d24,d1
- vld4.32 {d4[0],d5[0],d6[0],d7[0]},[r6]!
- vmlal.u32 q6,d20,d1
- vmlal.u32 q9,d26,d1
- vmlal.u32 q7,d22,d1
-
- vmlal.u32 q8,d22,d3
- vld1.32 d8[1],[r7,:32]
- vmlal.u32 q5,d26,d4
- vld1.32 d8[0],[r6,:32]
- vmlal.u32 q9,d24,d3
- vmlal.u32 q6,d28,d4
- vmlal.u32 q7,d20,d3
-
- vmlal.u32 q8,d20,d5
- vmlal.u32 q5,d24,d6
- vmlal.u32 q9,d22,d5
- vmlal.u32 q6,d26,d6
- vmlal.u32 q7,d28,d6
-
- vmlal.u32 q8,d28,d8
- vorn q0,q0,q0 @ all-ones
- vmlal.u32 q5,d22,d8
- vshr.u64 q0,q0,#38
- vmlal.u32 q9,d20,d7
- vmlal.u32 q6,d24,d8
- vmlal.u32 q7,d26,d8
-
-.Lshort_tail:
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @ horizontal addition
-
- vadd.i64 d16,d16,d17
- vadd.i64 d10,d10,d11
- vadd.i64 d18,d18,d19
- vadd.i64 d12,d12,d13
- vadd.i64 d14,d14,d15
-
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @ lazy reduction, but without narrowing
-
- vshr.u64 q15,q8,#26
- vand.i64 q8,q8,q0
- vshr.u64 q4,q5,#26
- vand.i64 q5,q5,q0
- vadd.i64 q9,q9,q15 @ h3 -> h4
- vadd.i64 q6,q6,q4 @ h0 -> h1
-
- vshr.u64 q15,q9,#26
- vand.i64 q9,q9,q0
- vshr.u64 q4,q6,#26
- vand.i64 q6,q6,q0
- vadd.i64 q7,q7,q4 @ h1 -> h2
-
- vadd.i64 q5,q5,q15
- vshl.u64 q15,q15,#2
- vshr.u64 q4,q7,#26
- vand.i64 q7,q7,q0
- vadd.i64 q5,q5,q15 @ h4 -> h0
- vadd.i64 q8,q8,q4 @ h2 -> h3
-
- vshr.u64 q15,q5,#26
- vand.i64 q5,q5,q0
- vshr.u64 q4,q8,#26
- vand.i64 q8,q8,q0
- vadd.i64 q6,q6,q15 @ h0 -> h1
- vadd.i64 q9,q9,q4 @ h3 -> h4
-
- cmp r2,#0
- bne .Leven
-
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @ store hash value
-
- vst4.32 {d10[0],d12[0],d14[0],d16[0]},[r0]!
- vst1.32 {d18[0]},[r0]
-
- vldmia sp!,{d8-d15} @ epilogue
- ldmia sp!,{r4-r7}
- bx lr @ bx lr
-.size poly1305_blocks_neon,.-poly1305_blocks_neon
-
-.align 5
-.Lzeros:
-.long 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
-#ifndef __KERNEL__
-.LOPENSSL_armcap:
-# ifdef _WIN32
-.word OPENSSL_armcap_P
-# else
-.word OPENSSL_armcap_P-.Lpoly1305_init
-# endif
-.comm OPENSSL_armcap_P,4,4
-.hidden OPENSSL_armcap_P
-#endif
-#endif
-.asciz "Poly1305 for ARMv4/NEON, CRYPTOGAMS by @dot-asm"
-.align 2
diff --git a/arch/arm/crypto/poly1305-glue.c b/arch/arm/crypto/poly1305-glue.c
index ceec04ec2f40..c31bd8f7c092 100644
--- a/arch/arm/crypto/poly1305-glue.c
+++ b/arch/arm/crypto/poly1305-glue.c
@@ -20,6 +20,7 @@
void poly1305_init_arm(void *state, const u8 *key);
void poly1305_blocks_arm(void *state, const u8 *src, u32 len, u32 hibit);
+void poly1305_blocks_neon(void *state, const u8 *src, u32 len, u32 hibit);
void poly1305_emit_arm(void *state, u8 *digest, const u32 *nonce);
void __weak poly1305_blocks_neon(void *state, const u8 *src, u32 len, u32 hibit)
@@ -28,7 +29,7 @@ void __weak poly1305_blocks_neon(void *state, const u8 *src, u32 len, u32 hibit)
static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
-void poly1305_init_arch(struct poly1305_desc_ctx *dctx, const u8 *key)
+void poly1305_init_arch(struct poly1305_desc_ctx *dctx, const u8 key[POLY1305_KEY_SIZE])
{
poly1305_init_arm(&dctx->h, key);
dctx->s[0] = get_unaligned_le32(key + 16);
@@ -160,13 +161,20 @@ void poly1305_update_arch(struct poly1305_desc_ctx *dctx, const u8 *src,
unsigned int len = round_down(nbytes, POLY1305_BLOCK_SIZE);
if (static_branch_likely(&have_neon) && do_neon) {
- kernel_neon_begin();
- poly1305_blocks_neon(&dctx->h, src, len, 1);
- kernel_neon_end();
+ do {
+ unsigned int todo = min_t(unsigned int, len, SZ_4K);
+
+ kernel_neon_begin();
+ poly1305_blocks_neon(&dctx->h, src, todo, 1);
+ kernel_neon_end();
+
+ len -= todo;
+ src += todo;
+ } while (len);
} else {
poly1305_blocks_arm(&dctx->h, src, len, 1);
+ src += len;
}
- src += len;
nbytes %= POLY1305_BLOCK_SIZE;
}
diff --git a/arch/arm/crypto/sha1-armv4-large.S b/arch/arm/crypto/sha1-armv4-large.S
index f82cd8cf5a09..1c8b685149f2 100644
--- a/arch/arm/crypto/sha1-armv4-large.S
+++ b/arch/arm/crypto/sha1-armv4-large.S
@@ -13,7 +13,7 @@
@ Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
@ project. The module is, however, dual licensed under OpenSSL and
@ CRYPTOGAMS licenses depending on where you obtain it. For further
-@ details see http://www.openssl.org/~appro/cryptogams/.
+@ details see https://www.openssl.org/~appro/cryptogams/.
@ ====================================================================
@ sha1_block procedure for ARMv4.
diff --git a/arch/arm/crypto/sha1-ce-glue.c b/arch/arm/crypto/sha1-ce-glue.c
index e79b1fb4b4dc..de9100c67b37 100644
--- a/arch/arm/crypto/sha1-ce-glue.c
+++ b/arch/arm/crypto/sha1-ce-glue.c
@@ -7,7 +7,7 @@
#include <crypto/internal/hash.h>
#include <crypto/internal/simd.h>
-#include <crypto/sha.h>
+#include <crypto/sha1.h>
#include <crypto/sha1_base.h>
#include <linux/cpufeature.h>
#include <linux/crypto.h>
diff --git a/arch/arm/crypto/sha1.h b/arch/arm/crypto/sha1.h
index 758db3e9ff0a..b1b7e21da2c3 100644
--- a/arch/arm/crypto/sha1.h
+++ b/arch/arm/crypto/sha1.h
@@ -3,7 +3,7 @@
#define ASM_ARM_CRYPTO_SHA1_H
#include <linux/crypto.h>
-#include <crypto/sha.h>
+#include <crypto/sha1.h>
extern int sha1_update_arm(struct shash_desc *desc, const u8 *data,
unsigned int len);
diff --git a/arch/arm/crypto/sha1_glue.c b/arch/arm/crypto/sha1_glue.c
index c80b0ebfd02f..6c2b849e459d 100644
--- a/arch/arm/crypto/sha1_glue.c
+++ b/arch/arm/crypto/sha1_glue.c
@@ -14,9 +14,8 @@
#include <crypto/internal/hash.h>
#include <linux/init.h>
#include <linux/module.h>
-#include <linux/cryptohash.h>
#include <linux/types.h>
-#include <crypto/sha.h>
+#include <crypto/sha1.h>
#include <crypto/sha1_base.h>
#include <asm/byteorder.h>
diff --git a/arch/arm/crypto/sha1_neon_glue.c b/arch/arm/crypto/sha1_neon_glue.c
index 2c3627334335..cfe36ae0f3f5 100644
--- a/arch/arm/crypto/sha1_neon_glue.c
+++ b/arch/arm/crypto/sha1_neon_glue.c
@@ -18,9 +18,8 @@
#include <linux/init.h>
#include <linux/module.h>
#include <linux/mm.h>
-#include <linux/cryptohash.h>
#include <linux/types.h>
-#include <crypto/sha.h>
+#include <crypto/sha1.h>
#include <crypto/sha1_base.h>
#include <asm/neon.h>
#include <asm/simd.h>
diff --git a/arch/arm/crypto/sha2-ce-glue.c b/arch/arm/crypto/sha2-ce-glue.c
index 87f0b62386c6..c62ce89dd3e0 100644
--- a/arch/arm/crypto/sha2-ce-glue.c
+++ b/arch/arm/crypto/sha2-ce-glue.c
@@ -7,7 +7,7 @@
#include <crypto/internal/hash.h>
#include <crypto/internal/simd.h>
-#include <crypto/sha.h>
+#include <crypto/sha2.h>
#include <crypto/sha256_base.h>
#include <linux/cpufeature.h>
#include <linux/crypto.h>
diff --git a/arch/arm/crypto/sha256-armv4.pl b/arch/arm/crypto/sha256-armv4.pl
index a03cf4dfb781..f3a2b54efd4e 100644
--- a/arch/arm/crypto/sha256-armv4.pl
+++ b/arch/arm/crypto/sha256-armv4.pl
@@ -13,7 +13,7 @@
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
-# details see http://www.openssl.org/~appro/cryptogams/.
+# details see https://www.openssl.org/~appro/cryptogams/.
# ====================================================================
# SHA256 block procedure for ARMv4. May 2007.
@@ -175,7 +175,6 @@ $code=<<___;
#else
.syntax unified
# ifdef __thumb2__
-# define adrl adr
.thumb
# else
.code 32
@@ -471,7 +470,8 @@ sha256_block_data_order_neon:
stmdb sp!,{r4-r12,lr}
sub $H,sp,#16*4+16
- adrl $Ktbl,K256
+ adr $Ktbl,.Lsha256_block_data_order
+ sub $Ktbl,$Ktbl,#.Lsha256_block_data_order-K256
bic $H,$H,#15 @ align for 128-bit stores
mov $t2,sp
mov sp,$H @ alloca
diff --git a/arch/arm/crypto/sha256-core.S_shipped b/arch/arm/crypto/sha256-core.S_shipped
deleted file mode 100644
index 054aae0edfce..000000000000
--- a/arch/arm/crypto/sha256-core.S_shipped
+++ /dev/null
@@ -1,2816 +0,0 @@
-@ SPDX-License-Identifier: GPL-2.0
-
-@ This code is taken from the OpenSSL project but the author (Andy Polyakov)
-@ has relicensed it under the GPLv2. Therefore this program is free software;
-@ you can redistribute it and/or modify it under the terms of the GNU General
-@ Public License version 2 as published by the Free Software Foundation.
-@
-@ The original headers, including the original license headers, are
-@ included below for completeness.
-
-@ ====================================================================
-@ Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
-@ project. The module is, however, dual licensed under OpenSSL and
-@ CRYPTOGAMS licenses depending on where you obtain it. For further
-@ details see http://www.openssl.org/~appro/cryptogams/.
-@ ====================================================================
-
-@ SHA256 block procedure for ARMv4. May 2007.
-
-@ Performance is ~2x better than gcc 3.4 generated code and in "abso-
-@ lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
-@ byte [on single-issue Xscale PXA250 core].
-
-@ July 2010.
-@
-@ Rescheduling for dual-issue pipeline resulted in 22% improvement on
-@ Cortex A8 core and ~20 cycles per processed byte.
-
-@ February 2011.
-@
-@ Profiler-assisted and platform-specific optimization resulted in 16%
-@ improvement on Cortex A8 core and ~15.4 cycles per processed byte.
-
-@ September 2013.
-@
-@ Add NEON implementation. On Cortex A8 it was measured to process one
-@ byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon
-@ S4 does it in 12.5 cycles too, but it's 50% faster than integer-only
-@ code (meaning that latter performs sub-optimally, nothing was done
-@ about it).
-
-@ May 2014.
-@
-@ Add ARMv8 code path performing at 2.0 cpb on Apple A7.
-
-#ifndef __KERNEL__
-# include "arm_arch.h"
-#else
-# define __ARM_ARCH__ __LINUX_ARM_ARCH__
-# define __ARM_MAX_ARCH__ 7
-#endif
-
-.text
-#if __ARM_ARCH__<7
-.code 32
-#else
-.syntax unified
-# ifdef __thumb2__
-# define adrl adr
-.thumb
-# else
-.code 32
-# endif
-#endif
-
-.type K256,%object
-.align 5
-K256:
-.word 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
-.word 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
-.word 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
-.word 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
-.word 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
-.word 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
-.word 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
-.word 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
-.word 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
-.word 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
-.word 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
-.word 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
-.word 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
-.word 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
-.word 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
-.word 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
-.size K256,.-K256
-.word 0 @ terminator
-#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
-.LOPENSSL_armcap:
-.word OPENSSL_armcap_P-sha256_block_data_order
-#endif
-.align 5
-
-.global sha256_block_data_order
-.type sha256_block_data_order,%function
-sha256_block_data_order:
-.Lsha256_block_data_order:
-#if __ARM_ARCH__<7
- sub r3,pc,#8 @ sha256_block_data_order
-#else
- adr r3,.Lsha256_block_data_order
-#endif
-#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
- ldr r12,.LOPENSSL_armcap
- ldr r12,[r3,r12] @ OPENSSL_armcap_P
- tst r12,#ARMV8_SHA256
- bne .LARMv8
- tst r12,#ARMV7_NEON
- bne .LNEON
-#endif
- add r2,r1,r2,lsl#6 @ len to point at the end of inp
- stmdb sp!,{r0,r1,r2,r4-r11,lr}
- ldmia r0,{r4,r5,r6,r7,r8,r9,r10,r11}
- sub r14,r3,#256+32 @ K256
- sub sp,sp,#16*4 @ alloca(X[16])
-.Loop:
-# if __ARM_ARCH__>=7
- ldr r2,[r1],#4
-# else
- ldrb r2,[r1,#3]
-# endif
- eor r3,r5,r6 @ magic
- eor r12,r12,r12
-#if __ARM_ARCH__>=7
- @ ldr r2,[r1],#4 @ 0
-# if 0==15
- str r1,[sp,#17*4] @ make room for r1
-# endif
- eor r0,r8,r8,ror#5
- add r4,r4,r12 @ h+=Maj(a,b,c) from the past
- eor r0,r0,r8,ror#19 @ Sigma1(e)
-# ifndef __ARMEB__
- rev r2,r2
-# endif
-#else
- @ ldrb r2,[r1,#3] @ 0
- add r4,r4,r12 @ h+=Maj(a,b,c) from the past
- ldrb r12,[r1,#2]
- ldrb r0,[r1,#1]
- orr r2,r2,r12,lsl#8
- ldrb r12,[r1],#4
- orr r2,r2,r0,lsl#16
-# if 0==15
- str r1,[sp,#17*4] @ make room for r1
-# endif
- eor r0,r8,r8,ror#5
- orr r2,r2,r12,lsl#24
- eor r0,r0,r8,ror#19 @ Sigma1(e)
-#endif
- ldr r12,[r14],#4 @ *K256++
- add r11,r11,r2 @ h+=X[i]
- str r2,[sp,#0*4]
- eor r2,r9,r10
- add r11,r11,r0,ror#6 @ h+=Sigma1(e)
- and r2,r2,r8
- add r11,r11,r12 @ h+=K256[i]
- eor r2,r2,r10 @ Ch(e,f,g)
- eor r0,r4,r4,ror#11
- add r11,r11,r2 @ h+=Ch(e,f,g)
-#if 0==31
- and r12,r12,#0xff
- cmp r12,#0xf2 @ done?
-#endif
-#if 0<15
-# if __ARM_ARCH__>=7
- ldr r2,[r1],#4 @ prefetch
-# else
- ldrb r2,[r1,#3]
-# endif
- eor r12,r4,r5 @ a^b, b^c in next round
-#else
- ldr r2,[sp,#2*4] @ from future BODY_16_xx
- eor r12,r4,r5 @ a^b, b^c in next round
- ldr r1,[sp,#15*4] @ from future BODY_16_xx
-#endif
- eor r0,r0,r4,ror#20 @ Sigma0(a)
- and r3,r3,r12 @ (b^c)&=(a^b)
- add r7,r7,r11 @ d+=h
- eor r3,r3,r5 @ Maj(a,b,c)
- add r11,r11,r0,ror#2 @ h+=Sigma0(a)
- @ add r11,r11,r3 @ h+=Maj(a,b,c)
-#if __ARM_ARCH__>=7
- @ ldr r2,[r1],#4 @ 1
-# if 1==15
- str r1,[sp,#17*4] @ make room for r1
-# endif
- eor r0,r7,r7,ror#5
- add r11,r11,r3 @ h+=Maj(a,b,c) from the past
- eor r0,r0,r7,ror#19 @ Sigma1(e)
-# ifndef __ARMEB__
- rev r2,r2
-# endif
-#else
- @ ldrb r2,[r1,#3] @ 1
- add r11,r11,r3 @ h+=Maj(a,b,c) from the past
- ldrb r3,[r1,#2]
- ldrb r0,[r1,#1]
- orr r2,r2,r3,lsl#8
- ldrb r3,[r1],#4
- orr r2,r2,r0,lsl#16
-# if 1==15
- str r1,[sp,#17*4] @ make room for r1
-# endif
- eor r0,r7,r7,ror#5
- orr r2,r2,r3,lsl#24
- eor r0,r0,r7,ror#19 @ Sigma1(e)
-#endif
- ldr r3,[r14],#4 @ *K256++
- add r10,r10,r2 @ h+=X[i]
- str r2,[sp,#1*4]
- eor r2,r8,r9
- add r10,r10,r0,ror#6 @ h+=Sigma1(e)
- and r2,r2,r7
- add r10,r10,r3 @ h+=K256[i]
- eor r2,r2,r9 @ Ch(e,f,g)
- eor r0,r11,r11,ror#11
- add r10,r10,r2 @ h+=Ch(e,f,g)
-#if 1==31
- and r3,r3,#0xff
- cmp r3,#0xf2 @ done?
-#endif
-#if 1<15
-# if __ARM_ARCH__>=7
- ldr r2,[r1],#4 @ prefetch
-# else
- ldrb r2,[r1,#3]
-# endif
- eor r3,r11,r4 @ a^b, b^c in next round
-#else
- ldr r2,[sp,#3*4] @ from future BODY_16_xx
- eor r3,r11,r4 @ a^b, b^c in next round
- ldr r1,[sp,#0*4] @ from future BODY_16_xx
-#endif
- eor r0,r0,r11,ror#20 @ Sigma0(a)
- and r12,r12,r3 @ (b^c)&=(a^b)
- add r6,r6,r10 @ d+=h
- eor r12,r12,r4 @ Maj(a,b,c)
- add r10,r10,r0,ror#2 @ h+=Sigma0(a)
- @ add r10,r10,r12 @ h+=Maj(a,b,c)
-#if __ARM_ARCH__>=7
- @ ldr r2,[r1],#4 @ 2
-# if 2==15
- str r1,[sp,#17*4] @ make room for r1
-# endif
- eor r0,r6,r6,ror#5
- add r10,r10,r12 @ h+=Maj(a,b,c) from the past
- eor r0,r0,r6,ror#19 @ Sigma1(e)
-# ifndef __ARMEB__
- rev r2,r2
-# endif
-#else
- @ ldrb r2,[r1,#3] @ 2
- add r10,r10,r12 @ h+=Maj(a,b,c) from the past
- ldrb r12,[r1,#2]
- ldrb r0,[r1,#1]
- orr r2,r2,r12,lsl#8
- ldrb r12,[r1],#4
- orr r2,r2,r0,lsl#16
-# if 2==15
- str r1,[sp,#17*4] @ make room for r1
-# endif
- eor r0,r6,r6,ror#5
- orr r2,r2,r12,lsl#24
- eor r0,r0,r6,ror#19 @ Sigma1(e)
-#endif
- ldr r12,[r14],#4 @ *K256++
- add r9,r9,r2 @ h+=X[i]
- str r2,[sp,#2*4]
- eor r2,r7,r8
- add r9,r9,r0,ror#6 @ h+=Sigma1(e)
- and r2,r2,r6
- add r9,r9,r12 @ h+=K256[i]
- eor r2,r2,r8 @ Ch(e,f,g)
- eor r0,r10,r10,ror#11
- add r9,r9,r2 @ h+=Ch(e,f,g)
-#if 2==31
- and r12,r12,#0xff
- cmp r12,#0xf2 @ done?
-#endif
-#if 2<15
-# if __ARM_ARCH__>=7
- ldr r2,[r1],#4 @ prefetch
-# else
- ldrb r2,[r1,#3]
-# endif
- eor r12,r10,r11 @ a^b, b^c in next round
-#else
- ldr r2,[sp,#4*4] @ from future BODY_16_xx
- eor r12,r10,r11 @ a^b, b^c in next round
- ldr r1,[sp,#1*4] @ from future BODY_16_xx
-#endif
- eor r0,r0,r10,ror#20 @ Sigma0(a)
- and r3,r3,r12 @ (b^c)&=(a^b)
- add r5,r5,r9 @ d+=h
- eor r3,r3,r11 @ Maj(a,b,c)
- add r9,r9,r0,ror#2 @ h+=Sigma0(a)
- @ add r9,r9,r3 @ h+=Maj(a,b,c)
-#if __ARM_ARCH__>=7
- @ ldr r2,[r1],#4 @ 3
-# if 3==15
- str r1,[sp,#17*4] @ make room for r1
-# endif
- eor r0,r5,r5,ror#5
- add r9,r9,r3 @ h+=Maj(a,b,c) from the past
- eor r0,r0,r5,ror#19 @ Sigma1(e)
-# ifndef __ARMEB__
- rev r2,r2
-# endif
-#else
- @ ldrb r2,[r1,#3] @ 3
- add r9,r9,r3 @ h+=Maj(a,b,c) from the past
- ldrb r3,[r1,#2]
- ldrb r0,[r1,#1]
- orr r2,r2,r3,lsl#8
- ldrb r3,[r1],#4
- orr r2,r2,r0,lsl#16
-# if 3==15
- str r1,[sp,#17*4] @ make room for r1
-# endif
- eor r0,r5,r5,ror#5
- orr r2,r2,r3,lsl#24
- eor r0,r0,r5,ror#19 @ Sigma1(e)
-#endif
- ldr r3,[r14],#4 @ *K256++
- add r8,r8,r2 @ h+=X[i]
- str r2,[sp,#3*4]
- eor r2,r6,r7
- add r8,r8,r0,ror#6 @ h+=Sigma1(e)
- and r2,r2,r5
- add r8,r8,r3 @ h+=K256[i]
- eor r2,r2,r7 @ Ch(e,f,g)
- eor r0,r9,r9,ror#11
- add r8,r8,r2 @ h+=Ch(e,f,g)
-#if 3==31
- and r3,r3,#0xff
- cmp r3,#0xf2 @ done?
-#endif
-#if 3<15
-# if __ARM_ARCH__>=7
- ldr r2,[r1],#4 @ prefetch
-# else
- ldrb r2,[r1,#3]
-# endif
- eor r3,r9,r10 @ a^b, b^c in next round
-#else
- ldr r2,[sp,#5*4] @ from future BODY_16_xx
- eor r3,r9,r10 @ a^b, b^c in next round
- ldr r1,[sp,#2*4] @ from future BODY_16_xx
-#endif
- eor r0,r0,r9,ror#20 @ Sigma0(a)
- and r12,r12,r3 @ (b^c)&=(a^b)
- add r4,r4,r8 @ d+=h
- eor r12,r12,r10 @ Maj(a,b,c)
- add r8,r8,r0,ror#2 @ h+=Sigma0(a)
- @ add r8,r8,r12 @ h+=Maj(a,b,c)
-#if __ARM_ARCH__>=7
- @ ldr r2,[r1],#4 @ 4
-# if 4==15
- str r1,[sp,#17*4] @ make room for r1
-# endif
- eor r0,r4,r4,ror#5
- add r8,r8,r12 @ h+=Maj(a,b,c) from the past
- eor r0,r0,r4,ror#19 @ Sigma1(e)
-# ifndef __ARMEB__
- rev r2,r2
-# endif
-#else
- @ ldrb r2,[r1,#3] @ 4
- add r8,r8,r12 @ h+=Maj(a,b,c) from the past
- ldrb r12,[r1,#2]
- ldrb r0,[r1,#1]
- orr r2,r2,r12,lsl#8
- ldrb r12,[r1],#4
- orr r2,r2,r0,lsl#16
-# if 4==15
- str r1,[sp,#17*4] @ make room for r1
-# endif
- eor r0,r4,r4,ror#5
- orr r2,r2,r12,lsl#24
- eor r0,r0,r4,ror#19 @ Sigma1(e)
-#endif
- ldr r12,[r14],#4 @ *K256++
- add r7,r7,r2 @ h+=X[i]
- str r2,[sp,#4*4]
- eor r2,r5,r6
- add r7,r7,r0,ror#6 @ h+=Sigma1(e)
- and r2,r2,r4
- add r7,r7,r12 @ h+=K256[i]
- eor r2,r2,r6 @ Ch(e,f,g)
- eor r0,r8,r8,ror#11
- add r7,r7,r2 @ h+=Ch(e,f,g)
-#if 4==31
- and r12,r12,#0xff
- cmp r12,#0xf2 @ done?
-#endif
-#if 4<15
-# if __ARM_ARCH__>=7
- ldr r2,[r1],#4 @ prefetch
-# else
- ldrb r2,[r1,#3]
-# endif
- eor r12,r8,r9 @ a^b, b^c in next round
-#else
- ldr r2,[sp,#6*4] @ from future BODY_16_xx
- eor r12,r8,r9 @ a^b, b^c in next round
- ldr r1,[sp,#3*4] @ from future BODY_16_xx
-#endif
- eor r0,r0,r8,ror#20 @ Sigma0(a)
- and r3,r3,r12 @ (b^c)&=(a^b)
- add r11,r11,r7 @ d+=h
- eor r3,r3,r9 @ Maj(a,b,c)
- add r7,r7,r0,ror#2 @ h+=Sigma0(a)
- @ add r7,r7,r3 @ h+=Maj(a,b,c)
-#if __ARM_ARCH__>=7
- @ ldr r2,[r1],#4 @ 5
-# if 5==15
- str r1,[sp,#17*4] @ make room for r1
-# endif
- eor r0,r11,r11,ror#5
- add r7,r7,r3 @ h+=Maj(a,b,c) from the past
- eor r0,r0,r11,ror#19 @ Sigma1(e)
-# ifndef __ARMEB__
- rev r2,r2
-# endif
-#else
- @ ldrb r2,[r1,#3] @ 5
- add r7,r7,r3 @ h+=Maj(a,b,c) from the past
- ldrb r3,[r1,#2]
- ldrb r0,[r1,#1]
- orr r2,r2,r3,lsl#8
- ldrb r3,[r1],#4
- orr r2,r2,r0,lsl#16
-# if 5==15
- str r1,[sp,#17*4] @ make room for r1
-# endif
- eor r0,r11,r11,ror#5
- orr r2,r2,r3,lsl#24
- eor r0,r0,r11,ror#19 @ Sigma1(e)
-#endif
- ldr r3,[r14],#4 @ *K256++
- add r6,r6,r2 @ h+=X[i]
- str r2,[sp,#5*4]
- eor r2,r4,r5
- add r6,r6,r0,ror#6 @ h+=Sigma1(e)
- and r2,r2,r11
- add r6,r6,r3 @ h+=K256[i]
- eor r2,r2,r5 @ Ch(e,f,g)
- eor r0,r7,r7,ror#11
- add r6,r6,r2 @ h+=Ch(e,f,g)
-#if 5==31
- and r3,r3,#0xff
- cmp r3,#0xf2 @ done?
-#endif
-#if 5<15
-# if __ARM_ARCH__>=7
- ldr r2,[r1],#4 @ prefetch
-# else
- ldrb r2,[r1,#3]
-# endif
- eor r3,r7,r8 @ a^b, b^c in next round
-#else
- ldr r2,[sp,#7*4] @ from future BODY_16_xx
- eor r3,r7,r8 @ a^b, b^c in next round
- ldr r1,[sp,#4*4] @ from future BODY_16_xx
-#endif
- eor r0,r0,r7,ror#20 @ Sigma0(a)
- and r12,r12,r3 @ (b^c)&=(a^b)
- add r10,r10,r6 @ d+=h
- eor r12,r12,r8 @ Maj(a,b,c)
- add r6,r6,r0,ror#2 @ h+=Sigma0(a)
- @ add r6,r6,r12 @ h+=Maj(a,b,c)
-#if __ARM_ARCH__>=7
- @ ldr r2,[r1],#4 @ 6
-# if 6==15
- str r1,[sp,#17*4] @ make room for r1
-# endif
- eor r0,r10,r10,ror#5
- add r6,r6,r12 @ h+=Maj(a,b,c) from the past
- eor r0,r0,r10,ror#19 @ Sigma1(e)
-# ifndef __ARMEB__
- rev r2,r2
-# endif
-#else
- @ ldrb r2,[r1,#3] @ 6
- add r6,r6,r12 @ h+=Maj(a,b,c) from the past
- ldrb r12,[r1,#2]
- ldrb r0,[r1,#1]
- orr r2,r2,r12,lsl#8
- ldrb r12,[r1],#4
- orr r2,r2,r0,lsl#16
-# if 6==15
- str r1,[sp,#17*4] @ make room for r1
-# endif
- eor r0,r10,r10,ror#5
- orr r2,r2,r12,lsl#24
- eor r0,r0,r10,ror#19 @ Sigma1(e)
-#endif
- ldr r12,[r14],#4 @ *K256++
- add r5,r5,r2 @ h+=X[i]
- str r2,[sp,#6*4]
- eor r2,r11,r4
- add r5,r5,r0,ror#6 @ h+=Sigma1(e)
- and r2,r2,r10
- add r5,r5,r12 @ h+=K256[i]
- eor r2,r2,r4 @ Ch(e,f,g)
- eor r0,r6,r6,ror#11
- add r5,r5,r2 @ h+=Ch(e,f,g)
-#if 6==31
- and r12,r12,#0xff
- cmp r12,#0xf2 @ done?
-#endif
-#if 6<15
-# if __ARM_ARCH__>=7
- ldr r2,[r1],#4 @ prefetch
-# else
- ldrb r2,[r1,#3]
-# endif
- eor r12,r6,r7 @ a^b, b^c in next round
-#else
- ldr r2,[sp,#8*4] @ from future BODY_16_xx
- eor r12,r6,r7 @ a^b, b^c in next round
- ldr r1,[sp,#5*4] @ from future BODY_16_xx
-#endif
- eor r0,r0,r6,ror#20 @ Sigma0(a)
- and r3,r3,r12 @ (b^c)&=(a^b)
- add r9,r9,r5 @ d+=h
- eor r3,r3,r7 @ Maj(a,b,c)
- add r5,r5,r0,ror#2 @ h+=Sigma0(a)
- @ add r5,r5,r3 @ h+=Maj(a,b,c)
-#if __ARM_ARCH__>=7
- @ ldr r2,[r1],#4 @ 7
-# if 7==15
- str r1,[sp,#17*4] @ make room for r1
-# endif
- eor r0,r9,r9,ror#5
- add r5,r5,r3 @ h+=Maj(a,b,c) from the past
- eor r0,r0,r9,ror#19 @ Sigma1(e)
-# ifndef __ARMEB__
- rev r2,r2
-# endif
-#else
- @ ldrb r2,[r1,#3] @ 7
- add r5,r5,r3 @ h+=Maj(a,b,c) from the past
- ldrb r3,[r1,#2]
- ldrb r0,[r1,#1]
- orr r2,r2,r3,lsl#8
- ldrb r3,[r1],#4
- orr r2,r2,r0,lsl#16
-# if 7==15
- str r1,[sp,#17*4] @ make room for r1
-# endif
- eor r0,r9,r9,ror#5
- orr r2,r2,r3,lsl#24
- eor r0,r0,r9,ror#19 @ Sigma1(e)
-#endif
- ldr r3,[r14],#4 @ *K256++
- add r4,r4,r2 @ h+=X[i]
- str r2,[sp,#7*4]
- eor r2,r10,r11
- add r4,r4,r0,ror#6 @ h+=Sigma1(e)
- and r2,r2,r9
- add r4,r4,r3 @ h+=K256[i]
- eor r2,r2,r11 @ Ch(e,f,g)
- eor r0,r5,r5,ror#11
- add r4,r4,r2 @ h+=Ch(e,f,g)
-#if 7==31
- and r3,r3,#0xff
- cmp r3,#0xf2 @ done?
-#endif
-#if 7<15
-# if __ARM_ARCH__>=7
- ldr r2,[r1],#4 @ prefetch
-# else
- ldrb r2,[r1,#3]
-# endif
- eor r3,r5,r6 @ a^b, b^c in next round
-#else
- ldr r2,[sp,#9*4] @ from future BODY_16_xx
- eor r3,r5,r6 @ a^b, b^c in next round
- ldr r1,[sp,#6*4] @ from future BODY_16_xx
-#endif
- eor r0,r0,r5,ror#20 @ Sigma0(a)
- and r12,r12,r3 @ (b^c)&=(a^b)
- add r8,r8,r4 @ d+=h
- eor r12,r12,r6 @ Maj(a,b,c)
- add r4,r4,r0,ror#2 @ h+=Sigma0(a)
- @ add r4,r4,r12 @ h+=Maj(a,b,c)
-#if __ARM_ARCH__>=7
- @ ldr r2,[r1],#4 @ 8
-# if 8==15
- str r1,[sp,#17*4] @ make room for r1
-# endif
- eor r0,r8,r8,ror#5
- add r4,r4,r12 @ h+=Maj(a,b,c) from the past
- eor r0,r0,r8,ror#19 @ Sigma1(e)
-# ifndef __ARMEB__
- rev r2,r2
-# endif
-#else
- @ ldrb r2,[r1,#3] @ 8
- add r4,r4,r12 @ h+=Maj(a,b,c) from the past
- ldrb r12,[r1,#2]
- ldrb r0,[r1,#1]
- orr r2,r2,r12,lsl#8
- ldrb r12,[r1],#4
- orr r2,r2,r0,lsl#16
-# if 8==15
- str r1,[sp,#17*4] @ make room for r1
-# endif
- eor r0,r8,r8,ror#5
- orr r2,r2,r12,lsl#24
- eor r0,r0,r8,ror#19 @ Sigma1(e)
-#endif
- ldr r12,[r14],#4 @ *K256++
- add r11,r11,r2 @ h+=X[i]
- str r2,[sp,#8*4]
- eor r2,r9,r10
- add r11,r11,r0,ror#6 @ h+=Sigma1(e)
- and r2,r2,r8
- add r11,r11,r12 @ h+=K256[i]
- eor r2,r2,r10 @ Ch(e,f,g)
- eor r0,r4,r4,ror#11
- add r11,r11,r2 @ h+=Ch(e,f,g)
-#if 8==31
- and r12,r12,#0xff
- cmp r12,#0xf2 @ done?
-#endif
-#if 8<15
-# if __ARM_ARCH__>=7
- ldr r2,[r1],#4 @ prefetch
-# else
- ldrb r2,[r1,#3]
-# endif
- eor r12,r4,r5 @ a^b, b^c in next round
-#else
- ldr r2,[sp,#10*4] @ from future BODY_16_xx
- eor r12,r4,r5 @ a^b, b^c in next round
- ldr r1,[sp,#7*4] @ from future BODY_16_xx
-#endif
- eor r0,r0,r4,ror#20 @ Sigma0(a)
- and r3,r3,r12 @ (b^c)&=(a^b)
- add r7,r7,r11 @ d+=h
- eor r3,r3,r5 @ Maj(a,b,c)
- add r11,r11,r0,ror#2 @ h+=Sigma0(a)
- @ add r11,r11,r3 @ h+=Maj(a,b,c)
-#if __ARM_ARCH__>=7
- @ ldr r2,[r1],#4 @ 9
-# if 9==15
- str r1,[sp,#17*4] @ make room for r1
-# endif
- eor r0,r7,r7,ror#5
- add r11,r11,r3 @ h+=Maj(a,b,c) from the past
- eor r0,r0,r7,ror#19 @ Sigma1(e)
-# ifndef __ARMEB__
- rev r2,r2
-# endif
-#else
- @ ldrb r2,[r1,#3] @ 9
- add r11,r11,r3 @ h+=Maj(a,b,c) from the past
- ldrb r3,[r1,#2]
- ldrb r0,[r1,#1]
- orr r2,r2,r3,lsl#8
- ldrb r3,[r1],#4
- orr r2,r2,r0,lsl#16
-# if 9==15
- str r1,[sp,#17*4] @ make room for r1
-# endif
- eor r0,r7,r7,ror#5
- orr r2,r2,r3,lsl#24
- eor r0,r0,r7,ror#19 @ Sigma1(e)
-#endif
- ldr r3,[r14],#4 @ *K256++
- add r10,r10,r2 @ h+=X[i]
- str r2,[sp,#9*4]
- eor r2,r8,r9
- add r10,r10,r0,ror#6 @ h+=Sigma1(e)
- and r2,r2,r7
- add r10,r10,r3 @ h+=K256[i]
- eor r2,r2,r9 @ Ch(e,f,g)
- eor r0,r11,r11,ror#11
- add r10,r10,r2 @ h+=Ch(e,f,g)
-#if 9==31
- and r3,r3,#0xff
- cmp r3,#0xf2 @ done?
-#endif
-#if 9<15
-# if __ARM_ARCH__>=7
- ldr r2,[r1],#4 @ prefetch
-# else
- ldrb r2,[r1,#3]
-# endif
- eor r3,r11,r4 @ a^b, b^c in next round
-#else
- ldr r2,[sp,#11*4] @ from future BODY_16_xx
- eor r3,r11,r4 @ a^b, b^c in next round
- ldr r1,[sp,#8*4] @ from future BODY_16_xx
-#endif
- eor r0,r0,r11,ror#20 @ Sigma0(a)
- and r12,r12,r3 @ (b^c)&=(a^b)
- add r6,r6,r10 @ d+=h
- eor r12,r12,r4 @ Maj(a,b,c)
- add r10,r10,r0,ror#2 @ h+=Sigma0(a)
- @ add r10,r10,r12 @ h+=Maj(a,b,c)
-#if __ARM_ARCH__>=7
- @ ldr r2,[r1],#4 @ 10
-# if 10==15
- str r1,[sp,#17*4] @ make room for r1
-# endif
- eor r0,r6,r6,ror#5
- add r10,r10,r12 @ h+=Maj(a,b,c) from the past
- eor r0,r0,r6,ror#19 @ Sigma1(e)
-# ifndef __ARMEB__
- rev r2,r2
-# endif
-#else
- @ ldrb r2,[r1,#3] @ 10
- add r10,r10,r12 @ h+=Maj(a,b,c) from the past
- ldrb r12,[r1,#2]
- ldrb r0,[r1,#1]
- orr r2,r2,r12,lsl#8
- ldrb r12,[r1],#4
- orr r2,r2,r0,lsl#16
-# if 10==15
- str r1,[sp,#17*4] @ make room for r1
-# endif
- eor r0,r6,r6,ror#5
- orr r2,r2,r12,lsl#24
- eor r0,r0,r6,ror#19 @ Sigma1(e)
-#endif
- ldr r12,[r14],#4 @ *K256++
- add r9,r9,r2 @ h+=X[i]
- str r2,[sp,#10*4]
- eor r2,r7,r8
- add r9,r9,r0,ror#6 @ h+=Sigma1(e)
- and r2,r2,r6
- add r9,r9,r12 @ h+=K256[i]
- eor r2,r2,r8 @ Ch(e,f,g)
- eor r0,r10,r10,ror#11
- add r9,r9,r2 @ h+=Ch(e,f,g)
-#if 10==31
- and r12,r12,#0xff
- cmp r12,#0xf2 @ done?
-#endif
-#if 10<15
-# if __ARM_ARCH__>=7
- ldr r2,[r1],#4 @ prefetch
-# else
- ldrb r2,[r1,#3]
-# endif
- eor r12,r10,r11 @ a^b, b^c in next round
-#else
- ldr r2,[sp,#12*4] @ from future BODY_16_xx
- eor r12,r10,r11 @ a^b, b^c in next round
- ldr r1,[sp,#9*4] @ from future BODY_16_xx
-#endif
- eor r0,r0,r10,ror#20 @ Sigma0(a)
- and r3,r3,r12 @ (b^c)&=(a^b)
- add r5,r5,r9 @ d+=h
- eor r3,r3,r11 @ Maj(a,b,c)
- add r9,r9,r0,ror#2 @ h+=Sigma0(a)
- @ add r9,r9,r3 @ h+=Maj(a,b,c)
-#if __ARM_ARCH__>=7
- @ ldr r2,[r1],#4 @ 11
-# if 11==15
- str r1,[sp,#17*4] @ make room for r1
-# endif
- eor r0,r5,r5,ror#5
- add r9,r9,r3 @ h+=Maj(a,b,c) from the past
- eor r0,r0,r5,ror#19 @ Sigma1(e)
-# ifndef __ARMEB__
- rev r2,r2
-# endif
-#else
- @ ldrb r2,[r1,#3] @ 11
- add r9,r9,r3 @ h+=Maj(a,b,c) from the past
- ldrb r3,[r1,#2]
- ldrb r0,[r1,#1]
- orr r2,r2,r3,lsl#8
- ldrb r3,[r1],#4
- orr r2,r2,r0,lsl#16
-# if 11==15
- str r1,[sp,#17*4] @ make room for r1
-# endif
- eor r0,r5,r5,ror#5
- orr r2,r2,r3,lsl#24
- eor r0,r0,r5,ror#19 @ Sigma1(e)
-#endif
- ldr r3,[r14],#4 @ *K256++
- add r8,r8,r2 @ h+=X[i]
- str r2,[sp,#11*4]
- eor r2,r6,r7
- add r8,r8,r0,ror#6 @ h+=Sigma1(e)
- and r2,r2,r5
- add r8,r8,r3 @ h+=K256[i]
- eor r2,r2,r7 @ Ch(e,f,g)
- eor r0,r9,r9,ror#11
- add r8,r8,r2 @ h+=Ch(e,f,g)
-#if 11==31
- and r3,r3,#0xff
- cmp r3,#0xf2 @ done?
-#endif
-#if 11<15
-# if __ARM_ARCH__>=7
- ldr r2,[r1],#4 @ prefetch
-# else
- ldrb r2,[r1,#3]
-# endif
- eor r3,r9,r10 @ a^b, b^c in next round
-#else
- ldr r2,[sp,#13*4] @ from future BODY_16_xx
- eor r3,r9,r10 @ a^b, b^c in next round
- ldr r1,[sp,#10*4] @ from future BODY_16_xx
-#endif
- eor r0,r0,r9,ror#20 @ Sigma0(a)
- and r12,r12,r3 @ (b^c)&=(a^b)
- add r4,r4,r8 @ d+=h
- eor r12,r12,r10 @ Maj(a,b,c)
- add r8,r8,r0,ror#2 @ h+=Sigma0(a)
- @ add r8,r8,r12 @ h+=Maj(a,b,c)
-#if __ARM_ARCH__>=7
- @ ldr r2,[r1],#4 @ 12
-# if 12==15
- str r1,[sp,#17*4] @ make room for r1
-# endif
- eor r0,r4,r4,ror#5
- add r8,r8,r12 @ h+=Maj(a,b,c) from the past
- eor r0,r0,r4,ror#19 @ Sigma1(e)
-# ifndef __ARMEB__
- rev r2,r2
-# endif
-#else
- @ ldrb r2,[r1,#3] @ 12
- add r8,r8,r12 @ h+=Maj(a,b,c) from the past
- ldrb r12,[r1,#2]
- ldrb r0,[r1,#1]
- orr r2,r2,r12,lsl#8
- ldrb r12,[r1],#4
- orr r2,r2,r0,lsl#16
-# if 12==15
- str r1,[sp,#17*4] @ make room for r1
-# endif
- eor r0,r4,r4,ror#5
- orr r2,r2,r12,lsl#24
- eor r0,r0,r4,ror#19 @ Sigma1(e)
-#endif
- ldr r12,[r14],#4 @ *K256++
- add r7,r7,r2 @ h+=X[i]
- str r2,[sp,#12*4]
- eor r2,r5,r6
- add r7,r7,r0,ror#6 @ h+=Sigma1(e)
- and r2,r2,r4
- add r7,r7,r12 @ h+=K256[i]
- eor r2,r2,r6 @ Ch(e,f,g)
- eor r0,r8,r8,ror#11
- add r7,r7,r2 @ h+=Ch(e,f,g)
-#if 12==31
- and r12,r12,#0xff
- cmp r12,#0xf2 @ done?
-#endif
-#if 12<15
-# if __ARM_ARCH__>=7
- ldr r2,[r1],#4 @ prefetch
-# else
- ldrb r2,[r1,#3]
-# endif
- eor r12,r8,r9 @ a^b, b^c in next round
-#else
- ldr r2,[sp,#14*4] @ from future BODY_16_xx
- eor r12,r8,r9 @ a^b, b^c in next round
- ldr r1,[sp,#11*4] @ from future BODY_16_xx
-#endif
- eor r0,r0,r8,ror#20 @ Sigma0(a)
- and r3,r3,r12 @ (b^c)&=(a^b)
- add r11,r11,r7 @ d+=h
- eor r3,r3,r9 @ Maj(a,b,c)
- add r7,r7,r0,ror#2 @ h+=Sigma0(a)
- @ add r7,r7,r3 @ h+=Maj(a,b,c)
-#if __ARM_ARCH__>=7
- @ ldr r2,[r1],#4 @ 13
-# if 13==15
- str r1,[sp,#17*4] @ make room for r1
-# endif
- eor r0,r11,r11,ror#5
- add r7,r7,r3 @ h+=Maj(a,b,c) from the past
- eor r0,r0,r11,ror#19 @ Sigma1(e)
-# ifndef __ARMEB__
- rev r2,r2
-# endif
-#else
- @ ldrb r2,[r1,#3] @ 13
- add r7,r7,r3 @ h+=Maj(a,b,c) from the past
- ldrb r3,[r1,#2]
- ldrb r0,[r1,#1]
- orr r2,r2,r3,lsl#8
- ldrb r3,[r1],#4
- orr r2,r2,r0,lsl#16
-# if 13==15
- str r1,[sp,#17*4] @ make room for r1
-# endif
- eor r0,r11,r11,ror#5
- orr r2,r2,r3,lsl#24
- eor r0,r0,r11,ror#19 @ Sigma1(e)
-#endif
- ldr r3,[r14],#4 @ *K256++
- add r6,r6,r2 @ h+=X[i]
- str r2,[sp,#13*4]
- eor r2,r4,r5
- add r6,r6,r0,ror#6 @ h+=Sigma1(e)
- and r2,r2,r11
- add r6,r6,r3 @ h+=K256[i]
- eor r2,r2,r5 @ Ch(e,f,g)
- eor r0,r7,r7,ror#11
- add r6,r6,r2 @ h+=Ch(e,f,g)
-#if 13==31
- and r3,r3,#0xff
- cmp r3,#0xf2 @ done?
-#endif
-#if 13<15
-# if __ARM_ARCH__>=7
- ldr r2,[r1],#4 @ prefetch
-# else
- ldrb r2,[r1,#3]
-# endif
- eor r3,r7,r8 @ a^b, b^c in next round
-#else
- ldr r2,[sp,#15*4] @ from future BODY_16_xx
- eor r3,r7,r8 @ a^b, b^c in next round
- ldr r1,[sp,#12*4] @ from future BODY_16_xx
-#endif
- eor r0,r0,r7,ror#20 @ Sigma0(a)
- and r12,r12,r3 @ (b^c)&=(a^b)
- add r10,r10,r6 @ d+=h
- eor r12,r12,r8 @ Maj(a,b,c)
- add r6,r6,r0,ror#2 @ h+=Sigma0(a)
- @ add r6,r6,r12 @ h+=Maj(a,b,c)
-#if __ARM_ARCH__>=7
- @ ldr r2,[r1],#4 @ 14
-# if 14==15
- str r1,[sp,#17*4] @ make room for r1
-# endif
- eor r0,r10,r10,ror#5
- add r6,r6,r12 @ h+=Maj(a,b,c) from the past
- eor r0,r0,r10,ror#19 @ Sigma1(e)
-# ifndef __ARMEB__
- rev r2,r2
-# endif
-#else
- @ ldrb r2,[r1,#3] @ 14
- add r6,r6,r12 @ h+=Maj(a,b,c) from the past
- ldrb r12,[r1,#2]
- ldrb r0,[r1,#1]
- orr r2,r2,r12,lsl#8
- ldrb r12,[r1],#4
- orr r2,r2,r0,lsl#16
-# if 14==15
- str r1,[sp,#17*4] @ make room for r1
-# endif
- eor r0,r10,r10,ror#5
- orr r2,r2,r12,lsl#24
- eor r0,r0,r10,ror#19 @ Sigma1(e)
-#endif
- ldr r12,[r14],#4 @ *K256++
- add r5,r5,r2 @ h+=X[i]
- str r2,[sp,#14*4]
- eor r2,r11,r4
- add r5,r5,r0,ror#6 @ h+=Sigma1(e)
- and r2,r2,r10
- add r5,r5,r12 @ h+=K256[i]
- eor r2,r2,r4 @ Ch(e,f,g)
- eor r0,r6,r6,ror#11
- add r5,r5,r2 @ h+=Ch(e,f,g)
-#if 14==31
- and r12,r12,#0xff
- cmp r12,#0xf2 @ done?
-#endif
-#if 14<15
-# if __ARM_ARCH__>=7
- ldr r2,[r1],#4 @ prefetch
-# else
- ldrb r2,[r1,#3]
-# endif
- eor r12,r6,r7 @ a^b, b^c in next round
-#else
- ldr r2,[sp,#0*4] @ from future BODY_16_xx
- eor r12,r6,r7 @ a^b, b^c in next round
- ldr r1,[sp,#13*4] @ from future BODY_16_xx
-#endif
- eor r0,r0,r6,ror#20 @ Sigma0(a)
- and r3,r3,r12 @ (b^c)&=(a^b)
- add r9,r9,r5 @ d+=h
- eor r3,r3,r7 @ Maj(a,b,c)
- add r5,r5,r0,ror#2 @ h+=Sigma0(a)
- @ add r5,r5,r3 @ h+=Maj(a,b,c)
-#if __ARM_ARCH__>=7
- @ ldr r2,[r1],#4 @ 15
-# if 15==15
- str r1,[sp,#17*4] @ make room for r1
-# endif
- eor r0,r9,r9,ror#5
- add r5,r5,r3 @ h+=Maj(a,b,c) from the past
- eor r0,r0,r9,ror#19 @ Sigma1(e)
-# ifndef __ARMEB__
- rev r2,r2
-# endif
-#else
- @ ldrb r2,[r1,#3] @ 15
- add r5,r5,r3 @ h+=Maj(a,b,c) from the past
- ldrb r3,[r1,#2]
- ldrb r0,[r1,#1]
- orr r2,r2,r3,lsl#8
- ldrb r3,[r1],#4
- orr r2,r2,r0,lsl#16
-# if 15==15
- str r1,[sp,#17*4] @ make room for r1
-# endif
- eor r0,r9,r9,ror#5
- orr r2,r2,r3,lsl#24
- eor r0,r0,r9,ror#19 @ Sigma1(e)
-#endif
- ldr r3,[r14],#4 @ *K256++
- add r4,r4,r2 @ h+=X[i]
- str r2,[sp,#15*4]
- eor r2,r10,r11
- add r4,r4,r0,ror#6 @ h+=Sigma1(e)
- and r2,r2,r9
- add r4,r4,r3 @ h+=K256[i]
- eor r2,r2,r11 @ Ch(e,f,g)
- eor r0,r5,r5,ror#11
- add r4,r4,r2 @ h+=Ch(e,f,g)
-#if 15==31
- and r3,r3,#0xff
- cmp r3,#0xf2 @ done?
-#endif
-#if 15<15
-# if __ARM_ARCH__>=7
- ldr r2,[r1],#4 @ prefetch
-# else
- ldrb r2,[r1,#3]
-# endif
- eor r3,r5,r6 @ a^b, b^c in next round
-#else
- ldr r2,[sp,#1*4] @ from future BODY_16_xx
- eor r3,r5,r6 @ a^b, b^c in next round
- ldr r1,[sp,#14*4] @ from future BODY_16_xx
-#endif
- eor r0,r0,r5,ror#20 @ Sigma0(a)
- and r12,r12,r3 @ (b^c)&=(a^b)
- add r8,r8,r4 @ d+=h
- eor r12,r12,r6 @ Maj(a,b,c)
- add r4,r4,r0,ror#2 @ h+=Sigma0(a)
- @ add r4,r4,r12 @ h+=Maj(a,b,c)
-.Lrounds_16_xx:
- @ ldr r2,[sp,#1*4] @ 16
- @ ldr r1,[sp,#14*4]
- mov r0,r2,ror#7
- add r4,r4,r12 @ h+=Maj(a,b,c) from the past
- mov r12,r1,ror#17
- eor r0,r0,r2,ror#18
- eor r12,r12,r1,ror#19
- eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
- ldr r2,[sp,#0*4]
- eor r12,r12,r1,lsr#10 @ sigma1(X[i+14])
- ldr r1,[sp,#9*4]
-
- add r12,r12,r0
- eor r0,r8,r8,ror#5 @ from BODY_00_15
- add r2,r2,r12
- eor r0,r0,r8,ror#19 @ Sigma1(e)
- add r2,r2,r1 @ X[i]
- ldr r12,[r14],#4 @ *K256++
- add r11,r11,r2 @ h+=X[i]
- str r2,[sp,#0*4]
- eor r2,r9,r10
- add r11,r11,r0,ror#6 @ h+=Sigma1(e)
- and r2,r2,r8
- add r11,r11,r12 @ h+=K256[i]
- eor r2,r2,r10 @ Ch(e,f,g)
- eor r0,r4,r4,ror#11
- add r11,r11,r2 @ h+=Ch(e,f,g)
-#if 16==31
- and r12,r12,#0xff
- cmp r12,#0xf2 @ done?
-#endif
-#if 16<15
-# if __ARM_ARCH__>=7
- ldr r2,[r1],#4 @ prefetch
-# else
- ldrb r2,[r1,#3]
-# endif
- eor r12,r4,r5 @ a^b, b^c in next round
-#else
- ldr r2,[sp,#2*4] @ from future BODY_16_xx
- eor r12,r4,r5 @ a^b, b^c in next round
- ldr r1,[sp,#15*4] @ from future BODY_16_xx
-#endif
- eor r0,r0,r4,ror#20 @ Sigma0(a)
- and r3,r3,r12 @ (b^c)&=(a^b)
- add r7,r7,r11 @ d+=h
- eor r3,r3,r5 @ Maj(a,b,c)
- add r11,r11,r0,ror#2 @ h+=Sigma0(a)
- @ add r11,r11,r3 @ h+=Maj(a,b,c)
- @ ldr r2,[sp,#2*4] @ 17
- @ ldr r1,[sp,#15*4]
- mov r0,r2,ror#7
- add r11,r11,r3 @ h+=Maj(a,b,c) from the past
- mov r3,r1,ror#17
- eor r0,r0,r2,ror#18
- eor r3,r3,r1,ror#19
- eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
- ldr r2,[sp,#1*4]
- eor r3,r3,r1,lsr#10 @ sigma1(X[i+14])
- ldr r1,[sp,#10*4]
-
- add r3,r3,r0
- eor r0,r7,r7,ror#5 @ from BODY_00_15
- add r2,r2,r3
- eor r0,r0,r7,ror#19 @ Sigma1(e)
- add r2,r2,r1 @ X[i]
- ldr r3,[r14],#4 @ *K256++
- add r10,r10,r2 @ h+=X[i]
- str r2,[sp,#1*4]
- eor r2,r8,r9
- add r10,r10,r0,ror#6 @ h+=Sigma1(e)
- and r2,r2,r7
- add r10,r10,r3 @ h+=K256[i]
- eor r2,r2,r9 @ Ch(e,f,g)
- eor r0,r11,r11,ror#11
- add r10,r10,r2 @ h+=Ch(e,f,g)
-#if 17==31
- and r3,r3,#0xff
- cmp r3,#0xf2 @ done?
-#endif
-#if 17<15
-# if __ARM_ARCH__>=7
- ldr r2,[r1],#4 @ prefetch
-# else
- ldrb r2,[r1,#3]
-# endif
- eor r3,r11,r4 @ a^b, b^c in next round
-#else
- ldr r2,[sp,#3*4] @ from future BODY_16_xx
- eor r3,r11,r4 @ a^b, b^c in next round
- ldr r1,[sp,#0*4] @ from future BODY_16_xx
-#endif
- eor r0,r0,r11,ror#20 @ Sigma0(a)
- and r12,r12,r3 @ (b^c)&=(a^b)
- add r6,r6,r10 @ d+=h
- eor r12,r12,r4 @ Maj(a,b,c)
- add r10,r10,r0,ror#2 @ h+=Sigma0(a)
- @ add r10,r10,r12 @ h+=Maj(a,b,c)
- @ ldr r2,[sp,#3*4] @ 18
- @ ldr r1,[sp,#0*4]
- mov r0,r2,ror#7
- add r10,r10,r12 @ h+=Maj(a,b,c) from the past
- mov r12,r1,ror#17
- eor r0,r0,r2,ror#18
- eor r12,r12,r1,ror#19
- eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
- ldr r2,[sp,#2*4]
- eor r12,r12,r1,lsr#10 @ sigma1(X[i+14])
- ldr r1,[sp,#11*4]
-
- add r12,r12,r0
- eor r0,r6,r6,ror#5 @ from BODY_00_15
- add r2,r2,r12
- eor r0,r0,r6,ror#19 @ Sigma1(e)
- add r2,r2,r1 @ X[i]
- ldr r12,[r14],#4 @ *K256++
- add r9,r9,r2 @ h+=X[i]
- str r2,[sp,#2*4]
- eor r2,r7,r8
- add r9,r9,r0,ror#6 @ h+=Sigma1(e)
- and r2,r2,r6
- add r9,r9,r12 @ h+=K256[i]
- eor r2,r2,r8 @ Ch(e,f,g)
- eor r0,r10,r10,ror#11
- add r9,r9,r2 @ h+=Ch(e,f,g)
-#if 18==31
- and r12,r12,#0xff
- cmp r12,#0xf2 @ done?
-#endif
-#if 18<15
-# if __ARM_ARCH__>=7
- ldr r2,[r1],#4 @ prefetch
-# else
- ldrb r2,[r1,#3]
-# endif
- eor r12,r10,r11 @ a^b, b^c in next round
-#else
- ldr r2,[sp,#4*4] @ from future BODY_16_xx
- eor r12,r10,r11 @ a^b, b^c in next round
- ldr r1,[sp,#1*4] @ from future BODY_16_xx
-#endif
- eor r0,r0,r10,ror#20 @ Sigma0(a)
- and r3,r3,r12 @ (b^c)&=(a^b)
- add r5,r5,r9 @ d+=h
- eor r3,r3,r11 @ Maj(a,b,c)
- add r9,r9,r0,ror#2 @ h+=Sigma0(a)
- @ add r9,r9,r3 @ h+=Maj(a,b,c)
- @ ldr r2,[sp,#4*4] @ 19
- @ ldr r1,[sp,#1*4]
- mov r0,r2,ror#7
- add r9,r9,r3 @ h+=Maj(a,b,c) from the past
- mov r3,r1,ror#17
- eor r0,r0,r2,ror#18
- eor r3,r3,r1,ror#19
- eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
- ldr r2,[sp,#3*4]
- eor r3,r3,r1,lsr#10 @ sigma1(X[i+14])
- ldr r1,[sp,#12*4]
-
- add r3,r3,r0
- eor r0,r5,r5,ror#5 @ from BODY_00_15
- add r2,r2,r3
- eor r0,r0,r5,ror#19 @ Sigma1(e)
- add r2,r2,r1 @ X[i]
- ldr r3,[r14],#4 @ *K256++
- add r8,r8,r2 @ h+=X[i]
- str r2,[sp,#3*4]
- eor r2,r6,r7
- add r8,r8,r0,ror#6 @ h+=Sigma1(e)
- and r2,r2,r5
- add r8,r8,r3 @ h+=K256[i]
- eor r2,r2,r7 @ Ch(e,f,g)
- eor r0,r9,r9,ror#11
- add r8,r8,r2 @ h+=Ch(e,f,g)
-#if 19==31
- and r3,r3,#0xff
- cmp r3,#0xf2 @ done?
-#endif
-#if 19<15
-# if __ARM_ARCH__>=7
- ldr r2,[r1],#4 @ prefetch
-# else
- ldrb r2,[r1,#3]
-# endif
- eor r3,r9,r10 @ a^b, b^c in next round
-#else
- ldr r2,[sp,#5*4] @ from future BODY_16_xx
- eor r3,r9,r10 @ a^b, b^c in next round
- ldr r1,[sp,#2*4] @ from future BODY_16_xx
-#endif
- eor r0,r0,r9,ror#20 @ Sigma0(a)
- and r12,r12,r3 @ (b^c)&=(a^b)
- add r4,r4,r8 @ d+=h
- eor r12,r12,r10 @ Maj(a,b,c)
- add r8,r8,r0,ror#2 @ h+=Sigma0(a)
- @ add r8,r8,r12 @ h+=Maj(a,b,c)
- @ ldr r2,[sp,#5*4] @ 20
- @ ldr r1,[sp,#2*4]
- mov r0,r2,ror#7
- add r8,r8,r12 @ h+=Maj(a,b,c) from the past
- mov r12,r1,ror#17
- eor r0,r0,r2,ror#18
- eor r12,r12,r1,ror#19
- eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
- ldr r2,[sp,#4*4]
- eor r12,r12,r1,lsr#10 @ sigma1(X[i+14])
- ldr r1,[sp,#13*4]
-
- add r12,r12,r0
- eor r0,r4,r4,ror#5 @ from BODY_00_15
- add r2,r2,r12
- eor r0,r0,r4,ror#19 @ Sigma1(e)
- add r2,r2,r1 @ X[i]
- ldr r12,[r14],#4 @ *K256++
- add r7,r7,r2 @ h+=X[i]
- str r2,[sp,#4*4]
- eor r2,r5,r6
- add r7,r7,r0,ror#6 @ h+=Sigma1(e)
- and r2,r2,r4
- add r7,r7,r12 @ h+=K256[i]
- eor r2,r2,r6 @ Ch(e,f,g)
- eor r0,r8,r8,ror#11
- add r7,r7,r2 @ h+=Ch(e,f,g)
-#if 20==31
- and r12,r12,#0xff
- cmp r12,#0xf2 @ done?
-#endif
-#if 20<15
-# if __ARM_ARCH__>=7
- ldr r2,[r1],#4 @ prefetch
-# else
- ldrb r2,[r1,#3]
-# endif
- eor r12,r8,r9 @ a^b, b^c in next round
-#else
- ldr r2,[sp,#6*4] @ from future BODY_16_xx
- eor r12,r8,r9 @ a^b, b^c in next round
- ldr r1,[sp,#3*4] @ from future BODY_16_xx
-#endif
- eor r0,r0,r8,ror#20 @ Sigma0(a)
- and r3,r3,r12 @ (b^c)&=(a^b)
- add r11,r11,r7 @ d+=h
- eor r3,r3,r9 @ Maj(a,b,c)
- add r7,r7,r0,ror#2 @ h+=Sigma0(a)
- @ add r7,r7,r3 @ h+=Maj(a,b,c)
- @ ldr r2,[sp,#6*4] @ 21
- @ ldr r1,[sp,#3*4]
- mov r0,r2,ror#7
- add r7,r7,r3 @ h+=Maj(a,b,c) from the past
- mov r3,r1,ror#17
- eor r0,r0,r2,ror#18
- eor r3,r3,r1,ror#19
- eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
- ldr r2,[sp,#5*4]
- eor r3,r3,r1,lsr#10 @ sigma1(X[i+14])
- ldr r1,[sp,#14*4]
-
- add r3,r3,r0
- eor r0,r11,r11,ror#5 @ from BODY_00_15
- add r2,r2,r3
- eor r0,r0,r11,ror#19 @ Sigma1(e)
- add r2,r2,r1 @ X[i]
- ldr r3,[r14],#4 @ *K256++
- add r6,r6,r2 @ h+=X[i]
- str r2,[sp,#5*4]
- eor r2,r4,r5
- add r6,r6,r0,ror#6 @ h+=Sigma1(e)
- and r2,r2,r11
- add r6,r6,r3 @ h+=K256[i]
- eor r2,r2,r5 @ Ch(e,f,g)
- eor r0,r7,r7,ror#11
- add r6,r6,r2 @ h+=Ch(e,f,g)
-#if 21==31
- and r3,r3,#0xff
- cmp r3,#0xf2 @ done?
-#endif
-#if 21<15
-# if __ARM_ARCH__>=7
- ldr r2,[r1],#4 @ prefetch
-# else
- ldrb r2,[r1,#3]
-# endif
- eor r3,r7,r8 @ a^b, b^c in next round
-#else
- ldr r2,[sp,#7*4] @ from future BODY_16_xx
- eor r3,r7,r8 @ a^b, b^c in next round
- ldr r1,[sp,#4*4] @ from future BODY_16_xx
-#endif
- eor r0,r0,r7,ror#20 @ Sigma0(a)
- and r12,r12,r3 @ (b^c)&=(a^b)
- add r10,r10,r6 @ d+=h
- eor r12,r12,r8 @ Maj(a,b,c)
- add r6,r6,r0,ror#2 @ h+=Sigma0(a)
- @ add r6,r6,r12 @ h+=Maj(a,b,c)
- @ ldr r2,[sp,#7*4] @ 22
- @ ldr r1,[sp,#4*4]
- mov r0,r2,ror#7
- add r6,r6,r12 @ h+=Maj(a,b,c) from the past
- mov r12,r1,ror#17
- eor r0,r0,r2,ror#18
- eor r12,r12,r1,ror#19
- eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
- ldr r2,[sp,#6*4]
- eor r12,r12,r1,lsr#10 @ sigma1(X[i+14])
- ldr r1,[sp,#15*4]
-
- add r12,r12,r0
- eor r0,r10,r10,ror#5 @ from BODY_00_15
- add r2,r2,r12
- eor r0,r0,r10,ror#19 @ Sigma1(e)
- add r2,r2,r1 @ X[i]
- ldr r12,[r14],#4 @ *K256++
- add r5,r5,r2 @ h+=X[i]
- str r2,[sp,#6*4]
- eor r2,r11,r4
- add r5,r5,r0,ror#6 @ h+=Sigma1(e)
- and r2,r2,r10
- add r5,r5,r12 @ h+=K256[i]
- eor r2,r2,r4 @ Ch(e,f,g)
- eor r0,r6,r6,ror#11
- add r5,r5,r2 @ h+=Ch(e,f,g)
-#if 22==31
- and r12,r12,#0xff
- cmp r12,#0xf2 @ done?
-#endif
-#if 22<15
-# if __ARM_ARCH__>=7
- ldr r2,[r1],#4 @ prefetch
-# else
- ldrb r2,[r1,#3]
-# endif
- eor r12,r6,r7 @ a^b, b^c in next round
-#else
- ldr r2,[sp,#8*4] @ from future BODY_16_xx
- eor r12,r6,r7 @ a^b, b^c in next round
- ldr r1,[sp,#5*4] @ from future BODY_16_xx
-#endif
- eor r0,r0,r6,ror#20 @ Sigma0(a)
- and r3,r3,r12 @ (b^c)&=(a^b)
- add r9,r9,r5 @ d+=h
- eor r3,r3,r7 @ Maj(a,b,c)
- add r5,r5,r0,ror#2 @ h+=Sigma0(a)
- @ add r5,r5,r3 @ h+=Maj(a,b,c)
- @ ldr r2,[sp,#8*4] @ 23
- @ ldr r1,[sp,#5*4]
- mov r0,r2,ror#7
- add r5,r5,r3 @ h+=Maj(a,b,c) from the past
- mov r3,r1,ror#17
- eor r0,r0,r2,ror#18
- eor r3,r3,r1,ror#19
- eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
- ldr r2,[sp,#7*4]
- eor r3,r3,r1,lsr#10 @ sigma1(X[i+14])
- ldr r1,[sp,#0*4]
-
- add r3,r3,r0
- eor r0,r9,r9,ror#5 @ from BODY_00_15
- add r2,r2,r3
- eor r0,r0,r9,ror#19 @ Sigma1(e)
- add r2,r2,r1 @ X[i]
- ldr r3,[r14],#4 @ *K256++
- add r4,r4,r2 @ h+=X[i]
- str r2,[sp,#7*4]
- eor r2,r10,r11
- add r4,r4,r0,ror#6 @ h+=Sigma1(e)
- and r2,r2,r9
- add r4,r4,r3 @ h+=K256[i]
- eor r2,r2,r11 @ Ch(e,f,g)
- eor r0,r5,r5,ror#11
- add r4,r4,r2 @ h+=Ch(e,f,g)
-#if 23==31
- and r3,r3,#0xff
- cmp r3,#0xf2 @ done?
-#endif
-#if 23<15
-# if __ARM_ARCH__>=7
- ldr r2,[r1],#4 @ prefetch
-# else
- ldrb r2,[r1,#3]
-# endif
- eor r3,r5,r6 @ a^b, b^c in next round
-#else
- ldr r2,[sp,#9*4] @ from future BODY_16_xx
- eor r3,r5,r6 @ a^b, b^c in next round
- ldr r1,[sp,#6*4] @ from future BODY_16_xx
-#endif
- eor r0,r0,r5,ror#20 @ Sigma0(a)
- and r12,r12,r3 @ (b^c)&=(a^b)
- add r8,r8,r4 @ d+=h
- eor r12,r12,r6 @ Maj(a,b,c)
- add r4,r4,r0,ror#2 @ h+=Sigma0(a)
- @ add r4,r4,r12 @ h+=Maj(a,b,c)
- @ ldr r2,[sp,#9*4] @ 24
- @ ldr r1,[sp,#6*4]
- mov r0,r2,ror#7
- add r4,r4,r12 @ h+=Maj(a,b,c) from the past
- mov r12,r1,ror#17
- eor r0,r0,r2,ror#18
- eor r12,r12,r1,ror#19
- eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
- ldr r2,[sp,#8*4]
- eor r12,r12,r1,lsr#10 @ sigma1(X[i+14])
- ldr r1,[sp,#1*4]
-
- add r12,r12,r0
- eor r0,r8,r8,ror#5 @ from BODY_00_15
- add r2,r2,r12
- eor r0,r0,r8,ror#19 @ Sigma1(e)
- add r2,r2,r1 @ X[i]
- ldr r12,[r14],#4 @ *K256++
- add r11,r11,r2 @ h+=X[i]
- str r2,[sp,#8*4]
- eor r2,r9,r10
- add r11,r11,r0,ror#6 @ h+=Sigma1(e)
- and r2,r2,r8
- add r11,r11,r12 @ h+=K256[i]
- eor r2,r2,r10 @ Ch(e,f,g)
- eor r0,r4,r4,ror#11
- add r11,r11,r2 @ h+=Ch(e,f,g)
-#if 24==31
- and r12,r12,#0xff
- cmp r12,#0xf2 @ done?
-#endif
-#if 24<15
-# if __ARM_ARCH__>=7
- ldr r2,[r1],#4 @ prefetch
-# else
- ldrb r2,[r1,#3]
-# endif
- eor r12,r4,r5 @ a^b, b^c in next round
-#else
- ldr r2,[sp,#10*4] @ from future BODY_16_xx
- eor r12,r4,r5 @ a^b, b^c in next round
- ldr r1,[sp,#7*4] @ from future BODY_16_xx
-#endif
- eor r0,r0,r4,ror#20 @ Sigma0(a)
- and r3,r3,r12 @ (b^c)&=(a^b)
- add r7,r7,r11 @ d+=h
- eor r3,r3,r5 @ Maj(a,b,c)
- add r11,r11,r0,ror#2 @ h+=Sigma0(a)
- @ add r11,r11,r3 @ h+=Maj(a,b,c)
- @ ldr r2,[sp,#10*4] @ 25
- @ ldr r1,[sp,#7*4]
- mov r0,r2,ror#7
- add r11,r11,r3 @ h+=Maj(a,b,c) from the past
- mov r3,r1,ror#17
- eor r0,r0,r2,ror#18
- eor r3,r3,r1,ror#19
- eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
- ldr r2,[sp,#9*4]
- eor r3,r3,r1,lsr#10 @ sigma1(X[i+14])
- ldr r1,[sp,#2*4]
-
- add r3,r3,r0
- eor r0,r7,r7,ror#5 @ from BODY_00_15
- add r2,r2,r3
- eor r0,r0,r7,ror#19 @ Sigma1(e)
- add r2,r2,r1 @ X[i]
- ldr r3,[r14],#4 @ *K256++
- add r10,r10,r2 @ h+=X[i]
- str r2,[sp,#9*4]
- eor r2,r8,r9
- add r10,r10,r0,ror#6 @ h+=Sigma1(e)
- and r2,r2,r7
- add r10,r10,r3 @ h+=K256[i]
- eor r2,r2,r9 @ Ch(e,f,g)
- eor r0,r11,r11,ror#11
- add r10,r10,r2 @ h+=Ch(e,f,g)
-#if 25==31
- and r3,r3,#0xff
- cmp r3,#0xf2 @ done?
-#endif
-#if 25<15
-# if __ARM_ARCH__>=7
- ldr r2,[r1],#4 @ prefetch
-# else
- ldrb r2,[r1,#3]
-# endif
- eor r3,r11,r4 @ a^b, b^c in next round
-#else
- ldr r2,[sp,#11*4] @ from future BODY_16_xx
- eor r3,r11,r4 @ a^b, b^c in next round
- ldr r1,[sp,#8*4] @ from future BODY_16_xx
-#endif
- eor r0,r0,r11,ror#20 @ Sigma0(a)
- and r12,r12,r3 @ (b^c)&=(a^b)
- add r6,r6,r10 @ d+=h
- eor r12,r12,r4 @ Maj(a,b,c)
- add r10,r10,r0,ror#2 @ h+=Sigma0(a)
- @ add r10,r10,r12 @ h+=Maj(a,b,c)
- @ ldr r2,[sp,#11*4] @ 26
- @ ldr r1,[sp,#8*4]
- mov r0,r2,ror#7
- add r10,r10,r12 @ h+=Maj(a,b,c) from the past
- mov r12,r1,ror#17
- eor r0,r0,r2,ror#18
- eor r12,r12,r1,ror#19
- eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
- ldr r2,[sp,#10*4]
- eor r12,r12,r1,lsr#10 @ sigma1(X[i+14])
- ldr r1,[sp,#3*4]
-
- add r12,r12,r0
- eor r0,r6,r6,ror#5 @ from BODY_00_15
- add r2,r2,r12
- eor r0,r0,r6,ror#19 @ Sigma1(e)
- add r2,r2,r1 @ X[i]
- ldr r12,[r14],#4 @ *K256++
- add r9,r9,r2 @ h+=X[i]
- str r2,[sp,#10*4]
- eor r2,r7,r8
- add r9,r9,r0,ror#6 @ h+=Sigma1(e)
- and r2,r2,r6
- add r9,r9,r12 @ h+=K256[i]
- eor r2,r2,r8 @ Ch(e,f,g)
- eor r0,r10,r10,ror#11
- add r9,r9,r2 @ h+=Ch(e,f,g)
-#if 26==31
- and r12,r12,#0xff
- cmp r12,#0xf2 @ done?
-#endif
-#if 26<15
-# if __ARM_ARCH__>=7
- ldr r2,[r1],#4 @ prefetch
-# else
- ldrb r2,[r1,#3]
-# endif
- eor r12,r10,r11 @ a^b, b^c in next round
-#else
- ldr r2,[sp,#12*4] @ from future BODY_16_xx
- eor r12,r10,r11 @ a^b, b^c in next round
- ldr r1,[sp,#9*4] @ from future BODY_16_xx
-#endif
- eor r0,r0,r10,ror#20 @ Sigma0(a)
- and r3,r3,r12 @ (b^c)&=(a^b)
- add r5,r5,r9 @ d+=h
- eor r3,r3,r11 @ Maj(a,b,c)
- add r9,r9,r0,ror#2 @ h+=Sigma0(a)
- @ add r9,r9,r3 @ h+=Maj(a,b,c)
- @ ldr r2,[sp,#12*4] @ 27
- @ ldr r1,[sp,#9*4]
- mov r0,r2,ror#7
- add r9,r9,r3 @ h+=Maj(a,b,c) from the past
- mov r3,r1,ror#17
- eor r0,r0,r2,ror#18
- eor r3,r3,r1,ror#19
- eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
- ldr r2,[sp,#11*4]
- eor r3,r3,r1,lsr#10 @ sigma1(X[i+14])
- ldr r1,[sp,#4*4]
-
- add r3,r3,r0
- eor r0,r5,r5,ror#5 @ from BODY_00_15
- add r2,r2,r3
- eor r0,r0,r5,ror#19 @ Sigma1(e)
- add r2,r2,r1 @ X[i]
- ldr r3,[r14],#4 @ *K256++
- add r8,r8,r2 @ h+=X[i]
- str r2,[sp,#11*4]
- eor r2,r6,r7
- add r8,r8,r0,ror#6 @ h+=Sigma1(e)
- and r2,r2,r5
- add r8,r8,r3 @ h+=K256[i]
- eor r2,r2,r7 @ Ch(e,f,g)
- eor r0,r9,r9,ror#11
- add r8,r8,r2 @ h+=Ch(e,f,g)
-#if 27==31
- and r3,r3,#0xff
- cmp r3,#0xf2 @ done?
-#endif
-#if 27<15
-# if __ARM_ARCH__>=7
- ldr r2,[r1],#4 @ prefetch
-# else
- ldrb r2,[r1,#3]
-# endif
- eor r3,r9,r10 @ a^b, b^c in next round
-#else
- ldr r2,[sp,#13*4] @ from future BODY_16_xx
- eor r3,r9,r10 @ a^b, b^c in next round
- ldr r1,[sp,#10*4] @ from future BODY_16_xx
-#endif
- eor r0,r0,r9,ror#20 @ Sigma0(a)
- and r12,r12,r3 @ (b^c)&=(a^b)
- add r4,r4,r8 @ d+=h
- eor r12,r12,r10 @ Maj(a,b,c)
- add r8,r8,r0,ror#2 @ h+=Sigma0(a)
- @ add r8,r8,r12 @ h+=Maj(a,b,c)
- @ ldr r2,[sp,#13*4] @ 28
- @ ldr r1,[sp,#10*4]
- mov r0,r2,ror#7
- add r8,r8,r12 @ h+=Maj(a,b,c) from the past
- mov r12,r1,ror#17
- eor r0,r0,r2,ror#18
- eor r12,r12,r1,ror#19
- eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
- ldr r2,[sp,#12*4]
- eor r12,r12,r1,lsr#10 @ sigma1(X[i+14])
- ldr r1,[sp,#5*4]
-
- add r12,r12,r0
- eor r0,r4,r4,ror#5 @ from BODY_00_15
- add r2,r2,r12
- eor r0,r0,r4,ror#19 @ Sigma1(e)
- add r2,r2,r1 @ X[i]
- ldr r12,[r14],#4 @ *K256++
- add r7,r7,r2 @ h+=X[i]
- str r2,[sp,#12*4]
- eor r2,r5,r6
- add r7,r7,r0,ror#6 @ h+=Sigma1(e)
- and r2,r2,r4
- add r7,r7,r12 @ h+=K256[i]
- eor r2,r2,r6 @ Ch(e,f,g)
- eor r0,r8,r8,ror#11
- add r7,r7,r2 @ h+=Ch(e,f,g)
-#if 28==31
- and r12,r12,#0xff
- cmp r12,#0xf2 @ done?
-#endif
-#if 28<15
-# if __ARM_ARCH__>=7
- ldr r2,[r1],#4 @ prefetch
-# else
- ldrb r2,[r1,#3]
-# endif
- eor r12,r8,r9 @ a^b, b^c in next round
-#else
- ldr r2,[sp,#14*4] @ from future BODY_16_xx
- eor r12,r8,r9 @ a^b, b^c in next round
- ldr r1,[sp,#11*4] @ from future BODY_16_xx
-#endif
- eor r0,r0,r8,ror#20 @ Sigma0(a)
- and r3,r3,r12 @ (b^c)&=(a^b)
- add r11,r11,r7 @ d+=h
- eor r3,r3,r9 @ Maj(a,b,c)
- add r7,r7,r0,ror#2 @ h+=Sigma0(a)
- @ add r7,r7,r3 @ h+=Maj(a,b,c)
- @ ldr r2,[sp,#14*4] @ 29
- @ ldr r1,[sp,#11*4]
- mov r0,r2,ror#7
- add r7,r7,r3 @ h+=Maj(a,b,c) from the past
- mov r3,r1,ror#17
- eor r0,r0,r2,ror#18
- eor r3,r3,r1,ror#19
- eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
- ldr r2,[sp,#13*4]
- eor r3,r3,r1,lsr#10 @ sigma1(X[i+14])
- ldr r1,[sp,#6*4]
-
- add r3,r3,r0
- eor r0,r11,r11,ror#5 @ from BODY_00_15
- add r2,r2,r3
- eor r0,r0,r11,ror#19 @ Sigma1(e)
- add r2,r2,r1 @ X[i]
- ldr r3,[r14],#4 @ *K256++
- add r6,r6,r2 @ h+=X[i]
- str r2,[sp,#13*4]
- eor r2,r4,r5
- add r6,r6,r0,ror#6 @ h+=Sigma1(e)
- and r2,r2,r11
- add r6,r6,r3 @ h+=K256[i]
- eor r2,r2,r5 @ Ch(e,f,g)
- eor r0,r7,r7,ror#11
- add r6,r6,r2 @ h+=Ch(e,f,g)
-#if 29==31
- and r3,r3,#0xff
- cmp r3,#0xf2 @ done?
-#endif
-#if 29<15
-# if __ARM_ARCH__>=7
- ldr r2,[r1],#4 @ prefetch
-# else
- ldrb r2,[r1,#3]
-# endif
- eor r3,r7,r8 @ a^b, b^c in next round
-#else
- ldr r2,[sp,#15*4] @ from future BODY_16_xx
- eor r3,r7,r8 @ a^b, b^c in next round
- ldr r1,[sp,#12*4] @ from future BODY_16_xx
-#endif
- eor r0,r0,r7,ror#20 @ Sigma0(a)
- and r12,r12,r3 @ (b^c)&=(a^b)
- add r10,r10,r6 @ d+=h
- eor r12,r12,r8 @ Maj(a,b,c)
- add r6,r6,r0,ror#2 @ h+=Sigma0(a)
- @ add r6,r6,r12 @ h+=Maj(a,b,c)
- @ ldr r2,[sp,#15*4] @ 30
- @ ldr r1,[sp,#12*4]
- mov r0,r2,ror#7
- add r6,r6,r12 @ h+=Maj(a,b,c) from the past
- mov r12,r1,ror#17
- eor r0,r0,r2,ror#18
- eor r12,r12,r1,ror#19
- eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
- ldr r2,[sp,#14*4]
- eor r12,r12,r1,lsr#10 @ sigma1(X[i+14])
- ldr r1,[sp,#7*4]
-
- add r12,r12,r0
- eor r0,r10,r10,ror#5 @ from BODY_00_15
- add r2,r2,r12
- eor r0,r0,r10,ror#19 @ Sigma1(e)
- add r2,r2,r1 @ X[i]
- ldr r12,[r14],#4 @ *K256++
- add r5,r5,r2 @ h+=X[i]
- str r2,[sp,#14*4]
- eor r2,r11,r4
- add r5,r5,r0,ror#6 @ h+=Sigma1(e)
- and r2,r2,r10
- add r5,r5,r12 @ h+=K256[i]
- eor r2,r2,r4 @ Ch(e,f,g)
- eor r0,r6,r6,ror#11
- add r5,r5,r2 @ h+=Ch(e,f,g)
-#if 30==31
- and r12,r12,#0xff
- cmp r12,#0xf2 @ done?
-#endif
-#if 30<15
-# if __ARM_ARCH__>=7
- ldr r2,[r1],#4 @ prefetch
-# else
- ldrb r2,[r1,#3]
-# endif
- eor r12,r6,r7 @ a^b, b^c in next round
-#else
- ldr r2,[sp,#0*4] @ from future BODY_16_xx
- eor r12,r6,r7 @ a^b, b^c in next round
- ldr r1,[sp,#13*4] @ from future BODY_16_xx
-#endif
- eor r0,r0,r6,ror#20 @ Sigma0(a)
- and r3,r3,r12 @ (b^c)&=(a^b)
- add r9,r9,r5 @ d+=h
- eor r3,r3,r7 @ Maj(a,b,c)
- add r5,r5,r0,ror#2 @ h+=Sigma0(a)
- @ add r5,r5,r3 @ h+=Maj(a,b,c)
- @ ldr r2,[sp,#0*4] @ 31
- @ ldr r1,[sp,#13*4]
- mov r0,r2,ror#7
- add r5,r5,r3 @ h+=Maj(a,b,c) from the past
- mov r3,r1,ror#17
- eor r0,r0,r2,ror#18
- eor r3,r3,r1,ror#19
- eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
- ldr r2,[sp,#15*4]
- eor r3,r3,r1,lsr#10 @ sigma1(X[i+14])
- ldr r1,[sp,#8*4]
-
- add r3,r3,r0
- eor r0,r9,r9,ror#5 @ from BODY_00_15
- add r2,r2,r3
- eor r0,r0,r9,ror#19 @ Sigma1(e)
- add r2,r2,r1 @ X[i]
- ldr r3,[r14],#4 @ *K256++
- add r4,r4,r2 @ h+=X[i]
- str r2,[sp,#15*4]
- eor r2,r10,r11
- add r4,r4,r0,ror#6 @ h+=Sigma1(e)
- and r2,r2,r9
- add r4,r4,r3 @ h+=K256[i]
- eor r2,r2,r11 @ Ch(e,f,g)
- eor r0,r5,r5,ror#11
- add r4,r4,r2 @ h+=Ch(e,f,g)
-#if 31==31
- and r3,r3,#0xff
- cmp r3,#0xf2 @ done?
-#endif
-#if 31<15
-# if __ARM_ARCH__>=7
- ldr r2,[r1],#4 @ prefetch
-# else
- ldrb r2,[r1,#3]
-# endif
- eor r3,r5,r6 @ a^b, b^c in next round
-#else
- ldr r2,[sp,#1*4] @ from future BODY_16_xx
- eor r3,r5,r6 @ a^b, b^c in next round
- ldr r1,[sp,#14*4] @ from future BODY_16_xx
-#endif
- eor r0,r0,r5,ror#20 @ Sigma0(a)
- and r12,r12,r3 @ (b^c)&=(a^b)
- add r8,r8,r4 @ d+=h
- eor r12,r12,r6 @ Maj(a,b,c)
- add r4,r4,r0,ror#2 @ h+=Sigma0(a)
- @ add r4,r4,r12 @ h+=Maj(a,b,c)
-#if __ARM_ARCH__>=7
- ite eq @ Thumb2 thing, sanity check in ARM
-#endif
- ldreq r3,[sp,#16*4] @ pull ctx
- bne .Lrounds_16_xx
-
- add r4,r4,r12 @ h+=Maj(a,b,c) from the past
- ldr r0,[r3,#0]
- ldr r2,[r3,#4]
- ldr r12,[r3,#8]
- add r4,r4,r0
- ldr r0,[r3,#12]
- add r5,r5,r2
- ldr r2,[r3,#16]
- add r6,r6,r12
- ldr r12,[r3,#20]
- add r7,r7,r0
- ldr r0,[r3,#24]
- add r8,r8,r2
- ldr r2,[r3,#28]
- add r9,r9,r12
- ldr r1,[sp,#17*4] @ pull inp
- ldr r12,[sp,#18*4] @ pull inp+len
- add r10,r10,r0
- add r11,r11,r2
- stmia r3,{r4,r5,r6,r7,r8,r9,r10,r11}
- cmp r1,r12
- sub r14,r14,#256 @ rewind Ktbl
- bne .Loop
-
- add sp,sp,#19*4 @ destroy frame
-#if __ARM_ARCH__>=5
- ldmia sp!,{r4-r11,pc}
-#else
- ldmia sp!,{r4-r11,lr}
- tst lr,#1
- moveq pc,lr @ be binary compatible with V4, yet
- .word 0xe12fff1e @ interoperable with Thumb ISA:-)
-#endif
-.size sha256_block_data_order,.-sha256_block_data_order
-#if __ARM_MAX_ARCH__>=7
-.arch armv7-a
-.fpu neon
-
-.global sha256_block_data_order_neon
-.type sha256_block_data_order_neon,%function
-.align 4
-sha256_block_data_order_neon:
-.LNEON:
- stmdb sp!,{r4-r12,lr}
-
- sub r11,sp,#16*4+16
- adrl r14,K256
- bic r11,r11,#15 @ align for 128-bit stores
- mov r12,sp
- mov sp,r11 @ alloca
- add r2,r1,r2,lsl#6 @ len to point at the end of inp
-
- vld1.8 {q0},[r1]!
- vld1.8 {q1},[r1]!
- vld1.8 {q2},[r1]!
- vld1.8 {q3},[r1]!
- vld1.32 {q8},[r14,:128]!
- vld1.32 {q9},[r14,:128]!
- vld1.32 {q10},[r14,:128]!
- vld1.32 {q11},[r14,:128]!
- vrev32.8 q0,q0 @ yes, even on
- str r0,[sp,#64]
- vrev32.8 q1,q1 @ big-endian
- str r1,[sp,#68]
- mov r1,sp
- vrev32.8 q2,q2
- str r2,[sp,#72]
- vrev32.8 q3,q3
- str r12,[sp,#76] @ save original sp
- vadd.i32 q8,q8,q0
- vadd.i32 q9,q9,q1
- vst1.32 {q8},[r1,:128]!
- vadd.i32 q10,q10,q2
- vst1.32 {q9},[r1,:128]!
- vadd.i32 q11,q11,q3
- vst1.32 {q10},[r1,:128]!
- vst1.32 {q11},[r1,:128]!
-
- ldmia r0,{r4-r11}
- sub r1,r1,#64
- ldr r2,[sp,#0]
- eor r12,r12,r12
- eor r3,r5,r6
- b .L_00_48
-
-.align 4
-.L_00_48:
- vext.8 q8,q0,q1,#4
- add r11,r11,r2
- eor r2,r9,r10
- eor r0,r8,r8,ror#5
- vext.8 q9,q2,q3,#4
- add r4,r4,r12
- and r2,r2,r8
- eor r12,r0,r8,ror#19
- vshr.u32 q10,q8,#7
- eor r0,r4,r4,ror#11
- eor r2,r2,r10
- vadd.i32 q0,q0,q9
- add r11,r11,r12,ror#6
- eor r12,r4,r5
- vshr.u32 q9,q8,#3
- eor r0,r0,r4,ror#20
- add r11,r11,r2
- vsli.32 q10,q8,#25
- ldr r2,[sp,#4]
- and r3,r3,r12
- vshr.u32 q11,q8,#18
- add r7,r7,r11
- add r11,r11,r0,ror#2
- eor r3,r3,r5
- veor q9,q9,q10
- add r10,r10,r2
- vsli.32 q11,q8,#14
- eor r2,r8,r9
- eor r0,r7,r7,ror#5
- vshr.u32 d24,d7,#17
- add r11,r11,r3
- and r2,r2,r7
- veor q9,q9,q11
- eor r3,r0,r7,ror#19
- eor r0,r11,r11,ror#11
- vsli.32 d24,d7,#15
- eor r2,r2,r9
- add r10,r10,r3,ror#6
- vshr.u32 d25,d7,#10
- eor r3,r11,r4
- eor r0,r0,r11,ror#20
- vadd.i32 q0,q0,q9
- add r10,r10,r2
- ldr r2,[sp,#8]
- veor d25,d25,d24
- and r12,r12,r3
- add r6,r6,r10
- vshr.u32 d24,d7,#19
- add r10,r10,r0,ror#2
- eor r12,r12,r4
- vsli.32 d24,d7,#13
- add r9,r9,r2
- eor r2,r7,r8
- veor d25,d25,d24
- eor r0,r6,r6,ror#5
- add r10,r10,r12
- vadd.i32 d0,d0,d25
- and r2,r2,r6
- eor r12,r0,r6,ror#19
- vshr.u32 d24,d0,#17
- eor r0,r10,r10,ror#11
- eor r2,r2,r8
- vsli.32 d24,d0,#15
- add r9,r9,r12,ror#6
- eor r12,r10,r11
- vshr.u32 d25,d0,#10
- eor r0,r0,r10,ror#20
- add r9,r9,r2
- veor d25,d25,d24
- ldr r2,[sp,#12]
- and r3,r3,r12
- vshr.u32 d24,d0,#19
- add r5,r5,r9
- add r9,r9,r0,ror#2
- eor r3,r3,r11
- vld1.32 {q8},[r14,:128]!
- add r8,r8,r2
- vsli.32 d24,d0,#13
- eor r2,r6,r7
- eor r0,r5,r5,ror#5
- veor d25,d25,d24
- add r9,r9,r3
- and r2,r2,r5
- vadd.i32 d1,d1,d25
- eor r3,r0,r5,ror#19
- eor r0,r9,r9,ror#11
- vadd.i32 q8,q8,q0
- eor r2,r2,r7
- add r8,r8,r3,ror#6
- eor r3,r9,r10
- eor r0,r0,r9,ror#20
- add r8,r8,r2
- ldr r2,[sp,#16]
- and r12,r12,r3
- add r4,r4,r8
- vst1.32 {q8},[r1,:128]!
- add r8,r8,r0,ror#2
- eor r12,r12,r10
- vext.8 q8,q1,q2,#4
- add r7,r7,r2
- eor r2,r5,r6
- eor r0,r4,r4,ror#5
- vext.8 q9,q3,q0,#4
- add r8,r8,r12
- and r2,r2,r4
- eor r12,r0,r4,ror#19
- vshr.u32 q10,q8,#7
- eor r0,r8,r8,ror#11
- eor r2,r2,r6
- vadd.i32 q1,q1,q9
- add r7,r7,r12,ror#6
- eor r12,r8,r9
- vshr.u32 q9,q8,#3
- eor r0,r0,r8,ror#20
- add r7,r7,r2
- vsli.32 q10,q8,#25
- ldr r2,[sp,#20]
- and r3,r3,r12
- vshr.u32 q11,q8,#18
- add r11,r11,r7
- add r7,r7,r0,ror#2
- eor r3,r3,r9
- veor q9,q9,q10
- add r6,r6,r2
- vsli.32 q11,q8,#14
- eor r2,r4,r5
- eor r0,r11,r11,ror#5
- vshr.u32 d24,d1,#17
- add r7,r7,r3
- and r2,r2,r11
- veor q9,q9,q11
- eor r3,r0,r11,ror#19
- eor r0,r7,r7,ror#11
- vsli.32 d24,d1,#15
- eor r2,r2,r5
- add r6,r6,r3,ror#6
- vshr.u32 d25,d1,#10
- eor r3,r7,r8
- eor r0,r0,r7,ror#20
- vadd.i32 q1,q1,q9
- add r6,r6,r2
- ldr r2,[sp,#24]
- veor d25,d25,d24
- and r12,r12,r3
- add r10,r10,r6
- vshr.u32 d24,d1,#19
- add r6,r6,r0,ror#2
- eor r12,r12,r8
- vsli.32 d24,d1,#13
- add r5,r5,r2
- eor r2,r11,r4
- veor d25,d25,d24
- eor r0,r10,r10,ror#5
- add r6,r6,r12
- vadd.i32 d2,d2,d25
- and r2,r2,r10
- eor r12,r0,r10,ror#19
- vshr.u32 d24,d2,#17
- eor r0,r6,r6,ror#11
- eor r2,r2,r4
- vsli.32 d24,d2,#15
- add r5,r5,r12,ror#6
- eor r12,r6,r7
- vshr.u32 d25,d2,#10
- eor r0,r0,r6,ror#20
- add r5,r5,r2
- veor d25,d25,d24
- ldr r2,[sp,#28]
- and r3,r3,r12
- vshr.u32 d24,d2,#19
- add r9,r9,r5
- add r5,r5,r0,ror#2
- eor r3,r3,r7
- vld1.32 {q8},[r14,:128]!
- add r4,r4,r2
- vsli.32 d24,d2,#13
- eor r2,r10,r11
- eor r0,r9,r9,ror#5
- veor d25,d25,d24
- add r5,r5,r3
- and r2,r2,r9
- vadd.i32 d3,d3,d25
- eor r3,r0,r9,ror#19
- eor r0,r5,r5,ror#11
- vadd.i32 q8,q8,q1
- eor r2,r2,r11
- add r4,r4,r3,ror#6
- eor r3,r5,r6
- eor r0,r0,r5,ror#20
- add r4,r4,r2
- ldr r2,[sp,#32]
- and r12,r12,r3
- add r8,r8,r4
- vst1.32 {q8},[r1,:128]!
- add r4,r4,r0,ror#2
- eor r12,r12,r6
- vext.8 q8,q2,q3,#4
- add r11,r11,r2
- eor r2,r9,r10
- eor r0,r8,r8,ror#5
- vext.8 q9,q0,q1,#4
- add r4,r4,r12
- and r2,r2,r8
- eor r12,r0,r8,ror#19
- vshr.u32 q10,q8,#7
- eor r0,r4,r4,ror#11
- eor r2,r2,r10
- vadd.i32 q2,q2,q9
- add r11,r11,r12,ror#6
- eor r12,r4,r5
- vshr.u32 q9,q8,#3
- eor r0,r0,r4,ror#20
- add r11,r11,r2
- vsli.32 q10,q8,#25
- ldr r2,[sp,#36]
- and r3,r3,r12
- vshr.u32 q11,q8,#18
- add r7,r7,r11
- add r11,r11,r0,ror#2
- eor r3,r3,r5
- veor q9,q9,q10
- add r10,r10,r2
- vsli.32 q11,q8,#14
- eor r2,r8,r9
- eor r0,r7,r7,ror#5
- vshr.u32 d24,d3,#17
- add r11,r11,r3
- and r2,r2,r7
- veor q9,q9,q11
- eor r3,r0,r7,ror#19
- eor r0,r11,r11,ror#11
- vsli.32 d24,d3,#15
- eor r2,r2,r9
- add r10,r10,r3,ror#6
- vshr.u32 d25,d3,#10
- eor r3,r11,r4
- eor r0,r0,r11,ror#20
- vadd.i32 q2,q2,q9
- add r10,r10,r2
- ldr r2,[sp,#40]
- veor d25,d25,d24
- and r12,r12,r3
- add r6,r6,r10
- vshr.u32 d24,d3,#19
- add r10,r10,r0,ror#2
- eor r12,r12,r4
- vsli.32 d24,d3,#13
- add r9,r9,r2
- eor r2,r7,r8
- veor d25,d25,d24
- eor r0,r6,r6,ror#5
- add r10,r10,r12
- vadd.i32 d4,d4,d25
- and r2,r2,r6
- eor r12,r0,r6,ror#19
- vshr.u32 d24,d4,#17
- eor r0,r10,r10,ror#11
- eor r2,r2,r8
- vsli.32 d24,d4,#15
- add r9,r9,r12,ror#6
- eor r12,r10,r11
- vshr.u32 d25,d4,#10
- eor r0,r0,r10,ror#20
- add r9,r9,r2
- veor d25,d25,d24
- ldr r2,[sp,#44]
- and r3,r3,r12
- vshr.u32 d24,d4,#19
- add r5,r5,r9
- add r9,r9,r0,ror#2
- eor r3,r3,r11
- vld1.32 {q8},[r14,:128]!
- add r8,r8,r2
- vsli.32 d24,d4,#13
- eor r2,r6,r7
- eor r0,r5,r5,ror#5
- veor d25,d25,d24
- add r9,r9,r3
- and r2,r2,r5
- vadd.i32 d5,d5,d25
- eor r3,r0,r5,ror#19
- eor r0,r9,r9,ror#11
- vadd.i32 q8,q8,q2
- eor r2,r2,r7
- add r8,r8,r3,ror#6
- eor r3,r9,r10
- eor r0,r0,r9,ror#20
- add r8,r8,r2
- ldr r2,[sp,#48]
- and r12,r12,r3
- add r4,r4,r8
- vst1.32 {q8},[r1,:128]!
- add r8,r8,r0,ror#2
- eor r12,r12,r10
- vext.8 q8,q3,q0,#4
- add r7,r7,r2
- eor r2,r5,r6
- eor r0,r4,r4,ror#5
- vext.8 q9,q1,q2,#4
- add r8,r8,r12
- and r2,r2,r4
- eor r12,r0,r4,ror#19
- vshr.u32 q10,q8,#7
- eor r0,r8,r8,ror#11
- eor r2,r2,r6
- vadd.i32 q3,q3,q9
- add r7,r7,r12,ror#6
- eor r12,r8,r9
- vshr.u32 q9,q8,#3
- eor r0,r0,r8,ror#20
- add r7,r7,r2
- vsli.32 q10,q8,#25
- ldr r2,[sp,#52]
- and r3,r3,r12
- vshr.u32 q11,q8,#18
- add r11,r11,r7
- add r7,r7,r0,ror#2
- eor r3,r3,r9
- veor q9,q9,q10
- add r6,r6,r2
- vsli.32 q11,q8,#14
- eor r2,r4,r5
- eor r0,r11,r11,ror#5
- vshr.u32 d24,d5,#17
- add r7,r7,r3
- and r2,r2,r11
- veor q9,q9,q11
- eor r3,r0,r11,ror#19
- eor r0,r7,r7,ror#11
- vsli.32 d24,d5,#15
- eor r2,r2,r5
- add r6,r6,r3,ror#6
- vshr.u32 d25,d5,#10
- eor r3,r7,r8
- eor r0,r0,r7,ror#20
- vadd.i32 q3,q3,q9
- add r6,r6,r2
- ldr r2,[sp,#56]
- veor d25,d25,d24
- and r12,r12,r3
- add r10,r10,r6
- vshr.u32 d24,d5,#19
- add r6,r6,r0,ror#2
- eor r12,r12,r8
- vsli.32 d24,d5,#13
- add r5,r5,r2
- eor r2,r11,r4
- veor d25,d25,d24
- eor r0,r10,r10,ror#5
- add r6,r6,r12
- vadd.i32 d6,d6,d25
- and r2,r2,r10
- eor r12,r0,r10,ror#19
- vshr.u32 d24,d6,#17
- eor r0,r6,r6,ror#11
- eor r2,r2,r4
- vsli.32 d24,d6,#15
- add r5,r5,r12,ror#6
- eor r12,r6,r7
- vshr.u32 d25,d6,#10
- eor r0,r0,r6,ror#20
- add r5,r5,r2
- veor d25,d25,d24
- ldr r2,[sp,#60]
- and r3,r3,r12
- vshr.u32 d24,d6,#19
- add r9,r9,r5
- add r5,r5,r0,ror#2
- eor r3,r3,r7
- vld1.32 {q8},[r14,:128]!
- add r4,r4,r2
- vsli.32 d24,d6,#13
- eor r2,r10,r11
- eor r0,r9,r9,ror#5
- veor d25,d25,d24
- add r5,r5,r3
- and r2,r2,r9
- vadd.i32 d7,d7,d25
- eor r3,r0,r9,ror#19
- eor r0,r5,r5,ror#11
- vadd.i32 q8,q8,q3
- eor r2,r2,r11
- add r4,r4,r3,ror#6
- eor r3,r5,r6
- eor r0,r0,r5,ror#20
- add r4,r4,r2
- ldr r2,[r14]
- and r12,r12,r3
- add r8,r8,r4
- vst1.32 {q8},[r1,:128]!
- add r4,r4,r0,ror#2
- eor r12,r12,r6
- teq r2,#0 @ check for K256 terminator
- ldr r2,[sp,#0]
- sub r1,r1,#64
- bne .L_00_48
-
- ldr r1,[sp,#68]
- ldr r0,[sp,#72]
- sub r14,r14,#256 @ rewind r14
- teq r1,r0
- it eq
- subeq r1,r1,#64 @ avoid SEGV
- vld1.8 {q0},[r1]! @ load next input block
- vld1.8 {q1},[r1]!
- vld1.8 {q2},[r1]!
- vld1.8 {q3},[r1]!
- it ne
- strne r1,[sp,#68]
- mov r1,sp
- add r11,r11,r2
- eor r2,r9,r10
- eor r0,r8,r8,ror#5
- add r4,r4,r12
- vld1.32 {q8},[r14,:128]!
- and r2,r2,r8
- eor r12,r0,r8,ror#19
- eor r0,r4,r4,ror#11
- eor r2,r2,r10
- vrev32.8 q0,q0
- add r11,r11,r12,ror#6
- eor r12,r4,r5
- eor r0,r0,r4,ror#20
- add r11,r11,r2
- vadd.i32 q8,q8,q0
- ldr r2,[sp,#4]
- and r3,r3,r12
- add r7,r7,r11
- add r11,r11,r0,ror#2
- eor r3,r3,r5
- add r10,r10,r2
- eor r2,r8,r9
- eor r0,r7,r7,ror#5
- add r11,r11,r3
- and r2,r2,r7
- eor r3,r0,r7,ror#19
- eor r0,r11,r11,ror#11
- eor r2,r2,r9
- add r10,r10,r3,ror#6
- eor r3,r11,r4
- eor r0,r0,r11,ror#20
- add r10,r10,r2
- ldr r2,[sp,#8]
- and r12,r12,r3
- add r6,r6,r10
- add r10,r10,r0,ror#2
- eor r12,r12,r4
- add r9,r9,r2
- eor r2,r7,r8
- eor r0,r6,r6,ror#5
- add r10,r10,r12
- and r2,r2,r6
- eor r12,r0,r6,ror#19
- eor r0,r10,r10,ror#11
- eor r2,r2,r8
- add r9,r9,r12,ror#6
- eor r12,r10,r11
- eor r0,r0,r10,ror#20
- add r9,r9,r2
- ldr r2,[sp,#12]
- and r3,r3,r12
- add r5,r5,r9
- add r9,r9,r0,ror#2
- eor r3,r3,r11
- add r8,r8,r2
- eor r2,r6,r7
- eor r0,r5,r5,ror#5
- add r9,r9,r3
- and r2,r2,r5
- eor r3,r0,r5,ror#19
- eor r0,r9,r9,ror#11
- eor r2,r2,r7
- add r8,r8,r3,ror#6
- eor r3,r9,r10
- eor r0,r0,r9,ror#20
- add r8,r8,r2
- ldr r2,[sp,#16]
- and r12,r12,r3
- add r4,r4,r8
- add r8,r8,r0,ror#2
- eor r12,r12,r10
- vst1.32 {q8},[r1,:128]!
- add r7,r7,r2
- eor r2,r5,r6
- eor r0,r4,r4,ror#5
- add r8,r8,r12
- vld1.32 {q8},[r14,:128]!
- and r2,r2,r4
- eor r12,r0,r4,ror#19
- eor r0,r8,r8,ror#11
- eor r2,r2,r6
- vrev32.8 q1,q1
- add r7,r7,r12,ror#6
- eor r12,r8,r9
- eor r0,r0,r8,ror#20
- add r7,r7,r2
- vadd.i32 q8,q8,q1
- ldr r2,[sp,#20]
- and r3,r3,r12
- add r11,r11,r7
- add r7,r7,r0,ror#2
- eor r3,r3,r9
- add r6,r6,r2
- eor r2,r4,r5
- eor r0,r11,r11,ror#5
- add r7,r7,r3
- and r2,r2,r11
- eor r3,r0,r11,ror#19
- eor r0,r7,r7,ror#11
- eor r2,r2,r5
- add r6,r6,r3,ror#6
- eor r3,r7,r8
- eor r0,r0,r7,ror#20
- add r6,r6,r2
- ldr r2,[sp,#24]
- and r12,r12,r3
- add r10,r10,r6
- add r6,r6,r0,ror#2
- eor r12,r12,r8
- add r5,r5,r2
- eor r2,r11,r4
- eor r0,r10,r10,ror#5
- add r6,r6,r12
- and r2,r2,r10
- eor r12,r0,r10,ror#19
- eor r0,r6,r6,ror#11
- eor r2,r2,r4
- add r5,r5,r12,ror#6
- eor r12,r6,r7
- eor r0,r0,r6,ror#20
- add r5,r5,r2
- ldr r2,[sp,#28]
- and r3,r3,r12
- add r9,r9,r5
- add r5,r5,r0,ror#2
- eor r3,r3,r7
- add r4,r4,r2
- eor r2,r10,r11
- eor r0,r9,r9,ror#5
- add r5,r5,r3
- and r2,r2,r9
- eor r3,r0,r9,ror#19
- eor r0,r5,r5,ror#11
- eor r2,r2,r11
- add r4,r4,r3,ror#6
- eor r3,r5,r6
- eor r0,r0,r5,ror#20
- add r4,r4,r2
- ldr r2,[sp,#32]
- and r12,r12,r3
- add r8,r8,r4
- add r4,r4,r0,ror#2
- eor r12,r12,r6
- vst1.32 {q8},[r1,:128]!
- add r11,r11,r2
- eor r2,r9,r10
- eor r0,r8,r8,ror#5
- add r4,r4,r12
- vld1.32 {q8},[r14,:128]!
- and r2,r2,r8
- eor r12,r0,r8,ror#19
- eor r0,r4,r4,ror#11
- eor r2,r2,r10
- vrev32.8 q2,q2
- add r11,r11,r12,ror#6
- eor r12,r4,r5
- eor r0,r0,r4,ror#20
- add r11,r11,r2
- vadd.i32 q8,q8,q2
- ldr r2,[sp,#36]
- and r3,r3,r12
- add r7,r7,r11
- add r11,r11,r0,ror#2
- eor r3,r3,r5
- add r10,r10,r2
- eor r2,r8,r9
- eor r0,r7,r7,ror#5
- add r11,r11,r3
- and r2,r2,r7
- eor r3,r0,r7,ror#19
- eor r0,r11,r11,ror#11
- eor r2,r2,r9
- add r10,r10,r3,ror#6
- eor r3,r11,r4
- eor r0,r0,r11,ror#20
- add r10,r10,r2
- ldr r2,[sp,#40]
- and r12,r12,r3
- add r6,r6,r10
- add r10,r10,r0,ror#2
- eor r12,r12,r4
- add r9,r9,r2
- eor r2,r7,r8
- eor r0,r6,r6,ror#5
- add r10,r10,r12
- and r2,r2,r6
- eor r12,r0,r6,ror#19
- eor r0,r10,r10,ror#11
- eor r2,r2,r8
- add r9,r9,r12,ror#6
- eor r12,r10,r11
- eor r0,r0,r10,ror#20
- add r9,r9,r2
- ldr r2,[sp,#44]
- and r3,r3,r12
- add r5,r5,r9
- add r9,r9,r0,ror#2
- eor r3,r3,r11
- add r8,r8,r2
- eor r2,r6,r7
- eor r0,r5,r5,ror#5
- add r9,r9,r3
- and r2,r2,r5
- eor r3,r0,r5,ror#19
- eor r0,r9,r9,ror#11
- eor r2,r2,r7
- add r8,r8,r3,ror#6
- eor r3,r9,r10
- eor r0,r0,r9,ror#20
- add r8,r8,r2
- ldr r2,[sp,#48]
- and r12,r12,r3
- add r4,r4,r8
- add r8,r8,r0,ror#2
- eor r12,r12,r10
- vst1.32 {q8},[r1,:128]!
- add r7,r7,r2
- eor r2,r5,r6
- eor r0,r4,r4,ror#5
- add r8,r8,r12
- vld1.32 {q8},[r14,:128]!
- and r2,r2,r4
- eor r12,r0,r4,ror#19
- eor r0,r8,r8,ror#11
- eor r2,r2,r6
- vrev32.8 q3,q3
- add r7,r7,r12,ror#6
- eor r12,r8,r9
- eor r0,r0,r8,ror#20
- add r7,r7,r2
- vadd.i32 q8,q8,q3
- ldr r2,[sp,#52]
- and r3,r3,r12
- add r11,r11,r7
- add r7,r7,r0,ror#2
- eor r3,r3,r9
- add r6,r6,r2
- eor r2,r4,r5
- eor r0,r11,r11,ror#5
- add r7,r7,r3
- and r2,r2,r11
- eor r3,r0,r11,ror#19
- eor r0,r7,r7,ror#11
- eor r2,r2,r5
- add r6,r6,r3,ror#6
- eor r3,r7,r8
- eor r0,r0,r7,ror#20
- add r6,r6,r2
- ldr r2,[sp,#56]
- and r12,r12,r3
- add r10,r10,r6
- add r6,r6,r0,ror#2
- eor r12,r12,r8
- add r5,r5,r2
- eor r2,r11,r4
- eor r0,r10,r10,ror#5
- add r6,r6,r12
- and r2,r2,r10
- eor r12,r0,r10,ror#19
- eor r0,r6,r6,ror#11
- eor r2,r2,r4
- add r5,r5,r12,ror#6
- eor r12,r6,r7
- eor r0,r0,r6,ror#20
- add r5,r5,r2
- ldr r2,[sp,#60]
- and r3,r3,r12
- add r9,r9,r5
- add r5,r5,r0,ror#2
- eor r3,r3,r7
- add r4,r4,r2
- eor r2,r10,r11
- eor r0,r9,r9,ror#5
- add r5,r5,r3
- and r2,r2,r9
- eor r3,r0,r9,ror#19
- eor r0,r5,r5,ror#11
- eor r2,r2,r11
- add r4,r4,r3,ror#6
- eor r3,r5,r6
- eor r0,r0,r5,ror#20
- add r4,r4,r2
- ldr r2,[sp,#64]
- and r12,r12,r3
- add r8,r8,r4
- add r4,r4,r0,ror#2
- eor r12,r12,r6
- vst1.32 {q8},[r1,:128]!
- ldr r0,[r2,#0]
- add r4,r4,r12 @ h+=Maj(a,b,c) from the past
- ldr r12,[r2,#4]
- ldr r3,[r2,#8]
- ldr r1,[r2,#12]
- add r4,r4,r0 @ accumulate
- ldr r0,[r2,#16]
- add r5,r5,r12
- ldr r12,[r2,#20]
- add r6,r6,r3
- ldr r3,[r2,#24]
- add r7,r7,r1
- ldr r1,[r2,#28]
- add r8,r8,r0
- str r4,[r2],#4
- add r9,r9,r12
- str r5,[r2],#4
- add r10,r10,r3
- str r6,[r2],#4
- add r11,r11,r1
- str r7,[r2],#4
- stmia r2,{r8-r11}
-
- ittte ne
- movne r1,sp
- ldrne r2,[sp,#0]
- eorne r12,r12,r12
- ldreq sp,[sp,#76] @ restore original sp
- itt ne
- eorne r3,r5,r6
- bne .L_00_48
-
- ldmia sp!,{r4-r12,pc}
-.size sha256_block_data_order_neon,.-sha256_block_data_order_neon
-#endif
-#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
-
-# ifdef __thumb2__
-# define INST(a,b,c,d) .byte c,d|0xc,a,b
-# else
-# define INST(a,b,c,d) .byte a,b,c,d
-# endif
-
-.type sha256_block_data_order_armv8,%function
-.align 5
-sha256_block_data_order_armv8:
-.LARMv8:
- vld1.32 {q0,q1},[r0]
-# ifdef __thumb2__
- adr r3,.LARMv8
- sub r3,r3,#.LARMv8-K256
-# else
- adrl r3,K256
-# endif
- add r2,r1,r2,lsl#6 @ len to point at the end of inp
-
-.Loop_v8:
- vld1.8 {q8-q9},[r1]!
- vld1.8 {q10-q11},[r1]!
- vld1.32 {q12},[r3]!
- vrev32.8 q8,q8
- vrev32.8 q9,q9
- vrev32.8 q10,q10
- vrev32.8 q11,q11
- vmov q14,q0 @ offload
- vmov q15,q1
- teq r1,r2
- vld1.32 {q13},[r3]!
- vadd.i32 q12,q12,q8
- INST(0xe2,0x03,0xfa,0xf3) @ sha256su0 q8,q9
- vmov q2,q0
- INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12
- INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12
- INST(0xe6,0x0c,0x64,0xf3) @ sha256su1 q8,q10,q11
- vld1.32 {q12},[r3]!
- vadd.i32 q13,q13,q9
- INST(0xe4,0x23,0xfa,0xf3) @ sha256su0 q9,q10
- vmov q2,q0
- INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13
- INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13
- INST(0xe0,0x2c,0x66,0xf3) @ sha256su1 q9,q11,q8
- vld1.32 {q13},[r3]!
- vadd.i32 q12,q12,q10
- INST(0xe6,0x43,0xfa,0xf3) @ sha256su0 q10,q11
- vmov q2,q0
- INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12
- INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12
- INST(0xe2,0x4c,0x60,0xf3) @ sha256su1 q10,q8,q9
- vld1.32 {q12},[r3]!
- vadd.i32 q13,q13,q11
- INST(0xe0,0x63,0xfa,0xf3) @ sha256su0 q11,q8
- vmov q2,q0
- INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13
- INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13
- INST(0xe4,0x6c,0x62,0xf3) @ sha256su1 q11,q9,q10
- vld1.32 {q13},[r3]!
- vadd.i32 q12,q12,q8
- INST(0xe2,0x03,0xfa,0xf3) @ sha256su0 q8,q9
- vmov q2,q0
- INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12
- INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12
- INST(0xe6,0x0c,0x64,0xf3) @ sha256su1 q8,q10,q11
- vld1.32 {q12},[r3]!
- vadd.i32 q13,q13,q9
- INST(0xe4,0x23,0xfa,0xf3) @ sha256su0 q9,q10
- vmov q2,q0
- INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13
- INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13
- INST(0xe0,0x2c,0x66,0xf3) @ sha256su1 q9,q11,q8
- vld1.32 {q13},[r3]!
- vadd.i32 q12,q12,q10
- INST(0xe6,0x43,0xfa,0xf3) @ sha256su0 q10,q11
- vmov q2,q0
- INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12
- INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12
- INST(0xe2,0x4c,0x60,0xf3) @ sha256su1 q10,q8,q9
- vld1.32 {q12},[r3]!
- vadd.i32 q13,q13,q11
- INST(0xe0,0x63,0xfa,0xf3) @ sha256su0 q11,q8
- vmov q2,q0
- INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13
- INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13
- INST(0xe4,0x6c,0x62,0xf3) @ sha256su1 q11,q9,q10
- vld1.32 {q13},[r3]!
- vadd.i32 q12,q12,q8
- INST(0xe2,0x03,0xfa,0xf3) @ sha256su0 q8,q9
- vmov q2,q0
- INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12
- INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12
- INST(0xe6,0x0c,0x64,0xf3) @ sha256su1 q8,q10,q11
- vld1.32 {q12},[r3]!
- vadd.i32 q13,q13,q9
- INST(0xe4,0x23,0xfa,0xf3) @ sha256su0 q9,q10
- vmov q2,q0
- INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13
- INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13
- INST(0xe0,0x2c,0x66,0xf3) @ sha256su1 q9,q11,q8
- vld1.32 {q13},[r3]!
- vadd.i32 q12,q12,q10
- INST(0xe6,0x43,0xfa,0xf3) @ sha256su0 q10,q11
- vmov q2,q0
- INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12
- INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12
- INST(0xe2,0x4c,0x60,0xf3) @ sha256su1 q10,q8,q9
- vld1.32 {q12},[r3]!
- vadd.i32 q13,q13,q11
- INST(0xe0,0x63,0xfa,0xf3) @ sha256su0 q11,q8
- vmov q2,q0
- INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13
- INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13
- INST(0xe4,0x6c,0x62,0xf3) @ sha256su1 q11,q9,q10
- vld1.32 {q13},[r3]!
- vadd.i32 q12,q12,q8
- vmov q2,q0
- INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12
- INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12
-
- vld1.32 {q12},[r3]!
- vadd.i32 q13,q13,q9
- vmov q2,q0
- INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13
- INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13
-
- vld1.32 {q13},[r3]
- vadd.i32 q12,q12,q10
- sub r3,r3,#256-16 @ rewind
- vmov q2,q0
- INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12
- INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12
-
- vadd.i32 q13,q13,q11
- vmov q2,q0
- INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13
- INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13
-
- vadd.i32 q0,q0,q14
- vadd.i32 q1,q1,q15
- it ne
- bne .Loop_v8
-
- vst1.32 {q0,q1},[r0]
-
- bx lr @ bx lr
-.size sha256_block_data_order_armv8,.-sha256_block_data_order_armv8
-#endif
-.asciz "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro@openssl.org>"
-.align 2
-#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
-.comm OPENSSL_armcap_P,4,4
-#endif
diff --git a/arch/arm/crypto/sha256_glue.c b/arch/arm/crypto/sha256_glue.c
index 215497f011f2..433ee4ddce6c 100644
--- a/arch/arm/crypto/sha256_glue.c
+++ b/arch/arm/crypto/sha256_glue.c
@@ -15,10 +15,9 @@
#include <linux/init.h>
#include <linux/module.h>
#include <linux/mm.h>
-#include <linux/cryptohash.h>
#include <linux/types.h>
#include <linux/string.h>
-#include <crypto/sha.h>
+#include <crypto/sha2.h>
#include <crypto/sha256_base.h>
#include <asm/simd.h>
#include <asm/neon.h>
diff --git a/arch/arm/crypto/sha256_neon_glue.c b/arch/arm/crypto/sha256_neon_glue.c
index 38645e415196..701706262ef3 100644
--- a/arch/arm/crypto/sha256_neon_glue.c
+++ b/arch/arm/crypto/sha256_neon_glue.c
@@ -11,10 +11,9 @@
#include <crypto/internal/hash.h>
#include <crypto/internal/simd.h>
-#include <linux/cryptohash.h>
#include <linux/types.h>
#include <linux/string.h>
-#include <crypto/sha.h>
+#include <crypto/sha2.h>
#include <crypto/sha256_base.h>
#include <asm/byteorder.h>
#include <asm/simd.h>
diff --git a/arch/arm/crypto/sha512-armv4.pl b/arch/arm/crypto/sha512-armv4.pl
index 788c17b56ecc..2fc3516912fa 100644
--- a/arch/arm/crypto/sha512-armv4.pl
+++ b/arch/arm/crypto/sha512-armv4.pl
@@ -13,7 +13,7 @@
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
-# details see http://www.openssl.org/~appro/cryptogams/.
+# details see https://www.openssl.org/~appro/cryptogams/.
# ====================================================================
# SHA512 block procedure for ARMv4. September 2007.
@@ -43,7 +43,7 @@
# terms it's 22.6 cycles per byte, which is disappointing result.
# Technical writers asserted that 3-way S4 pipeline can sustain
# multiple NEON instructions per cycle, but dual NEON issue could
-# not be observed, see http://www.openssl.org/~appro/Snapdragon-S4.html
+# not be observed, see https://www.openssl.org/~appro/Snapdragon-S4.html
# for further details. On side note Cortex-A15 processes one byte in
# 16 cycles.
@@ -212,7 +212,6 @@ $code=<<___;
#else
.syntax unified
# ifdef __thumb2__
-# define adrl adr
.thumb
# else
.code 32
@@ -602,7 +601,8 @@ sha512_block_data_order_neon:
dmb @ errata #451034 on early Cortex A8
add $len,$inp,$len,lsl#7 @ len to point at the end of inp
VFP_ABI_PUSH
- adrl $Ktbl,K512
+ adr $Ktbl,.Lsha512_block_data_order
+ sub $Ktbl,$Ktbl,.Lsha512_block_data_order-K512
vldmia $ctx,{$A-$H} @ load context
.Loop_neon:
___
diff --git a/arch/arm/crypto/sha512-core.S_shipped b/arch/arm/crypto/sha512-core.S_shipped
deleted file mode 100644
index 710ea309769e..000000000000
--- a/arch/arm/crypto/sha512-core.S_shipped
+++ /dev/null
@@ -1,1869 +0,0 @@
-@ SPDX-License-Identifier: GPL-2.0
-
-@ This code is taken from the OpenSSL project but the author (Andy Polyakov)
-@ has relicensed it under the GPLv2. Therefore this program is free software;
-@ you can redistribute it and/or modify it under the terms of the GNU General
-@ Public License version 2 as published by the Free Software Foundation.
-@
-@ The original headers, including the original license headers, are
-@ included below for completeness.
-
-@ ====================================================================
-@ Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
-@ project. The module is, however, dual licensed under OpenSSL and
-@ CRYPTOGAMS licenses depending on where you obtain it. For further
-@ details see http://www.openssl.org/~appro/cryptogams/.
-@ ====================================================================
-
-@ SHA512 block procedure for ARMv4. September 2007.
-
-@ This code is ~4.5 (four and a half) times faster than code generated
-@ by gcc 3.4 and it spends ~72 clock cycles per byte [on single-issue
-@ Xscale PXA250 core].
-@
-@ July 2010.
-@
-@ Rescheduling for dual-issue pipeline resulted in 6% improvement on
-@ Cortex A8 core and ~40 cycles per processed byte.
-
-@ February 2011.
-@
-@ Profiler-assisted and platform-specific optimization resulted in 7%
-@ improvement on Coxtex A8 core and ~38 cycles per byte.
-
-@ March 2011.
-@
-@ Add NEON implementation. On Cortex A8 it was measured to process
-@ one byte in 23.3 cycles or ~60% faster than integer-only code.
-
-@ August 2012.
-@
-@ Improve NEON performance by 12% on Snapdragon S4. In absolute
-@ terms it's 22.6 cycles per byte, which is disappointing result.
-@ Technical writers asserted that 3-way S4 pipeline can sustain
-@ multiple NEON instructions per cycle, but dual NEON issue could
-@ not be observed, see http://www.openssl.org/~appro/Snapdragon-S4.html
-@ for further details. On side note Cortex-A15 processes one byte in
-@ 16 cycles.
-
-@ Byte order [in]dependence. =========================================
-@
-@ Originally caller was expected to maintain specific *dword* order in
-@ h[0-7], namely with most significant dword at *lower* address, which
-@ was reflected in below two parameters as 0 and 4. Now caller is
-@ expected to maintain native byte order for whole 64-bit values.
-#ifndef __KERNEL__
-# include "arm_arch.h"
-# define VFP_ABI_PUSH vstmdb sp!,{d8-d15}
-# define VFP_ABI_POP vldmia sp!,{d8-d15}
-#else
-# define __ARM_ARCH__ __LINUX_ARM_ARCH__
-# define __ARM_MAX_ARCH__ 7
-# define VFP_ABI_PUSH
-# define VFP_ABI_POP
-#endif
-
-#ifdef __ARMEL__
-# define LO 0
-# define HI 4
-# define WORD64(hi0,lo0,hi1,lo1) .word lo0,hi0, lo1,hi1
-#else
-# define HI 0
-# define LO 4
-# define WORD64(hi0,lo0,hi1,lo1) .word hi0,lo0, hi1,lo1
-#endif
-
-.text
-#if __ARM_ARCH__<7
-.code 32
-#else
-.syntax unified
-# ifdef __thumb2__
-# define adrl adr
-.thumb
-# else
-.code 32
-# endif
-#endif
-
-.type K512,%object
-.align 5
-K512:
-WORD64(0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd)
-WORD64(0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc)
-WORD64(0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019)
-WORD64(0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118)
-WORD64(0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe)
-WORD64(0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2)
-WORD64(0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1)
-WORD64(0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694)
-WORD64(0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3)
-WORD64(0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65)
-WORD64(0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483)
-WORD64(0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5)
-WORD64(0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210)
-WORD64(0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4)
-WORD64(0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725)
-WORD64(0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70)
-WORD64(0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926)
-WORD64(0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df)
-WORD64(0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8)
-WORD64(0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b)
-WORD64(0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001)
-WORD64(0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30)
-WORD64(0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910)
-WORD64(0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8)
-WORD64(0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53)
-WORD64(0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8)
-WORD64(0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb)
-WORD64(0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3)
-WORD64(0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60)
-WORD64(0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec)
-WORD64(0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9)
-WORD64(0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b)
-WORD64(0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207)
-WORD64(0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178)
-WORD64(0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6)
-WORD64(0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b)
-WORD64(0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493)
-WORD64(0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c)
-WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a)
-WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817)
-.size K512,.-K512
-#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
-.LOPENSSL_armcap:
-.word OPENSSL_armcap_P-sha512_block_data_order
-.skip 32-4
-#else
-.skip 32
-#endif
-
-.global sha512_block_data_order
-.type sha512_block_data_order,%function
-sha512_block_data_order:
-.Lsha512_block_data_order:
-#if __ARM_ARCH__<7
- sub r3,pc,#8 @ sha512_block_data_order
-#else
- adr r3,.Lsha512_block_data_order
-#endif
-#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
- ldr r12,.LOPENSSL_armcap
- ldr r12,[r3,r12] @ OPENSSL_armcap_P
- tst r12,#1
- bne .LNEON
-#endif
- add r2,r1,r2,lsl#7 @ len to point at the end of inp
- stmdb sp!,{r4-r12,lr}
- sub r14,r3,#672 @ K512
- sub sp,sp,#9*8
-
- ldr r7,[r0,#32+LO]
- ldr r8,[r0,#32+HI]
- ldr r9, [r0,#48+LO]
- ldr r10, [r0,#48+HI]
- ldr r11, [r0,#56+LO]
- ldr r12, [r0,#56+HI]
-.Loop:
- str r9, [sp,#48+0]
- str r10, [sp,#48+4]
- str r11, [sp,#56+0]
- str r12, [sp,#56+4]
- ldr r5,[r0,#0+LO]
- ldr r6,[r0,#0+HI]
- ldr r3,[r0,#8+LO]
- ldr r4,[r0,#8+HI]
- ldr r9, [r0,#16+LO]
- ldr r10, [r0,#16+HI]
- ldr r11, [r0,#24+LO]
- ldr r12, [r0,#24+HI]
- str r3,[sp,#8+0]
- str r4,[sp,#8+4]
- str r9, [sp,#16+0]
- str r10, [sp,#16+4]
- str r11, [sp,#24+0]
- str r12, [sp,#24+4]
- ldr r3,[r0,#40+LO]
- ldr r4,[r0,#40+HI]
- str r3,[sp,#40+0]
- str r4,[sp,#40+4]
-
-.L00_15:
-#if __ARM_ARCH__<7
- ldrb r3,[r1,#7]
- ldrb r9, [r1,#6]
- ldrb r10, [r1,#5]
- ldrb r11, [r1,#4]
- ldrb r4,[r1,#3]
- ldrb r12, [r1,#2]
- orr r3,r3,r9,lsl#8
- ldrb r9, [r1,#1]
- orr r3,r3,r10,lsl#16
- ldrb r10, [r1],#8
- orr r3,r3,r11,lsl#24
- orr r4,r4,r12,lsl#8
- orr r4,r4,r9,lsl#16
- orr r4,r4,r10,lsl#24
-#else
- ldr r3,[r1,#4]
- ldr r4,[r1],#8
-#ifdef __ARMEL__
- rev r3,r3
- rev r4,r4
-#endif
-#endif
- @ Sigma1(x) (ROTR((x),14) ^ ROTR((x),18) ^ ROTR((x),41))
- @ LO lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
- @ HI hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
- mov r9,r7,lsr#14
- str r3,[sp,#64+0]
- mov r10,r8,lsr#14
- str r4,[sp,#64+4]
- eor r9,r9,r8,lsl#18
- ldr r11,[sp,#56+0] @ h.lo
- eor r10,r10,r7,lsl#18
- ldr r12,[sp,#56+4] @ h.hi
- eor r9,r9,r7,lsr#18
- eor r10,r10,r8,lsr#18
- eor r9,r9,r8,lsl#14
- eor r10,r10,r7,lsl#14
- eor r9,r9,r8,lsr#9
- eor r10,r10,r7,lsr#9
- eor r9,r9,r7,lsl#23
- eor r10,r10,r8,lsl#23 @ Sigma1(e)
- adds r3,r3,r9
- ldr r9,[sp,#40+0] @ f.lo
- adc r4,r4,r10 @ T += Sigma1(e)
- ldr r10,[sp,#40+4] @ f.hi
- adds r3,r3,r11
- ldr r11,[sp,#48+0] @ g.lo
- adc r4,r4,r12 @ T += h
- ldr r12,[sp,#48+4] @ g.hi
-
- eor r9,r9,r11
- str r7,[sp,#32+0]
- eor r10,r10,r12
- str r8,[sp,#32+4]
- and r9,r9,r7
- str r5,[sp,#0+0]
- and r10,r10,r8
- str r6,[sp,#0+4]
- eor r9,r9,r11
- ldr r11,[r14,#LO] @ K[i].lo
- eor r10,r10,r12 @ Ch(e,f,g)
- ldr r12,[r14,#HI] @ K[i].hi
-
- adds r3,r3,r9
- ldr r7,[sp,#24+0] @ d.lo
- adc r4,r4,r10 @ T += Ch(e,f,g)
- ldr r8,[sp,#24+4] @ d.hi
- adds r3,r3,r11
- and r9,r11,#0xff
- adc r4,r4,r12 @ T += K[i]
- adds r7,r7,r3
- ldr r11,[sp,#8+0] @ b.lo
- adc r8,r8,r4 @ d += T
- teq r9,#148
-
- ldr r12,[sp,#16+0] @ c.lo
-#if __ARM_ARCH__>=7
- it eq @ Thumb2 thing, sanity check in ARM
-#endif
- orreq r14,r14,#1
- @ Sigma0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
- @ LO lo>>28^hi<<4 ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
- @ HI hi>>28^lo<<4 ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
- mov r9,r5,lsr#28
- mov r10,r6,lsr#28
- eor r9,r9,r6,lsl#4
- eor r10,r10,r5,lsl#4
- eor r9,r9,r6,lsr#2
- eor r10,r10,r5,lsr#2
- eor r9,r9,r5,lsl#30
- eor r10,r10,r6,lsl#30
- eor r9,r9,r6,lsr#7
- eor r10,r10,r5,lsr#7
- eor r9,r9,r5,lsl#25
- eor r10,r10,r6,lsl#25 @ Sigma0(a)
- adds r3,r3,r9
- and r9,r5,r11
- adc r4,r4,r10 @ T += Sigma0(a)
-
- ldr r10,[sp,#8+4] @ b.hi
- orr r5,r5,r11
- ldr r11,[sp,#16+4] @ c.hi
- and r5,r5,r12
- and r12,r6,r10
- orr r6,r6,r10
- orr r5,r5,r9 @ Maj(a,b,c).lo
- and r6,r6,r11
- adds r5,r5,r3
- orr r6,r6,r12 @ Maj(a,b,c).hi
- sub sp,sp,#8
- adc r6,r6,r4 @ h += T
- tst r14,#1
- add r14,r14,#8
- tst r14,#1
- beq .L00_15
- ldr r9,[sp,#184+0]
- ldr r10,[sp,#184+4]
- bic r14,r14,#1
-.L16_79:
- @ sigma0(x) (ROTR((x),1) ^ ROTR((x),8) ^ ((x)>>7))
- @ LO lo>>1^hi<<31 ^ lo>>8^hi<<24 ^ lo>>7^hi<<25
- @ HI hi>>1^lo<<31 ^ hi>>8^lo<<24 ^ hi>>7
- mov r3,r9,lsr#1
- ldr r11,[sp,#80+0]
- mov r4,r10,lsr#1
- ldr r12,[sp,#80+4]
- eor r3,r3,r10,lsl#31
- eor r4,r4,r9,lsl#31
- eor r3,r3,r9,lsr#8
- eor r4,r4,r10,lsr#8
- eor r3,r3,r10,lsl#24
- eor r4,r4,r9,lsl#24
- eor r3,r3,r9,lsr#7
- eor r4,r4,r10,lsr#7
- eor r3,r3,r10,lsl#25
-
- @ sigma1(x) (ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6))
- @ LO lo>>19^hi<<13 ^ hi>>29^lo<<3 ^ lo>>6^hi<<26
- @ HI hi>>19^lo<<13 ^ lo>>29^hi<<3 ^ hi>>6
- mov r9,r11,lsr#19
- mov r10,r12,lsr#19
- eor r9,r9,r12,lsl#13
- eor r10,r10,r11,lsl#13
- eor r9,r9,r12,lsr#29
- eor r10,r10,r11,lsr#29
- eor r9,r9,r11,lsl#3
- eor r10,r10,r12,lsl#3
- eor r9,r9,r11,lsr#6
- eor r10,r10,r12,lsr#6
- ldr r11,[sp,#120+0]
- eor r9,r9,r12,lsl#26
-
- ldr r12,[sp,#120+4]
- adds r3,r3,r9
- ldr r9,[sp,#192+0]
- adc r4,r4,r10
-
- ldr r10,[sp,#192+4]
- adds r3,r3,r11
- adc r4,r4,r12
- adds r3,r3,r9
- adc r4,r4,r10
- @ Sigma1(x) (ROTR((x),14) ^ ROTR((x),18) ^ ROTR((x),41))
- @ LO lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
- @ HI hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
- mov r9,r7,lsr#14
- str r3,[sp,#64+0]
- mov r10,r8,lsr#14
- str r4,[sp,#64+4]
- eor r9,r9,r8,lsl#18
- ldr r11,[sp,#56+0] @ h.lo
- eor r10,r10,r7,lsl#18
- ldr r12,[sp,#56+4] @ h.hi
- eor r9,r9,r7,lsr#18
- eor r10,r10,r8,lsr#18
- eor r9,r9,r8,lsl#14
- eor r10,r10,r7,lsl#14
- eor r9,r9,r8,lsr#9
- eor r10,r10,r7,lsr#9
- eor r9,r9,r7,lsl#23
- eor r10,r10,r8,lsl#23 @ Sigma1(e)
- adds r3,r3,r9
- ldr r9,[sp,#40+0] @ f.lo
- adc r4,r4,r10 @ T += Sigma1(e)
- ldr r10,[sp,#40+4] @ f.hi
- adds r3,r3,r11
- ldr r11,[sp,#48+0] @ g.lo
- adc r4,r4,r12 @ T += h
- ldr r12,[sp,#48+4] @ g.hi
-
- eor r9,r9,r11
- str r7,[sp,#32+0]
- eor r10,r10,r12
- str r8,[sp,#32+4]
- and r9,r9,r7
- str r5,[sp,#0+0]
- and r10,r10,r8
- str r6,[sp,#0+4]
- eor r9,r9,r11
- ldr r11,[r14,#LO] @ K[i].lo
- eor r10,r10,r12 @ Ch(e,f,g)
- ldr r12,[r14,#HI] @ K[i].hi
-
- adds r3,r3,r9
- ldr r7,[sp,#24+0] @ d.lo
- adc r4,r4,r10 @ T += Ch(e,f,g)
- ldr r8,[sp,#24+4] @ d.hi
- adds r3,r3,r11
- and r9,r11,#0xff
- adc r4,r4,r12 @ T += K[i]
- adds r7,r7,r3
- ldr r11,[sp,#8+0] @ b.lo
- adc r8,r8,r4 @ d += T
- teq r9,#23
-
- ldr r12,[sp,#16+0] @ c.lo
-#if __ARM_ARCH__>=7
- it eq @ Thumb2 thing, sanity check in ARM
-#endif
- orreq r14,r14,#1
- @ Sigma0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
- @ LO lo>>28^hi<<4 ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
- @ HI hi>>28^lo<<4 ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
- mov r9,r5,lsr#28
- mov r10,r6,lsr#28
- eor r9,r9,r6,lsl#4
- eor r10,r10,r5,lsl#4
- eor r9,r9,r6,lsr#2
- eor r10,r10,r5,lsr#2
- eor r9,r9,r5,lsl#30
- eor r10,r10,r6,lsl#30
- eor r9,r9,r6,lsr#7
- eor r10,r10,r5,lsr#7
- eor r9,r9,r5,lsl#25
- eor r10,r10,r6,lsl#25 @ Sigma0(a)
- adds r3,r3,r9
- and r9,r5,r11
- adc r4,r4,r10 @ T += Sigma0(a)
-
- ldr r10,[sp,#8+4] @ b.hi
- orr r5,r5,r11
- ldr r11,[sp,#16+4] @ c.hi
- and r5,r5,r12
- and r12,r6,r10
- orr r6,r6,r10
- orr r5,r5,r9 @ Maj(a,b,c).lo
- and r6,r6,r11
- adds r5,r5,r3
- orr r6,r6,r12 @ Maj(a,b,c).hi
- sub sp,sp,#8
- adc r6,r6,r4 @ h += T
- tst r14,#1
- add r14,r14,#8
-#if __ARM_ARCH__>=7
- ittt eq @ Thumb2 thing, sanity check in ARM
-#endif
- ldreq r9,[sp,#184+0]
- ldreq r10,[sp,#184+4]
- beq .L16_79
- bic r14,r14,#1
-
- ldr r3,[sp,#8+0]
- ldr r4,[sp,#8+4]
- ldr r9, [r0,#0+LO]
- ldr r10, [r0,#0+HI]
- ldr r11, [r0,#8+LO]
- ldr r12, [r0,#8+HI]
- adds r9,r5,r9
- str r9, [r0,#0+LO]
- adc r10,r6,r10
- str r10, [r0,#0+HI]
- adds r11,r3,r11
- str r11, [r0,#8+LO]
- adc r12,r4,r12
- str r12, [r0,#8+HI]
-
- ldr r5,[sp,#16+0]
- ldr r6,[sp,#16+4]
- ldr r3,[sp,#24+0]
- ldr r4,[sp,#24+4]
- ldr r9, [r0,#16+LO]
- ldr r10, [r0,#16+HI]
- ldr r11, [r0,#24+LO]
- ldr r12, [r0,#24+HI]
- adds r9,r5,r9
- str r9, [r0,#16+LO]
- adc r10,r6,r10
- str r10, [r0,#16+HI]
- adds r11,r3,r11
- str r11, [r0,#24+LO]
- adc r12,r4,r12
- str r12, [r0,#24+HI]
-
- ldr r3,[sp,#40+0]
- ldr r4,[sp,#40+4]
- ldr r9, [r0,#32+LO]
- ldr r10, [r0,#32+HI]
- ldr r11, [r0,#40+LO]
- ldr r12, [r0,#40+HI]
- adds r7,r7,r9
- str r7,[r0,#32+LO]
- adc r8,r8,r10
- str r8,[r0,#32+HI]
- adds r11,r3,r11
- str r11, [r0,#40+LO]
- adc r12,r4,r12
- str r12, [r0,#40+HI]
-
- ldr r5,[sp,#48+0]
- ldr r6,[sp,#48+4]
- ldr r3,[sp,#56+0]
- ldr r4,[sp,#56+4]
- ldr r9, [r0,#48+LO]
- ldr r10, [r0,#48+HI]
- ldr r11, [r0,#56+LO]
- ldr r12, [r0,#56+HI]
- adds r9,r5,r9
- str r9, [r0,#48+LO]
- adc r10,r6,r10
- str r10, [r0,#48+HI]
- adds r11,r3,r11
- str r11, [r0,#56+LO]
- adc r12,r4,r12
- str r12, [r0,#56+HI]
-
- add sp,sp,#640
- sub r14,r14,#640
-
- teq r1,r2
- bne .Loop
-
- add sp,sp,#8*9 @ destroy frame
-#if __ARM_ARCH__>=5
- ldmia sp!,{r4-r12,pc}
-#else
- ldmia sp!,{r4-r12,lr}
- tst lr,#1
- moveq pc,lr @ be binary compatible with V4, yet
- .word 0xe12fff1e @ interoperable with Thumb ISA:-)
-#endif
-.size sha512_block_data_order,.-sha512_block_data_order
-#if __ARM_MAX_ARCH__>=7
-.arch armv7-a
-.fpu neon
-
-.global sha512_block_data_order_neon
-.type sha512_block_data_order_neon,%function
-.align 4
-sha512_block_data_order_neon:
-.LNEON:
- dmb @ errata #451034 on early Cortex A8
- add r2,r1,r2,lsl#7 @ len to point at the end of inp
- VFP_ABI_PUSH
- adrl r3,K512
- vldmia r0,{d16-d23} @ load context
-.Loop_neon:
- vshr.u64 d24,d20,#14 @ 0
-#if 0<16
- vld1.64 {d0},[r1]! @ handles unaligned
-#endif
- vshr.u64 d25,d20,#18
-#if 0>0
- vadd.i64 d16,d30 @ h+=Maj from the past
-#endif
- vshr.u64 d26,d20,#41
- vld1.64 {d28},[r3,:64]! @ K[i++]
- vsli.64 d24,d20,#50
- vsli.64 d25,d20,#46
- vmov d29,d20
- vsli.64 d26,d20,#23
-#if 0<16 && defined(__ARMEL__)
- vrev64.8 d0,d0
-#endif
- veor d25,d24
- vbsl d29,d21,d22 @ Ch(e,f,g)
- vshr.u64 d24,d16,#28
- veor d26,d25 @ Sigma1(e)
- vadd.i64 d27,d29,d23
- vshr.u64 d25,d16,#34
- vsli.64 d24,d16,#36
- vadd.i64 d27,d26
- vshr.u64 d26,d16,#39
- vadd.i64 d28,d0
- vsli.64 d25,d16,#30
- veor d30,d16,d17
- vsli.64 d26,d16,#25
- veor d23,d24,d25
- vadd.i64 d27,d28
- vbsl d30,d18,d17 @ Maj(a,b,c)
- veor d23,d26 @ Sigma0(a)
- vadd.i64 d19,d27
- vadd.i64 d30,d27
- @ vadd.i64 d23,d30
- vshr.u64 d24,d19,#14 @ 1
-#if 1<16
- vld1.64 {d1},[r1]! @ handles unaligned
-#endif
- vshr.u64 d25,d19,#18
-#if 1>0
- vadd.i64 d23,d30 @ h+=Maj from the past
-#endif
- vshr.u64 d26,d19,#41
- vld1.64 {d28},[r3,:64]! @ K[i++]
- vsli.64 d24,d19,#50
- vsli.64 d25,d19,#46
- vmov d29,d19
- vsli.64 d26,d19,#23
-#if 1<16 && defined(__ARMEL__)
- vrev64.8 d1,d1
-#endif
- veor d25,d24
- vbsl d29,d20,d21 @ Ch(e,f,g)
- vshr.u64 d24,d23,#28
- veor d26,d25 @ Sigma1(e)
- vadd.i64 d27,d29,d22
- vshr.u64 d25,d23,#34
- vsli.64 d24,d23,#36
- vadd.i64 d27,d26
- vshr.u64 d26,d23,#39
- vadd.i64 d28,d1
- vsli.64 d25,d23,#30
- veor d30,d23,d16
- vsli.64 d26,d23,#25
- veor d22,d24,d25
- vadd.i64 d27,d28
- vbsl d30,d17,d16 @ Maj(a,b,c)
- veor d22,d26 @ Sigma0(a)
- vadd.i64 d18,d27
- vadd.i64 d30,d27
- @ vadd.i64 d22,d30
- vshr.u64 d24,d18,#14 @ 2
-#if 2<16
- vld1.64 {d2},[r1]! @ handles unaligned
-#endif
- vshr.u64 d25,d18,#18
-#if 2>0
- vadd.i64 d22,d30 @ h+=Maj from the past
-#endif
- vshr.u64 d26,d18,#41
- vld1.64 {d28},[r3,:64]! @ K[i++]
- vsli.64 d24,d18,#50
- vsli.64 d25,d18,#46
- vmov d29,d18
- vsli.64 d26,d18,#23
-#if 2<16 && defined(__ARMEL__)
- vrev64.8 d2,d2
-#endif
- veor d25,d24
- vbsl d29,d19,d20 @ Ch(e,f,g)
- vshr.u64 d24,d22,#28
- veor d26,d25 @ Sigma1(e)
- vadd.i64 d27,d29,d21
- vshr.u64 d25,d22,#34
- vsli.64 d24,d22,#36
- vadd.i64 d27,d26
- vshr.u64 d26,d22,#39
- vadd.i64 d28,d2
- vsli.64 d25,d22,#30
- veor d30,d22,d23
- vsli.64 d26,d22,#25
- veor d21,d24,d25
- vadd.i64 d27,d28
- vbsl d30,d16,d23 @ Maj(a,b,c)
- veor d21,d26 @ Sigma0(a)
- vadd.i64 d17,d27
- vadd.i64 d30,d27
- @ vadd.i64 d21,d30
- vshr.u64 d24,d17,#14 @ 3
-#if 3<16
- vld1.64 {d3},[r1]! @ handles unaligned
-#endif
- vshr.u64 d25,d17,#18
-#if 3>0
- vadd.i64 d21,d30 @ h+=Maj from the past
-#endif
- vshr.u64 d26,d17,#41
- vld1.64 {d28},[r3,:64]! @ K[i++]
- vsli.64 d24,d17,#50
- vsli.64 d25,d17,#46
- vmov d29,d17
- vsli.64 d26,d17,#23
-#if 3<16 && defined(__ARMEL__)
- vrev64.8 d3,d3
-#endif
- veor d25,d24
- vbsl d29,d18,d19 @ Ch(e,f,g)
- vshr.u64 d24,d21,#28
- veor d26,d25 @ Sigma1(e)
- vadd.i64 d27,d29,d20
- vshr.u64 d25,d21,#34
- vsli.64 d24,d21,#36
- vadd.i64 d27,d26
- vshr.u64 d26,d21,#39
- vadd.i64 d28,d3
- vsli.64 d25,d21,#30
- veor d30,d21,d22
- vsli.64 d26,d21,#25
- veor d20,d24,d25
- vadd.i64 d27,d28
- vbsl d30,d23,d22 @ Maj(a,b,c)
- veor d20,d26 @ Sigma0(a)
- vadd.i64 d16,d27
- vadd.i64 d30,d27
- @ vadd.i64 d20,d30
- vshr.u64 d24,d16,#14 @ 4
-#if 4<16
- vld1.64 {d4},[r1]! @ handles unaligned
-#endif
- vshr.u64 d25,d16,#18
-#if 4>0
- vadd.i64 d20,d30 @ h+=Maj from the past
-#endif
- vshr.u64 d26,d16,#41
- vld1.64 {d28},[r3,:64]! @ K[i++]
- vsli.64 d24,d16,#50
- vsli.64 d25,d16,#46
- vmov d29,d16
- vsli.64 d26,d16,#23
-#if 4<16 && defined(__ARMEL__)
- vrev64.8 d4,d4
-#endif
- veor d25,d24
- vbsl d29,d17,d18 @ Ch(e,f,g)
- vshr.u64 d24,d20,#28
- veor d26,d25 @ Sigma1(e)
- vadd.i64 d27,d29,d19
- vshr.u64 d25,d20,#34
- vsli.64 d24,d20,#36
- vadd.i64 d27,d26
- vshr.u64 d26,d20,#39
- vadd.i64 d28,d4
- vsli.64 d25,d20,#30
- veor d30,d20,d21
- vsli.64 d26,d20,#25
- veor d19,d24,d25
- vadd.i64 d27,d28
- vbsl d30,d22,d21 @ Maj(a,b,c)
- veor d19,d26 @ Sigma0(a)
- vadd.i64 d23,d27
- vadd.i64 d30,d27
- @ vadd.i64 d19,d30
- vshr.u64 d24,d23,#14 @ 5
-#if 5<16
- vld1.64 {d5},[r1]! @ handles unaligned
-#endif
- vshr.u64 d25,d23,#18
-#if 5>0
- vadd.i64 d19,d30 @ h+=Maj from the past
-#endif
- vshr.u64 d26,d23,#41
- vld1.64 {d28},[r3,:64]! @ K[i++]
- vsli.64 d24,d23,#50
- vsli.64 d25,d23,#46
- vmov d29,d23
- vsli.64 d26,d23,#23
-#if 5<16 && defined(__ARMEL__)
- vrev64.8 d5,d5
-#endif
- veor d25,d24
- vbsl d29,d16,d17 @ Ch(e,f,g)
- vshr.u64 d24,d19,#28
- veor d26,d25 @ Sigma1(e)
- vadd.i64 d27,d29,d18
- vshr.u64 d25,d19,#34
- vsli.64 d24,d19,#36
- vadd.i64 d27,d26
- vshr.u64 d26,d19,#39
- vadd.i64 d28,d5
- vsli.64 d25,d19,#30
- veor d30,d19,d20
- vsli.64 d26,d19,#25
- veor d18,d24,d25
- vadd.i64 d27,d28
- vbsl d30,d21,d20 @ Maj(a,b,c)
- veor d18,d26 @ Sigma0(a)
- vadd.i64 d22,d27
- vadd.i64 d30,d27
- @ vadd.i64 d18,d30
- vshr.u64 d24,d22,#14 @ 6
-#if 6<16
- vld1.64 {d6},[r1]! @ handles unaligned
-#endif
- vshr.u64 d25,d22,#18
-#if 6>0
- vadd.i64 d18,d30 @ h+=Maj from the past
-#endif
- vshr.u64 d26,d22,#41
- vld1.64 {d28},[r3,:64]! @ K[i++]
- vsli.64 d24,d22,#50
- vsli.64 d25,d22,#46
- vmov d29,d22
- vsli.64 d26,d22,#23
-#if 6<16 && defined(__ARMEL__)
- vrev64.8 d6,d6
-#endif
- veor d25,d24
- vbsl d29,d23,d16 @ Ch(e,f,g)
- vshr.u64 d24,d18,#28
- veor d26,d25 @ Sigma1(e)
- vadd.i64 d27,d29,d17
- vshr.u64 d25,d18,#34
- vsli.64 d24,d18,#36
- vadd.i64 d27,d26
- vshr.u64 d26,d18,#39
- vadd.i64 d28,d6
- vsli.64 d25,d18,#30
- veor d30,d18,d19
- vsli.64 d26,d18,#25
- veor d17,d24,d25
- vadd.i64 d27,d28
- vbsl d30,d20,d19 @ Maj(a,b,c)
- veor d17,d26 @ Sigma0(a)
- vadd.i64 d21,d27
- vadd.i64 d30,d27
- @ vadd.i64 d17,d30
- vshr.u64 d24,d21,#14 @ 7
-#if 7<16
- vld1.64 {d7},[r1]! @ handles unaligned
-#endif
- vshr.u64 d25,d21,#18
-#if 7>0
- vadd.i64 d17,d30 @ h+=Maj from the past
-#endif
- vshr.u64 d26,d21,#41
- vld1.64 {d28},[r3,:64]! @ K[i++]
- vsli.64 d24,d21,#50
- vsli.64 d25,d21,#46
- vmov d29,d21
- vsli.64 d26,d21,#23
-#if 7<16 && defined(__ARMEL__)
- vrev64.8 d7,d7
-#endif
- veor d25,d24
- vbsl d29,d22,d23 @ Ch(e,f,g)
- vshr.u64 d24,d17,#28
- veor d26,d25 @ Sigma1(e)
- vadd.i64 d27,d29,d16
- vshr.u64 d25,d17,#34
- vsli.64 d24,d17,#36
- vadd.i64 d27,d26
- vshr.u64 d26,d17,#39
- vadd.i64 d28,d7
- vsli.64 d25,d17,#30
- veor d30,d17,d18
- vsli.64 d26,d17,#25
- veor d16,d24,d25
- vadd.i64 d27,d28
- vbsl d30,d19,d18 @ Maj(a,b,c)
- veor d16,d26 @ Sigma0(a)
- vadd.i64 d20,d27
- vadd.i64 d30,d27
- @ vadd.i64 d16,d30
- vshr.u64 d24,d20,#14 @ 8
-#if 8<16
- vld1.64 {d8},[r1]! @ handles unaligned
-#endif
- vshr.u64 d25,d20,#18
-#if 8>0
- vadd.i64 d16,d30 @ h+=Maj from the past
-#endif
- vshr.u64 d26,d20,#41
- vld1.64 {d28},[r3,:64]! @ K[i++]
- vsli.64 d24,d20,#50
- vsli.64 d25,d20,#46
- vmov d29,d20
- vsli.64 d26,d20,#23
-#if 8<16 && defined(__ARMEL__)
- vrev64.8 d8,d8
-#endif
- veor d25,d24
- vbsl d29,d21,d22 @ Ch(e,f,g)
- vshr.u64 d24,d16,#28
- veor d26,d25 @ Sigma1(e)
- vadd.i64 d27,d29,d23
- vshr.u64 d25,d16,#34
- vsli.64 d24,d16,#36
- vadd.i64 d27,d26
- vshr.u64 d26,d16,#39
- vadd.i64 d28,d8
- vsli.64 d25,d16,#30
- veor d30,d16,d17
- vsli.64 d26,d16,#25
- veor d23,d24,d25
- vadd.i64 d27,d28
- vbsl d30,d18,d17 @ Maj(a,b,c)
- veor d23,d26 @ Sigma0(a)
- vadd.i64 d19,d27
- vadd.i64 d30,d27
- @ vadd.i64 d23,d30
- vshr.u64 d24,d19,#14 @ 9
-#if 9<16
- vld1.64 {d9},[r1]! @ handles unaligned
-#endif
- vshr.u64 d25,d19,#18
-#if 9>0
- vadd.i64 d23,d30 @ h+=Maj from the past
-#endif
- vshr.u64 d26,d19,#41
- vld1.64 {d28},[r3,:64]! @ K[i++]
- vsli.64 d24,d19,#50
- vsli.64 d25,d19,#46
- vmov d29,d19
- vsli.64 d26,d19,#23
-#if 9<16 && defined(__ARMEL__)
- vrev64.8 d9,d9
-#endif
- veor d25,d24
- vbsl d29,d20,d21 @ Ch(e,f,g)
- vshr.u64 d24,d23,#28
- veor d26,d25 @ Sigma1(e)
- vadd.i64 d27,d29,d22
- vshr.u64 d25,d23,#34
- vsli.64 d24,d23,#36
- vadd.i64 d27,d26
- vshr.u64 d26,d23,#39
- vadd.i64 d28,d9
- vsli.64 d25,d23,#30
- veor d30,d23,d16
- vsli.64 d26,d23,#25
- veor d22,d24,d25
- vadd.i64 d27,d28
- vbsl d30,d17,d16 @ Maj(a,b,c)
- veor d22,d26 @ Sigma0(a)
- vadd.i64 d18,d27
- vadd.i64 d30,d27
- @ vadd.i64 d22,d30
- vshr.u64 d24,d18,#14 @ 10
-#if 10<16
- vld1.64 {d10},[r1]! @ handles unaligned
-#endif
- vshr.u64 d25,d18,#18
-#if 10>0
- vadd.i64 d22,d30 @ h+=Maj from the past
-#endif
- vshr.u64 d26,d18,#41
- vld1.64 {d28},[r3,:64]! @ K[i++]
- vsli.64 d24,d18,#50
- vsli.64 d25,d18,#46
- vmov d29,d18
- vsli.64 d26,d18,#23
-#if 10<16 && defined(__ARMEL__)
- vrev64.8 d10,d10
-#endif
- veor d25,d24
- vbsl d29,d19,d20 @ Ch(e,f,g)
- vshr.u64 d24,d22,#28
- veor d26,d25 @ Sigma1(e)
- vadd.i64 d27,d29,d21
- vshr.u64 d25,d22,#34
- vsli.64 d24,d22,#36
- vadd.i64 d27,d26
- vshr.u64 d26,d22,#39
- vadd.i64 d28,d10
- vsli.64 d25,d22,#30
- veor d30,d22,d23
- vsli.64 d26,d22,#25
- veor d21,d24,d25
- vadd.i64 d27,d28
- vbsl d30,d16,d23 @ Maj(a,b,c)
- veor d21,d26 @ Sigma0(a)
- vadd.i64 d17,d27
- vadd.i64 d30,d27
- @ vadd.i64 d21,d30
- vshr.u64 d24,d17,#14 @ 11
-#if 11<16
- vld1.64 {d11},[r1]! @ handles unaligned
-#endif
- vshr.u64 d25,d17,#18
-#if 11>0
- vadd.i64 d21,d30 @ h+=Maj from the past
-#endif
- vshr.u64 d26,d17,#41
- vld1.64 {d28},[r3,:64]! @ K[i++]
- vsli.64 d24,d17,#50
- vsli.64 d25,d17,#46
- vmov d29,d17
- vsli.64 d26,d17,#23
-#if 11<16 && defined(__ARMEL__)
- vrev64.8 d11,d11
-#endif
- veor d25,d24
- vbsl d29,d18,d19 @ Ch(e,f,g)
- vshr.u64 d24,d21,#28
- veor d26,d25 @ Sigma1(e)
- vadd.i64 d27,d29,d20
- vshr.u64 d25,d21,#34
- vsli.64 d24,d21,#36
- vadd.i64 d27,d26
- vshr.u64 d26,d21,#39
- vadd.i64 d28,d11
- vsli.64 d25,d21,#30
- veor d30,d21,d22
- vsli.64 d26,d21,#25
- veor d20,d24,d25
- vadd.i64 d27,d28
- vbsl d30,d23,d22 @ Maj(a,b,c)
- veor d20,d26 @ Sigma0(a)
- vadd.i64 d16,d27
- vadd.i64 d30,d27
- @ vadd.i64 d20,d30
- vshr.u64 d24,d16,#14 @ 12
-#if 12<16
- vld1.64 {d12},[r1]! @ handles unaligned
-#endif
- vshr.u64 d25,d16,#18
-#if 12>0
- vadd.i64 d20,d30 @ h+=Maj from the past
-#endif
- vshr.u64 d26,d16,#41
- vld1.64 {d28},[r3,:64]! @ K[i++]
- vsli.64 d24,d16,#50
- vsli.64 d25,d16,#46
- vmov d29,d16
- vsli.64 d26,d16,#23
-#if 12<16 && defined(__ARMEL__)
- vrev64.8 d12,d12
-#endif
- veor d25,d24
- vbsl d29,d17,d18 @ Ch(e,f,g)
- vshr.u64 d24,d20,#28
- veor d26,d25 @ Sigma1(e)
- vadd.i64 d27,d29,d19
- vshr.u64 d25,d20,#34
- vsli.64 d24,d20,#36
- vadd.i64 d27,d26
- vshr.u64 d26,d20,#39
- vadd.i64 d28,d12
- vsli.64 d25,d20,#30
- veor d30,d20,d21
- vsli.64 d26,d20,#25
- veor d19,d24,d25
- vadd.i64 d27,d28
- vbsl d30,d22,d21 @ Maj(a,b,c)
- veor d19,d26 @ Sigma0(a)
- vadd.i64 d23,d27
- vadd.i64 d30,d27
- @ vadd.i64 d19,d30
- vshr.u64 d24,d23,#14 @ 13
-#if 13<16
- vld1.64 {d13},[r1]! @ handles unaligned
-#endif
- vshr.u64 d25,d23,#18
-#if 13>0
- vadd.i64 d19,d30 @ h+=Maj from the past
-#endif
- vshr.u64 d26,d23,#41
- vld1.64 {d28},[r3,:64]! @ K[i++]
- vsli.64 d24,d23,#50
- vsli.64 d25,d23,#46
- vmov d29,d23
- vsli.64 d26,d23,#23
-#if 13<16 && defined(__ARMEL__)
- vrev64.8 d13,d13
-#endif
- veor d25,d24
- vbsl d29,d16,d17 @ Ch(e,f,g)
- vshr.u64 d24,d19,#28
- veor d26,d25 @ Sigma1(e)
- vadd.i64 d27,d29,d18
- vshr.u64 d25,d19,#34
- vsli.64 d24,d19,#36
- vadd.i64 d27,d26
- vshr.u64 d26,d19,#39
- vadd.i64 d28,d13
- vsli.64 d25,d19,#30
- veor d30,d19,d20
- vsli.64 d26,d19,#25
- veor d18,d24,d25
- vadd.i64 d27,d28
- vbsl d30,d21,d20 @ Maj(a,b,c)
- veor d18,d26 @ Sigma0(a)
- vadd.i64 d22,d27
- vadd.i64 d30,d27
- @ vadd.i64 d18,d30
- vshr.u64 d24,d22,#14 @ 14
-#if 14<16
- vld1.64 {d14},[r1]! @ handles unaligned
-#endif
- vshr.u64 d25,d22,#18
-#if 14>0
- vadd.i64 d18,d30 @ h+=Maj from the past
-#endif
- vshr.u64 d26,d22,#41
- vld1.64 {d28},[r3,:64]! @ K[i++]
- vsli.64 d24,d22,#50
- vsli.64 d25,d22,#46
- vmov d29,d22
- vsli.64 d26,d22,#23
-#if 14<16 && defined(__ARMEL__)
- vrev64.8 d14,d14
-#endif
- veor d25,d24
- vbsl d29,d23,d16 @ Ch(e,f,g)
- vshr.u64 d24,d18,#28
- veor d26,d25 @ Sigma1(e)
- vadd.i64 d27,d29,d17
- vshr.u64 d25,d18,#34
- vsli.64 d24,d18,#36
- vadd.i64 d27,d26
- vshr.u64 d26,d18,#39
- vadd.i64 d28,d14
- vsli.64 d25,d18,#30
- veor d30,d18,d19
- vsli.64 d26,d18,#25
- veor d17,d24,d25
- vadd.i64 d27,d28
- vbsl d30,d20,d19 @ Maj(a,b,c)
- veor d17,d26 @ Sigma0(a)
- vadd.i64 d21,d27
- vadd.i64 d30,d27
- @ vadd.i64 d17,d30
- vshr.u64 d24,d21,#14 @ 15
-#if 15<16
- vld1.64 {d15},[r1]! @ handles unaligned
-#endif
- vshr.u64 d25,d21,#18
-#if 15>0
- vadd.i64 d17,d30 @ h+=Maj from the past
-#endif
- vshr.u64 d26,d21,#41
- vld1.64 {d28},[r3,:64]! @ K[i++]
- vsli.64 d24,d21,#50
- vsli.64 d25,d21,#46
- vmov d29,d21
- vsli.64 d26,d21,#23
-#if 15<16 && defined(__ARMEL__)
- vrev64.8 d15,d15
-#endif
- veor d25,d24
- vbsl d29,d22,d23 @ Ch(e,f,g)
- vshr.u64 d24,d17,#28
- veor d26,d25 @ Sigma1(e)
- vadd.i64 d27,d29,d16
- vshr.u64 d25,d17,#34
- vsli.64 d24,d17,#36
- vadd.i64 d27,d26
- vshr.u64 d26,d17,#39
- vadd.i64 d28,d15
- vsli.64 d25,d17,#30
- veor d30,d17,d18
- vsli.64 d26,d17,#25
- veor d16,d24,d25
- vadd.i64 d27,d28
- vbsl d30,d19,d18 @ Maj(a,b,c)
- veor d16,d26 @ Sigma0(a)
- vadd.i64 d20,d27
- vadd.i64 d30,d27
- @ vadd.i64 d16,d30
- mov r12,#4
-.L16_79_neon:
- subs r12,#1
- vshr.u64 q12,q7,#19
- vshr.u64 q13,q7,#61
- vadd.i64 d16,d30 @ h+=Maj from the past
- vshr.u64 q15,q7,#6
- vsli.64 q12,q7,#45
- vext.8 q14,q0,q1,#8 @ X[i+1]
- vsli.64 q13,q7,#3
- veor q15,q12
- vshr.u64 q12,q14,#1
- veor q15,q13 @ sigma1(X[i+14])
- vshr.u64 q13,q14,#8
- vadd.i64 q0,q15
- vshr.u64 q15,q14,#7
- vsli.64 q12,q14,#63
- vsli.64 q13,q14,#56
- vext.8 q14,q4,q5,#8 @ X[i+9]
- veor q15,q12
- vshr.u64 d24,d20,#14 @ from NEON_00_15
- vadd.i64 q0,q14
- vshr.u64 d25,d20,#18 @ from NEON_00_15
- veor q15,q13 @ sigma0(X[i+1])
- vshr.u64 d26,d20,#41 @ from NEON_00_15
- vadd.i64 q0,q15
- vld1.64 {d28},[r3,:64]! @ K[i++]
- vsli.64 d24,d20,#50
- vsli.64 d25,d20,#46
- vmov d29,d20
- vsli.64 d26,d20,#23
-#if 16<16 && defined(__ARMEL__)
- vrev64.8 ,
-#endif
- veor d25,d24
- vbsl d29,d21,d22 @ Ch(e,f,g)
- vshr.u64 d24,d16,#28
- veor d26,d25 @ Sigma1(e)
- vadd.i64 d27,d29,d23
- vshr.u64 d25,d16,#34
- vsli.64 d24,d16,#36
- vadd.i64 d27,d26
- vshr.u64 d26,d16,#39
- vadd.i64 d28,d0
- vsli.64 d25,d16,#30
- veor d30,d16,d17
- vsli.64 d26,d16,#25
- veor d23,d24,d25
- vadd.i64 d27,d28
- vbsl d30,d18,d17 @ Maj(a,b,c)
- veor d23,d26 @ Sigma0(a)
- vadd.i64 d19,d27
- vadd.i64 d30,d27
- @ vadd.i64 d23,d30
- vshr.u64 d24,d19,#14 @ 17
-#if 17<16
- vld1.64 {d1},[r1]! @ handles unaligned
-#endif
- vshr.u64 d25,d19,#18
-#if 17>0
- vadd.i64 d23,d30 @ h+=Maj from the past
-#endif
- vshr.u64 d26,d19,#41
- vld1.64 {d28},[r3,:64]! @ K[i++]
- vsli.64 d24,d19,#50
- vsli.64 d25,d19,#46
- vmov d29,d19
- vsli.64 d26,d19,#23
-#if 17<16 && defined(__ARMEL__)
- vrev64.8 ,
-#endif
- veor d25,d24
- vbsl d29,d20,d21 @ Ch(e,f,g)
- vshr.u64 d24,d23,#28
- veor d26,d25 @ Sigma1(e)
- vadd.i64 d27,d29,d22
- vshr.u64 d25,d23,#34
- vsli.64 d24,d23,#36
- vadd.i64 d27,d26
- vshr.u64 d26,d23,#39
- vadd.i64 d28,d1
- vsli.64 d25,d23,#30
- veor d30,d23,d16
- vsli.64 d26,d23,#25
- veor d22,d24,d25
- vadd.i64 d27,d28
- vbsl d30,d17,d16 @ Maj(a,b,c)
- veor d22,d26 @ Sigma0(a)
- vadd.i64 d18,d27
- vadd.i64 d30,d27
- @ vadd.i64 d22,d30
- vshr.u64 q12,q0,#19
- vshr.u64 q13,q0,#61
- vadd.i64 d22,d30 @ h+=Maj from the past
- vshr.u64 q15,q0,#6
- vsli.64 q12,q0,#45
- vext.8 q14,q1,q2,#8 @ X[i+1]
- vsli.64 q13,q0,#3
- veor q15,q12
- vshr.u64 q12,q14,#1
- veor q15,q13 @ sigma1(X[i+14])
- vshr.u64 q13,q14,#8
- vadd.i64 q1,q15
- vshr.u64 q15,q14,#7
- vsli.64 q12,q14,#63
- vsli.64 q13,q14,#56
- vext.8 q14,q5,q6,#8 @ X[i+9]
- veor q15,q12
- vshr.u64 d24,d18,#14 @ from NEON_00_15
- vadd.i64 q1,q14
- vshr.u64 d25,d18,#18 @ from NEON_00_15
- veor q15,q13 @ sigma0(X[i+1])
- vshr.u64 d26,d18,#41 @ from NEON_00_15
- vadd.i64 q1,q15
- vld1.64 {d28},[r3,:64]! @ K[i++]
- vsli.64 d24,d18,#50
- vsli.64 d25,d18,#46
- vmov d29,d18
- vsli.64 d26,d18,#23
-#if 18<16 && defined(__ARMEL__)
- vrev64.8 ,
-#endif
- veor d25,d24
- vbsl d29,d19,d20 @ Ch(e,f,g)
- vshr.u64 d24,d22,#28
- veor d26,d25 @ Sigma1(e)
- vadd.i64 d27,d29,d21
- vshr.u64 d25,d22,#34
- vsli.64 d24,d22,#36
- vadd.i64 d27,d26
- vshr.u64 d26,d22,#39
- vadd.i64 d28,d2
- vsli.64 d25,d22,#30
- veor d30,d22,d23
- vsli.64 d26,d22,#25
- veor d21,d24,d25
- vadd.i64 d27,d28
- vbsl d30,d16,d23 @ Maj(a,b,c)
- veor d21,d26 @ Sigma0(a)
- vadd.i64 d17,d27
- vadd.i64 d30,d27
- @ vadd.i64 d21,d30
- vshr.u64 d24,d17,#14 @ 19
-#if 19<16
- vld1.64 {d3},[r1]! @ handles unaligned
-#endif
- vshr.u64 d25,d17,#18
-#if 19>0
- vadd.i64 d21,d30 @ h+=Maj from the past
-#endif
- vshr.u64 d26,d17,#41
- vld1.64 {d28},[r3,:64]! @ K[i++]
- vsli.64 d24,d17,#50
- vsli.64 d25,d17,#46
- vmov d29,d17
- vsli.64 d26,d17,#23
-#if 19<16 && defined(__ARMEL__)
- vrev64.8 ,
-#endif
- veor d25,d24
- vbsl d29,d18,d19 @ Ch(e,f,g)
- vshr.u64 d24,d21,#28
- veor d26,d25 @ Sigma1(e)
- vadd.i64 d27,d29,d20
- vshr.u64 d25,d21,#34
- vsli.64 d24,d21,#36
- vadd.i64 d27,d26
- vshr.u64 d26,d21,#39
- vadd.i64 d28,d3
- vsli.64 d25,d21,#30
- veor d30,d21,d22
- vsli.64 d26,d21,#25
- veor d20,d24,d25
- vadd.i64 d27,d28
- vbsl d30,d23,d22 @ Maj(a,b,c)
- veor d20,d26 @ Sigma0(a)
- vadd.i64 d16,d27
- vadd.i64 d30,d27
- @ vadd.i64 d20,d30
- vshr.u64 q12,q1,#19
- vshr.u64 q13,q1,#61
- vadd.i64 d20,d30 @ h+=Maj from the past
- vshr.u64 q15,q1,#6
- vsli.64 q12,q1,#45
- vext.8 q14,q2,q3,#8 @ X[i+1]
- vsli.64 q13,q1,#3
- veor q15,q12
- vshr.u64 q12,q14,#1
- veor q15,q13 @ sigma1(X[i+14])
- vshr.u64 q13,q14,#8
- vadd.i64 q2,q15
- vshr.u64 q15,q14,#7
- vsli.64 q12,q14,#63
- vsli.64 q13,q14,#56
- vext.8 q14,q6,q7,#8 @ X[i+9]
- veor q15,q12
- vshr.u64 d24,d16,#14 @ from NEON_00_15
- vadd.i64 q2,q14
- vshr.u64 d25,d16,#18 @ from NEON_00_15
- veor q15,q13 @ sigma0(X[i+1])
- vshr.u64 d26,d16,#41 @ from NEON_00_15
- vadd.i64 q2,q15
- vld1.64 {d28},[r3,:64]! @ K[i++]
- vsli.64 d24,d16,#50
- vsli.64 d25,d16,#46
- vmov d29,d16
- vsli.64 d26,d16,#23
-#if 20<16 && defined(__ARMEL__)
- vrev64.8 ,
-#endif
- veor d25,d24
- vbsl d29,d17,d18 @ Ch(e,f,g)
- vshr.u64 d24,d20,#28
- veor d26,d25 @ Sigma1(e)
- vadd.i64 d27,d29,d19
- vshr.u64 d25,d20,#34
- vsli.64 d24,d20,#36
- vadd.i64 d27,d26
- vshr.u64 d26,d20,#39
- vadd.i64 d28,d4
- vsli.64 d25,d20,#30
- veor d30,d20,d21
- vsli.64 d26,d20,#25
- veor d19,d24,d25
- vadd.i64 d27,d28
- vbsl d30,d22,d21 @ Maj(a,b,c)
- veor d19,d26 @ Sigma0(a)
- vadd.i64 d23,d27
- vadd.i64 d30,d27
- @ vadd.i64 d19,d30
- vshr.u64 d24,d23,#14 @ 21
-#if 21<16
- vld1.64 {d5},[r1]! @ handles unaligned
-#endif
- vshr.u64 d25,d23,#18
-#if 21>0
- vadd.i64 d19,d30 @ h+=Maj from the past
-#endif
- vshr.u64 d26,d23,#41
- vld1.64 {d28},[r3,:64]! @ K[i++]
- vsli.64 d24,d23,#50
- vsli.64 d25,d23,#46
- vmov d29,d23
- vsli.64 d26,d23,#23
-#if 21<16 && defined(__ARMEL__)
- vrev64.8 ,
-#endif
- veor d25,d24
- vbsl d29,d16,d17 @ Ch(e,f,g)
- vshr.u64 d24,d19,#28
- veor d26,d25 @ Sigma1(e)
- vadd.i64 d27,d29,d18
- vshr.u64 d25,d19,#34
- vsli.64 d24,d19,#36
- vadd.i64 d27,d26
- vshr.u64 d26,d19,#39
- vadd.i64 d28,d5
- vsli.64 d25,d19,#30
- veor d30,d19,d20
- vsli.64 d26,d19,#25
- veor d18,d24,d25
- vadd.i64 d27,d28
- vbsl d30,d21,d20 @ Maj(a,b,c)
- veor d18,d26 @ Sigma0(a)
- vadd.i64 d22,d27
- vadd.i64 d30,d27
- @ vadd.i64 d18,d30
- vshr.u64 q12,q2,#19
- vshr.u64 q13,q2,#61
- vadd.i64 d18,d30 @ h+=Maj from the past
- vshr.u64 q15,q2,#6
- vsli.64 q12,q2,#45
- vext.8 q14,q3,q4,#8 @ X[i+1]
- vsli.64 q13,q2,#3
- veor q15,q12
- vshr.u64 q12,q14,#1
- veor q15,q13 @ sigma1(X[i+14])
- vshr.u64 q13,q14,#8
- vadd.i64 q3,q15
- vshr.u64 q15,q14,#7
- vsli.64 q12,q14,#63
- vsli.64 q13,q14,#56
- vext.8 q14,q7,q0,#8 @ X[i+9]
- veor q15,q12
- vshr.u64 d24,d22,#14 @ from NEON_00_15
- vadd.i64 q3,q14
- vshr.u64 d25,d22,#18 @ from NEON_00_15
- veor q15,q13 @ sigma0(X[i+1])
- vshr.u64 d26,d22,#41 @ from NEON_00_15
- vadd.i64 q3,q15
- vld1.64 {d28},[r3,:64]! @ K[i++]
- vsli.64 d24,d22,#50
- vsli.64 d25,d22,#46
- vmov d29,d22
- vsli.64 d26,d22,#23
-#if 22<16 && defined(__ARMEL__)
- vrev64.8 ,
-#endif
- veor d25,d24
- vbsl d29,d23,d16 @ Ch(e,f,g)
- vshr.u64 d24,d18,#28
- veor d26,d25 @ Sigma1(e)
- vadd.i64 d27,d29,d17
- vshr.u64 d25,d18,#34
- vsli.64 d24,d18,#36
- vadd.i64 d27,d26
- vshr.u64 d26,d18,#39
- vadd.i64 d28,d6
- vsli.64 d25,d18,#30
- veor d30,d18,d19
- vsli.64 d26,d18,#25
- veor d17,d24,d25
- vadd.i64 d27,d28
- vbsl d30,d20,d19 @ Maj(a,b,c)
- veor d17,d26 @ Sigma0(a)
- vadd.i64 d21,d27
- vadd.i64 d30,d27
- @ vadd.i64 d17,d30
- vshr.u64 d24,d21,#14 @ 23
-#if 23<16
- vld1.64 {d7},[r1]! @ handles unaligned
-#endif
- vshr.u64 d25,d21,#18
-#if 23>0
- vadd.i64 d17,d30 @ h+=Maj from the past
-#endif
- vshr.u64 d26,d21,#41
- vld1.64 {d28},[r3,:64]! @ K[i++]
- vsli.64 d24,d21,#50
- vsli.64 d25,d21,#46
- vmov d29,d21
- vsli.64 d26,d21,#23
-#if 23<16 && defined(__ARMEL__)
- vrev64.8 ,
-#endif
- veor d25,d24
- vbsl d29,d22,d23 @ Ch(e,f,g)
- vshr.u64 d24,d17,#28
- veor d26,d25 @ Sigma1(e)
- vadd.i64 d27,d29,d16
- vshr.u64 d25,d17,#34
- vsli.64 d24,d17,#36
- vadd.i64 d27,d26
- vshr.u64 d26,d17,#39
- vadd.i64 d28,d7
- vsli.64 d25,d17,#30
- veor d30,d17,d18
- vsli.64 d26,d17,#25
- veor d16,d24,d25
- vadd.i64 d27,d28
- vbsl d30,d19,d18 @ Maj(a,b,c)
- veor d16,d26 @ Sigma0(a)
- vadd.i64 d20,d27
- vadd.i64 d30,d27
- @ vadd.i64 d16,d30
- vshr.u64 q12,q3,#19
- vshr.u64 q13,q3,#61
- vadd.i64 d16,d30 @ h+=Maj from the past
- vshr.u64 q15,q3,#6
- vsli.64 q12,q3,#45
- vext.8 q14,q4,q5,#8 @ X[i+1]
- vsli.64 q13,q3,#3
- veor q15,q12
- vshr.u64 q12,q14,#1
- veor q15,q13 @ sigma1(X[i+14])
- vshr.u64 q13,q14,#8
- vadd.i64 q4,q15
- vshr.u64 q15,q14,#7
- vsli.64 q12,q14,#63
- vsli.64 q13,q14,#56
- vext.8 q14,q0,q1,#8 @ X[i+9]
- veor q15,q12
- vshr.u64 d24,d20,#14 @ from NEON_00_15
- vadd.i64 q4,q14
- vshr.u64 d25,d20,#18 @ from NEON_00_15
- veor q15,q13 @ sigma0(X[i+1])
- vshr.u64 d26,d20,#41 @ from NEON_00_15
- vadd.i64 q4,q15
- vld1.64 {d28},[r3,:64]! @ K[i++]
- vsli.64 d24,d20,#50
- vsli.64 d25,d20,#46
- vmov d29,d20
- vsli.64 d26,d20,#23
-#if 24<16 && defined(__ARMEL__)
- vrev64.8 ,
-#endif
- veor d25,d24
- vbsl d29,d21,d22 @ Ch(e,f,g)
- vshr.u64 d24,d16,#28
- veor d26,d25 @ Sigma1(e)
- vadd.i64 d27,d29,d23
- vshr.u64 d25,d16,#34
- vsli.64 d24,d16,#36
- vadd.i64 d27,d26
- vshr.u64 d26,d16,#39
- vadd.i64 d28,d8
- vsli.64 d25,d16,#30
- veor d30,d16,d17
- vsli.64 d26,d16,#25
- veor d23,d24,d25
- vadd.i64 d27,d28
- vbsl d30,d18,d17 @ Maj(a,b,c)
- veor d23,d26 @ Sigma0(a)
- vadd.i64 d19,d27
- vadd.i64 d30,d27
- @ vadd.i64 d23,d30
- vshr.u64 d24,d19,#14 @ 25
-#if 25<16
- vld1.64 {d9},[r1]! @ handles unaligned
-#endif
- vshr.u64 d25,d19,#18
-#if 25>0
- vadd.i64 d23,d30 @ h+=Maj from the past
-#endif
- vshr.u64 d26,d19,#41
- vld1.64 {d28},[r3,:64]! @ K[i++]
- vsli.64 d24,d19,#50
- vsli.64 d25,d19,#46
- vmov d29,d19
- vsli.64 d26,d19,#23
-#if 25<16 && defined(__ARMEL__)
- vrev64.8 ,
-#endif
- veor d25,d24
- vbsl d29,d20,d21 @ Ch(e,f,g)
- vshr.u64 d24,d23,#28
- veor d26,d25 @ Sigma1(e)
- vadd.i64 d27,d29,d22
- vshr.u64 d25,d23,#34
- vsli.64 d24,d23,#36
- vadd.i64 d27,d26
- vshr.u64 d26,d23,#39
- vadd.i64 d28,d9
- vsli.64 d25,d23,#30
- veor d30,d23,d16
- vsli.64 d26,d23,#25
- veor d22,d24,d25
- vadd.i64 d27,d28
- vbsl d30,d17,d16 @ Maj(a,b,c)
- veor d22,d26 @ Sigma0(a)
- vadd.i64 d18,d27
- vadd.i64 d30,d27
- @ vadd.i64 d22,d30
- vshr.u64 q12,q4,#19
- vshr.u64 q13,q4,#61
- vadd.i64 d22,d30 @ h+=Maj from the past
- vshr.u64 q15,q4,#6
- vsli.64 q12,q4,#45
- vext.8 q14,q5,q6,#8 @ X[i+1]
- vsli.64 q13,q4,#3
- veor q15,q12
- vshr.u64 q12,q14,#1
- veor q15,q13 @ sigma1(X[i+14])
- vshr.u64 q13,q14,#8
- vadd.i64 q5,q15
- vshr.u64 q15,q14,#7
- vsli.64 q12,q14,#63
- vsli.64 q13,q14,#56
- vext.8 q14,q1,q2,#8 @ X[i+9]
- veor q15,q12
- vshr.u64 d24,d18,#14 @ from NEON_00_15
- vadd.i64 q5,q14
- vshr.u64 d25,d18,#18 @ from NEON_00_15
- veor q15,q13 @ sigma0(X[i+1])
- vshr.u64 d26,d18,#41 @ from NEON_00_15
- vadd.i64 q5,q15
- vld1.64 {d28},[r3,:64]! @ K[i++]
- vsli.64 d24,d18,#50
- vsli.64 d25,d18,#46
- vmov d29,d18
- vsli.64 d26,d18,#23
-#if 26<16 && defined(__ARMEL__)
- vrev64.8 ,
-#endif
- veor d25,d24
- vbsl d29,d19,d20 @ Ch(e,f,g)
- vshr.u64 d24,d22,#28
- veor d26,d25 @ Sigma1(e)
- vadd.i64 d27,d29,d21
- vshr.u64 d25,d22,#34
- vsli.64 d24,d22,#36
- vadd.i64 d27,d26
- vshr.u64 d26,d22,#39
- vadd.i64 d28,d10
- vsli.64 d25,d22,#30
- veor d30,d22,d23
- vsli.64 d26,d22,#25
- veor d21,d24,d25
- vadd.i64 d27,d28
- vbsl d30,d16,d23 @ Maj(a,b,c)
- veor d21,d26 @ Sigma0(a)
- vadd.i64 d17,d27
- vadd.i64 d30,d27
- @ vadd.i64 d21,d30
- vshr.u64 d24,d17,#14 @ 27
-#if 27<16
- vld1.64 {d11},[r1]! @ handles unaligned
-#endif
- vshr.u64 d25,d17,#18
-#if 27>0
- vadd.i64 d21,d30 @ h+=Maj from the past
-#endif
- vshr.u64 d26,d17,#41
- vld1.64 {d28},[r3,:64]! @ K[i++]
- vsli.64 d24,d17,#50
- vsli.64 d25,d17,#46
- vmov d29,d17
- vsli.64 d26,d17,#23
-#if 27<16 && defined(__ARMEL__)
- vrev64.8 ,
-#endif
- veor d25,d24
- vbsl d29,d18,d19 @ Ch(e,f,g)
- vshr.u64 d24,d21,#28
- veor d26,d25 @ Sigma1(e)
- vadd.i64 d27,d29,d20
- vshr.u64 d25,d21,#34
- vsli.64 d24,d21,#36
- vadd.i64 d27,d26
- vshr.u64 d26,d21,#39
- vadd.i64 d28,d11
- vsli.64 d25,d21,#30
- veor d30,d21,d22
- vsli.64 d26,d21,#25
- veor d20,d24,d25
- vadd.i64 d27,d28
- vbsl d30,d23,d22 @ Maj(a,b,c)
- veor d20,d26 @ Sigma0(a)
- vadd.i64 d16,d27
- vadd.i64 d30,d27
- @ vadd.i64 d20,d30
- vshr.u64 q12,q5,#19
- vshr.u64 q13,q5,#61
- vadd.i64 d20,d30 @ h+=Maj from the past
- vshr.u64 q15,q5,#6
- vsli.64 q12,q5,#45
- vext.8 q14,q6,q7,#8 @ X[i+1]
- vsli.64 q13,q5,#3
- veor q15,q12
- vshr.u64 q12,q14,#1
- veor q15,q13 @ sigma1(X[i+14])
- vshr.u64 q13,q14,#8
- vadd.i64 q6,q15
- vshr.u64 q15,q14,#7
- vsli.64 q12,q14,#63
- vsli.64 q13,q14,#56
- vext.8 q14,q2,q3,#8 @ X[i+9]
- veor q15,q12
- vshr.u64 d24,d16,#14 @ from NEON_00_15
- vadd.i64 q6,q14
- vshr.u64 d25,d16,#18 @ from NEON_00_15
- veor q15,q13 @ sigma0(X[i+1])
- vshr.u64 d26,d16,#41 @ from NEON_00_15
- vadd.i64 q6,q15
- vld1.64 {d28},[r3,:64]! @ K[i++]
- vsli.64 d24,d16,#50
- vsli.64 d25,d16,#46
- vmov d29,d16
- vsli.64 d26,d16,#23
-#if 28<16 && defined(__ARMEL__)
- vrev64.8 ,
-#endif
- veor d25,d24
- vbsl d29,d17,d18 @ Ch(e,f,g)
- vshr.u64 d24,d20,#28
- veor d26,d25 @ Sigma1(e)
- vadd.i64 d27,d29,d19
- vshr.u64 d25,d20,#34
- vsli.64 d24,d20,#36
- vadd.i64 d27,d26
- vshr.u64 d26,d20,#39
- vadd.i64 d28,d12
- vsli.64 d25,d20,#30
- veor d30,d20,d21
- vsli.64 d26,d20,#25
- veor d19,d24,d25
- vadd.i64 d27,d28
- vbsl d30,d22,d21 @ Maj(a,b,c)
- veor d19,d26 @ Sigma0(a)
- vadd.i64 d23,d27
- vadd.i64 d30,d27
- @ vadd.i64 d19,d30
- vshr.u64 d24,d23,#14 @ 29
-#if 29<16
- vld1.64 {d13},[r1]! @ handles unaligned
-#endif
- vshr.u64 d25,d23,#18
-#if 29>0
- vadd.i64 d19,d30 @ h+=Maj from the past
-#endif
- vshr.u64 d26,d23,#41
- vld1.64 {d28},[r3,:64]! @ K[i++]
- vsli.64 d24,d23,#50
- vsli.64 d25,d23,#46
- vmov d29,d23
- vsli.64 d26,d23,#23
-#if 29<16 && defined(__ARMEL__)
- vrev64.8 ,
-#endif
- veor d25,d24
- vbsl d29,d16,d17 @ Ch(e,f,g)
- vshr.u64 d24,d19,#28
- veor d26,d25 @ Sigma1(e)
- vadd.i64 d27,d29,d18
- vshr.u64 d25,d19,#34
- vsli.64 d24,d19,#36
- vadd.i64 d27,d26
- vshr.u64 d26,d19,#39
- vadd.i64 d28,d13
- vsli.64 d25,d19,#30
- veor d30,d19,d20
- vsli.64 d26,d19,#25
- veor d18,d24,d25
- vadd.i64 d27,d28
- vbsl d30,d21,d20 @ Maj(a,b,c)
- veor d18,d26 @ Sigma0(a)
- vadd.i64 d22,d27
- vadd.i64 d30,d27
- @ vadd.i64 d18,d30
- vshr.u64 q12,q6,#19
- vshr.u64 q13,q6,#61
- vadd.i64 d18,d30 @ h+=Maj from the past
- vshr.u64 q15,q6,#6
- vsli.64 q12,q6,#45
- vext.8 q14,q7,q0,#8 @ X[i+1]
- vsli.64 q13,q6,#3
- veor q15,q12
- vshr.u64 q12,q14,#1
- veor q15,q13 @ sigma1(X[i+14])
- vshr.u64 q13,q14,#8
- vadd.i64 q7,q15
- vshr.u64 q15,q14,#7
- vsli.64 q12,q14,#63
- vsli.64 q13,q14,#56
- vext.8 q14,q3,q4,#8 @ X[i+9]
- veor q15,q12
- vshr.u64 d24,d22,#14 @ from NEON_00_15
- vadd.i64 q7,q14
- vshr.u64 d25,d22,#18 @ from NEON_00_15
- veor q15,q13 @ sigma0(X[i+1])
- vshr.u64 d26,d22,#41 @ from NEON_00_15
- vadd.i64 q7,q15
- vld1.64 {d28},[r3,:64]! @ K[i++]
- vsli.64 d24,d22,#50
- vsli.64 d25,d22,#46
- vmov d29,d22
- vsli.64 d26,d22,#23
-#if 30<16 && defined(__ARMEL__)
- vrev64.8 ,
-#endif
- veor d25,d24
- vbsl d29,d23,d16 @ Ch(e,f,g)
- vshr.u64 d24,d18,#28
- veor d26,d25 @ Sigma1(e)
- vadd.i64 d27,d29,d17
- vshr.u64 d25,d18,#34
- vsli.64 d24,d18,#36
- vadd.i64 d27,d26
- vshr.u64 d26,d18,#39
- vadd.i64 d28,d14
- vsli.64 d25,d18,#30
- veor d30,d18,d19
- vsli.64 d26,d18,#25
- veor d17,d24,d25
- vadd.i64 d27,d28
- vbsl d30,d20,d19 @ Maj(a,b,c)
- veor d17,d26 @ Sigma0(a)
- vadd.i64 d21,d27
- vadd.i64 d30,d27
- @ vadd.i64 d17,d30
- vshr.u64 d24,d21,#14 @ 31
-#if 31<16
- vld1.64 {d15},[r1]! @ handles unaligned
-#endif
- vshr.u64 d25,d21,#18
-#if 31>0
- vadd.i64 d17,d30 @ h+=Maj from the past
-#endif
- vshr.u64 d26,d21,#41
- vld1.64 {d28},[r3,:64]! @ K[i++]
- vsli.64 d24,d21,#50
- vsli.64 d25,d21,#46
- vmov d29,d21
- vsli.64 d26,d21,#23
-#if 31<16 && defined(__ARMEL__)
- vrev64.8 ,
-#endif
- veor d25,d24
- vbsl d29,d22,d23 @ Ch(e,f,g)
- vshr.u64 d24,d17,#28
- veor d26,d25 @ Sigma1(e)
- vadd.i64 d27,d29,d16
- vshr.u64 d25,d17,#34
- vsli.64 d24,d17,#36
- vadd.i64 d27,d26
- vshr.u64 d26,d17,#39
- vadd.i64 d28,d15
- vsli.64 d25,d17,#30
- veor d30,d17,d18
- vsli.64 d26,d17,#25
- veor d16,d24,d25
- vadd.i64 d27,d28
- vbsl d30,d19,d18 @ Maj(a,b,c)
- veor d16,d26 @ Sigma0(a)
- vadd.i64 d20,d27
- vadd.i64 d30,d27
- @ vadd.i64 d16,d30
- bne .L16_79_neon
-
- vadd.i64 d16,d30 @ h+=Maj from the past
- vldmia r0,{d24-d31} @ load context to temp
- vadd.i64 q8,q12 @ vectorized accumulate
- vadd.i64 q9,q13
- vadd.i64 q10,q14
- vadd.i64 q11,q15
- vstmia r0,{d16-d23} @ save context
- teq r1,r2
- sub r3,#640 @ rewind K512
- bne .Loop_neon
-
- VFP_ABI_POP
- bx lr @ .word 0xe12fff1e
-.size sha512_block_data_order_neon,.-sha512_block_data_order_neon
-#endif
-.asciz "SHA512 block transform for ARMv4/NEON, CRYPTOGAMS by <appro@openssl.org>"
-.align 2
-#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
-.comm OPENSSL_armcap_P,4,4
-#endif
diff --git a/arch/arm/crypto/sha512-glue.c b/arch/arm/crypto/sha512-glue.c
index 8775aa42bbbe..0635a65aa488 100644
--- a/arch/arm/crypto/sha512-glue.c
+++ b/arch/arm/crypto/sha512-glue.c
@@ -6,7 +6,7 @@
*/
#include <crypto/internal/hash.h>
-#include <crypto/sha.h>
+#include <crypto/sha2.h>
#include <crypto/sha512_base.h>
#include <linux/crypto.h>
#include <linux/module.h>
diff --git a/arch/arm/crypto/sha512-neon-glue.c b/arch/arm/crypto/sha512-neon-glue.c
index 96cb94403540..c879ad32db51 100644
--- a/arch/arm/crypto/sha512-neon-glue.c
+++ b/arch/arm/crypto/sha512-neon-glue.c
@@ -7,7 +7,7 @@
#include <crypto/internal/hash.h>
#include <crypto/internal/simd.h>
-#include <crypto/sha.h>
+#include <crypto/sha2.h>
#include <crypto/sha512_base.h>
#include <linux/crypto.h>
#include <linux/module.h>