diff options
authorEric Biggers <ebiggers@google.com>2019-02-22 22:54:07 -0800
committerHerbert Xu <herbert@gondor.apana.org.au>2019-02-28 14:37:48 +0800
commit4b6d196c9cec548a6b1cf5bb07b4a8b8d375829d (patch)
parentcrypto: sha512/arm - fix crash bug in Thumb2 build (diff)
crypto: arm64/chacha - fix chacha_4block_xor_neon() for big endian
The change to encrypt a fifth ChaCha block using scalar instructions caused the chacha20-neon, xchacha20-neon, and xchacha12-neon self-tests to start failing on big endian arm64 kernels. The bug is that the keystream block produced in 32-bit scalar registers is directly XOR'd with the data words, which are loaded and stored in native endianness. Thus in big endian mode the data bytes end up XOR'd with the wrong bytes. Fix it by byte-swapping the keystream words in big endian mode. Fixes: 2fe55987b262 ("crypto: arm64/chacha - use combined SIMD/ALU routine for more speed") Signed-off-by: Eric Biggers <ebiggers@google.com> Reviewed-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
1 files changed, 16 insertions, 0 deletions
diff --git a/arch/arm64/crypto/chacha-neon-core.S b/arch/arm64/crypto/chacha-neon-core.S
index 021bb9e9784b..bfb80e10ff7b 100644
--- a/arch/arm64/crypto/chacha-neon-core.S
+++ b/arch/arm64/crypto/chacha-neon-core.S
@@ -532,6 +532,10 @@ ENTRY(chacha_4block_xor_neon)
add v3.4s, v3.4s, v19.4s
add a2, a2, w8
add a3, a3, w9
+CPU_BE( rev a0, a0 )
+CPU_BE( rev a1, a1 )
+CPU_BE( rev a2, a2 )
+CPU_BE( rev a3, a3 )
ld4r {v24.4s-v27.4s}, [x0], #16
ld4r {v28.4s-v31.4s}, [x0]
@@ -552,6 +556,10 @@ ENTRY(chacha_4block_xor_neon)
add v7.4s, v7.4s, v23.4s
add a6, a6, w8
add a7, a7, w9
+CPU_BE( rev a4, a4 )
+CPU_BE( rev a5, a5 )
+CPU_BE( rev a6, a6 )
+CPU_BE( rev a7, a7 )
// x8[0-3] += s2[0]
// x9[0-3] += s2[1]
@@ -569,6 +577,10 @@ ENTRY(chacha_4block_xor_neon)
add v11.4s, v11.4s, v27.4s
add a10, a10, w8
add a11, a11, w9
+CPU_BE( rev a8, a8 )
+CPU_BE( rev a9, a9 )
+CPU_BE( rev a10, a10 )
+CPU_BE( rev a11, a11 )
// x12[0-3] += s3[0]
// x13[0-3] += s3[1]
@@ -586,6 +598,10 @@ ENTRY(chacha_4block_xor_neon)
add v15.4s, v15.4s, v31.4s
add a14, a14, w8
add a15, a15, w9
+CPU_BE( rev a12, a12 )
+CPU_BE( rev a13, a13 )
+CPU_BE( rev a14, a14 )
+CPU_BE( rev a15, a15 )
// interleave 32-bit words in state n, n+1
ldp w6, w7, [x2], #64