From 4b6d196c9cec548a6b1cf5bb07b4a8b8d375829d Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 22 Feb 2019 22:54:07 -0800 Subject: crypto: arm64/chacha - fix chacha_4block_xor_neon() for big endian The change to encrypt a fifth ChaCha block using scalar instructions caused the chacha20-neon, xchacha20-neon, and xchacha12-neon self-tests to start failing on big endian arm64 kernels. The bug is that the keystream block produced in 32-bit scalar registers is directly XOR'd with the data words, which are loaded and stored in native endianness. Thus in big endian mode the data bytes end up XOR'd with the wrong bytes. Fix it by byte-swapping the keystream words in big endian mode. Fixes: 2fe55987b262 ("crypto: arm64/chacha - use combined SIMD/ALU routine for more speed") Signed-off-by: Eric Biggers Reviewed-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- arch/arm64/crypto/chacha-neon-core.S | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'arch/arm64') diff --git a/arch/arm64/crypto/chacha-neon-core.S b/arch/arm64/crypto/chacha-neon-core.S index 021bb9e9784b..bfb80e10ff7b 100644 --- a/arch/arm64/crypto/chacha-neon-core.S +++ b/arch/arm64/crypto/chacha-neon-core.S @@ -532,6 +532,10 @@ ENTRY(chacha_4block_xor_neon) add v3.4s, v3.4s, v19.4s add a2, a2, w8 add a3, a3, w9 +CPU_BE( rev a0, a0 ) +CPU_BE( rev a1, a1 ) +CPU_BE( rev a2, a2 ) +CPU_BE( rev a3, a3 ) ld4r {v24.4s-v27.4s}, [x0], #16 ld4r {v28.4s-v31.4s}, [x0] @@ -552,6 +556,10 @@ ENTRY(chacha_4block_xor_neon) add v7.4s, v7.4s, v23.4s add a6, a6, w8 add a7, a7, w9 +CPU_BE( rev a4, a4 ) +CPU_BE( rev a5, a5 ) +CPU_BE( rev a6, a6 ) +CPU_BE( rev a7, a7 ) // x8[0-3] += s2[0] // x9[0-3] += s2[1] @@ -569,6 +577,10 @@ ENTRY(chacha_4block_xor_neon) add v11.4s, v11.4s, v27.4s add a10, a10, w8 add a11, a11, w9 +CPU_BE( rev a8, a8 ) +CPU_BE( rev a9, a9 ) +CPU_BE( rev a10, a10 ) +CPU_BE( rev a11, a11 ) // x12[0-3] += s3[0] // x13[0-3] += s3[1] @@ -586,6 +598,10 @@ ENTRY(chacha_4block_xor_neon) add v15.4s, v15.4s, v31.4s add a14, a14, w8 add a15, a15, w9 +CPU_BE( rev a12, a12 ) +CPU_BE( rev a13, a13 ) +CPU_BE( rev a14, a14 ) +CPU_BE( rev a15, a15 ) // interleave 32-bit words in state n, n+1 ldp w6, w7, [x2], #64 -- cgit v1.2.3-59-g8ed1b From f86d17e9efe010b894db231329ee36b24bcc1b24 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 22 Feb 2019 22:54:08 -0800 Subject: crypto: arm64/chacha - fix hchacha_block_neon() for big endian On big endian arm64 kernels, the xchacha20-neon and xchacha12-neon self-tests fail because hchacha_block_neon() outputs little endian words but the C code expects native endianness. Fix it to output the words in native endianness (which also makes it match the arm32 version). Fixes: cc7cf991e9eb ("crypto: arm64/chacha20 - add XChaCha20 support") Signed-off-by: Eric Biggers Reviewed-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- arch/arm64/crypto/chacha-neon-core.S | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/arm64') diff --git a/arch/arm64/crypto/chacha-neon-core.S b/arch/arm64/crypto/chacha-neon-core.S index bfb80e10ff7b..706c4e10e9e2 100644 --- a/arch/arm64/crypto/chacha-neon-core.S +++ b/arch/arm64/crypto/chacha-neon-core.S @@ -158,8 +158,8 @@ ENTRY(hchacha_block_neon) mov w3, w2 bl chacha_permute - st1 {v0.16b}, [x1], #16 - st1 {v3.16b}, [x1] + st1 {v0.4s}, [x1], #16 + st1 {v3.4s}, [x1] ldp x29, x30, [sp], #16 ret -- cgit v1.2.3-59-g8ed1b