aboutsummaryrefslogtreecommitdiffstats
path: root/arch/arm64/crypto/crct10dif-ce-core.S
diff options
context:
space:
mode:
authorArd Biesheuvel <ard.biesheuvel@linaro.org>2018-04-30 18:18:28 +0200
committerHerbert Xu <herbert@gondor.apana.org.au>2018-05-12 00:13:11 +0800
commit5b3da651776338eaf6f37b32bc29f1998807cba4 (patch)
tree3b742c44729fae720b6ee79267beb5de0fc92e6e /arch/arm64/crypto/crct10dif-ce-core.S
parentcrypto: arm64/crc32-ce - yield NEON after every block of input (diff)
downloadlinux-dev-5b3da651776338eaf6f37b32bc29f1998807cba4.tar.xz
linux-dev-5b3da651776338eaf6f37b32bc29f1998807cba4.zip
crypto: arm64/crct10dif-ce - yield NEON after every block of input
Avoid excessive scheduling delays under a preemptible kernel by yielding the NEON after every block of input. Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Diffstat (limited to 'arch/arm64/crypto/crct10dif-ce-core.S')
-rw-r--r--arch/arm64/crypto/crct10dif-ce-core.S32
1 files changed, 28 insertions, 4 deletions
diff --git a/arch/arm64/crypto/crct10dif-ce-core.S b/arch/arm64/crypto/crct10dif-ce-core.S
index f179c01bd55c..663ea71cdb38 100644
--- a/arch/arm64/crypto/crct10dif-ce-core.S
+++ b/arch/arm64/crypto/crct10dif-ce-core.S
@@ -74,13 +74,19 @@
.text
.cpu generic+crypto
- arg1_low32 .req w0
- arg2 .req x1
- arg3 .req x2
+ arg1_low32 .req w19
+ arg2 .req x20
+ arg3 .req x21
vzr .req v13
ENTRY(crc_t10dif_pmull)
+ frame_push 3, 128
+
+ mov arg1_low32, w0
+ mov arg2, x1
+ mov arg3, x2
+
movi vzr.16b, #0 // init zero register
// adjust the 16-bit initial_crc value, scale it to 32 bits
@@ -175,8 +181,25 @@ CPU_LE( ext v12.16b, v12.16b, v12.16b, #8 )
subs arg3, arg3, #128
// check if there is another 64B in the buffer to be able to fold
- b.ge _fold_64_B_loop
+ b.lt _fold_64_B_end
+
+ if_will_cond_yield_neon
+ stp q0, q1, [sp, #.Lframe_local_offset]
+ stp q2, q3, [sp, #.Lframe_local_offset + 32]
+ stp q4, q5, [sp, #.Lframe_local_offset + 64]
+ stp q6, q7, [sp, #.Lframe_local_offset + 96]
+ do_cond_yield_neon
+ ldp q0, q1, [sp, #.Lframe_local_offset]
+ ldp q2, q3, [sp, #.Lframe_local_offset + 32]
+ ldp q4, q5, [sp, #.Lframe_local_offset + 64]
+ ldp q6, q7, [sp, #.Lframe_local_offset + 96]
+ ldr_l q10, rk3, x8
+ movi vzr.16b, #0 // init zero register
+ endif_yield_neon
+
+ b _fold_64_B_loop
+_fold_64_B_end:
// at this point, the buffer pointer is pointing at the last y Bytes
// of the buffer the 64B of folded data is in 4 of the vector
// registers: v0, v1, v2, v3
@@ -304,6 +327,7 @@ _barrett:
_cleanup:
// scale the result back to 16 bits
lsr x0, x0, #16
+ frame_pop
ret
_less_than_128: