aboutsummaryrefslogtreecommitdiffstats
path: root/arch/arm64/crypto/ghash-ce-core.S
diff options
context:
space:
mode:
authorArd Biesheuvel <ard.biesheuvel@linaro.org>2018-07-30 23:06:40 +0200
committerHerbert Xu <herbert@gondor.apana.org.au>2018-08-07 17:38:04 +0800
commit71e52c278c54db10e368c54687234390357b08d6 (patch)
treef17035f6b122c1de560652d67ccc93044a00ff77 /arch/arm64/crypto/ghash-ce-core.S
parentMerge git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6 (diff)
downloadlinux-dev-71e52c278c54db10e368c54687234390357b08d6.tar.xz
linux-dev-71e52c278c54db10e368c54687234390357b08d6.zip
crypto: arm64/aes-ce-gcm - operate on two input blocks at a time
Update the core AES/GCM transform and the associated plumbing to operate on 2 AES/GHASH blocks at a time. By itself, this is not expected to result in a noticeable speedup, but it paves the way for reimplementing the GHASH component using 2-way aggregation. Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Diffstat (limited to 'arch/arm64/crypto/ghash-ce-core.S')
-rw-r--r--arch/arm64/crypto/ghash-ce-core.S127
1 files changed, 97 insertions, 30 deletions
diff --git a/arch/arm64/crypto/ghash-ce-core.S b/arch/arm64/crypto/ghash-ce-core.S
index c723647b37db..dac0df29d194 100644
--- a/arch/arm64/crypto/ghash-ce-core.S
+++ b/arch/arm64/crypto/ghash-ce-core.S
@@ -286,9 +286,10 @@ ENTRY(pmull_ghash_update_p8)
__pmull_ghash p8
ENDPROC(pmull_ghash_update_p8)
- KS .req v8
- CTR .req v9
- INP .req v10
+ KS0 .req v8
+ KS1 .req v9
+ INP0 .req v10
+ INP1 .req v11
.macro load_round_keys, rounds, rk
cmp \rounds, #12
@@ -336,84 +337,146 @@ CPU_LE( rev x8, x8 )
.if \enc == 1
ldr x10, [sp]
- ld1 {KS.16b}, [x10]
+ ld1 {KS0.16b-KS1.16b}, [x10]
.endif
-0: ld1 {CTR.8b}, [x5] // load upper counter
- ld1 {INP.16b}, [x3], #16
+0: ld1 {INP0.16b-INP1.16b}, [x3], #32
+
rev x9, x8
- add x8, x8, #1
- sub w0, w0, #1
- ins CTR.d[1], x9 // set lower counter
+ add x11, x8, #1
+ add x8, x8, #2
.if \enc == 1
- eor INP.16b, INP.16b, KS.16b // encrypt input
- st1 {INP.16b}, [x2], #16
+ eor INP0.16b, INP0.16b, KS0.16b // encrypt input
+ eor INP1.16b, INP1.16b, KS1.16b
.endif
- rev64 T1.16b, INP.16b
+ ld1 {KS0.8b}, [x5] // load upper counter
+ rev x11, x11
+ sub w0, w0, #2
+ mov KS1.8b, KS0.8b
+ ins KS0.d[1], x9 // set lower counter
+ ins KS1.d[1], x11
+
+ rev64 T1.16b, INP0.16b
cmp w7, #12
b.ge 2f // AES-192/256?
-1: enc_round CTR, v21
+1: enc_round KS0, v21
ext T2.16b, XL.16b, XL.16b, #8
ext IN1.16b, T1.16b, T1.16b, #8
- enc_round CTR, v22
+ enc_round KS1, v21
eor T1.16b, T1.16b, T2.16b
eor XL.16b, XL.16b, IN1.16b
- enc_round CTR, v23
+ enc_round KS0, v22
pmull2 XH.1q, SHASH.2d, XL.2d // a1 * b1
eor T1.16b, T1.16b, XL.16b
- enc_round CTR, v24
+ enc_round KS1, v22
pmull XL.1q, SHASH.1d, XL.1d // a0 * b0
pmull XM.1q, SHASH2.1d, T1.1d // (a1 + a0)(b1 + b0)
- enc_round CTR, v25
+ enc_round KS0, v23
ext T1.16b, XL.16b, XH.16b, #8
eor T2.16b, XL.16b, XH.16b
eor XM.16b, XM.16b, T1.16b
- enc_round CTR, v26
+ enc_round KS1, v23
eor XM.16b, XM.16b, T2.16b
pmull T2.1q, XL.1d, MASK.1d
- enc_round CTR, v27
+ enc_round KS0, v24
mov XH.d[0], XM.d[1]
mov XM.d[1], XL.d[0]
- enc_round CTR, v28
+ enc_round KS1, v24
eor XL.16b, XM.16b, T2.16b
- enc_round CTR, v29
+ enc_round KS0, v25
ext T2.16b, XL.16b, XL.16b, #8
- aese CTR.16b, v30.16b
+ enc_round KS1, v25
pmull XL.1q, XL.1d, MASK.1d
eor T2.16b, T2.16b, XH.16b
- eor KS.16b, CTR.16b, v31.16b
+ enc_round KS0, v26
+
+ eor XL.16b, XL.16b, T2.16b
+ rev64 T1.16b, INP1.16b
+
+ enc_round KS1, v26
+
+ ext T2.16b, XL.16b, XL.16b, #8
+ ext IN1.16b, T1.16b, T1.16b, #8
+
+ enc_round KS0, v27
+
+ eor T1.16b, T1.16b, T2.16b
+ eor XL.16b, XL.16b, IN1.16b
+
+ enc_round KS1, v27
+
+ pmull2 XH.1q, SHASH.2d, XL.2d // a1 * b1
+ eor T1.16b, T1.16b, XL.16b
+
+ enc_round KS0, v28
+
+ pmull XL.1q, SHASH.1d, XL.1d // a0 * b0
+ pmull XM.1q, SHASH2.1d, T1.1d // (a1 + a0)(b1 + b0)
+
+ enc_round KS1, v28
+
+ ext T1.16b, XL.16b, XH.16b, #8
+ eor T2.16b, XL.16b, XH.16b
+ eor XM.16b, XM.16b, T1.16b
+
+ enc_round KS0, v29
+
+ eor XM.16b, XM.16b, T2.16b
+ pmull T2.1q, XL.1d, MASK.1d
+
+ enc_round KS1, v29
+
+ mov XH.d[0], XM.d[1]
+ mov XM.d[1], XL.d[0]
+
+ aese KS0.16b, v30.16b
+
+ eor XL.16b, XM.16b, T2.16b
+
+ aese KS1.16b, v30.16b
+
+ ext T2.16b, XL.16b, XL.16b, #8
+
+ eor KS0.16b, KS0.16b, v31.16b
+
+ pmull XL.1q, XL.1d, MASK.1d
+ eor T2.16b, T2.16b, XH.16b
+
+ eor KS1.16b, KS1.16b, v31.16b
eor XL.16b, XL.16b, T2.16b
.if \enc == 0
- eor INP.16b, INP.16b, KS.16b
- st1 {INP.16b}, [x2], #16
+ eor INP0.16b, INP0.16b, KS0.16b
+ eor INP1.16b, INP1.16b, KS1.16b
.endif
+ st1 {INP0.16b-INP1.16b}, [x2], #32
+
cbnz w0, 0b
CPU_LE( rev x8, x8 )
@@ -421,16 +484,20 @@ CPU_LE( rev x8, x8 )
str x8, [x5, #8] // store lower counter
.if \enc == 1
- st1 {KS.16b}, [x10]
+ st1 {KS0.16b-KS1.16b}, [x10]
.endif
ret
2: b.eq 3f // AES-192?
- enc_round CTR, v17
- enc_round CTR, v18
-3: enc_round CTR, v19
- enc_round CTR, v20
+ enc_round KS0, v17
+ enc_round KS1, v17
+ enc_round KS0, v18
+ enc_round KS1, v18
+3: enc_round KS0, v19
+ enc_round KS1, v19
+ enc_round KS0, v20
+ enc_round KS1, v20
b 1b
.endm