aboutsummaryrefslogtreecommitdiffstats
path: root/arch/arm64/crypto/aes-modes.S
diff options
context:
space:
mode:
Diffstat (limited to 'arch/arm64/crypto/aes-modes.S')
-rw-r--r--arch/arm64/crypto/aes-modes.S1002
1 files changed, 662 insertions, 340 deletions
diff --git a/arch/arm64/crypto/aes-modes.S b/arch/arm64/crypto/aes-modes.S
index 2674d43d1384..5abc834271f4 100644
--- a/arch/arm64/crypto/aes-modes.S
+++ b/arch/arm64/crypto/aes-modes.S
@@ -1,11 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
/*
* linux/arch/arm64/crypto/aes-modes.S - chaining mode wrappers for AES
*
* Copyright (C) 2013 - 2017 Linaro Ltd <ard.biesheuvel@linaro.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
/* included by aes-ce.S and aes-neon.S */
@@ -13,127 +10,66 @@
.text
.align 4
-/*
- * There are several ways to instantiate this code:
- * - no interleave, all inline
- * - 2-way interleave, 2x calls out of line (-DINTERLEAVE=2)
- * - 2-way interleave, all inline (-DINTERLEAVE=2 -DINTERLEAVE_INLINE)
- * - 4-way interleave, 4x calls out of line (-DINTERLEAVE=4)
- * - 4-way interleave, all inline (-DINTERLEAVE=4 -DINTERLEAVE_INLINE)
- *
- * Macros imported by this code:
- * - enc_prepare - setup NEON registers for encryption
- * - dec_prepare - setup NEON registers for decryption
- * - enc_switch_key - change to new key after having prepared for encryption
- * - encrypt_block - encrypt a single block
- * - decrypt block - decrypt a single block
- * - encrypt_block2x - encrypt 2 blocks in parallel (if INTERLEAVE == 2)
- * - decrypt_block2x - decrypt 2 blocks in parallel (if INTERLEAVE == 2)
- * - encrypt_block4x - encrypt 4 blocks in parallel (if INTERLEAVE == 4)
- * - decrypt_block4x - decrypt 4 blocks in parallel (if INTERLEAVE == 4)
- */
-
-#if defined(INTERLEAVE) && !defined(INTERLEAVE_INLINE)
-#define FRAME_PUSH stp x29, x30, [sp,#-16]! ; mov x29, sp
-#define FRAME_POP ldp x29, x30, [sp],#16
+#ifndef MAX_STRIDE
+#define MAX_STRIDE 4
+#endif
-#if INTERLEAVE == 2
+#if MAX_STRIDE == 4
+#define ST4(x...) x
+#define ST5(x...)
+#else
+#define ST4(x...)
+#define ST5(x...) x
+#endif
-aes_encrypt_block2x:
- encrypt_block2x v0, v1, w3, x2, x6, w7
+SYM_FUNC_START_LOCAL(aes_encrypt_block4x)
+ encrypt_block4x v0, v1, v2, v3, w3, x2, x8, w7
ret
-ENDPROC(aes_encrypt_block2x)
+SYM_FUNC_END(aes_encrypt_block4x)
-aes_decrypt_block2x:
- decrypt_block2x v0, v1, w3, x2, x6, w7
+SYM_FUNC_START_LOCAL(aes_decrypt_block4x)
+ decrypt_block4x v0, v1, v2, v3, w3, x2, x8, w7
ret
-ENDPROC(aes_decrypt_block2x)
-
-#elif INTERLEAVE == 4
+SYM_FUNC_END(aes_decrypt_block4x)
-aes_encrypt_block4x:
- encrypt_block4x v0, v1, v2, v3, w3, x2, x6, w7
+#if MAX_STRIDE == 5
+SYM_FUNC_START_LOCAL(aes_encrypt_block5x)
+ encrypt_block5x v0, v1, v2, v3, v4, w3, x2, x8, w7
ret
-ENDPROC(aes_encrypt_block4x)
+SYM_FUNC_END(aes_encrypt_block5x)
-aes_decrypt_block4x:
- decrypt_block4x v0, v1, v2, v3, w3, x2, x6, w7
+SYM_FUNC_START_LOCAL(aes_decrypt_block5x)
+ decrypt_block5x v0, v1, v2, v3, v4, w3, x2, x8, w7
ret
-ENDPROC(aes_decrypt_block4x)
-
-#else
-#error INTERLEAVE should equal 2 or 4
-#endif
-
- .macro do_encrypt_block2x
- bl aes_encrypt_block2x
- .endm
-
- .macro do_decrypt_block2x
- bl aes_decrypt_block2x
- .endm
-
- .macro do_encrypt_block4x
- bl aes_encrypt_block4x
- .endm
-
- .macro do_decrypt_block4x
- bl aes_decrypt_block4x
- .endm
-
-#else
-#define FRAME_PUSH
-#define FRAME_POP
-
- .macro do_encrypt_block2x
- encrypt_block2x v0, v1, w3, x2, x6, w7
- .endm
-
- .macro do_decrypt_block2x
- decrypt_block2x v0, v1, w3, x2, x6, w7
- .endm
-
- .macro do_encrypt_block4x
- encrypt_block4x v0, v1, v2, v3, w3, x2, x6, w7
- .endm
-
- .macro do_decrypt_block4x
- decrypt_block4x v0, v1, v2, v3, w3, x2, x6, w7
- .endm
-
+SYM_FUNC_END(aes_decrypt_block5x)
#endif
/*
* aes_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
- * int blocks, int first)
+ * int blocks)
* aes_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
- * int blocks, int first)
+ * int blocks)
*/
-AES_ENTRY(aes_ecb_encrypt)
- FRAME_PUSH
- cbz w5, .LecbencloopNx
+AES_FUNC_START(aes_ecb_encrypt)
+ stp x29, x30, [sp, #-16]!
+ mov x29, sp
enc_prepare w3, x2, x5
.LecbencloopNx:
-#if INTERLEAVE >= 2
- subs w4, w4, #INTERLEAVE
+ subs w4, w4, #MAX_STRIDE
bmi .Lecbenc1x
-#if INTERLEAVE == 2
- ld1 {v0.16b-v1.16b}, [x1], #32 /* get 2 pt blocks */
- do_encrypt_block2x
- st1 {v0.16b-v1.16b}, [x0], #32
-#else
ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 pt blocks */
- do_encrypt_block4x
+ST4( bl aes_encrypt_block4x )
+ST5( ld1 {v4.16b}, [x1], #16 )
+ST5( bl aes_encrypt_block5x )
st1 {v0.16b-v3.16b}, [x0], #64
-#endif
+ST5( st1 {v4.16b}, [x0], #16 )
b .LecbencloopNx
.Lecbenc1x:
- adds w4, w4, #INTERLEAVE
+ adds w4, w4, #MAX_STRIDE
beq .Lecbencout
-#endif
.Lecbencloop:
ld1 {v0.16b}, [x1], #16 /* get next pt block */
encrypt_block v0, w3, x2, x5, w6
@@ -141,35 +77,30 @@ AES_ENTRY(aes_ecb_encrypt)
subs w4, w4, #1
bne .Lecbencloop
.Lecbencout:
- FRAME_POP
+ ldp x29, x30, [sp], #16
ret
-AES_ENDPROC(aes_ecb_encrypt)
+AES_FUNC_END(aes_ecb_encrypt)
-AES_ENTRY(aes_ecb_decrypt)
- FRAME_PUSH
- cbz w5, .LecbdecloopNx
+AES_FUNC_START(aes_ecb_decrypt)
+ stp x29, x30, [sp, #-16]!
+ mov x29, sp
dec_prepare w3, x2, x5
.LecbdecloopNx:
-#if INTERLEAVE >= 2
- subs w4, w4, #INTERLEAVE
+ subs w4, w4, #MAX_STRIDE
bmi .Lecbdec1x
-#if INTERLEAVE == 2
- ld1 {v0.16b-v1.16b}, [x1], #32 /* get 2 ct blocks */
- do_decrypt_block2x
- st1 {v0.16b-v1.16b}, [x0], #32
-#else
ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */
- do_decrypt_block4x
+ST4( bl aes_decrypt_block4x )
+ST5( ld1 {v4.16b}, [x1], #16 )
+ST5( bl aes_decrypt_block5x )
st1 {v0.16b-v3.16b}, [x0], #64
-#endif
+ST5( st1 {v4.16b}, [x0], #16 )
b .LecbdecloopNx
.Lecbdec1x:
- adds w4, w4, #INTERLEAVE
+ adds w4, w4, #MAX_STRIDE
beq .Lecbdecout
-#endif
.Lecbdecloop:
ld1 {v0.16b}, [x1], #16 /* get next ct block */
decrypt_block v0, w3, x2, x5, w6
@@ -177,327 +108,665 @@ AES_ENTRY(aes_ecb_decrypt)
subs w4, w4, #1
bne .Lecbdecloop
.Lecbdecout:
- FRAME_POP
+ ldp x29, x30, [sp], #16
ret
-AES_ENDPROC(aes_ecb_decrypt)
+AES_FUNC_END(aes_ecb_decrypt)
/*
* aes_cbc_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
- * int blocks, u8 iv[], int first)
+ * int blocks, u8 iv[])
* aes_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
- * int blocks, u8 iv[], int first)
+ * int blocks, u8 iv[])
+ * aes_essiv_cbc_encrypt(u8 out[], u8 const in[], u32 const rk1[],
+ * int rounds, int blocks, u8 iv[],
+ * u32 const rk2[]);
+ * aes_essiv_cbc_decrypt(u8 out[], u8 const in[], u32 const rk1[],
+ * int rounds, int blocks, u8 iv[],
+ * u32 const rk2[]);
*/
-AES_ENTRY(aes_cbc_encrypt)
- cbz w6, .Lcbcencloop
+AES_FUNC_START(aes_essiv_cbc_encrypt)
+ ld1 {v4.16b}, [x5] /* get iv */
- ld1 {v0.16b}, [x5] /* get iv */
+ mov w8, #14 /* AES-256: 14 rounds */
+ enc_prepare w8, x6, x7
+ encrypt_block v4, w8, x6, x7, w9
+ enc_switch_key w3, x2, x6
+ b .Lcbcencloop4x
+
+AES_FUNC_START(aes_cbc_encrypt)
+ ld1 {v4.16b}, [x5] /* get iv */
enc_prepare w3, x2, x6
-.Lcbcencloop:
- ld1 {v1.16b}, [x1], #16 /* get next pt block */
- eor v0.16b, v0.16b, v1.16b /* ..and xor with iv */
+.Lcbcencloop4x:
+ subs w4, w4, #4
+ bmi .Lcbcenc1x
+ ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 pt blocks */
+ eor v0.16b, v0.16b, v4.16b /* ..and xor with iv */
encrypt_block v0, w3, x2, x6, w7
- st1 {v0.16b}, [x0], #16
+ eor v1.16b, v1.16b, v0.16b
+ encrypt_block v1, w3, x2, x6, w7
+ eor v2.16b, v2.16b, v1.16b
+ encrypt_block v2, w3, x2, x6, w7
+ eor v3.16b, v3.16b, v2.16b
+ encrypt_block v3, w3, x2, x6, w7
+ st1 {v0.16b-v3.16b}, [x0], #64
+ mov v4.16b, v3.16b
+ b .Lcbcencloop4x
+.Lcbcenc1x:
+ adds w4, w4, #4
+ beq .Lcbcencout
+.Lcbcencloop:
+ ld1 {v0.16b}, [x1], #16 /* get next pt block */
+ eor v4.16b, v4.16b, v0.16b /* ..and xor with iv */
+ encrypt_block v4, w3, x2, x6, w7
+ st1 {v4.16b}, [x0], #16
subs w4, w4, #1
bne .Lcbcencloop
- st1 {v0.16b}, [x5] /* return iv */
+.Lcbcencout:
+ st1 {v4.16b}, [x5] /* return iv */
ret
-AES_ENDPROC(aes_cbc_encrypt)
+AES_FUNC_END(aes_cbc_encrypt)
+AES_FUNC_END(aes_essiv_cbc_encrypt)
+
+AES_FUNC_START(aes_essiv_cbc_decrypt)
+ stp x29, x30, [sp, #-16]!
+ mov x29, sp
+ ld1 {cbciv.16b}, [x5] /* get iv */
-AES_ENTRY(aes_cbc_decrypt)
- FRAME_PUSH
- cbz w6, .LcbcdecloopNx
+ mov w8, #14 /* AES-256: 14 rounds */
+ enc_prepare w8, x6, x7
+ encrypt_block cbciv, w8, x6, x7, w9
+ b .Lessivcbcdecstart
- ld1 {v7.16b}, [x5] /* get iv */
+AES_FUNC_START(aes_cbc_decrypt)
+ stp x29, x30, [sp, #-16]!
+ mov x29, sp
+
+ ld1 {cbciv.16b}, [x5] /* get iv */
+.Lessivcbcdecstart:
dec_prepare w3, x2, x6
.LcbcdecloopNx:
-#if INTERLEAVE >= 2
- subs w4, w4, #INTERLEAVE
+ subs w4, w4, #MAX_STRIDE
bmi .Lcbcdec1x
-#if INTERLEAVE == 2
- ld1 {v0.16b-v1.16b}, [x1], #32 /* get 2 ct blocks */
- mov v2.16b, v0.16b
- mov v3.16b, v1.16b
- do_decrypt_block2x
- eor v0.16b, v0.16b, v7.16b
- eor v1.16b, v1.16b, v2.16b
- mov v7.16b, v3.16b
- st1 {v0.16b-v1.16b}, [x0], #32
-#else
ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */
+#if MAX_STRIDE == 5
+ ld1 {v4.16b}, [x1], #16 /* get 1 ct block */
+ mov v5.16b, v0.16b
+ mov v6.16b, v1.16b
+ mov v7.16b, v2.16b
+ bl aes_decrypt_block5x
+ sub x1, x1, #32
+ eor v0.16b, v0.16b, cbciv.16b
+ eor v1.16b, v1.16b, v5.16b
+ ld1 {v5.16b}, [x1], #16 /* reload 1 ct block */
+ ld1 {cbciv.16b}, [x1], #16 /* reload 1 ct block */
+ eor v2.16b, v2.16b, v6.16b
+ eor v3.16b, v3.16b, v7.16b
+ eor v4.16b, v4.16b, v5.16b
+#else
mov v4.16b, v0.16b
mov v5.16b, v1.16b
mov v6.16b, v2.16b
- do_decrypt_block4x
+ bl aes_decrypt_block4x
sub x1, x1, #16
- eor v0.16b, v0.16b, v7.16b
+ eor v0.16b, v0.16b, cbciv.16b
eor v1.16b, v1.16b, v4.16b
- ld1 {v7.16b}, [x1], #16 /* reload 1 ct block */
+ ld1 {cbciv.16b}, [x1], #16 /* reload 1 ct block */
eor v2.16b, v2.16b, v5.16b
eor v3.16b, v3.16b, v6.16b
- st1 {v0.16b-v3.16b}, [x0], #64
#endif
+ st1 {v0.16b-v3.16b}, [x0], #64
+ST5( st1 {v4.16b}, [x0], #16 )
b .LcbcdecloopNx
.Lcbcdec1x:
- adds w4, w4, #INTERLEAVE
+ adds w4, w4, #MAX_STRIDE
beq .Lcbcdecout
-#endif
.Lcbcdecloop:
ld1 {v1.16b}, [x1], #16 /* get next ct block */
mov v0.16b, v1.16b /* ...and copy to v0 */
decrypt_block v0, w3, x2, x6, w7
- eor v0.16b, v0.16b, v7.16b /* xor with iv => pt */
- mov v7.16b, v1.16b /* ct is next iv */
+ eor v0.16b, v0.16b, cbciv.16b /* xor with iv => pt */
+ mov cbciv.16b, v1.16b /* ct is next iv */
st1 {v0.16b}, [x0], #16
subs w4, w4, #1
bne .Lcbcdecloop
.Lcbcdecout:
- FRAME_POP
- st1 {v7.16b}, [x5] /* return iv */
+ st1 {cbciv.16b}, [x5] /* return iv */
+ ldp x29, x30, [sp], #16
ret
-AES_ENDPROC(aes_cbc_decrypt)
+AES_FUNC_END(aes_cbc_decrypt)
+AES_FUNC_END(aes_essiv_cbc_decrypt)
/*
- * aes_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
- * int blocks, u8 ctr[], int first)
+ * aes_cbc_cts_encrypt(u8 out[], u8 const in[], u32 const rk[],
+ * int rounds, int bytes, u8 const iv[])
+ * aes_cbc_cts_decrypt(u8 out[], u8 const in[], u32 const rk[],
+ * int rounds, int bytes, u8 const iv[])
*/
-AES_ENTRY(aes_ctr_encrypt)
- FRAME_PUSH
- cbz w6, .Lctrnotfirst /* 1st time around? */
+AES_FUNC_START(aes_cbc_cts_encrypt)
+ adr_l x8, .Lcts_permute_table
+ sub x4, x4, #16
+ add x9, x8, #32
+ add x8, x8, x4
+ sub x9, x9, x4
+ ld1 {v3.16b}, [x8]
+ ld1 {v4.16b}, [x9]
+
+ ld1 {v0.16b}, [x1], x4 /* overlapping loads */
+ ld1 {v1.16b}, [x1]
+
+ ld1 {v5.16b}, [x5] /* get iv */
enc_prepare w3, x2, x6
- ld1 {v4.16b}, [x5]
-
-.Lctrnotfirst:
- umov x8, v4.d[1] /* keep swabbed ctr in reg */
- rev x8, x8
-#if INTERLEAVE >= 2
- cmn w8, w4 /* 32 bit overflow? */
- bcs .Lctrloop
-.LctrloopNx:
- subs w4, w4, #INTERLEAVE
- bmi .Lctr1x
-#if INTERLEAVE == 2
- mov v0.8b, v4.8b
- mov v1.8b, v4.8b
- rev x7, x8
- add x8, x8, #1
- ins v0.d[1], x7
- rev x7, x8
- add x8, x8, #1
- ins v1.d[1], x7
- ld1 {v2.16b-v3.16b}, [x1], #32 /* get 2 input blocks */
- do_encrypt_block2x
- eor v0.16b, v0.16b, v2.16b
- eor v1.16b, v1.16b, v3.16b
- st1 {v0.16b-v1.16b}, [x0], #32
-#else
- ldr q8, =0x30000000200000001 /* addends 1,2,3[,0] */
- dup v7.4s, w8
- mov v0.16b, v4.16b
- add v7.4s, v7.4s, v8.4s
- mov v1.16b, v4.16b
- rev32 v8.16b, v7.16b
- mov v2.16b, v4.16b
- mov v3.16b, v4.16b
- mov v1.s[3], v8.s[0]
- mov v2.s[3], v8.s[1]
- mov v3.s[3], v8.s[2]
- ld1 {v5.16b-v7.16b}, [x1], #48 /* get 3 input blocks */
- do_encrypt_block4x
- eor v0.16b, v5.16b, v0.16b
- ld1 {v5.16b}, [x1], #16 /* get 1 input block */
- eor v1.16b, v6.16b, v1.16b
- eor v2.16b, v7.16b, v2.16b
- eor v3.16b, v5.16b, v3.16b
- st1 {v0.16b-v3.16b}, [x0], #64
- add x8, x8, #INTERLEAVE
-#endif
- rev x7, x8
- ins v4.d[1], x7
- cbz w4, .Lctrout
- b .LctrloopNx
-.Lctr1x:
- adds w4, w4, #INTERLEAVE
- beq .Lctrout
-#endif
-.Lctrloop:
- mov v0.16b, v4.16b
+
+ eor v0.16b, v0.16b, v5.16b /* xor with iv */
+ tbl v1.16b, {v1.16b}, v4.16b
encrypt_block v0, w3, x2, x6, w7
- adds x8, x8, #1 /* increment BE ctr */
- rev x7, x8
- ins v4.d[1], x7
- bcs .Lctrcarry /* overflow? */
+ eor v1.16b, v1.16b, v0.16b
+ tbl v0.16b, {v0.16b}, v3.16b
+ encrypt_block v1, w3, x2, x6, w7
-.Lctrcarrydone:
- subs w4, w4, #1
- bmi .Lctrtailblock /* blocks <0 means tail block */
- ld1 {v3.16b}, [x1], #16
- eor v3.16b, v0.16b, v3.16b
- st1 {v3.16b}, [x0], #16
- bne .Lctrloop
-
-.Lctrout:
- st1 {v4.16b}, [x5] /* return next CTR value */
- FRAME_POP
+ add x4, x0, x4
+ st1 {v0.16b}, [x4] /* overlapping stores */
+ st1 {v1.16b}, [x0]
ret
+AES_FUNC_END(aes_cbc_cts_encrypt)
+
+AES_FUNC_START(aes_cbc_cts_decrypt)
+ adr_l x8, .Lcts_permute_table
+ sub x4, x4, #16
+ add x9, x8, #32
+ add x8, x8, x4
+ sub x9, x9, x4
+ ld1 {v3.16b}, [x8]
+ ld1 {v4.16b}, [x9]
-.Lctrtailblock:
+ ld1 {v0.16b}, [x1], x4 /* overlapping loads */
+ ld1 {v1.16b}, [x1]
+
+ ld1 {v5.16b}, [x5] /* get iv */
+ dec_prepare w3, x2, x6
+
+ decrypt_block v0, w3, x2, x6, w7
+ tbl v2.16b, {v0.16b}, v3.16b
+ eor v2.16b, v2.16b, v1.16b
+
+ tbx v0.16b, {v1.16b}, v4.16b
+ decrypt_block v0, w3, x2, x6, w7
+ eor v0.16b, v0.16b, v5.16b /* xor with iv */
+
+ add x4, x0, x4
+ st1 {v2.16b}, [x4] /* overlapping stores */
st1 {v0.16b}, [x0]
- FRAME_POP
ret
+AES_FUNC_END(aes_cbc_cts_decrypt)
+
+ .section ".rodata", "a"
+ .align 6
+.Lcts_permute_table:
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+ .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7
+ .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+ .previous
-.Lctrcarry:
- umov x7, v4.d[0] /* load upper word of ctr */
- rev x7, x7 /* ... to handle the carry */
- add x7, x7, #1
- rev x7, x7
- ins v4.d[0], x7
- b .Lctrcarrydone
-AES_ENDPROC(aes_ctr_encrypt)
- .ltorg
+ /*
+ * This macro generates the code for CTR and XCTR mode.
+ */
+.macro ctr_encrypt xctr
+ // Arguments
+ OUT .req x0
+ IN .req x1
+ KEY .req x2
+ ROUNDS_W .req w3
+ BYTES_W .req w4
+ IV .req x5
+ BYTE_CTR_W .req w6 // XCTR only
+ // Intermediate values
+ CTR_W .req w11 // XCTR only
+ CTR .req x11 // XCTR only
+ IV_PART .req x12
+ BLOCKS .req x13
+ BLOCKS_W .req w13
+
+ stp x29, x30, [sp, #-16]!
+ mov x29, sp
+
+ enc_prepare ROUNDS_W, KEY, IV_PART
+ ld1 {vctr.16b}, [IV]
+ /*
+ * Keep 64 bits of the IV in a register. For CTR mode this lets us
+ * easily increment the IV. For XCTR mode this lets us efficiently XOR
+ * the 64-bit counter with the IV.
+ */
+ .if \xctr
+ umov IV_PART, vctr.d[0]
+ lsr CTR_W, BYTE_CTR_W, #4
+ .else
+ umov IV_PART, vctr.d[1]
+ rev IV_PART, IV_PART
+ .endif
+
+.LctrloopNx\xctr:
+ add BLOCKS_W, BYTES_W, #15
+ sub BYTES_W, BYTES_W, #MAX_STRIDE << 4
+ lsr BLOCKS_W, BLOCKS_W, #4
+ mov w8, #MAX_STRIDE
+ cmp BLOCKS_W, w8
+ csel BLOCKS_W, BLOCKS_W, w8, lt
/*
+ * Set up the counter values in v0-v{MAX_STRIDE-1}.
+ *
+ * If we are encrypting less than MAX_STRIDE blocks, the tail block
+ * handling code expects the last keystream block to be in
+ * v{MAX_STRIDE-1}. For example: if encrypting two blocks with
+ * MAX_STRIDE=5, then v3 and v4 should have the next two counter blocks.
+ */
+ .if \xctr
+ add CTR, CTR, BLOCKS
+ .else
+ adds IV_PART, IV_PART, BLOCKS
+ .endif
+ mov v0.16b, vctr.16b
+ mov v1.16b, vctr.16b
+ mov v2.16b, vctr.16b
+ mov v3.16b, vctr.16b
+ST5( mov v4.16b, vctr.16b )
+ .if \xctr
+ sub x6, CTR, #MAX_STRIDE - 1
+ sub x7, CTR, #MAX_STRIDE - 2
+ sub x8, CTR, #MAX_STRIDE - 3
+ sub x9, CTR, #MAX_STRIDE - 4
+ST5( sub x10, CTR, #MAX_STRIDE - 5 )
+ eor x6, x6, IV_PART
+ eor x7, x7, IV_PART
+ eor x8, x8, IV_PART
+ eor x9, x9, IV_PART
+ST5( eor x10, x10, IV_PART )
+ mov v0.d[0], x6
+ mov v1.d[0], x7
+ mov v2.d[0], x8
+ mov v3.d[0], x9
+ST5( mov v4.d[0], x10 )
+ .else
+ bcs 0f
+ .subsection 1
+ /*
+ * This subsection handles carries.
+ *
+ * Conditional branching here is allowed with respect to time
+ * invariance since the branches are dependent on the IV instead
+ * of the plaintext or key. This code is rarely executed in
+ * practice anyway.
+ */
+
+ /* Apply carry to outgoing counter. */
+0: umov x8, vctr.d[0]
+ rev x8, x8
+ add x8, x8, #1
+ rev x8, x8
+ ins vctr.d[0], x8
+
+ /*
+ * Apply carry to counter blocks if needed.
+ *
+ * Since the carry flag was set, we know 0 <= IV_PART <
+ * MAX_STRIDE. Using the value of IV_PART we can determine how
+ * many counter blocks need to be updated.
+ */
+ cbz IV_PART, 2f
+ adr x16, 1f
+ sub x16, x16, IV_PART, lsl #3
+ br x16
+ bti c
+ mov v0.d[0], vctr.d[0]
+ bti c
+ mov v1.d[0], vctr.d[0]
+ bti c
+ mov v2.d[0], vctr.d[0]
+ bti c
+ mov v3.d[0], vctr.d[0]
+ST5( bti c )
+ST5( mov v4.d[0], vctr.d[0] )
+1: b 2f
+ .previous
+
+2: rev x7, IV_PART
+ ins vctr.d[1], x7
+ sub x7, IV_PART, #MAX_STRIDE - 1
+ sub x8, IV_PART, #MAX_STRIDE - 2
+ sub x9, IV_PART, #MAX_STRIDE - 3
+ rev x7, x7
+ rev x8, x8
+ mov v1.d[1], x7
+ rev x9, x9
+ST5( sub x10, IV_PART, #MAX_STRIDE - 4 )
+ mov v2.d[1], x8
+ST5( rev x10, x10 )
+ mov v3.d[1], x9
+ST5( mov v4.d[1], x10 )
+ .endif
+
+ /*
+ * If there are at least MAX_STRIDE blocks left, XOR the data with
+ * keystream and store. Otherwise jump to tail handling.
+ */
+ tbnz BYTES_W, #31, .Lctrtail\xctr
+ ld1 {v5.16b-v7.16b}, [IN], #48
+ST4( bl aes_encrypt_block4x )
+ST5( bl aes_encrypt_block5x )
+ eor v0.16b, v5.16b, v0.16b
+ST4( ld1 {v5.16b}, [IN], #16 )
+ eor v1.16b, v6.16b, v1.16b
+ST5( ld1 {v5.16b-v6.16b}, [IN], #32 )
+ eor v2.16b, v7.16b, v2.16b
+ eor v3.16b, v5.16b, v3.16b
+ST5( eor v4.16b, v6.16b, v4.16b )
+ st1 {v0.16b-v3.16b}, [OUT], #64
+ST5( st1 {v4.16b}, [OUT], #16 )
+ cbz BYTES_W, .Lctrout\xctr
+ b .LctrloopNx\xctr
+
+.Lctrout\xctr:
+ .if !\xctr
+ st1 {vctr.16b}, [IV] /* return next CTR value */
+ .endif
+ ldp x29, x30, [sp], #16
+ ret
+
+.Lctrtail\xctr:
+ /*
+ * Handle up to MAX_STRIDE * 16 - 1 bytes of plaintext
+ *
+ * This code expects the last keystream block to be in v{MAX_STRIDE-1}.
+ * For example: if encrypting two blocks with MAX_STRIDE=5, then v3 and
+ * v4 should have the next two counter blocks.
+ *
+ * This allows us to store the ciphertext by writing to overlapping
+ * regions of memory. Any invalid ciphertext blocks get overwritten by
+ * correctly computed blocks. This approach greatly simplifies the
+ * logic for storing the ciphertext.
+ */
+ mov x16, #16
+ ands w7, BYTES_W, #0xf
+ csel x13, x7, x16, ne
+
+ST5( cmp BYTES_W, #64 - (MAX_STRIDE << 4))
+ST5( csel x14, x16, xzr, gt )
+ cmp BYTES_W, #48 - (MAX_STRIDE << 4)
+ csel x15, x16, xzr, gt
+ cmp BYTES_W, #32 - (MAX_STRIDE << 4)
+ csel x16, x16, xzr, gt
+ cmp BYTES_W, #16 - (MAX_STRIDE << 4)
+
+ adr_l x9, .Lcts_permute_table
+ add x9, x9, x13
+ ble .Lctrtail1x\xctr
+
+ST5( ld1 {v5.16b}, [IN], x14 )
+ ld1 {v6.16b}, [IN], x15
+ ld1 {v7.16b}, [IN], x16
+
+ST4( bl aes_encrypt_block4x )
+ST5( bl aes_encrypt_block5x )
+
+ ld1 {v8.16b}, [IN], x13
+ ld1 {v9.16b}, [IN]
+ ld1 {v10.16b}, [x9]
+
+ST4( eor v6.16b, v6.16b, v0.16b )
+ST4( eor v7.16b, v7.16b, v1.16b )
+ST4( tbl v3.16b, {v3.16b}, v10.16b )
+ST4( eor v8.16b, v8.16b, v2.16b )
+ST4( eor v9.16b, v9.16b, v3.16b )
+
+ST5( eor v5.16b, v5.16b, v0.16b )
+ST5( eor v6.16b, v6.16b, v1.16b )
+ST5( tbl v4.16b, {v4.16b}, v10.16b )
+ST5( eor v7.16b, v7.16b, v2.16b )
+ST5( eor v8.16b, v8.16b, v3.16b )
+ST5( eor v9.16b, v9.16b, v4.16b )
+
+ST5( st1 {v5.16b}, [OUT], x14 )
+ st1 {v6.16b}, [OUT], x15
+ st1 {v7.16b}, [OUT], x16
+ add x13, x13, OUT
+ st1 {v9.16b}, [x13] // overlapping stores
+ st1 {v8.16b}, [OUT]
+ b .Lctrout\xctr
+
+.Lctrtail1x\xctr:
+ /*
+ * Handle <= 16 bytes of plaintext
+ *
+ * This code always reads and writes 16 bytes. To avoid out of bounds
+ * accesses, XCTR and CTR modes must use a temporary buffer when
+ * encrypting/decrypting less than 16 bytes.
+ *
+ * This code is unusual in that it loads the input and stores the output
+ * relative to the end of the buffers rather than relative to the start.
+ * This causes unusual behaviour when encrypting/decrypting less than 16
+ * bytes; the end of the data is expected to be at the end of the
+ * temporary buffer rather than the start of the data being at the start
+ * of the temporary buffer.
+ */
+ sub x8, x7, #16
+ csel x7, x7, x8, eq
+ add IN, IN, x7
+ add OUT, OUT, x7
+ ld1 {v5.16b}, [IN]
+ ld1 {v6.16b}, [OUT]
+ST5( mov v3.16b, v4.16b )
+ encrypt_block v3, ROUNDS_W, KEY, x8, w7
+ ld1 {v10.16b-v11.16b}, [x9]
+ tbl v3.16b, {v3.16b}, v10.16b
+ sshr v11.16b, v11.16b, #7
+ eor v5.16b, v5.16b, v3.16b
+ bif v5.16b, v6.16b, v11.16b
+ st1 {v5.16b}, [OUT]
+ b .Lctrout\xctr
+
+ // Arguments
+ .unreq OUT
+ .unreq IN
+ .unreq KEY
+ .unreq ROUNDS_W
+ .unreq BYTES_W
+ .unreq IV
+ .unreq BYTE_CTR_W // XCTR only
+ // Intermediate values
+ .unreq CTR_W // XCTR only
+ .unreq CTR // XCTR only
+ .unreq IV_PART
+ .unreq BLOCKS
+ .unreq BLOCKS_W
+.endm
+
+ /*
+ * aes_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
+ * int bytes, u8 ctr[])
+ *
+ * The input and output buffers must always be at least 16 bytes even if
+ * encrypting/decrypting less than 16 bytes. Otherwise out of bounds
+ * accesses will occur. The data to be encrypted/decrypted is expected
+ * to be at the end of this 16-byte temporary buffer rather than the
+ * start.
+ */
+
+AES_FUNC_START(aes_ctr_encrypt)
+ ctr_encrypt 0
+AES_FUNC_END(aes_ctr_encrypt)
+
+ /*
+ * aes_xctr_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
+ * int bytes, u8 const iv[], int byte_ctr)
+ *
+ * The input and output buffers must always be at least 16 bytes even if
+ * encrypting/decrypting less than 16 bytes. Otherwise out of bounds
+ * accesses will occur. The data to be encrypted/decrypted is expected
+ * to be at the end of this 16-byte temporary buffer rather than the
+ * start.
+ */
+
+AES_FUNC_START(aes_xctr_encrypt)
+ ctr_encrypt 1
+AES_FUNC_END(aes_xctr_encrypt)
+
+
+ /*
+ * aes_xts_encrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds,
+ * int bytes, u8 const rk2[], u8 iv[], int first)
* aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds,
- * int blocks, u8 const rk2[], u8 iv[], int first)
- * aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds,
- * int blocks, u8 const rk2[], u8 iv[], int first)
+ * int bytes, u8 const rk2[], u8 iv[], int first)
*/
- .macro next_tweak, out, in, const, tmp
+ .macro next_tweak, out, in, tmp
sshr \tmp\().2d, \in\().2d, #63
- and \tmp\().16b, \tmp\().16b, \const\().16b
+ and \tmp\().16b, \tmp\().16b, xtsmask.16b
add \out\().2d, \in\().2d, \in\().2d
ext \tmp\().16b, \tmp\().16b, \tmp\().16b, #8
eor \out\().16b, \out\().16b, \tmp\().16b
.endm
-.Lxts_mul_x:
-CPU_LE( .quad 1, 0x87 )
-CPU_BE( .quad 0x87, 1 )
+ .macro xts_load_mask, tmp
+ movi xtsmask.2s, #0x1
+ movi \tmp\().2s, #0x87
+ uzp1 xtsmask.4s, xtsmask.4s, \tmp\().4s
+ .endm
-AES_ENTRY(aes_xts_encrypt)
- FRAME_PUSH
- cbz w7, .LxtsencloopNx
+AES_FUNC_START(aes_xts_encrypt)
+ stp x29, x30, [sp, #-16]!
+ mov x29, sp
ld1 {v4.16b}, [x6]
- enc_prepare w3, x5, x6
- encrypt_block v4, w3, x5, x6, w7 /* first tweak */
- enc_switch_key w3, x2, x6
- ldr q7, .Lxts_mul_x
+ xts_load_mask v8
+ cbz w7, .Lxtsencnotfirst
+
+ enc_prepare w3, x5, x8
+ xts_cts_skip_tw w7, .LxtsencNx
+ encrypt_block v4, w3, x5, x8, w7 /* first tweak */
+ enc_switch_key w3, x2, x8
b .LxtsencNx
+.Lxtsencnotfirst:
+ enc_prepare w3, x2, x8
.LxtsencloopNx:
- ldr q7, .Lxts_mul_x
- next_tweak v4, v4, v7, v8
+ next_tweak v4, v4, v8
.LxtsencNx:
-#if INTERLEAVE >= 2
- subs w4, w4, #INTERLEAVE
+ subs w4, w4, #64
bmi .Lxtsenc1x
-#if INTERLEAVE == 2
- ld1 {v0.16b-v1.16b}, [x1], #32 /* get 2 pt blocks */
- next_tweak v5, v4, v7, v8
- eor v0.16b, v0.16b, v4.16b
- eor v1.16b, v1.16b, v5.16b
- do_encrypt_block2x
- eor v0.16b, v0.16b, v4.16b
- eor v1.16b, v1.16b, v5.16b
- st1 {v0.16b-v1.16b}, [x0], #32
- cbz w4, .LxtsencoutNx
- next_tweak v4, v5, v7, v8
- b .LxtsencNx
-.LxtsencoutNx:
- mov v4.16b, v5.16b
- b .Lxtsencout
-#else
ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 pt blocks */
- next_tweak v5, v4, v7, v8
+ next_tweak v5, v4, v8
eor v0.16b, v0.16b, v4.16b
- next_tweak v6, v5, v7, v8
+ next_tweak v6, v5, v8
eor v1.16b, v1.16b, v5.16b
eor v2.16b, v2.16b, v6.16b
- next_tweak v7, v6, v7, v8
+ next_tweak v7, v6, v8
eor v3.16b, v3.16b, v7.16b
- do_encrypt_block4x
+ bl aes_encrypt_block4x
eor v3.16b, v3.16b, v7.16b
eor v0.16b, v0.16b, v4.16b
eor v1.16b, v1.16b, v5.16b
eor v2.16b, v2.16b, v6.16b
st1 {v0.16b-v3.16b}, [x0], #64
mov v4.16b, v7.16b
- cbz w4, .Lxtsencout
+ cbz w4, .Lxtsencret
+ xts_reload_mask v8
b .LxtsencloopNx
-#endif
.Lxtsenc1x:
- adds w4, w4, #INTERLEAVE
+ adds w4, w4, #64
beq .Lxtsencout
-#endif
+ subs w4, w4, #16
+ bmi .LxtsencctsNx
.Lxtsencloop:
- ld1 {v1.16b}, [x1], #16
- eor v0.16b, v1.16b, v4.16b
- encrypt_block v0, w3, x2, x6, w7
+ ld1 {v0.16b}, [x1], #16
+.Lxtsencctsout:
+ eor v0.16b, v0.16b, v4.16b
+ encrypt_block v0, w3, x2, x8, w7
eor v0.16b, v0.16b, v4.16b
+ cbz w4, .Lxtsencout
+ subs w4, w4, #16
+ next_tweak v4, v4, v8
+ bmi .Lxtsenccts
st1 {v0.16b}, [x0], #16
- subs w4, w4, #1
- beq .Lxtsencout
- next_tweak v4, v4, v7, v8
b .Lxtsencloop
.Lxtsencout:
- FRAME_POP
+ st1 {v0.16b}, [x0]
+.Lxtsencret:
+ st1 {v4.16b}, [x6]
+ ldp x29, x30, [sp], #16
ret
-AES_ENDPROC(aes_xts_encrypt)
-
-AES_ENTRY(aes_xts_decrypt)
- FRAME_PUSH
- cbz w7, .LxtsdecloopNx
+.LxtsencctsNx:
+ mov v0.16b, v3.16b
+ sub x0, x0, #16
+.Lxtsenccts:
+ adr_l x8, .Lcts_permute_table
+
+ add x1, x1, w4, sxtw /* rewind input pointer */
+ add w4, w4, #16 /* # bytes in final block */
+ add x9, x8, #32
+ add x8, x8, x4
+ sub x9, x9, x4
+ add x4, x0, x4 /* output address of final block */
+
+ ld1 {v1.16b}, [x1] /* load final block */
+ ld1 {v2.16b}, [x8]
+ ld1 {v3.16b}, [x9]
+
+ tbl v2.16b, {v0.16b}, v2.16b
+ tbx v0.16b, {v1.16b}, v3.16b
+ st1 {v2.16b}, [x4] /* overlapping stores */
+ mov w4, wzr
+ b .Lxtsencctsout
+AES_FUNC_END(aes_xts_encrypt)
+
+AES_FUNC_START(aes_xts_decrypt)
+ stp x29, x30, [sp, #-16]!
+ mov x29, sp
+
+ /* subtract 16 bytes if we are doing CTS */
+ sub w8, w4, #0x10
+ tst w4, #0xf
+ csel w4, w4, w8, eq
ld1 {v4.16b}, [x6]
- enc_prepare w3, x5, x6
- encrypt_block v4, w3, x5, x6, w7 /* first tweak */
- dec_prepare w3, x2, x6
- ldr q7, .Lxts_mul_x
+ xts_load_mask v8
+ xts_cts_skip_tw w7, .Lxtsdecskiptw
+ cbz w7, .Lxtsdecnotfirst
+
+ enc_prepare w3, x5, x8
+ encrypt_block v4, w3, x5, x8, w7 /* first tweak */
+.Lxtsdecskiptw:
+ dec_prepare w3, x2, x8
b .LxtsdecNx
+.Lxtsdecnotfirst:
+ dec_prepare w3, x2, x8
.LxtsdecloopNx:
- ldr q7, .Lxts_mul_x
- next_tweak v4, v4, v7, v8
+ next_tweak v4, v4, v8
.LxtsdecNx:
-#if INTERLEAVE >= 2
- subs w4, w4, #INTERLEAVE
+ subs w4, w4, #64
bmi .Lxtsdec1x
-#if INTERLEAVE == 2
- ld1 {v0.16b-v1.16b}, [x1], #32 /* get 2 ct blocks */
- next_tweak v5, v4, v7, v8
- eor v0.16b, v0.16b, v4.16b
- eor v1.16b, v1.16b, v5.16b
- do_decrypt_block2x
- eor v0.16b, v0.16b, v4.16b
- eor v1.16b, v1.16b, v5.16b
- st1 {v0.16b-v1.16b}, [x0], #32
- cbz w4, .LxtsdecoutNx
- next_tweak v4, v5, v7, v8
- b .LxtsdecNx
-.LxtsdecoutNx:
- mov v4.16b, v5.16b
- b .Lxtsdecout
-#else
ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */
- next_tweak v5, v4, v7, v8
+ next_tweak v5, v4, v8
eor v0.16b, v0.16b, v4.16b
- next_tweak v6, v5, v7, v8
+ next_tweak v6, v5, v8
eor v1.16b, v1.16b, v5.16b
eor v2.16b, v2.16b, v6.16b
- next_tweak v7, v6, v7, v8
+ next_tweak v7, v6, v8
eor v3.16b, v3.16b, v7.16b
- do_decrypt_block4x
+ bl aes_decrypt_block4x
eor v3.16b, v3.16b, v7.16b
eor v0.16b, v0.16b, v4.16b
eor v1.16b, v1.16b, v5.16b
@@ -505,36 +774,88 @@ AES_ENTRY(aes_xts_decrypt)
st1 {v0.16b-v3.16b}, [x0], #64
mov v4.16b, v7.16b
cbz w4, .Lxtsdecout
+ xts_reload_mask v8
b .LxtsdecloopNx
-#endif
.Lxtsdec1x:
- adds w4, w4, #INTERLEAVE
+ adds w4, w4, #64
beq .Lxtsdecout
-#endif
+ subs w4, w4, #16
.Lxtsdecloop:
- ld1 {v1.16b}, [x1], #16
- eor v0.16b, v1.16b, v4.16b
- decrypt_block v0, w3, x2, x6, w7
+ ld1 {v0.16b}, [x1], #16
+ bmi .Lxtsdeccts
+.Lxtsdecctsout:
+ eor v0.16b, v0.16b, v4.16b
+ decrypt_block v0, w3, x2, x8, w7
eor v0.16b, v0.16b, v4.16b
st1 {v0.16b}, [x0], #16
- subs w4, w4, #1
- beq .Lxtsdecout
- next_tweak v4, v4, v7, v8
+ cbz w4, .Lxtsdecout
+ subs w4, w4, #16
+ next_tweak v4, v4, v8
b .Lxtsdecloop
.Lxtsdecout:
- FRAME_POP
+ st1 {v4.16b}, [x6]
+ ldp x29, x30, [sp], #16
ret
-AES_ENDPROC(aes_xts_decrypt)
+
+.Lxtsdeccts:
+ adr_l x8, .Lcts_permute_table
+
+ add x1, x1, w4, sxtw /* rewind input pointer */
+ add w4, w4, #16 /* # bytes in final block */
+ add x9, x8, #32
+ add x8, x8, x4
+ sub x9, x9, x4
+ add x4, x0, x4 /* output address of final block */
+
+ next_tweak v5, v4, v8
+
+ ld1 {v1.16b}, [x1] /* load final block */
+ ld1 {v2.16b}, [x8]
+ ld1 {v3.16b}, [x9]
+
+ eor v0.16b, v0.16b, v5.16b
+ decrypt_block v0, w3, x2, x8, w7
+ eor v0.16b, v0.16b, v5.16b
+
+ tbl v2.16b, {v0.16b}, v2.16b
+ tbx v0.16b, {v1.16b}, v3.16b
+
+ st1 {v2.16b}, [x4] /* overlapping stores */
+ mov w4, wzr
+ b .Lxtsdecctsout
+AES_FUNC_END(aes_xts_decrypt)
/*
* aes_mac_update(u8 const in[], u32 const rk[], int rounds,
* int blocks, u8 dg[], int enc_before, int enc_after)
*/
-AES_ENTRY(aes_mac_update)
+AES_FUNC_START(aes_mac_update)
ld1 {v0.16b}, [x4] /* get dg */
enc_prepare w2, x1, x7
- cbnz w5, .Lmacenc
+ cbz w5, .Lmacloop4x
+
+ encrypt_block v0, w2, x1, x7, w8
+.Lmacloop4x:
+ subs w3, w3, #4
+ bmi .Lmac1x
+ ld1 {v1.16b-v4.16b}, [x0], #64 /* get next pt block */
+ eor v0.16b, v0.16b, v1.16b /* ..and xor with dg */
+ encrypt_block v0, w2, x1, x7, w8
+ eor v0.16b, v0.16b, v2.16b
+ encrypt_block v0, w2, x1, x7, w8
+ eor v0.16b, v0.16b, v3.16b
+ encrypt_block v0, w2, x1, x7, w8
+ eor v0.16b, v0.16b, v4.16b
+ cmp w3, wzr
+ csinv x5, x6, xzr, eq
+ cbz w5, .Lmacout
+ encrypt_block v0, w2, x1, x7, w8
+ st1 {v0.16b}, [x4] /* return dg */
+ cond_yield .Lmacout, x7, x8
+ b .Lmacloop4x
+.Lmac1x:
+ add w3, w3, #4
.Lmacloop:
cbz w3, .Lmacout
ld1 {v1.16b}, [x0], #16 /* get next pt block */
@@ -550,5 +871,6 @@ AES_ENTRY(aes_mac_update)
.Lmacout:
st1 {v0.16b}, [x4] /* return dg */
+ mov w0, w3
ret
-AES_ENDPROC(aes_mac_update)
+AES_FUNC_END(aes_mac_update)