aboutsummaryrefslogtreecommitdiffstats
path: root/arch/arm64
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2019-03-05 09:09:55 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2019-03-05 09:09:55 -0800
commit63bdf4284c38a48af21745ceb148a087b190cd21 (patch)
treeffbf9e69ed457e776db0317903ccb0addbd1b276 /arch/arm64
parentMerge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next (diff)
parentcrypto: s5p - add AES support for Exynos5433 (diff)
downloadlinux-dev-63bdf4284c38a48af21745ceb148a087b190cd21.tar.xz
linux-dev-63bdf4284c38a48af21745ceb148a087b190cd21.zip
Merge branch 'linus' of git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6
Pull crypto update from Herbert Xu: "API: - Add helper for simple skcipher modes. - Add helper to register multiple templates. - Set CRYPTO_TFM_NEED_KEY when setkey fails. - Require neither or both of export/import in shash. - AEAD decryption test vectors are now generated from encryption ones. - New option CONFIG_CRYPTO_MANAGER_EXTRA_TESTS that includes random fuzzing. Algorithms: - Conversions to skcipher and helper for many templates. - Add more test vectors for nhpoly1305 and adiantum. Drivers: - Add crypto4xx prng support. - Add xcbc/cmac/ecb support in caam. - Add AES support for Exynos5433 in s5p. - Remove sha384/sha512 from artpec7 as hardware cannot do partial hash" [ There is a merge of the Freescale SoC tree in order to pull in changes required by patches to the caam/qi2 driver. ] * 'linus' of git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6: (174 commits) crypto: s5p - add AES support for Exynos5433 dt-bindings: crypto: document Exynos5433 SlimSSS crypto: crypto4xx - add missing of_node_put after of_device_is_available crypto: cavium/zip - fix collision with generic cra_driver_name crypto: af_alg - use struct_size() in sock_kfree_s() crypto: caam - remove redundant likely/unlikely annotation crypto: s5p - update iv after AES-CBC op end crypto: x86/poly1305 - Clear key material from stack in SSE2 variant crypto: caam - generate hash keys in-place crypto: caam - fix DMA mapping xcbc key twice crypto: caam - fix hash context DMA unmap size hwrng: bcm2835 - fix probe as platform device crypto: s5p-sss - Use AES_BLOCK_SIZE define instead of number crypto: stm32 - drop pointless static qualifier in stm32_hash_remove() crypto: chelsio - Fixed Traffic Stall crypto: marvell - Remove set but not used variable 'ivsize' crypto: ccp - Update driver messages to remove some confusion crypto: adiantum - add 1536 and 4096-byte test vectors crypto: nhpoly1305 - add a test vector with len % 16 != 0 crypto: arm/aes-ce - update IV after partial final CTR block ...
Diffstat (limited to 'arch/arm64')
-rw-r--r--arch/arm64/crypto/aes-ce-ccm-core.S5
-rw-r--r--arch/arm64/crypto/aes-ce-ccm-glue.c8
-rw-r--r--arch/arm64/crypto/aes-modes.S3
-rw-r--r--arch/arm64/crypto/aes-neonbs-core.S8
-rw-r--r--arch/arm64/crypto/crct10dif-ce-core.S513
-rw-r--r--arch/arm64/crypto/crct10dif-ce-glue.c75
-rw-r--r--arch/arm64/crypto/ghash-ce-glue.c118
7 files changed, 383 insertions, 347 deletions
diff --git a/arch/arm64/crypto/aes-ce-ccm-core.S b/arch/arm64/crypto/aes-ce-ccm-core.S
index e3a375c4cb83..1b151442dac1 100644
--- a/arch/arm64/crypto/aes-ce-ccm-core.S
+++ b/arch/arm64/crypto/aes-ce-ccm-core.S
@@ -74,12 +74,13 @@ ENTRY(ce_aes_ccm_auth_data)
beq 10f
ext v0.16b, v0.16b, v0.16b, #1 /* rotate out the mac bytes */
b 7b
-8: mov w7, w8
+8: cbz w8, 91f
+ mov w7, w8
add w8, w8, #16
9: ext v1.16b, v1.16b, v1.16b, #1
adds w7, w7, #1
bne 9b
- eor v0.16b, v0.16b, v1.16b
+91: eor v0.16b, v0.16b, v1.16b
st1 {v0.16b}, [x0]
10: str w8, [x3]
ret
diff --git a/arch/arm64/crypto/aes-ce-ccm-glue.c b/arch/arm64/crypto/aes-ce-ccm-glue.c
index 68b11aa690e4..5fc6f51908fd 100644
--- a/arch/arm64/crypto/aes-ce-ccm-glue.c
+++ b/arch/arm64/crypto/aes-ce-ccm-glue.c
@@ -125,7 +125,7 @@ static void ccm_update_mac(struct crypto_aes_ctx *key, u8 mac[], u8 const in[],
abytes -= added;
}
- while (abytes > AES_BLOCK_SIZE) {
+ while (abytes >= AES_BLOCK_SIZE) {
__aes_arm64_encrypt(key->key_enc, mac, mac,
num_rounds(key));
crypto_xor(mac, in, AES_BLOCK_SIZE);
@@ -139,8 +139,6 @@ static void ccm_update_mac(struct crypto_aes_ctx *key, u8 mac[], u8 const in[],
num_rounds(key));
crypto_xor(mac, in, abytes);
*macp = abytes;
- } else {
- *macp = 0;
}
}
}
@@ -255,7 +253,7 @@ static int ccm_encrypt(struct aead_request *req)
/* preserve the original iv for the final round */
memcpy(buf, req->iv, AES_BLOCK_SIZE);
- err = skcipher_walk_aead_encrypt(&walk, req, true);
+ err = skcipher_walk_aead_encrypt(&walk, req, false);
if (may_use_simd()) {
while (walk.nbytes) {
@@ -313,7 +311,7 @@ static int ccm_decrypt(struct aead_request *req)
/* preserve the original iv for the final round */
memcpy(buf, req->iv, AES_BLOCK_SIZE);
- err = skcipher_walk_aead_decrypt(&walk, req, true);
+ err = skcipher_walk_aead_decrypt(&walk, req, false);
if (may_use_simd()) {
while (walk.nbytes) {
diff --git a/arch/arm64/crypto/aes-modes.S b/arch/arm64/crypto/aes-modes.S
index 67700045a0e0..4c7ce231963c 100644
--- a/arch/arm64/crypto/aes-modes.S
+++ b/arch/arm64/crypto/aes-modes.S
@@ -320,8 +320,7 @@ AES_ENTRY(aes_ctr_encrypt)
.Lctrtailblock:
st1 {v0.16b}, [x0]
- ldp x29, x30, [sp], #16
- ret
+ b .Lctrout
.Lctrcarry:
umov x7, v4.d[0] /* load upper word of ctr */
diff --git a/arch/arm64/crypto/aes-neonbs-core.S b/arch/arm64/crypto/aes-neonbs-core.S
index e613a87f8b53..8432c8d0dea6 100644
--- a/arch/arm64/crypto/aes-neonbs-core.S
+++ b/arch/arm64/crypto/aes-neonbs-core.S
@@ -971,18 +971,22 @@ CPU_LE( rev x8, x8 )
8: next_ctr v0
st1 {v0.16b}, [x24]
- cbz x23, 0f
+ cbz x23, .Lctr_done
cond_yield_neon 98b
b 99b
-0: frame_pop
+.Lctr_done:
+ frame_pop
ret
/*
* If we are handling the tail of the input (x6 != NULL), return the
* final keystream block back to the caller.
*/
+0: cbz x25, 8b
+ st1 {v0.16b}, [x25]
+ b 8b
1: cbz x25, 8b
st1 {v1.16b}, [x25]
b 8b
diff --git a/arch/arm64/crypto/crct10dif-ce-core.S b/arch/arm64/crypto/crct10dif-ce-core.S
index 9e82e8e8ed05..e545b42e6a46 100644
--- a/arch/arm64/crypto/crct10dif-ce-core.S
+++ b/arch/arm64/crypto/crct10dif-ce-core.S
@@ -2,12 +2,14 @@
// Accelerated CRC-T10DIF using arm64 NEON and Crypto Extensions instructions
//
// Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
+// Copyright (C) 2019 Google LLC <ebiggers@google.com>
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License version 2 as
// published by the Free Software Foundation.
//
+// Derived from the x86 version:
//
// Implement fast CRC-T10DIF computation with SSE and PCLMULQDQ instructions
//
@@ -54,19 +56,11 @@
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
-// Function API:
-// UINT16 crc_t10dif_pcl(
-// UINT16 init_crc, //initial CRC value, 16 bits
-// const unsigned char *buf, //buffer pointer to calculate CRC on
-// UINT64 len //buffer length in bytes (64-bit data)
-// );
-//
// Reference paper titled "Fast CRC Computation for Generic
// Polynomials Using PCLMULQDQ Instruction"
// URL: http://www.intel.com/content/dam/www/public/us/en/documents
// /white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
//
-//
#include <linux/linkage.h>
#include <asm/assembler.h>
@@ -74,14 +68,14 @@
.text
.cpu generic+crypto
- arg1_low32 .req w19
- arg2 .req x20
- arg3 .req x21
+ init_crc .req w19
+ buf .req x20
+ len .req x21
+ fold_consts_ptr .req x22
- vzr .req v13
+ fold_consts .req v10
ad .req v14
- bd .req v10
k00_16 .req v15
k32_48 .req v16
@@ -143,11 +137,11 @@ __pmull_p8_core:
ext t5.8b, ad.8b, ad.8b, #2 // A2
ext t6.8b, ad.8b, ad.8b, #3 // A3
- pmull t4.8h, t4.8b, bd.8b // F = A1*B
+ pmull t4.8h, t4.8b, fold_consts.8b // F = A1*B
pmull t8.8h, ad.8b, bd1.8b // E = A*B1
- pmull t5.8h, t5.8b, bd.8b // H = A2*B
+ pmull t5.8h, t5.8b, fold_consts.8b // H = A2*B
pmull t7.8h, ad.8b, bd2.8b // G = A*B2
- pmull t6.8h, t6.8b, bd.8b // J = A3*B
+ pmull t6.8h, t6.8b, fold_consts.8b // J = A3*B
pmull t9.8h, ad.8b, bd3.8b // I = A*B3
pmull t3.8h, ad.8b, bd4.8b // K = A*B4
b 0f
@@ -157,11 +151,11 @@ __pmull_p8_core:
tbl t5.16b, {ad.16b}, perm2.16b // A2
tbl t6.16b, {ad.16b}, perm3.16b // A3
- pmull2 t4.8h, t4.16b, bd.16b // F = A1*B
+ pmull2 t4.8h, t4.16b, fold_consts.16b // F = A1*B
pmull2 t8.8h, ad.16b, bd1.16b // E = A*B1
- pmull2 t5.8h, t5.16b, bd.16b // H = A2*B
+ pmull2 t5.8h, t5.16b, fold_consts.16b // H = A2*B
pmull2 t7.8h, ad.16b, bd2.16b // G = A*B2
- pmull2 t6.8h, t6.16b, bd.16b // J = A3*B
+ pmull2 t6.8h, t6.16b, fold_consts.16b // J = A3*B
pmull2 t9.8h, ad.16b, bd3.16b // I = A*B3
pmull2 t3.8h, ad.16b, bd4.16b // K = A*B4
@@ -203,14 +197,14 @@ __pmull_p8_core:
ENDPROC(__pmull_p8_core)
.macro __pmull_p8, rq, ad, bd, i
- .ifnc \bd, v10
+ .ifnc \bd, fold_consts
.err
.endif
mov ad.16b, \ad\().16b
.ifb \i
- pmull \rq\().8h, \ad\().8b, bd.8b // D = A*B
+ pmull \rq\().8h, \ad\().8b, \bd\().8b // D = A*B
.else
- pmull2 \rq\().8h, \ad\().16b, bd.16b // D = A*B
+ pmull2 \rq\().8h, \ad\().16b, \bd\().16b // D = A*B
.endif
bl .L__pmull_p8_core\i
@@ -219,17 +213,19 @@ ENDPROC(__pmull_p8_core)
eor \rq\().16b, \rq\().16b, t6.16b
.endm
- .macro fold64, p, reg1, reg2
- ldp q11, q12, [arg2], #0x20
+ // Fold reg1, reg2 into the next 32 data bytes, storing the result back
+ // into reg1, reg2.
+ .macro fold_32_bytes, p, reg1, reg2
+ ldp q11, q12, [buf], #0x20
- __pmull_\p v8, \reg1, v10, 2
- __pmull_\p \reg1, \reg1, v10
+ __pmull_\p v8, \reg1, fold_consts, 2
+ __pmull_\p \reg1, \reg1, fold_consts
CPU_LE( rev64 v11.16b, v11.16b )
CPU_LE( rev64 v12.16b, v12.16b )
- __pmull_\p v9, \reg2, v10, 2
- __pmull_\p \reg2, \reg2, v10
+ __pmull_\p v9, \reg2, fold_consts, 2
+ __pmull_\p \reg2, \reg2, fold_consts
CPU_LE( ext v11.16b, v11.16b, v11.16b, #8 )
CPU_LE( ext v12.16b, v12.16b, v12.16b, #8 )
@@ -240,15 +236,16 @@ CPU_LE( ext v12.16b, v12.16b, v12.16b, #8 )
eor \reg2\().16b, \reg2\().16b, v12.16b
.endm
- .macro fold16, p, reg, rk
- __pmull_\p v8, \reg, v10
- __pmull_\p \reg, \reg, v10, 2
- .ifnb \rk
- ldr_l q10, \rk, x8
- __pmull_pre_\p v10
+ // Fold src_reg into dst_reg, optionally loading the next fold constants
+ .macro fold_16_bytes, p, src_reg, dst_reg, load_next_consts
+ __pmull_\p v8, \src_reg, fold_consts
+ __pmull_\p \src_reg, \src_reg, fold_consts, 2
+ .ifnb \load_next_consts
+ ld1 {fold_consts.2d}, [fold_consts_ptr], #16
+ __pmull_pre_\p fold_consts
.endif
- eor v7.16b, v7.16b, v8.16b
- eor v7.16b, v7.16b, \reg\().16b
+ eor \dst_reg\().16b, \dst_reg\().16b, v8.16b
+ eor \dst_reg\().16b, \dst_reg\().16b, \src_reg\().16b
.endm
.macro __pmull_p64, rd, rn, rm, n
@@ -260,40 +257,27 @@ CPU_LE( ext v12.16b, v12.16b, v12.16b, #8 )
.endm
.macro crc_t10dif_pmull, p
- frame_push 3, 128
+ frame_push 4, 128
- mov arg1_low32, w0
- mov arg2, x1
- mov arg3, x2
-
- movi vzr.16b, #0 // init zero register
+ mov init_crc, w0
+ mov buf, x1
+ mov len, x2
__pmull_init_\p
- // adjust the 16-bit initial_crc value, scale it to 32 bits
- lsl arg1_low32, arg1_low32, #16
-
- // check if smaller than 256
- cmp arg3, #256
-
- // for sizes less than 128, we can't fold 64B at a time...
- b.lt .L_less_than_128_\@
+ // For sizes less than 256 bytes, we can't fold 128 bytes at a time.
+ cmp len, #256
+ b.lt .Lless_than_256_bytes_\@
- // load the initial crc value
- // crc value does not need to be byte-reflected, but it needs
- // to be moved to the high part of the register.
- // because data will be byte-reflected and will align with
- // initial crc at correct place.
- movi v10.16b, #0
- mov v10.s[3], arg1_low32 // initial crc
-
- // receive the initial 64B data, xor the initial crc value
- ldp q0, q1, [arg2]
- ldp q2, q3, [arg2, #0x20]
- ldp q4, q5, [arg2, #0x40]
- ldp q6, q7, [arg2, #0x60]
- add arg2, arg2, #0x80
+ adr_l fold_consts_ptr, .Lfold_across_128_bytes_consts
+ // Load the first 128 data bytes. Byte swapping is necessary to make
+ // the bit order match the polynomial coefficient order.
+ ldp q0, q1, [buf]
+ ldp q2, q3, [buf, #0x20]
+ ldp q4, q5, [buf, #0x40]
+ ldp q6, q7, [buf, #0x60]
+ add buf, buf, #0x80
CPU_LE( rev64 v0.16b, v0.16b )
CPU_LE( rev64 v1.16b, v1.16b )
CPU_LE( rev64 v2.16b, v2.16b )
@@ -302,7 +286,6 @@ CPU_LE( rev64 v4.16b, v4.16b )
CPU_LE( rev64 v5.16b, v5.16b )
CPU_LE( rev64 v6.16b, v6.16b )
CPU_LE( rev64 v7.16b, v7.16b )
-
CPU_LE( ext v0.16b, v0.16b, v0.16b, #8 )
CPU_LE( ext v1.16b, v1.16b, v1.16b, #8 )
CPU_LE( ext v2.16b, v2.16b, v2.16b, #8 )
@@ -312,36 +295,29 @@ CPU_LE( ext v5.16b, v5.16b, v5.16b, #8 )
CPU_LE( ext v6.16b, v6.16b, v6.16b, #8 )
CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 )
- // XOR the initial_crc value
- eor v0.16b, v0.16b, v10.16b
-
- ldr_l q10, rk3, x8 // xmm10 has rk3 and rk4
- // type of pmull instruction
- // will determine which constant to use
- __pmull_pre_\p v10
+ // XOR the first 16 data *bits* with the initial CRC value.
+ movi v8.16b, #0
+ mov v8.h[7], init_crc
+ eor v0.16b, v0.16b, v8.16b
- //
- // we subtract 256 instead of 128 to save one instruction from the loop
- //
- sub arg3, arg3, #256
+ // Load the constants for folding across 128 bytes.
+ ld1 {fold_consts.2d}, [fold_consts_ptr]
+ __pmull_pre_\p fold_consts
- // at this section of the code, there is 64*x+y (0<=y<64) bytes of
- // buffer. The _fold_64_B_loop will fold 64B at a time
- // until we have 64+y Bytes of buffer
+ // Subtract 128 for the 128 data bytes just consumed. Subtract another
+ // 128 to simplify the termination condition of the following loop.
+ sub len, len, #256
- // fold 64B at a time. This section of the code folds 4 vector
- // registers in parallel
-.L_fold_64_B_loop_\@:
+ // While >= 128 data bytes remain (not counting v0-v7), fold the 128
+ // bytes v0-v7 into them, storing the result back into v0-v7.
+.Lfold_128_bytes_loop_\@:
+ fold_32_bytes \p, v0, v1
+ fold_32_bytes \p, v2, v3
+ fold_32_bytes \p, v4, v5
+ fold_32_bytes \p, v6, v7
- fold64 \p, v0, v1
- fold64 \p, v2, v3
- fold64 \p, v4, v5
- fold64 \p, v6, v7
-
- subs arg3, arg3, #128
-
- // check if there is another 64B in the buffer to be able to fold
- b.lt .L_fold_64_B_end_\@
+ subs len, len, #128
+ b.lt .Lfold_128_bytes_loop_done_\@
if_will_cond_yield_neon
stp q0, q1, [sp, #.Lframe_local_offset]
@@ -353,228 +329,207 @@ CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 )
ldp q2, q3, [sp, #.Lframe_local_offset + 32]
ldp q4, q5, [sp, #.Lframe_local_offset + 64]
ldp q6, q7, [sp, #.Lframe_local_offset + 96]
- ldr_l q10, rk3, x8
- movi vzr.16b, #0 // init zero register
+ ld1 {fold_consts.2d}, [fold_consts_ptr]
__pmull_init_\p
- __pmull_pre_\p v10
+ __pmull_pre_\p fold_consts
endif_yield_neon
- b .L_fold_64_B_loop_\@
-
-.L_fold_64_B_end_\@:
- // at this point, the buffer pointer is pointing at the last y Bytes
- // of the buffer the 64B of folded data is in 4 of the vector
- // registers: v0, v1, v2, v3
-
- // fold the 8 vector registers to 1 vector register with different
- // constants
-
- ldr_l q10, rk9, x8
- __pmull_pre_\p v10
-
- fold16 \p, v0, rk11
- fold16 \p, v1, rk13
- fold16 \p, v2, rk15
- fold16 \p, v3, rk17
- fold16 \p, v4, rk19
- fold16 \p, v5, rk1
- fold16 \p, v6
-
- // instead of 64, we add 48 to the loop counter to save 1 instruction
- // from the loop instead of a cmp instruction, we use the negative
- // flag with the jl instruction
- adds arg3, arg3, #(128-16)
- b.lt .L_final_reduction_for_128_\@
-
- // now we have 16+y bytes left to reduce. 16 Bytes is in register v7
- // and the rest is in memory. We can fold 16 bytes at a time if y>=16
- // continue folding 16B at a time
-
-.L_16B_reduction_loop_\@:
- __pmull_\p v8, v7, v10
- __pmull_\p v7, v7, v10, 2
+ b .Lfold_128_bytes_loop_\@
+
+.Lfold_128_bytes_loop_done_\@:
+
+ // Now fold the 112 bytes in v0-v6 into the 16 bytes in v7.
+
+ // Fold across 64 bytes.
+ add fold_consts_ptr, fold_consts_ptr, #16
+ ld1 {fold_consts.2d}, [fold_consts_ptr], #16
+ __pmull_pre_\p fold_consts
+ fold_16_bytes \p, v0, v4
+ fold_16_bytes \p, v1, v5
+ fold_16_bytes \p, v2, v6
+ fold_16_bytes \p, v3, v7, 1
+ // Fold across 32 bytes.
+ fold_16_bytes \p, v4, v6
+ fold_16_bytes \p, v5, v7, 1
+ // Fold across 16 bytes.
+ fold_16_bytes \p, v6, v7
+
+ // Add 128 to get the correct number of data bytes remaining in 0...127
+ // (not counting v7), following the previous extra subtraction by 128.
+ // Then subtract 16 to simplify the termination condition of the
+ // following loop.
+ adds len, len, #(128-16)
+
+ // While >= 16 data bytes remain (not counting v7), fold the 16 bytes v7
+ // into them, storing the result back into v7.
+ b.lt .Lfold_16_bytes_loop_done_\@
+.Lfold_16_bytes_loop_\@:
+ __pmull_\p v8, v7, fold_consts
+ __pmull_\p v7, v7, fold_consts, 2
eor v7.16b, v7.16b, v8.16b
-
- ldr q0, [arg2], #16
+ ldr q0, [buf], #16
CPU_LE( rev64 v0.16b, v0.16b )
CPU_LE( ext v0.16b, v0.16b, v0.16b, #8 )
eor v7.16b, v7.16b, v0.16b
- subs arg3, arg3, #16
-
- // instead of a cmp instruction, we utilize the flags with the
- // jge instruction equivalent of: cmp arg3, 16-16
- // check if there is any more 16B in the buffer to be able to fold
- b.ge .L_16B_reduction_loop_\@
-
- // now we have 16+z bytes left to reduce, where 0<= z < 16.
- // first, we reduce the data in the xmm7 register
-
-.L_final_reduction_for_128_\@:
- // check if any more data to fold. If not, compute the CRC of
- // the final 128 bits
- adds arg3, arg3, #16
- b.eq .L_128_done_\@
-
- // here we are getting data that is less than 16 bytes.
- // since we know that there was data before the pointer, we can
- // offset the input pointer before the actual point, to receive
- // exactly 16 bytes. after that the registers need to be adjusted.
-.L_get_last_two_regs_\@:
- add arg2, arg2, arg3
- ldr q1, [arg2, #-16]
-CPU_LE( rev64 v1.16b, v1.16b )
-CPU_LE( ext v1.16b, v1.16b, v1.16b, #8 )
-
- // get rid of the extra data that was loaded before
- // load the shift constant
- adr_l x4, tbl_shf_table + 16
- sub x4, x4, arg3
- ld1 {v0.16b}, [x4]
-
- // shift v2 to the left by arg3 bytes
- tbl v2.16b, {v7.16b}, v0.16b
-
- // shift v7 to the right by 16-arg3 bytes
- movi v9.16b, #0x80
- eor v0.16b, v0.16b, v9.16b
- tbl v7.16b, {v7.16b}, v0.16b
-
- // blend
- sshr v0.16b, v0.16b, #7 // convert to 8-bit mask
- bsl v0.16b, v2.16b, v1.16b
-
- // fold 16 Bytes
- __pmull_\p v8, v7, v10
- __pmull_\p v7, v7, v10, 2
- eor v7.16b, v7.16b, v8.16b
- eor v7.16b, v7.16b, v0.16b
+ subs len, len, #16
+ b.ge .Lfold_16_bytes_loop_\@
+
+.Lfold_16_bytes_loop_done_\@:
+ // Add 16 to get the correct number of data bytes remaining in 0...15
+ // (not counting v7), following the previous extra subtraction by 16.
+ adds len, len, #16
+ b.eq .Lreduce_final_16_bytes_\@
+
+.Lhandle_partial_segment_\@:
+ // Reduce the last '16 + len' bytes where 1 <= len <= 15 and the first
+ // 16 bytes are in v7 and the rest are the remaining data in 'buf'. To
+ // do this without needing a fold constant for each possible 'len',
+ // redivide the bytes into a first chunk of 'len' bytes and a second
+ // chunk of 16 bytes, then fold the first chunk into the second.
+
+ // v0 = last 16 original data bytes
+ add buf, buf, len
+ ldr q0, [buf, #-16]
+CPU_LE( rev64 v0.16b, v0.16b )
+CPU_LE( ext v0.16b, v0.16b, v0.16b, #8 )
-.L_128_done_\@:
- // compute crc of a 128-bit value
- ldr_l q10, rk5, x8 // rk5 and rk6 in xmm10
- __pmull_pre_\p v10
+ // v1 = high order part of second chunk: v7 left-shifted by 'len' bytes.
+ adr_l x4, .Lbyteshift_table + 16
+ sub x4, x4, len
+ ld1 {v2.16b}, [x4]
+ tbl v1.16b, {v7.16b}, v2.16b
- // 64b fold
- ext v0.16b, vzr.16b, v7.16b, #8
- mov v7.d[0], v7.d[1]
- __pmull_\p v7, v7, v10
- eor v7.16b, v7.16b, v0.16b
+ // v3 = first chunk: v7 right-shifted by '16-len' bytes.
+ movi v3.16b, #0x80
+ eor v2.16b, v2.16b, v3.16b
+ tbl v3.16b, {v7.16b}, v2.16b
- // 32b fold
- ext v0.16b, v7.16b, vzr.16b, #4
- mov v7.s[3], vzr.s[0]
- __pmull_\p v0, v0, v10, 2
- eor v7.16b, v7.16b, v0.16b
+ // Convert to 8-bit masks: 'len' 0x00 bytes, then '16-len' 0xff bytes.
+ sshr v2.16b, v2.16b, #7
- // barrett reduction
- ldr_l q10, rk7, x8
- __pmull_pre_\p v10
- mov v0.d[0], v7.d[1]
+ // v2 = second chunk: 'len' bytes from v0 (low-order bytes),
+ // then '16-len' bytes from v1 (high-order bytes).
+ bsl v2.16b, v1.16b, v0.16b
- __pmull_\p v0, v0, v10
- ext v0.16b, vzr.16b, v0.16b, #12
- __pmull_\p v0, v0, v10, 2
- ext v0.16b, vzr.16b, v0.16b, #12
+ // Fold the first chunk into the second chunk, storing the result in v7.
+ __pmull_\p v0, v3, fold_consts
+ __pmull_\p v7, v3, fold_consts, 2
eor v7.16b, v7.16b, v0.16b
- mov w0, v7.s[1]
-
-.L_cleanup_\@:
- // scale the result back to 16 bits
- lsr x0, x0, #16
+ eor v7.16b, v7.16b, v2.16b
+
+.Lreduce_final_16_bytes_\@:
+ // Reduce the 128-bit value M(x), stored in v7, to the final 16-bit CRC.
+
+ movi v2.16b, #0 // init zero register
+
+ // Load 'x^48 * (x^48 mod G(x))' and 'x^48 * (x^80 mod G(x))'.
+ ld1 {fold_consts.2d}, [fold_consts_ptr], #16
+ __pmull_pre_\p fold_consts
+
+ // Fold the high 64 bits into the low 64 bits, while also multiplying by
+ // x^64. This produces a 128-bit value congruent to x^64 * M(x) and
+ // whose low 48 bits are 0.
+ ext v0.16b, v2.16b, v7.16b, #8
+ __pmull_\p v7, v7, fold_consts, 2 // high bits * x^48 * (x^80 mod G(x))
+ eor v0.16b, v0.16b, v7.16b // + low bits * x^64
+
+ // Fold the high 32 bits into the low 96 bits. This produces a 96-bit
+ // value congruent to x^64 * M(x) and whose low 48 bits are 0.
+ ext v1.16b, v0.16b, v2.16b, #12 // extract high 32 bits
+ mov v0.s[3], v2.s[0] // zero high 32 bits
+ __pmull_\p v1, v1, fold_consts // high 32 bits * x^48 * (x^48 mod G(x))
+ eor v0.16b, v0.16b, v1.16b // + low bits
+
+ // Load G(x) and floor(x^48 / G(x)).
+ ld1 {fold_consts.2d}, [fold_consts_ptr]
+ __pmull_pre_\p fold_consts
+
+ // Use Barrett reduction to compute the final CRC value.
+ __pmull_\p v1, v0, fold_consts, 2 // high 32 bits * floor(x^48 / G(x))
+ ushr v1.2d, v1.2d, #32 // /= x^32
+ __pmull_\p v1, v1, fold_consts // *= G(x)
+ ushr v0.2d, v0.2d, #48
+ eor v0.16b, v0.16b, v1.16b // + low 16 nonzero bits
+ // Final CRC value (x^16 * M(x)) mod G(x) is in low 16 bits of v0.
+
+ umov w0, v0.h[0]
frame_pop
ret
-.L_less_than_128_\@:
- cbz arg3, .L_cleanup_\@
+.Lless_than_256_bytes_\@:
+ // Checksumming a buffer of length 16...255 bytes
- movi v0.16b, #0
- mov v0.s[3], arg1_low32 // get the initial crc value
+ adr_l fold_consts_ptr, .Lfold_across_16_bytes_consts
- ldr q7, [arg2], #0x10
+ // Load the first 16 data bytes.
+ ldr q7, [buf], #0x10
CPU_LE( rev64 v7.16b, v7.16b )
CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 )
- eor v7.16b, v7.16b, v0.16b // xor the initial crc value
-
- cmp arg3, #16
- b.eq .L_128_done_\@ // exactly 16 left
- b.lt .L_less_than_16_left_\@
-
- ldr_l q10, rk1, x8 // rk1 and rk2 in xmm10
- __pmull_pre_\p v10
-
- // update the counter. subtract 32 instead of 16 to save one
- // instruction from the loop
- subs arg3, arg3, #32
- b.ge .L_16B_reduction_loop_\@
-
- add arg3, arg3, #16
- b .L_get_last_two_regs_\@
-
-.L_less_than_16_left_\@:
- // shl r9, 4
- adr_l x0, tbl_shf_table + 16
- sub x0, x0, arg3
- ld1 {v0.16b}, [x0]
- movi v9.16b, #0x80
- eor v0.16b, v0.16b, v9.16b
- tbl v7.16b, {v7.16b}, v0.16b
- b .L_128_done_\@
+
+ // XOR the first 16 data *bits* with the initial CRC value.
+ movi v0.16b, #0
+ mov v0.h[7], init_crc
+ eor v7.16b, v7.16b, v0.16b
+
+ // Load the fold-across-16-bytes constants.
+ ld1 {fold_consts.2d}, [fold_consts_ptr], #16
+ __pmull_pre_\p fold_consts
+
+ cmp len, #16
+ b.eq .Lreduce_final_16_bytes_\@ // len == 16
+ subs len, len, #32
+ b.ge .Lfold_16_bytes_loop_\@ // 32 <= len <= 255
+ add len, len, #16
+ b .Lhandle_partial_segment_\@ // 17 <= len <= 31
.endm
+//
+// u16 crc_t10dif_pmull_p8(u16 init_crc, const u8 *buf, size_t len);
+//
+// Assumes len >= 16.
+//
ENTRY(crc_t10dif_pmull_p8)
crc_t10dif_pmull p8
ENDPROC(crc_t10dif_pmull_p8)
.align 5
+//
+// u16 crc_t10dif_pmull_p64(u16 init_crc, const u8 *buf, size_t len);
+//
+// Assumes len >= 16.
+//
ENTRY(crc_t10dif_pmull_p64)
crc_t10dif_pmull p64
ENDPROC(crc_t10dif_pmull_p64)
-// precomputed constants
-// these constants are precomputed from the poly:
-// 0x8bb70000 (0x8bb7 scaled to 32 bits)
.section ".rodata", "a"
.align 4
-// Q = 0x18BB70000
-// rk1 = 2^(32*3) mod Q << 32
-// rk2 = 2^(32*5) mod Q << 32
-// rk3 = 2^(32*15) mod Q << 32
-// rk4 = 2^(32*17) mod Q << 32
-// rk5 = 2^(32*3) mod Q << 32
-// rk6 = 2^(32*2) mod Q << 32
-// rk7 = floor(2^64/Q)
-// rk8 = Q
-
-rk1: .octa 0x06df0000000000002d56000000000000
-rk3: .octa 0x7cf50000000000009d9d000000000000
-rk5: .octa 0x13680000000000002d56000000000000
-rk7: .octa 0x000000018bb7000000000001f65a57f8
-rk9: .octa 0xbfd6000000000000ceae000000000000
-rk11: .octa 0x713c0000000000001e16000000000000
-rk13: .octa 0x80a6000000000000f7f9000000000000
-rk15: .octa 0xe658000000000000044c000000000000
-rk17: .octa 0xa497000000000000ad18000000000000
-rk19: .octa 0xe7b50000000000006ee3000000000000
-
-tbl_shf_table:
-// use these values for shift constants for the tbl/tbx instruction
-// different alignments result in values as shown:
-// DDQ 0x008f8e8d8c8b8a898887868584838281 # shl 15 (16-1) / shr1
-// DDQ 0x01008f8e8d8c8b8a8988878685848382 # shl 14 (16-3) / shr2
-// DDQ 0x0201008f8e8d8c8b8a89888786858483 # shl 13 (16-4) / shr3
-// DDQ 0x030201008f8e8d8c8b8a898887868584 # shl 12 (16-4) / shr4
-// DDQ 0x04030201008f8e8d8c8b8a8988878685 # shl 11 (16-5) / shr5
-// DDQ 0x0504030201008f8e8d8c8b8a89888786 # shl 10 (16-6) / shr6
-// DDQ 0x060504030201008f8e8d8c8b8a898887 # shl 9 (16-7) / shr7
-// DDQ 0x07060504030201008f8e8d8c8b8a8988 # shl 8 (16-8) / shr8
-// DDQ 0x0807060504030201008f8e8d8c8b8a89 # shl 7 (16-9) / shr9
-// DDQ 0x090807060504030201008f8e8d8c8b8a # shl 6 (16-10) / shr10
-// DDQ 0x0a090807060504030201008f8e8d8c8b # shl 5 (16-11) / shr11
-// DDQ 0x0b0a090807060504030201008f8e8d8c # shl 4 (16-12) / shr12
-// DDQ 0x0c0b0a090807060504030201008f8e8d # shl 3 (16-13) / shr13
-// DDQ 0x0d0c0b0a090807060504030201008f8e # shl 2 (16-14) / shr14
-// DDQ 0x0e0d0c0b0a090807060504030201008f # shl 1 (16-15) / shr15
+// Fold constants precomputed from the polynomial 0x18bb7
+// G(x) = x^16 + x^15 + x^11 + x^9 + x^8 + x^7 + x^5 + x^4 + x^2 + x^1 + x^0
+.Lfold_across_128_bytes_consts:
+ .quad 0x0000000000006123 // x^(8*128) mod G(x)
+ .quad 0x0000000000002295 // x^(8*128+64) mod G(x)
+// .Lfold_across_64_bytes_consts:
+ .quad 0x0000000000001069 // x^(4*128) mod G(x)
+ .quad 0x000000000000dd31 // x^(4*128+64) mod G(x)
+// .Lfold_across_32_bytes_consts:
+ .quad 0x000000000000857d // x^(2*128) mod G(x)
+ .quad 0x0000000000007acc // x^(2*128+64) mod G(x)
+.Lfold_across_16_bytes_consts:
+ .quad 0x000000000000a010 // x^(1*128) mod G(x)
+ .quad 0x0000000000001faa // x^(1*128+64) mod G(x)
+// .Lfinal_fold_consts:
+ .quad 0x1368000000000000 // x^48 * (x^48 mod G(x))
+ .quad 0x2d56000000000000 // x^48 * (x^80 mod G(x))
+// .Lbarrett_reduction_consts:
+ .quad 0x0000000000018bb7 // G(x)
+ .quad 0x00000001f65a57f8 // floor(x^48 / G(x))
+
+// For 1 <= len <= 15, the 16-byte vector beginning at &byteshift_table[16 -
+// len] is the index vector to shift left by 'len' bytes, and is also {0x80,
+// ..., 0x80} XOR the index vector to shift right by '16 - len' bytes.
+.Lbyteshift_table:
.byte 0x0, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87
.byte 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f
.byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7
diff --git a/arch/arm64/crypto/crct10dif-ce-glue.c b/arch/arm64/crypto/crct10dif-ce-glue.c
index b461d62023f2..dd325829ee44 100644
--- a/arch/arm64/crypto/crct10dif-ce-glue.c
+++ b/arch/arm64/crypto/crct10dif-ce-glue.c
@@ -22,10 +22,8 @@
#define CRC_T10DIF_PMULL_CHUNK_SIZE 16U
-asmlinkage u16 crc_t10dif_pmull_p64(u16 init_crc, const u8 buf[], u64 len);
-asmlinkage u16 crc_t10dif_pmull_p8(u16 init_crc, const u8 buf[], u64 len);
-
-static u16 (*crc_t10dif_pmull)(u16 init_crc, const u8 buf[], u64 len);
+asmlinkage u16 crc_t10dif_pmull_p8(u16 init_crc, const u8 *buf, size_t len);
+asmlinkage u16 crc_t10dif_pmull_p64(u16 init_crc, const u8 *buf, size_t len);
static int crct10dif_init(struct shash_desc *desc)
{
@@ -35,30 +33,33 @@ static int crct10dif_init(struct shash_desc *desc)
return 0;
}
-static int crct10dif_update(struct shash_desc *desc, const u8 *data,
+static int crct10dif_update_pmull_p8(struct shash_desc *desc, const u8 *data,
unsigned int length)
{
u16 *crc = shash_desc_ctx(desc);
- unsigned int l;
- if (unlikely((u64)data % CRC_T10DIF_PMULL_CHUNK_SIZE)) {
- l = min_t(u32, length, CRC_T10DIF_PMULL_CHUNK_SIZE -
- ((u64)data % CRC_T10DIF_PMULL_CHUNK_SIZE));
+ if (length >= CRC_T10DIF_PMULL_CHUNK_SIZE && may_use_simd()) {
+ kernel_neon_begin();
+ *crc = crc_t10dif_pmull_p8(*crc, data, length);
+ kernel_neon_end();
+ } else {
+ *crc = crc_t10dif_generic(*crc, data, length);
+ }
- *crc = crc_t10dif_generic(*crc, data, l);
+ return 0;
+}
- length -= l;
- data += l;
- }
+static int crct10dif_update_pmull_p64(struct shash_desc *desc, const u8 *data,
+ unsigned int length)
+{
+ u16 *crc = shash_desc_ctx(desc);
- if (length > 0) {
- if (may_use_simd()) {
- kernel_neon_begin();
- *crc = crc_t10dif_pmull(*crc, data, length);
- kernel_neon_end();
- } else {
- *crc = crc_t10dif_generic(*crc, data, length);
- }
+ if (length >= CRC_T10DIF_PMULL_CHUNK_SIZE && may_use_simd()) {
+ kernel_neon_begin();
+ *crc = crc_t10dif_pmull_p64(*crc, data, length);
+ kernel_neon_end();
+ } else {
+ *crc = crc_t10dif_generic(*crc, data, length);
}
return 0;
@@ -72,10 +73,22 @@ static int crct10dif_final(struct shash_desc *desc, u8 *out)
return 0;
}
-static struct shash_alg crc_t10dif_alg = {
+static struct shash_alg crc_t10dif_alg[] = {{
.digestsize = CRC_T10DIF_DIGEST_SIZE,
.init = crct10dif_init,
- .update = crct10dif_update,
+ .update = crct10dif_update_pmull_p8,
+ .final = crct10dif_final,
+ .descsize = CRC_T10DIF_DIGEST_SIZE,
+
+ .base.cra_name = "crct10dif",
+ .base.cra_driver_name = "crct10dif-arm64-neon",
+ .base.cra_priority = 100,
+ .base.cra_blocksize = CRC_T10DIF_BLOCK_SIZE,
+ .base.cra_module = THIS_MODULE,
+}, {
+ .digestsize = CRC_T10DIF_DIGEST_SIZE,
+ .init = crct10dif_init,
+ .update = crct10dif_update_pmull_p64,
.final = crct10dif_final,
.descsize = CRC_T10DIF_DIGEST_SIZE,
@@ -84,21 +97,25 @@ static struct shash_alg crc_t10dif_alg = {
.base.cra_priority = 200,
.base.cra_blocksize = CRC_T10DIF_BLOCK_SIZE,
.base.cra_module = THIS_MODULE,
-};
+}};
static int __init crc_t10dif_mod_init(void)
{
if (elf_hwcap & HWCAP_PMULL)
- crc_t10dif_pmull = crc_t10dif_pmull_p64;
+ return crypto_register_shashes(crc_t10dif_alg,
+ ARRAY_SIZE(crc_t10dif_alg));
else
- crc_t10dif_pmull = crc_t10dif_pmull_p8;
-
- return crypto_register_shash(&crc_t10dif_alg);
+ /* only register the first array element */
+ return crypto_register_shash(crc_t10dif_alg);
}
static void __exit crc_t10dif_mod_exit(void)
{
- crypto_unregister_shash(&crc_t10dif_alg);
+ if (elf_hwcap & HWCAP_PMULL)
+ crypto_unregister_shashes(crc_t10dif_alg,
+ ARRAY_SIZE(crc_t10dif_alg));
+ else
+ crypto_unregister_shash(crc_t10dif_alg);
}
module_cpu_feature_match(ASIMD, crc_t10dif_mod_init);
diff --git a/arch/arm64/crypto/ghash-ce-glue.c b/arch/arm64/crypto/ghash-ce-glue.c
index 067d8937d5af..791ad422c427 100644
--- a/arch/arm64/crypto/ghash-ce-glue.c
+++ b/arch/arm64/crypto/ghash-ce-glue.c
@@ -60,10 +60,6 @@ asmlinkage void pmull_ghash_update_p8(int blocks, u64 dg[], const char *src,
struct ghash_key const *k,
const char *head);
-static void (*pmull_ghash_update)(int blocks, u64 dg[], const char *src,
- struct ghash_key const *k,
- const char *head);
-
asmlinkage void pmull_gcm_encrypt(int blocks, u64 dg[], u8 dst[],
const u8 src[], struct ghash_key const *k,
u8 ctr[], u32 const rk[], int rounds,
@@ -87,11 +83,15 @@ static int ghash_init(struct shash_desc *desc)
}
static void ghash_do_update(int blocks, u64 dg[], const char *src,
- struct ghash_key *key, const char *head)
+ struct ghash_key *key, const char *head,
+ void (*simd_update)(int blocks, u64 dg[],
+ const char *src,
+ struct ghash_key const *k,
+ const char *head))
{
if (likely(may_use_simd())) {
kernel_neon_begin();
- pmull_ghash_update(blocks, dg, src, key, head);
+ simd_update(blocks, dg, src, key, head);
kernel_neon_end();
} else {
be128 dst = { cpu_to_be64(dg[1]), cpu_to_be64(dg[0]) };
@@ -119,8 +119,12 @@ static void ghash_do_update(int blocks, u64 dg[], const char *src,
/* avoid hogging the CPU for too long */
#define MAX_BLOCKS (SZ_64K / GHASH_BLOCK_SIZE)
-static int ghash_update(struct shash_desc *desc, const u8 *src,
- unsigned int len)
+static int __ghash_update(struct shash_desc *desc, const u8 *src,
+ unsigned int len,
+ void (*simd_update)(int blocks, u64 dg[],
+ const char *src,
+ struct ghash_key const *k,
+ const char *head))
{
struct ghash_desc_ctx *ctx = shash_desc_ctx(desc);
unsigned int partial = ctx->count % GHASH_BLOCK_SIZE;
@@ -146,7 +150,8 @@ static int ghash_update(struct shash_desc *desc, const u8 *src,
int chunk = min(blocks, MAX_BLOCKS);
ghash_do_update(chunk, ctx->digest, src, key,
- partial ? ctx->buf : NULL);
+ partial ? ctx->buf : NULL,
+ simd_update);
blocks -= chunk;
src += chunk * GHASH_BLOCK_SIZE;
@@ -158,7 +163,19 @@ static int ghash_update(struct shash_desc *desc, const u8 *src,
return 0;
}
-static int ghash_final(struct shash_desc *desc, u8 *dst)
+static int ghash_update_p8(struct shash_desc *desc, const u8 *src,
+ unsigned int len)
+{
+ return __ghash_update(desc, src, len, pmull_ghash_update_p8);
+}
+
+static int ghash_update_p64(struct shash_desc *desc, const u8 *src,
+ unsigned int len)
+{
+ return __ghash_update(desc, src, len, pmull_ghash_update_p64);
+}
+
+static int ghash_final_p8(struct shash_desc *desc, u8 *dst)
{
struct ghash_desc_ctx *ctx = shash_desc_ctx(desc);
unsigned int partial = ctx->count % GHASH_BLOCK_SIZE;
@@ -168,7 +185,28 @@ static int ghash_final(struct shash_desc *desc, u8 *dst)
memset(ctx->buf + partial, 0, GHASH_BLOCK_SIZE - partial);
- ghash_do_update(1, ctx->digest, ctx->buf, key, NULL);
+ ghash_do_update(1, ctx->digest, ctx->buf, key, NULL,
+ pmull_ghash_update_p8);
+ }
+ put_unaligned_be64(ctx->digest[1], dst);
+ put_unaligned_be64(ctx->digest[0], dst + 8);
+
+ *ctx = (struct ghash_desc_ctx){};
+ return 0;
+}
+
+static int ghash_final_p64(struct shash_desc *desc, u8 *dst)
+{
+ struct ghash_desc_ctx *ctx = shash_desc_ctx(desc);
+ unsigned int partial = ctx->count % GHASH_BLOCK_SIZE;
+
+ if (partial) {
+ struct ghash_key *key = crypto_shash_ctx(desc->tfm);
+
+ memset(ctx->buf + partial, 0, GHASH_BLOCK_SIZE - partial);
+
+ ghash_do_update(1, ctx->digest, ctx->buf, key, NULL,
+ pmull_ghash_update_p64);
}
put_unaligned_be64(ctx->digest[1], dst);
put_unaligned_be64(ctx->digest[0], dst + 8);
@@ -224,7 +262,21 @@ static int ghash_setkey(struct crypto_shash *tfm,
return __ghash_setkey(key, inkey, keylen);
}
-static struct shash_alg ghash_alg = {
+static struct shash_alg ghash_alg[] = {{
+ .base.cra_name = "ghash",
+ .base.cra_driver_name = "ghash-neon",
+ .base.cra_priority = 100,
+ .base.cra_blocksize = GHASH_BLOCK_SIZE,
+ .base.cra_ctxsize = sizeof(struct ghash_key),
+ .base.cra_module = THIS_MODULE,
+
+ .digestsize = GHASH_DIGEST_SIZE,
+ .init = ghash_init,
+ .update = ghash_update_p8,
+ .final = ghash_final_p8,
+ .setkey = ghash_setkey,
+ .descsize = sizeof(struct ghash_desc_ctx),
+}, {
.base.cra_name = "ghash",
.base.cra_driver_name = "ghash-ce",
.base.cra_priority = 200,
@@ -234,11 +286,11 @@ static struct shash_alg ghash_alg = {
.digestsize = GHASH_DIGEST_SIZE,
.init = ghash_init,
- .update = ghash_update,
- .final = ghash_final,
+ .update = ghash_update_p64,
+ .final = ghash_final_p64,
.setkey = ghash_setkey,
.descsize = sizeof(struct ghash_desc_ctx),
-};
+}};
static int num_rounds(struct crypto_aes_ctx *ctx)
{
@@ -301,7 +353,8 @@ static void gcm_update_mac(u64 dg[], const u8 *src, int count, u8 buf[],
int blocks = count / GHASH_BLOCK_SIZE;
ghash_do_update(blocks, dg, src, &ctx->ghash_key,
- *buf_count ? buf : NULL);
+ *buf_count ? buf : NULL,
+ pmull_ghash_update_p64);
src += blocks * GHASH_BLOCK_SIZE;
count %= GHASH_BLOCK_SIZE;
@@ -345,7 +398,8 @@ static void gcm_calculate_auth_mac(struct aead_request *req, u64 dg[])
if (buf_count) {
memset(&buf[buf_count], 0, GHASH_BLOCK_SIZE - buf_count);
- ghash_do_update(1, dg, buf, &ctx->ghash_key, NULL);
+ ghash_do_update(1, dg, buf, &ctx->ghash_key, NULL,
+ pmull_ghash_update_p64);
}
}
@@ -358,7 +412,8 @@ static void gcm_final(struct aead_request *req, struct gcm_aes_ctx *ctx,
lengths.a = cpu_to_be64(req->assoclen * 8);
lengths.b = cpu_to_be64(cryptlen * 8);
- ghash_do_update(1, dg, (void *)&lengths, &ctx->ghash_key, NULL);
+ ghash_do_update(1, dg, (void *)&lengths, &ctx->ghash_key, NULL,
+ pmull_ghash_update_p64);
put_unaligned_be64(dg[1], mac);
put_unaligned_be64(dg[0], mac + 8);
@@ -434,7 +489,7 @@ static int gcm_encrypt(struct aead_request *req)
ghash_do_update(walk.nbytes / AES_BLOCK_SIZE, dg,
walk.dst.virt.addr, &ctx->ghash_key,
- NULL);
+ NULL, pmull_ghash_update_p64);
err = skcipher_walk_done(&walk,
walk.nbytes % (2 * AES_BLOCK_SIZE));
@@ -469,7 +524,8 @@ static int gcm_encrypt(struct aead_request *req)
memcpy(buf, dst, nbytes);
memset(buf + nbytes, 0, GHASH_BLOCK_SIZE - nbytes);
- ghash_do_update(!!nbytes, dg, buf, &ctx->ghash_key, head);
+ ghash_do_update(!!nbytes, dg, buf, &ctx->ghash_key, head,
+ pmull_ghash_update_p64);
err = skcipher_walk_done(&walk, 0);
}
@@ -558,7 +614,8 @@ static int gcm_decrypt(struct aead_request *req)
u8 *src = walk.src.virt.addr;
ghash_do_update(blocks, dg, walk.src.virt.addr,
- &ctx->ghash_key, NULL);
+ &ctx->ghash_key, NULL,
+ pmull_ghash_update_p64);
do {
__aes_arm64_encrypt(ctx->aes_key.key_enc,
@@ -602,7 +659,8 @@ static int gcm_decrypt(struct aead_request *req)
memcpy(buf, src, nbytes);
memset(buf + nbytes, 0, GHASH_BLOCK_SIZE - nbytes);
- ghash_do_update(!!nbytes, dg, buf, &ctx->ghash_key, head);
+ ghash_do_update(!!nbytes, dg, buf, &ctx->ghash_key, head,
+ pmull_ghash_update_p64);
crypto_xor_cpy(walk.dst.virt.addr, walk.src.virt.addr, iv,
walk.nbytes);
@@ -650,26 +708,30 @@ static int __init ghash_ce_mod_init(void)
return -ENODEV;
if (elf_hwcap & HWCAP_PMULL)
- pmull_ghash_update = pmull_ghash_update_p64;
-
+ ret = crypto_register_shashes(ghash_alg,
+ ARRAY_SIZE(ghash_alg));
else
- pmull_ghash_update = pmull_ghash_update_p8;
+ /* only register the first array element */
+ ret = crypto_register_shash(ghash_alg);
- ret = crypto_register_shash(&ghash_alg);
if (ret)
return ret;
if (elf_hwcap & HWCAP_PMULL) {
ret = crypto_register_aead(&gcm_aes_alg);
if (ret)
- crypto_unregister_shash(&ghash_alg);
+ crypto_unregister_shashes(ghash_alg,
+ ARRAY_SIZE(ghash_alg));
}
return ret;
}
static void __exit ghash_ce_mod_exit(void)
{
- crypto_unregister_shash(&ghash_alg);
+ if (elf_hwcap & HWCAP_PMULL)
+ crypto_unregister_shashes(ghash_alg, ARRAY_SIZE(ghash_alg));
+ else
+ crypto_unregister_shash(ghash_alg);
crypto_unregister_aead(&gcm_aes_alg);
}