aboutsummaryrefslogtreecommitdiffstats
path: root/arch/arm64
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2022-05-27 18:06:49 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2022-05-27 18:06:49 -0700
commitd075c0c1be279c5f4c6688ac0442fff6494e56bc (patch)
treec3e3ab6b35139229ad0a5096ccea0c00eb97998b /arch/arm64
parentMerge tag '5.19-rc-smb3-client-fixes-updated' of git://git.samba.org/sfrench/cifs-2.6 (diff)
parenthwrng: omap3-rom - fix using wrong clk_disable() in omap_rom_rng_runtime_resume() (diff)
downloadlinux-dev-d075c0c1be279c5f4c6688ac0442fff6494e56bc.tar.xz
linux-dev-d075c0c1be279c5f4c6688ac0442fff6494e56bc.zip
Merge tag 'v5.19-p1' of git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6
Pull crypto updates from Herbert Xu: "API: - Test in-place en/decryption with two sglists in testmgr - Fix process vs softirq race in cryptd Algorithms: - Add arm64 acceleration for sm4 - Add s390 acceleration for chacha20 Drivers: - Add polarfire soc hwrng support in mpsf - Add support for TI SoC AM62x in sa2ul - Add support for ATSHA204 cryptochip in atmel-sha204a - Add support for PRNG in caam - Restore support for storage encryption in qat - Restore support for storage encryption in hisilicon/sec" * tag 'v5.19-p1' of git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6: (116 commits) hwrng: omap3-rom - fix using wrong clk_disable() in omap_rom_rng_runtime_resume() crypto: hisilicon/sec - delete the flag CRYPTO_ALG_ALLOCATES_MEMORY crypto: qat - add support for 401xx devices crypto: qat - re-enable registration of algorithms crypto: qat - honor CRYPTO_TFM_REQ_MAY_SLEEP flag crypto: qat - add param check for DH crypto: qat - add param check for RSA crypto: qat - remove dma_free_coherent() for DH crypto: qat - remove dma_free_coherent() for RSA crypto: qat - fix memory leak in RSA crypto: qat - add backlog mechanism crypto: qat - refactor submission logic crypto: qat - use pre-allocated buffers in datapath crypto: qat - set to zero DH parameters before free crypto: s390 - add crypto library interface for ChaCha20 crypto: talitos - Uniform coding style with defined variable crypto: octeontx2 - simplify the return expression of otx2_cpt_aead_cbc_aes_sha_setkey() crypto: cryptd - Protect per-CPU resource by disabling BH. crypto: sun8i-ce - do not fallback if cryptlen is less than sg length crypto: sun8i-ce - rework debugging ...
Diffstat (limited to 'arch/arm64')
-rw-r--r--arch/arm64/crypto/Kconfig16
-rw-r--r--arch/arm64/crypto/Makefile8
-rw-r--r--arch/arm64/crypto/sm4-ce-cipher-core.S36
-rw-r--r--arch/arm64/crypto/sm4-ce-cipher-glue.c82
-rw-r--r--arch/arm64/crypto/sm4-ce-core.S688
-rw-r--r--arch/arm64/crypto/sm4-ce-glue.c386
-rw-r--r--arch/arm64/crypto/sm4-neon-core.S487
-rw-r--r--arch/arm64/crypto/sm4-neon-glue.c442
8 files changed, 2062 insertions, 83 deletions
diff --git a/arch/arm64/crypto/Kconfig b/arch/arm64/crypto/Kconfig
index 2a965aa0188d..ac85682c013c 100644
--- a/arch/arm64/crypto/Kconfig
+++ b/arch/arm64/crypto/Kconfig
@@ -45,13 +45,25 @@ config CRYPTO_SM3_ARM64_CE
tristate "SM3 digest algorithm (ARMv8.2 Crypto Extensions)"
depends on KERNEL_MODE_NEON
select CRYPTO_HASH
- select CRYPTO_LIB_SM3
+ select CRYPTO_SM3
config CRYPTO_SM4_ARM64_CE
tristate "SM4 symmetric cipher (ARMv8.2 Crypto Extensions)"
depends on KERNEL_MODE_NEON
select CRYPTO_ALGAPI
- select CRYPTO_LIB_SM4
+ select CRYPTO_SM4
+
+config CRYPTO_SM4_ARM64_CE_BLK
+ tristate "SM4 in ECB/CBC/CFB/CTR modes using ARMv8 Crypto Extensions"
+ depends on KERNEL_MODE_NEON
+ select CRYPTO_SKCIPHER
+ select CRYPTO_SM4
+
+config CRYPTO_SM4_ARM64_NEON_BLK
+ tristate "SM4 in ECB/CBC/CFB/CTR modes using NEON instructions"
+ depends on KERNEL_MODE_NEON
+ select CRYPTO_SKCIPHER
+ select CRYPTO_SM4
config CRYPTO_GHASH_ARM64_CE
tristate "GHASH/AES-GCM using ARMv8 Crypto Extensions"
diff --git a/arch/arm64/crypto/Makefile b/arch/arm64/crypto/Makefile
index 09a805cc32d7..bea8995133b1 100644
--- a/arch/arm64/crypto/Makefile
+++ b/arch/arm64/crypto/Makefile
@@ -20,9 +20,15 @@ sha3-ce-y := sha3-ce-glue.o sha3-ce-core.o
obj-$(CONFIG_CRYPTO_SM3_ARM64_CE) += sm3-ce.o
sm3-ce-y := sm3-ce-glue.o sm3-ce-core.o
-obj-$(CONFIG_CRYPTO_SM4_ARM64_CE) += sm4-ce.o
+obj-$(CONFIG_CRYPTO_SM4_ARM64_CE) += sm4-ce-cipher.o
+sm4-ce-cipher-y := sm4-ce-cipher-glue.o sm4-ce-cipher-core.o
+
+obj-$(CONFIG_CRYPTO_SM4_ARM64_CE_BLK) += sm4-ce.o
sm4-ce-y := sm4-ce-glue.o sm4-ce-core.o
+obj-$(CONFIG_CRYPTO_SM4_ARM64_NEON_BLK) += sm4-neon.o
+sm4-neon-y := sm4-neon-glue.o sm4-neon-core.o
+
obj-$(CONFIG_CRYPTO_GHASH_ARM64_CE) += ghash-ce.o
ghash-ce-y := ghash-ce-glue.o ghash-ce-core.o
diff --git a/arch/arm64/crypto/sm4-ce-cipher-core.S b/arch/arm64/crypto/sm4-ce-cipher-core.S
new file mode 100644
index 000000000000..4ac6cfbc5797
--- /dev/null
+++ b/arch/arm64/crypto/sm4-ce-cipher-core.S
@@ -0,0 +1,36 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+ .irp b, 0, 1, 2, 3, 4, 5, 6, 7, 8
+ .set .Lv\b\().4s, \b
+ .endr
+
+ .macro sm4e, rd, rn
+ .inst 0xcec08400 | .L\rd | (.L\rn << 5)
+ .endm
+
+ /*
+ * void sm4_ce_do_crypt(const u32 *rk, u32 *out, const u32 *in);
+ */
+ .text
+SYM_FUNC_START(sm4_ce_do_crypt)
+ ld1 {v8.4s}, [x2]
+ ld1 {v0.4s-v3.4s}, [x0], #64
+CPU_LE( rev32 v8.16b, v8.16b )
+ ld1 {v4.4s-v7.4s}, [x0]
+ sm4e v8.4s, v0.4s
+ sm4e v8.4s, v1.4s
+ sm4e v8.4s, v2.4s
+ sm4e v8.4s, v3.4s
+ sm4e v8.4s, v4.4s
+ sm4e v8.4s, v5.4s
+ sm4e v8.4s, v6.4s
+ sm4e v8.4s, v7.4s
+ rev64 v8.4s, v8.4s
+ ext v8.16b, v8.16b, v8.16b, #8
+CPU_LE( rev32 v8.16b, v8.16b )
+ st1 {v8.4s}, [x1]
+ ret
+SYM_FUNC_END(sm4_ce_do_crypt)
diff --git a/arch/arm64/crypto/sm4-ce-cipher-glue.c b/arch/arm64/crypto/sm4-ce-cipher-glue.c
new file mode 100644
index 000000000000..76a34ef4abbb
--- /dev/null
+++ b/arch/arm64/crypto/sm4-ce-cipher-glue.c
@@ -0,0 +1,82 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <asm/neon.h>
+#include <asm/simd.h>
+#include <crypto/sm4.h>
+#include <crypto/internal/simd.h>
+#include <linux/module.h>
+#include <linux/cpufeature.h>
+#include <linux/crypto.h>
+#include <linux/types.h>
+
+MODULE_ALIAS_CRYPTO("sm4");
+MODULE_ALIAS_CRYPTO("sm4-ce");
+MODULE_DESCRIPTION("SM4 symmetric cipher using ARMv8 Crypto Extensions");
+MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
+MODULE_LICENSE("GPL v2");
+
+asmlinkage void sm4_ce_do_crypt(const u32 *rk, void *out, const void *in);
+
+static int sm4_ce_setkey(struct crypto_tfm *tfm, const u8 *key,
+ unsigned int key_len)
+{
+ struct sm4_ctx *ctx = crypto_tfm_ctx(tfm);
+
+ return sm4_expandkey(ctx, key, key_len);
+}
+
+static void sm4_ce_encrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
+{
+ const struct sm4_ctx *ctx = crypto_tfm_ctx(tfm);
+
+ if (!crypto_simd_usable()) {
+ sm4_crypt_block(ctx->rkey_enc, out, in);
+ } else {
+ kernel_neon_begin();
+ sm4_ce_do_crypt(ctx->rkey_enc, out, in);
+ kernel_neon_end();
+ }
+}
+
+static void sm4_ce_decrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
+{
+ const struct sm4_ctx *ctx = crypto_tfm_ctx(tfm);
+
+ if (!crypto_simd_usable()) {
+ sm4_crypt_block(ctx->rkey_dec, out, in);
+ } else {
+ kernel_neon_begin();
+ sm4_ce_do_crypt(ctx->rkey_dec, out, in);
+ kernel_neon_end();
+ }
+}
+
+static struct crypto_alg sm4_ce_alg = {
+ .cra_name = "sm4",
+ .cra_driver_name = "sm4-ce",
+ .cra_priority = 300,
+ .cra_flags = CRYPTO_ALG_TYPE_CIPHER,
+ .cra_blocksize = SM4_BLOCK_SIZE,
+ .cra_ctxsize = sizeof(struct sm4_ctx),
+ .cra_module = THIS_MODULE,
+ .cra_u.cipher = {
+ .cia_min_keysize = SM4_KEY_SIZE,
+ .cia_max_keysize = SM4_KEY_SIZE,
+ .cia_setkey = sm4_ce_setkey,
+ .cia_encrypt = sm4_ce_encrypt,
+ .cia_decrypt = sm4_ce_decrypt
+ }
+};
+
+static int __init sm4_ce_mod_init(void)
+{
+ return crypto_register_alg(&sm4_ce_alg);
+}
+
+static void __exit sm4_ce_mod_fini(void)
+{
+ crypto_unregister_alg(&sm4_ce_alg);
+}
+
+module_cpu_feature_match(SM4, sm4_ce_mod_init);
+module_exit(sm4_ce_mod_fini);
diff --git a/arch/arm64/crypto/sm4-ce-core.S b/arch/arm64/crypto/sm4-ce-core.S
index 4ac6cfbc5797..934e0f093279 100644
--- a/arch/arm64/crypto/sm4-ce-core.S
+++ b/arch/arm64/crypto/sm4-ce-core.S
@@ -1,36 +1,660 @@
-// SPDX-License-Identifier: GPL-2.0
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * SM4 Cipher Algorithm for ARMv8 with Crypto Extensions
+ * as specified in
+ * https://tools.ietf.org/id/draft-ribose-cfrg-sm4-10.html
+ *
+ * Copyright (C) 2022, Alibaba Group.
+ * Copyright (C) 2022 Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
+ */
#include <linux/linkage.h>
#include <asm/assembler.h>
- .irp b, 0, 1, 2, 3, 4, 5, 6, 7, 8
- .set .Lv\b\().4s, \b
- .endr
-
- .macro sm4e, rd, rn
- .inst 0xcec08400 | .L\rd | (.L\rn << 5)
- .endm
-
- /*
- * void sm4_ce_do_crypt(const u32 *rk, u32 *out, const u32 *in);
- */
- .text
-SYM_FUNC_START(sm4_ce_do_crypt)
- ld1 {v8.4s}, [x2]
- ld1 {v0.4s-v3.4s}, [x0], #64
-CPU_LE( rev32 v8.16b, v8.16b )
- ld1 {v4.4s-v7.4s}, [x0]
- sm4e v8.4s, v0.4s
- sm4e v8.4s, v1.4s
- sm4e v8.4s, v2.4s
- sm4e v8.4s, v3.4s
- sm4e v8.4s, v4.4s
- sm4e v8.4s, v5.4s
- sm4e v8.4s, v6.4s
- sm4e v8.4s, v7.4s
- rev64 v8.4s, v8.4s
- ext v8.16b, v8.16b, v8.16b, #8
-CPU_LE( rev32 v8.16b, v8.16b )
- st1 {v8.4s}, [x1]
- ret
-SYM_FUNC_END(sm4_ce_do_crypt)
+.arch armv8-a+crypto
+
+.irp b, 0, 1, 2, 3, 4, 5, 6, 7, 16, 20, 24, 25, 26, 27, 28, 29, 30, 31
+ .set .Lv\b\().4s, \b
+.endr
+
+.macro sm4e, vd, vn
+ .inst 0xcec08400 | (.L\vn << 5) | .L\vd
+.endm
+
+.macro sm4ekey, vd, vn, vm
+ .inst 0xce60c800 | (.L\vm << 16) | (.L\vn << 5) | .L\vd
+.endm
+
+/* Register macros */
+
+#define RTMP0 v16
+#define RTMP1 v17
+#define RTMP2 v18
+#define RTMP3 v19
+
+#define RIV v20
+
+/* Helper macros. */
+
+#define PREPARE \
+ ld1 {v24.16b-v27.16b}, [x0], #64; \
+ ld1 {v28.16b-v31.16b}, [x0];
+
+#define SM4_CRYPT_BLK(b0) \
+ rev32 b0.16b, b0.16b; \
+ sm4e b0.4s, v24.4s; \
+ sm4e b0.4s, v25.4s; \
+ sm4e b0.4s, v26.4s; \
+ sm4e b0.4s, v27.4s; \
+ sm4e b0.4s, v28.4s; \
+ sm4e b0.4s, v29.4s; \
+ sm4e b0.4s, v30.4s; \
+ sm4e b0.4s, v31.4s; \
+ rev64 b0.4s, b0.4s; \
+ ext b0.16b, b0.16b, b0.16b, #8; \
+ rev32 b0.16b, b0.16b;
+
+#define SM4_CRYPT_BLK4(b0, b1, b2, b3) \
+ rev32 b0.16b, b0.16b; \
+ rev32 b1.16b, b1.16b; \
+ rev32 b2.16b, b2.16b; \
+ rev32 b3.16b, b3.16b; \
+ sm4e b0.4s, v24.4s; \
+ sm4e b1.4s, v24.4s; \
+ sm4e b2.4s, v24.4s; \
+ sm4e b3.4s, v24.4s; \
+ sm4e b0.4s, v25.4s; \
+ sm4e b1.4s, v25.4s; \
+ sm4e b2.4s, v25.4s; \
+ sm4e b3.4s, v25.4s; \
+ sm4e b0.4s, v26.4s; \
+ sm4e b1.4s, v26.4s; \
+ sm4e b2.4s, v26.4s; \
+ sm4e b3.4s, v26.4s; \
+ sm4e b0.4s, v27.4s; \
+ sm4e b1.4s, v27.4s; \
+ sm4e b2.4s, v27.4s; \
+ sm4e b3.4s, v27.4s; \
+ sm4e b0.4s, v28.4s; \
+ sm4e b1.4s, v28.4s; \
+ sm4e b2.4s, v28.4s; \
+ sm4e b3.4s, v28.4s; \
+ sm4e b0.4s, v29.4s; \
+ sm4e b1.4s, v29.4s; \
+ sm4e b2.4s, v29.4s; \
+ sm4e b3.4s, v29.4s; \
+ sm4e b0.4s, v30.4s; \
+ sm4e b1.4s, v30.4s; \
+ sm4e b2.4s, v30.4s; \
+ sm4e b3.4s, v30.4s; \
+ sm4e b0.4s, v31.4s; \
+ sm4e b1.4s, v31.4s; \
+ sm4e b2.4s, v31.4s; \
+ sm4e b3.4s, v31.4s; \
+ rev64 b0.4s, b0.4s; \
+ rev64 b1.4s, b1.4s; \
+ rev64 b2.4s, b2.4s; \
+ rev64 b3.4s, b3.4s; \
+ ext b0.16b, b0.16b, b0.16b, #8; \
+ ext b1.16b, b1.16b, b1.16b, #8; \
+ ext b2.16b, b2.16b, b2.16b, #8; \
+ ext b3.16b, b3.16b, b3.16b, #8; \
+ rev32 b0.16b, b0.16b; \
+ rev32 b1.16b, b1.16b; \
+ rev32 b2.16b, b2.16b; \
+ rev32 b3.16b, b3.16b;
+
+#define SM4_CRYPT_BLK8(b0, b1, b2, b3, b4, b5, b6, b7) \
+ rev32 b0.16b, b0.16b; \
+ rev32 b1.16b, b1.16b; \
+ rev32 b2.16b, b2.16b; \
+ rev32 b3.16b, b3.16b; \
+ rev32 b4.16b, b4.16b; \
+ rev32 b5.16b, b5.16b; \
+ rev32 b6.16b, b6.16b; \
+ rev32 b7.16b, b7.16b; \
+ sm4e b0.4s, v24.4s; \
+ sm4e b1.4s, v24.4s; \
+ sm4e b2.4s, v24.4s; \
+ sm4e b3.4s, v24.4s; \
+ sm4e b4.4s, v24.4s; \
+ sm4e b5.4s, v24.4s; \
+ sm4e b6.4s, v24.4s; \
+ sm4e b7.4s, v24.4s; \
+ sm4e b0.4s, v25.4s; \
+ sm4e b1.4s, v25.4s; \
+ sm4e b2.4s, v25.4s; \
+ sm4e b3.4s, v25.4s; \
+ sm4e b4.4s, v25.4s; \
+ sm4e b5.4s, v25.4s; \
+ sm4e b6.4s, v25.4s; \
+ sm4e b7.4s, v25.4s; \
+ sm4e b0.4s, v26.4s; \
+ sm4e b1.4s, v26.4s; \
+ sm4e b2.4s, v26.4s; \
+ sm4e b3.4s, v26.4s; \
+ sm4e b4.4s, v26.4s; \
+ sm4e b5.4s, v26.4s; \
+ sm4e b6.4s, v26.4s; \
+ sm4e b7.4s, v26.4s; \
+ sm4e b0.4s, v27.4s; \
+ sm4e b1.4s, v27.4s; \
+ sm4e b2.4s, v27.4s; \
+ sm4e b3.4s, v27.4s; \
+ sm4e b4.4s, v27.4s; \
+ sm4e b5.4s, v27.4s; \
+ sm4e b6.4s, v27.4s; \
+ sm4e b7.4s, v27.4s; \
+ sm4e b0.4s, v28.4s; \
+ sm4e b1.4s, v28.4s; \
+ sm4e b2.4s, v28.4s; \
+ sm4e b3.4s, v28.4s; \
+ sm4e b4.4s, v28.4s; \
+ sm4e b5.4s, v28.4s; \
+ sm4e b6.4s, v28.4s; \
+ sm4e b7.4s, v28.4s; \
+ sm4e b0.4s, v29.4s; \
+ sm4e b1.4s, v29.4s; \
+ sm4e b2.4s, v29.4s; \
+ sm4e b3.4s, v29.4s; \
+ sm4e b4.4s, v29.4s; \
+ sm4e b5.4s, v29.4s; \
+ sm4e b6.4s, v29.4s; \
+ sm4e b7.4s, v29.4s; \
+ sm4e b0.4s, v30.4s; \
+ sm4e b1.4s, v30.4s; \
+ sm4e b2.4s, v30.4s; \
+ sm4e b3.4s, v30.4s; \
+ sm4e b4.4s, v30.4s; \
+ sm4e b5.4s, v30.4s; \
+ sm4e b6.4s, v30.4s; \
+ sm4e b7.4s, v30.4s; \
+ sm4e b0.4s, v31.4s; \
+ sm4e b1.4s, v31.4s; \
+ sm4e b2.4s, v31.4s; \
+ sm4e b3.4s, v31.4s; \
+ sm4e b4.4s, v31.4s; \
+ sm4e b5.4s, v31.4s; \
+ sm4e b6.4s, v31.4s; \
+ sm4e b7.4s, v31.4s; \
+ rev64 b0.4s, b0.4s; \
+ rev64 b1.4s, b1.4s; \
+ rev64 b2.4s, b2.4s; \
+ rev64 b3.4s, b3.4s; \
+ rev64 b4.4s, b4.4s; \
+ rev64 b5.4s, b5.4s; \
+ rev64 b6.4s, b6.4s; \
+ rev64 b7.4s, b7.4s; \
+ ext b0.16b, b0.16b, b0.16b, #8; \
+ ext b1.16b, b1.16b, b1.16b, #8; \
+ ext b2.16b, b2.16b, b2.16b, #8; \
+ ext b3.16b, b3.16b, b3.16b, #8; \
+ ext b4.16b, b4.16b, b4.16b, #8; \
+ ext b5.16b, b5.16b, b5.16b, #8; \
+ ext b6.16b, b6.16b, b6.16b, #8; \
+ ext b7.16b, b7.16b, b7.16b, #8; \
+ rev32 b0.16b, b0.16b; \
+ rev32 b1.16b, b1.16b; \
+ rev32 b2.16b, b2.16b; \
+ rev32 b3.16b, b3.16b; \
+ rev32 b4.16b, b4.16b; \
+ rev32 b5.16b, b5.16b; \
+ rev32 b6.16b, b6.16b; \
+ rev32 b7.16b, b7.16b;
+
+
+.align 3
+SYM_FUNC_START(sm4_ce_expand_key)
+ /* input:
+ * x0: 128-bit key
+ * x1: rkey_enc
+ * x2: rkey_dec
+ * x3: fk array
+ * x4: ck array
+ */
+ ld1 {v0.16b}, [x0];
+ rev32 v0.16b, v0.16b;
+ ld1 {v1.16b}, [x3];
+ /* load ck */
+ ld1 {v24.16b-v27.16b}, [x4], #64;
+ ld1 {v28.16b-v31.16b}, [x4];
+
+ /* input ^ fk */
+ eor v0.16b, v0.16b, v1.16b;
+
+ sm4ekey v0.4s, v0.4s, v24.4s;
+ sm4ekey v1.4s, v0.4s, v25.4s;
+ sm4ekey v2.4s, v1.4s, v26.4s;
+ sm4ekey v3.4s, v2.4s, v27.4s;
+ sm4ekey v4.4s, v3.4s, v28.4s;
+ sm4ekey v5.4s, v4.4s, v29.4s;
+ sm4ekey v6.4s, v5.4s, v30.4s;
+ sm4ekey v7.4s, v6.4s, v31.4s;
+
+ st1 {v0.16b-v3.16b}, [x1], #64;
+ st1 {v4.16b-v7.16b}, [x1];
+ rev64 v7.4s, v7.4s;
+ rev64 v6.4s, v6.4s;
+ rev64 v5.4s, v5.4s;
+ rev64 v4.4s, v4.4s;
+ rev64 v3.4s, v3.4s;
+ rev64 v2.4s, v2.4s;
+ rev64 v1.4s, v1.4s;
+ rev64 v0.4s, v0.4s;
+ ext v7.16b, v7.16b, v7.16b, #8;
+ ext v6.16b, v6.16b, v6.16b, #8;
+ ext v5.16b, v5.16b, v5.16b, #8;
+ ext v4.16b, v4.16b, v4.16b, #8;
+ ext v3.16b, v3.16b, v3.16b, #8;
+ ext v2.16b, v2.16b, v2.16b, #8;
+ ext v1.16b, v1.16b, v1.16b, #8;
+ ext v0.16b, v0.16b, v0.16b, #8;
+ st1 {v7.16b}, [x2], #16;
+ st1 {v6.16b}, [x2], #16;
+ st1 {v5.16b}, [x2], #16;
+ st1 {v4.16b}, [x2], #16;
+ st1 {v3.16b}, [x2], #16;
+ st1 {v2.16b}, [x2], #16;
+ st1 {v1.16b}, [x2], #16;
+ st1 {v0.16b}, [x2];
+
+ ret;
+SYM_FUNC_END(sm4_ce_expand_key)
+
+.align 3
+SYM_FUNC_START(sm4_ce_crypt_block)
+ /* input:
+ * x0: round key array, CTX
+ * x1: dst
+ * x2: src
+ */
+ PREPARE;
+
+ ld1 {v0.16b}, [x2];
+ SM4_CRYPT_BLK(v0);
+ st1 {v0.16b}, [x1];
+
+ ret;
+SYM_FUNC_END(sm4_ce_crypt_block)
+
+.align 3
+SYM_FUNC_START(sm4_ce_crypt)
+ /* input:
+ * x0: round key array, CTX
+ * x1: dst
+ * x2: src
+ * w3: nblocks
+ */
+ PREPARE;
+
+.Lcrypt_loop_blk:
+ sub w3, w3, #8;
+ tbnz w3, #31, .Lcrypt_tail8;
+
+ ld1 {v0.16b-v3.16b}, [x2], #64;
+ ld1 {v4.16b-v7.16b}, [x2], #64;
+
+ SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7);
+
+ st1 {v0.16b-v3.16b}, [x1], #64;
+ st1 {v4.16b-v7.16b}, [x1], #64;
+
+ cbz w3, .Lcrypt_end;
+ b .Lcrypt_loop_blk;
+
+.Lcrypt_tail8:
+ add w3, w3, #8;
+ cmp w3, #4;
+ blt .Lcrypt_tail4;
+
+ sub w3, w3, #4;
+
+ ld1 {v0.16b-v3.16b}, [x2], #64;
+ SM4_CRYPT_BLK4(v0, v1, v2, v3);
+ st1 {v0.16b-v3.16b}, [x1], #64;
+
+ cbz w3, .Lcrypt_end;
+
+.Lcrypt_tail4:
+ sub w3, w3, #1;
+
+ ld1 {v0.16b}, [x2], #16;
+ SM4_CRYPT_BLK(v0);
+ st1 {v0.16b}, [x1], #16;
+
+ cbnz w3, .Lcrypt_tail4;
+
+.Lcrypt_end:
+ ret;
+SYM_FUNC_END(sm4_ce_crypt)
+
+.align 3
+SYM_FUNC_START(sm4_ce_cbc_enc)
+ /* input:
+ * x0: round key array, CTX
+ * x1: dst
+ * x2: src
+ * x3: iv (big endian, 128 bit)
+ * w4: nblocks
+ */
+ PREPARE;
+
+ ld1 {RIV.16b}, [x3];
+
+.Lcbc_enc_loop:
+ sub w4, w4, #1;
+
+ ld1 {RTMP0.16b}, [x2], #16;
+ eor RIV.16b, RIV.16b, RTMP0.16b;
+
+ SM4_CRYPT_BLK(RIV);
+
+ st1 {RIV.16b}, [x1], #16;
+
+ cbnz w4, .Lcbc_enc_loop;
+
+ /* store new IV */
+ st1 {RIV.16b}, [x3];
+
+ ret;
+SYM_FUNC_END(sm4_ce_cbc_enc)
+
+.align 3
+SYM_FUNC_START(sm4_ce_cbc_dec)
+ /* input:
+ * x0: round key array, CTX
+ * x1: dst
+ * x2: src
+ * x3: iv (big endian, 128 bit)
+ * w4: nblocks
+ */
+ PREPARE;
+
+ ld1 {RIV.16b}, [x3];
+
+.Lcbc_loop_blk:
+ sub w4, w4, #8;
+ tbnz w4, #31, .Lcbc_tail8;
+
+ ld1 {v0.16b-v3.16b}, [x2], #64;
+ ld1 {v4.16b-v7.16b}, [x2];
+
+ SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7);
+
+ sub x2, x2, #64;
+ eor v0.16b, v0.16b, RIV.16b;
+ ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64;
+ eor v1.16b, v1.16b, RTMP0.16b;
+ eor v2.16b, v2.16b, RTMP1.16b;
+ eor v3.16b, v3.16b, RTMP2.16b;
+ st1 {v0.16b-v3.16b}, [x1], #64;
+
+ eor v4.16b, v4.16b, RTMP3.16b;
+ ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64;
+ eor v5.16b, v5.16b, RTMP0.16b;
+ eor v6.16b, v6.16b, RTMP1.16b;
+ eor v7.16b, v7.16b, RTMP2.16b;
+
+ mov RIV.16b, RTMP3.16b;
+ st1 {v4.16b-v7.16b}, [x1], #64;
+
+ cbz w4, .Lcbc_end;
+ b .Lcbc_loop_blk;
+
+.Lcbc_tail8:
+ add w4, w4, #8;
+ cmp w4, #4;
+ blt .Lcbc_tail4;
+
+ sub w4, w4, #4;
+
+ ld1 {v0.16b-v3.16b}, [x2];
+
+ SM4_CRYPT_BLK4(v0, v1, v2, v3);
+
+ eor v0.16b, v0.16b, RIV.16b;
+ ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64;
+ eor v1.16b, v1.16b, RTMP0.16b;
+ eor v2.16b, v2.16b, RTMP1.16b;
+ eor v3.16b, v3.16b, RTMP2.16b;
+
+ mov RIV.16b, RTMP3.16b;
+ st1 {v0.16b-v3.16b}, [x1], #64;
+
+ cbz w4, .Lcbc_end;
+
+.Lcbc_tail4:
+ sub w4, w4, #1;
+
+ ld1 {v0.16b}, [x2];
+
+ SM4_CRYPT_BLK(v0);
+
+ eor v0.16b, v0.16b, RIV.16b;
+ ld1 {RIV.16b}, [x2], #16;
+ st1 {v0.16b}, [x1], #16;
+
+ cbnz w4, .Lcbc_tail4;
+
+.Lcbc_end:
+ /* store new IV */
+ st1 {RIV.16b}, [x3];
+
+ ret;
+SYM_FUNC_END(sm4_ce_cbc_dec)
+
+.align 3
+SYM_FUNC_START(sm4_ce_cfb_enc)
+ /* input:
+ * x0: round key array, CTX
+ * x1: dst
+ * x2: src
+ * x3: iv (big endian, 128 bit)
+ * w4: nblocks
+ */
+ PREPARE;
+
+ ld1 {RIV.16b}, [x3];
+
+.Lcfb_enc_loop:
+ sub w4, w4, #1;
+
+ SM4_CRYPT_BLK(RIV);
+
+ ld1 {RTMP0.16b}, [x2], #16;
+ eor RIV.16b, RIV.16b, RTMP0.16b;
+ st1 {RIV.16b}, [x1], #16;
+
+ cbnz w4, .Lcfb_enc_loop;
+
+ /* store new IV */
+ st1 {RIV.16b}, [x3];
+
+ ret;
+SYM_FUNC_END(sm4_ce_cfb_enc)
+
+.align 3
+SYM_FUNC_START(sm4_ce_cfb_dec)
+ /* input:
+ * x0: round key array, CTX
+ * x1: dst
+ * x2: src
+ * x3: iv (big endian, 128 bit)
+ * w4: nblocks
+ */
+ PREPARE;
+
+ ld1 {v0.16b}, [x3];
+
+.Lcfb_loop_blk:
+ sub w4, w4, #8;
+ tbnz w4, #31, .Lcfb_tail8;
+
+ ld1 {v1.16b, v2.16b, v3.16b}, [x2], #48;
+ ld1 {v4.16b-v7.16b}, [x2];
+
+ SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7);
+
+ sub x2, x2, #48;
+ ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64;
+ eor v0.16b, v0.16b, RTMP0.16b;
+ eor v1.16b, v1.16b, RTMP1.16b;
+ eor v2.16b, v2.16b, RTMP2.16b;
+ eor v3.16b, v3.16b, RTMP3.16b;
+ st1 {v0.16b-v3.16b}, [x1], #64;
+
+ ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64;
+ eor v4.16b, v4.16b, RTMP0.16b;
+ eor v5.16b, v5.16b, RTMP1.16b;
+ eor v6.16b, v6.16b, RTMP2.16b;
+ eor v7.16b, v7.16b, RTMP3.16b;
+ st1 {v4.16b-v7.16b}, [x1], #64;
+
+ mov v0.16b, RTMP3.16b;
+
+ cbz w4, .Lcfb_end;
+ b .Lcfb_loop_blk;
+
+.Lcfb_tail8:
+ add w4, w4, #8;
+ cmp w4, #4;
+ blt .Lcfb_tail4;
+
+ sub w4, w4, #4;
+
+ ld1 {v1.16b, v2.16b, v3.16b}, [x2];
+
+ SM4_CRYPT_BLK4(v0, v1, v2, v3);
+
+ ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64;
+ eor v0.16b, v0.16b, RTMP0.16b;
+ eor v1.16b, v1.16b, RTMP1.16b;
+ eor v2.16b, v2.16b, RTMP2.16b;
+ eor v3.16b, v3.16b, RTMP3.16b;
+ st1 {v0.16b-v3.16b}, [x1], #64;
+
+ mov v0.16b, RTMP3.16b;
+
+ cbz w4, .Lcfb_end;
+
+.Lcfb_tail4:
+ sub w4, w4, #1;
+
+ SM4_CRYPT_BLK(v0);
+
+ ld1 {RTMP0.16b}, [x2], #16;
+ eor v0.16b, v0.16b, RTMP0.16b;
+ st1 {v0.16b}, [x1], #16;
+
+ mov v0.16b, RTMP0.16b;
+
+ cbnz w4, .Lcfb_tail4;
+
+.Lcfb_end:
+ /* store new IV */
+ st1 {v0.16b}, [x3];
+
+ ret;
+SYM_FUNC_END(sm4_ce_cfb_dec)
+
+.align 3
+SYM_FUNC_START(sm4_ce_ctr_enc)
+ /* input:
+ * x0: round key array, CTX
+ * x1: dst
+ * x2: src
+ * x3: ctr (big endian, 128 bit)
+ * w4: nblocks
+ */
+ PREPARE;
+
+ ldp x7, x8, [x3];
+ rev x7, x7;
+ rev x8, x8;
+
+.Lctr_loop_blk:
+ sub w4, w4, #8;
+ tbnz w4, #31, .Lctr_tail8;
+
+#define inc_le128(vctr) \
+ mov vctr.d[1], x8; \
+ mov vctr.d[0], x7; \
+ adds x8, x8, #1; \
+ adc x7, x7, xzr; \
+ rev64 vctr.16b, vctr.16b;
+
+ /* construct CTRs */
+ inc_le128(v0); /* +0 */
+ inc_le128(v1); /* +1 */
+ inc_le128(v2); /* +2 */
+ inc_le128(v3); /* +3 */
+ inc_le128(v4); /* +4 */
+ inc_le128(v5); /* +5 */
+ inc_le128(v6); /* +6 */
+ inc_le128(v7); /* +7 */
+
+ SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7);
+
+ ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64;
+ eor v0.16b, v0.16b, RTMP0.16b;
+ eor v1.16b, v1.16b, RTMP1.16b;
+ eor v2.16b, v2.16b, RTMP2.16b;
+ eor v3.16b, v3.16b, RTMP3.16b;
+ st1 {v0.16b-v3.16b}, [x1], #64;
+
+ ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64;
+ eor v4.16b, v4.16b, RTMP0.16b;
+ eor v5.16b, v5.16b, RTMP1.16b;
+ eor v6.16b, v6.16b, RTMP2.16b;
+ eor v7.16b, v7.16b, RTMP3.16b;
+ st1 {v4.16b-v7.16b}, [x1], #64;
+
+ cbz w4, .Lctr_end;
+ b .Lctr_loop_blk;
+
+.Lctr_tail8:
+ add w4, w4, #8;
+ cmp w4, #4;
+ blt .Lctr_tail4;
+
+ sub w4, w4, #4;
+
+ /* construct CTRs */
+ inc_le128(v0); /* +0 */
+ inc_le128(v1); /* +1 */
+ inc_le128(v2); /* +2 */
+ inc_le128(v3); /* +3 */
+
+ SM4_CRYPT_BLK4(v0, v1, v2, v3);
+
+ ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64;
+ eor v0.16b, v0.16b, RTMP0.16b;
+ eor v1.16b, v1.16b, RTMP1.16b;
+ eor v2.16b, v2.16b, RTMP2.16b;
+ eor v3.16b, v3.16b, RTMP3.16b;
+ st1 {v0.16b-v3.16b}, [x1], #64;
+
+ cbz w4, .Lctr_end;
+
+.Lctr_tail4:
+ sub w4, w4, #1;
+
+ /* construct CTRs */
+ inc_le128(v0);
+
+ SM4_CRYPT_BLK(v0);
+
+ ld1 {RTMP0.16b}, [x2], #16;
+ eor v0.16b, v0.16b, RTMP0.16b;
+ st1 {v0.16b}, [x1], #16;
+
+ cbnz w4, .Lctr_tail4;
+
+.Lctr_end:
+ /* store new CTR */
+ rev x7, x7;
+ rev x8, x8;
+ stp x7, x8, [x3];
+
+ ret;
+SYM_FUNC_END(sm4_ce_ctr_enc)
diff --git a/arch/arm64/crypto/sm4-ce-glue.c b/arch/arm64/crypto/sm4-ce-glue.c
index 9c93cfc4841b..496d55c0d01a 100644
--- a/arch/arm64/crypto/sm4-ce-glue.c
+++ b/arch/arm64/crypto/sm4-ce-glue.c
@@ -1,82 +1,372 @@
-// SPDX-License-Identifier: GPL-2.0
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * SM4 Cipher Algorithm, using ARMv8 Crypto Extensions
+ * as specified in
+ * https://tools.ietf.org/id/draft-ribose-cfrg-sm4-10.html
+ *
+ * Copyright (C) 2022, Alibaba Group.
+ * Copyright (C) 2022 Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
+ */
+#include <linux/module.h>
+#include <linux/crypto.h>
+#include <linux/kernel.h>
+#include <linux/cpufeature.h>
#include <asm/neon.h>
#include <asm/simd.h>
-#include <crypto/sm4.h>
#include <crypto/internal/simd.h>
-#include <linux/module.h>
-#include <linux/cpufeature.h>
-#include <linux/crypto.h>
-#include <linux/types.h>
+#include <crypto/internal/skcipher.h>
+#include <crypto/sm4.h>
-MODULE_ALIAS_CRYPTO("sm4");
-MODULE_ALIAS_CRYPTO("sm4-ce");
-MODULE_DESCRIPTION("SM4 symmetric cipher using ARMv8 Crypto Extensions");
-MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
-MODULE_LICENSE("GPL v2");
+#define BYTES2BLKS(nbytes) ((nbytes) >> 4)
+
+asmlinkage void sm4_ce_expand_key(const u8 *key, u32 *rkey_enc, u32 *rkey_dec,
+ const u32 *fk, const u32 *ck);
+asmlinkage void sm4_ce_crypt_block(const u32 *rkey, u8 *dst, const u8 *src);
+asmlinkage void sm4_ce_crypt(const u32 *rkey, u8 *dst, const u8 *src,
+ unsigned int nblks);
+asmlinkage void sm4_ce_cbc_enc(const u32 *rkey, u8 *dst, const u8 *src,
+ u8 *iv, unsigned int nblks);
+asmlinkage void sm4_ce_cbc_dec(const u32 *rkey, u8 *dst, const u8 *src,
+ u8 *iv, unsigned int nblks);
+asmlinkage void sm4_ce_cfb_enc(const u32 *rkey, u8 *dst, const u8 *src,
+ u8 *iv, unsigned int nblks);
+asmlinkage void sm4_ce_cfb_dec(const u32 *rkey, u8 *dst, const u8 *src,
+ u8 *iv, unsigned int nblks);
+asmlinkage void sm4_ce_ctr_enc(const u32 *rkey, u8 *dst, const u8 *src,
+ u8 *iv, unsigned int nblks);
+
+static int sm4_setkey(struct crypto_skcipher *tfm, const u8 *key,
+ unsigned int key_len)
+{
+ struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm);
+
+ if (key_len != SM4_KEY_SIZE)
+ return -EINVAL;
+
+ sm4_ce_expand_key(key, ctx->rkey_enc, ctx->rkey_dec,
+ crypto_sm4_fk, crypto_sm4_ck);
+ return 0;
+}
+
+static int sm4_ecb_do_crypt(struct skcipher_request *req, const u32 *rkey)
+{
+ struct skcipher_walk walk;
+ unsigned int nbytes;
+ int err;
+
+ err = skcipher_walk_virt(&walk, req, false);
+
+ while ((nbytes = walk.nbytes) > 0) {
+ const u8 *src = walk.src.virt.addr;
+ u8 *dst = walk.dst.virt.addr;
+ unsigned int nblks;
+
+ kernel_neon_begin();
+
+ nblks = BYTES2BLKS(nbytes);
+ if (nblks) {
+ sm4_ce_crypt(rkey, dst, src, nblks);
+ nbytes -= nblks * SM4_BLOCK_SIZE;
+ }
+
+ kernel_neon_end();
+
+ err = skcipher_walk_done(&walk, nbytes);
+ }
+
+ return err;
+}
+
+static int sm4_ecb_encrypt(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm);
+
+ return sm4_ecb_do_crypt(req, ctx->rkey_enc);
+}
+
+static int sm4_ecb_decrypt(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm);
+
+ return sm4_ecb_do_crypt(req, ctx->rkey_dec);
+}
+
+static int sm4_cbc_encrypt(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm);
+ struct skcipher_walk walk;
+ unsigned int nbytes;
+ int err;
+
+ err = skcipher_walk_virt(&walk, req, false);
+
+ while ((nbytes = walk.nbytes) > 0) {
+ const u8 *src = walk.src.virt.addr;
+ u8 *dst = walk.dst.virt.addr;
+ unsigned int nblks;
+
+ kernel_neon_begin();
+
+ nblks = BYTES2BLKS(nbytes);
+ if (nblks) {
+ sm4_ce_cbc_enc(ctx->rkey_enc, dst, src, walk.iv, nblks);
+ nbytes -= nblks * SM4_BLOCK_SIZE;
+ }
+
+ kernel_neon_end();
-asmlinkage void sm4_ce_do_crypt(const u32 *rk, void *out, const void *in);
+ err = skcipher_walk_done(&walk, nbytes);
+ }
+
+ return err;
+}
-static int sm4_ce_setkey(struct crypto_tfm *tfm, const u8 *key,
- unsigned int key_len)
+static int sm4_cbc_decrypt(struct skcipher_request *req)
{
- struct sm4_ctx *ctx = crypto_tfm_ctx(tfm);
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm);
+ struct skcipher_walk walk;
+ unsigned int nbytes;
+ int err;
+
+ err = skcipher_walk_virt(&walk, req, false);
+
+ while ((nbytes = walk.nbytes) > 0) {
+ const u8 *src = walk.src.virt.addr;
+ u8 *dst = walk.dst.virt.addr;
+ unsigned int nblks;
+
+ kernel_neon_begin();
+
+ nblks = BYTES2BLKS(nbytes);
+ if (nblks) {
+ sm4_ce_cbc_dec(ctx->rkey_dec, dst, src, walk.iv, nblks);
+ nbytes -= nblks * SM4_BLOCK_SIZE;
+ }
+
+ kernel_neon_end();
+
+ err = skcipher_walk_done(&walk, nbytes);
+ }
- return sm4_expandkey(ctx, key, key_len);
+ return err;
}
-static void sm4_ce_encrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
+static int sm4_cfb_encrypt(struct skcipher_request *req)
{
- const struct sm4_ctx *ctx = crypto_tfm_ctx(tfm);
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm);
+ struct skcipher_walk walk;
+ unsigned int nbytes;
+ int err;
+
+ err = skcipher_walk_virt(&walk, req, false);
+
+ while ((nbytes = walk.nbytes) > 0) {
+ const u8 *src = walk.src.virt.addr;
+ u8 *dst = walk.dst.virt.addr;
+ unsigned int nblks;
- if (!crypto_simd_usable()) {
- sm4_crypt_block(ctx->rkey_enc, out, in);
- } else {
kernel_neon_begin();
- sm4_ce_do_crypt(ctx->rkey_enc, out, in);
+
+ nblks = BYTES2BLKS(nbytes);
+ if (nblks) {
+ sm4_ce_cfb_enc(ctx->rkey_enc, dst, src, walk.iv, nblks);
+ dst += nblks * SM4_BLOCK_SIZE;
+ src += nblks * SM4_BLOCK_SIZE;
+ nbytes -= nblks * SM4_BLOCK_SIZE;
+ }
+
+ /* tail */
+ if (walk.nbytes == walk.total && nbytes > 0) {
+ u8 keystream[SM4_BLOCK_SIZE];
+
+ sm4_ce_crypt_block(ctx->rkey_enc, keystream, walk.iv);
+ crypto_xor_cpy(dst, src, keystream, nbytes);
+ nbytes = 0;
+ }
+
kernel_neon_end();
+
+ err = skcipher_walk_done(&walk, nbytes);
}
+
+ return err;
}
-static void sm4_ce_decrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
+static int sm4_cfb_decrypt(struct skcipher_request *req)
{
- const struct sm4_ctx *ctx = crypto_tfm_ctx(tfm);
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm);
+ struct skcipher_walk walk;
+ unsigned int nbytes;
+ int err;
+
+ err = skcipher_walk_virt(&walk, req, false);
+
+ while ((nbytes = walk.nbytes) > 0) {
+ const u8 *src = walk.src.virt.addr;
+ u8 *dst = walk.dst.virt.addr;
+ unsigned int nblks;
- if (!crypto_simd_usable()) {
- sm4_crypt_block(ctx->rkey_dec, out, in);
- } else {
kernel_neon_begin();
- sm4_ce_do_crypt(ctx->rkey_dec, out, in);
+
+ nblks = BYTES2BLKS(nbytes);
+ if (nblks) {
+ sm4_ce_cfb_dec(ctx->rkey_enc, dst, src, walk.iv, nblks);
+ dst += nblks * SM4_BLOCK_SIZE;
+ src += nblks * SM4_BLOCK_SIZE;
+ nbytes -= nblks * SM4_BLOCK_SIZE;
+ }
+
+ /* tail */
+ if (walk.nbytes == walk.total && nbytes > 0) {
+ u8 keystream[SM4_BLOCK_SIZE];
+
+ sm4_ce_crypt_block(ctx->rkey_enc, keystream, walk.iv);
+ crypto_xor_cpy(dst, src, keystream, nbytes);
+ nbytes = 0;
+ }
+
kernel_neon_end();
+
+ err = skcipher_walk_done(&walk, nbytes);
}
+
+ return err;
}
-static struct crypto_alg sm4_ce_alg = {
- .cra_name = "sm4",
- .cra_driver_name = "sm4-ce",
- .cra_priority = 200,
- .cra_flags = CRYPTO_ALG_TYPE_CIPHER,
- .cra_blocksize = SM4_BLOCK_SIZE,
- .cra_ctxsize = sizeof(struct sm4_ctx),
- .cra_module = THIS_MODULE,
- .cra_u.cipher = {
- .cia_min_keysize = SM4_KEY_SIZE,
- .cia_max_keysize = SM4_KEY_SIZE,
- .cia_setkey = sm4_ce_setkey,
- .cia_encrypt = sm4_ce_encrypt,
- .cia_decrypt = sm4_ce_decrypt
+static int sm4_ctr_crypt(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm);
+ struct skcipher_walk walk;
+ unsigned int nbytes;
+ int err;
+
+ err = skcipher_walk_virt(&walk, req, false);
+
+ while ((nbytes = walk.nbytes) > 0) {
+ const u8 *src = walk.src.virt.addr;
+ u8 *dst = walk.dst.virt.addr;
+ unsigned int nblks;
+
+ kernel_neon_begin();
+
+ nblks = BYTES2BLKS(nbytes);
+ if (nblks) {
+ sm4_ce_ctr_enc(ctx->rkey_enc, dst, src, walk.iv, nblks);
+ dst += nblks * SM4_BLOCK_SIZE;
+ src += nblks * SM4_BLOCK_SIZE;
+ nbytes -= nblks * SM4_BLOCK_SIZE;
+ }
+
+ /* tail */
+ if (walk.nbytes == walk.total && nbytes > 0) {
+ u8 keystream[SM4_BLOCK_SIZE];
+
+ sm4_ce_crypt_block(ctx->rkey_enc, keystream, walk.iv);
+ crypto_inc(walk.iv, SM4_BLOCK_SIZE);
+ crypto_xor_cpy(dst, src, keystream, nbytes);
+ nbytes = 0;
+ }
+
+ kernel_neon_end();
+
+ err = skcipher_walk_done(&walk, nbytes);
+ }
+
+ return err;
+}
+
+static struct skcipher_alg sm4_algs[] = {
+ {
+ .base = {
+ .cra_name = "ecb(sm4)",
+ .cra_driver_name = "ecb-sm4-ce",
+ .cra_priority = 400,
+ .cra_blocksize = SM4_BLOCK_SIZE,
+ .cra_ctxsize = sizeof(struct sm4_ctx),
+ .cra_module = THIS_MODULE,
+ },
+ .min_keysize = SM4_KEY_SIZE,
+ .max_keysize = SM4_KEY_SIZE,
+ .setkey = sm4_setkey,
+ .encrypt = sm4_ecb_encrypt,
+ .decrypt = sm4_ecb_decrypt,
+ }, {
+ .base = {
+ .cra_name = "cbc(sm4)",
+ .cra_driver_name = "cbc-sm4-ce",
+ .cra_priority = 400,
+ .cra_blocksize = SM4_BLOCK_SIZE,
+ .cra_ctxsize = sizeof(struct sm4_ctx),
+ .cra_module = THIS_MODULE,
+ },
+ .min_keysize = SM4_KEY_SIZE,
+ .max_keysize = SM4_KEY_SIZE,
+ .ivsize = SM4_BLOCK_SIZE,
+ .setkey = sm4_setkey,
+ .encrypt = sm4_cbc_encrypt,
+ .decrypt = sm4_cbc_decrypt,
+ }, {
+ .base = {
+ .cra_name = "cfb(sm4)",
+ .cra_driver_name = "cfb-sm4-ce",
+ .cra_priority = 400,
+ .cra_blocksize = 1,
+ .cra_ctxsize = sizeof(struct sm4_ctx),
+ .cra_module = THIS_MODULE,
+ },
+ .min_keysize = SM4_KEY_SIZE,
+ .max_keysize = SM4_KEY_SIZE,
+ .ivsize = SM4_BLOCK_SIZE,
+ .chunksize = SM4_BLOCK_SIZE,
+ .setkey = sm4_setkey,
+ .encrypt = sm4_cfb_encrypt,
+ .decrypt = sm4_cfb_decrypt,
+ }, {
+ .base = {
+ .cra_name = "ctr(sm4)",
+ .cra_driver_name = "ctr-sm4-ce",
+ .cra_priority = 400,
+ .cra_blocksize = 1,
+ .cra_ctxsize = sizeof(struct sm4_ctx),
+ .cra_module = THIS_MODULE,
+ },
+ .min_keysize = SM4_KEY_SIZE,
+ .max_keysize = SM4_KEY_SIZE,
+ .ivsize = SM4_BLOCK_SIZE,
+ .chunksize = SM4_BLOCK_SIZE,
+ .setkey = sm4_setkey,
+ .encrypt = sm4_ctr_crypt,
+ .decrypt = sm4_ctr_crypt,
}
};
-static int __init sm4_ce_mod_init(void)
+static int __init sm4_init(void)
{
- return crypto_register_alg(&sm4_ce_alg);
+ return crypto_register_skciphers(sm4_algs, ARRAY_SIZE(sm4_algs));
}
-static void __exit sm4_ce_mod_fini(void)
+static void __exit sm4_exit(void)
{
- crypto_unregister_alg(&sm4_ce_alg);
+ crypto_unregister_skciphers(sm4_algs, ARRAY_SIZE(sm4_algs));
}
-module_cpu_feature_match(SM4, sm4_ce_mod_init);
-module_exit(sm4_ce_mod_fini);
+module_cpu_feature_match(SM4, sm4_init);
+module_exit(sm4_exit);
+
+MODULE_DESCRIPTION("SM4 ECB/CBC/CFB/CTR using ARMv8 Crypto Extensions");
+MODULE_ALIAS_CRYPTO("sm4-ce");
+MODULE_ALIAS_CRYPTO("sm4");
+MODULE_ALIAS_CRYPTO("ecb(sm4)");
+MODULE_ALIAS_CRYPTO("cbc(sm4)");
+MODULE_ALIAS_CRYPTO("cfb(sm4)");
+MODULE_ALIAS_CRYPTO("ctr(sm4)");
+MODULE_AUTHOR("Tianjia Zhang <tianjia.zhang@linux.alibaba.com>");
+MODULE_LICENSE("GPL v2");
diff --git a/arch/arm64/crypto/sm4-neon-core.S b/arch/arm64/crypto/sm4-neon-core.S
new file mode 100644
index 000000000000..3d5256b354d2
--- /dev/null
+++ b/arch/arm64/crypto/sm4-neon-core.S
@@ -0,0 +1,487 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * SM4 Cipher Algorithm for ARMv8 NEON
+ * as specified in
+ * https://tools.ietf.org/id/draft-ribose-cfrg-sm4-10.html
+ *
+ * Copyright (C) 2022, Alibaba Group.
+ * Copyright (C) 2022 Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+/* Register macros */
+
+#define RTMP0 v8
+#define RTMP1 v9
+#define RTMP2 v10
+#define RTMP3 v11
+
+#define RX0 v12
+#define RX1 v13
+#define RKEY v14
+#define RIV v15
+
+/* Helper macros. */
+
+#define PREPARE \
+ adr_l x5, crypto_sm4_sbox; \
+ ld1 {v16.16b-v19.16b}, [x5], #64; \
+ ld1 {v20.16b-v23.16b}, [x5], #64; \
+ ld1 {v24.16b-v27.16b}, [x5], #64; \
+ ld1 {v28.16b-v31.16b}, [x5];
+
+#define transpose_4x4(s0, s1, s2, s3) \
+ zip1 RTMP0.4s, s0.4s, s1.4s; \
+ zip1 RTMP1.4s, s2.4s, s3.4s; \
+ zip2 RTMP2.4s, s0.4s, s1.4s; \
+ zip2 RTMP3.4s, s2.4s, s3.4s; \
+ zip1 s0.2d, RTMP0.2d, RTMP1.2d; \
+ zip2 s1.2d, RTMP0.2d, RTMP1.2d; \
+ zip1 s2.2d, RTMP2.2d, RTMP3.2d; \
+ zip2 s3.2d, RTMP2.2d, RTMP3.2d;
+
+#define rotate_clockwise_90(s0, s1, s2, s3) \
+ zip1 RTMP0.4s, s1.4s, s0.4s; \
+ zip2 RTMP1.4s, s1.4s, s0.4s; \
+ zip1 RTMP2.4s, s3.4s, s2.4s; \
+ zip2 RTMP3.4s, s3.4s, s2.4s; \
+ zip1 s0.2d, RTMP2.2d, RTMP0.2d; \
+ zip2 s1.2d, RTMP2.2d, RTMP0.2d; \
+ zip1 s2.2d, RTMP3.2d, RTMP1.2d; \
+ zip2 s3.2d, RTMP3.2d, RTMP1.2d;
+
+#define ROUND4(round, s0, s1, s2, s3) \
+ dup RX0.4s, RKEY.s[round]; \
+ /* rk ^ s1 ^ s2 ^ s3 */ \
+ eor RTMP1.16b, s2.16b, s3.16b; \
+ eor RX0.16b, RX0.16b, s1.16b; \
+ eor RX0.16b, RX0.16b, RTMP1.16b; \
+ \
+ /* sbox, non-linear part */ \
+ movi RTMP3.16b, #64; /* sizeof(sbox) / 4 */ \
+ tbl RTMP0.16b, {v16.16b-v19.16b}, RX0.16b; \
+ sub RX0.16b, RX0.16b, RTMP3.16b; \
+ tbx RTMP0.16b, {v20.16b-v23.16b}, RX0.16b; \
+ sub RX0.16b, RX0.16b, RTMP3.16b; \
+ tbx RTMP0.16b, {v24.16b-v27.16b}, RX0.16b; \
+ sub RX0.16b, RX0.16b, RTMP3.16b; \
+ tbx RTMP0.16b, {v28.16b-v31.16b}, RX0.16b; \
+ \
+ /* linear part */ \
+ shl RTMP1.4s, RTMP0.4s, #8; \
+ shl RTMP2.4s, RTMP0.4s, #16; \
+ shl RTMP3.4s, RTMP0.4s, #24; \
+ sri RTMP1.4s, RTMP0.4s, #(32-8); \
+ sri RTMP2.4s, RTMP0.4s, #(32-16); \
+ sri RTMP3.4s, RTMP0.4s, #(32-24); \
+ /* RTMP1 = x ^ rol32(x, 8) ^ rol32(x, 16) */ \
+ eor RTMP1.16b, RTMP1.16b, RTMP0.16b; \
+ eor RTMP1.16b, RTMP1.16b, RTMP2.16b; \
+ /* RTMP3 = x ^ rol32(x, 24) ^ rol32(RTMP1, 2) */ \
+ eor RTMP3.16b, RTMP3.16b, RTMP0.16b; \
+ shl RTMP2.4s, RTMP1.4s, 2; \
+ sri RTMP2.4s, RTMP1.4s, #(32-2); \
+ eor RTMP3.16b, RTMP3.16b, RTMP2.16b; \
+ /* s0 ^= RTMP3 */ \
+ eor s0.16b, s0.16b, RTMP3.16b;
+
+#define SM4_CRYPT_BLK4(b0, b1, b2, b3) \
+ rev32 b0.16b, b0.16b; \
+ rev32 b1.16b, b1.16b; \
+ rev32 b2.16b, b2.16b; \
+ rev32 b3.16b, b3.16b; \
+ \
+ transpose_4x4(b0, b1, b2, b3); \
+ \
+ mov x6, 8; \
+4: \
+ ld1 {RKEY.4s}, [x0], #16; \
+ subs x6, x6, #1; \
+ \
+ ROUND4(0, b0, b1, b2, b3); \
+ ROUND4(1, b1, b2, b3, b0); \
+ ROUND4(2, b2, b3, b0, b1); \
+ ROUND4(3, b3, b0, b1, b2); \
+ \
+ bne 4b; \
+ \
+ rotate_clockwise_90(b0, b1, b2, b3); \
+ rev32 b0.16b, b0.16b; \
+ rev32 b1.16b, b1.16b; \
+ rev32 b2.16b, b2.16b; \
+ rev32 b3.16b, b3.16b; \
+ \
+ /* repoint to rkey */ \
+ sub x0, x0, #128;
+
+#define ROUND8(round, s0, s1, s2, s3, t0, t1, t2, t3) \
+ /* rk ^ s1 ^ s2 ^ s3 */ \
+ dup RX0.4s, RKEY.s[round]; \
+ eor RTMP0.16b, s2.16b, s3.16b; \
+ mov RX1.16b, RX0.16b; \
+ eor RTMP1.16b, t2.16b, t3.16b; \
+ eor RX0.16b, RX0.16b, s1.16b; \
+ eor RX1.16b, RX1.16b, t1.16b; \
+ eor RX0.16b, RX0.16b, RTMP0.16b; \
+ eor RX1.16b, RX1.16b, RTMP1.16b; \
+ \
+ /* sbox, non-linear part */ \
+ movi RTMP3.16b, #64; /* sizeof(sbox) / 4 */ \
+ tbl RTMP0.16b, {v16.16b-v19.16b}, RX0.16b; \
+ tbl RTMP1.16b, {v16.16b-v19.16b}, RX1.16b; \
+ sub RX0.16b, RX0.16b, RTMP3.16b; \
+ sub RX1.16b, RX1.16b, RTMP3.16b; \
+ tbx RTMP0.16b, {v20.16b-v23.16b}, RX0.16b; \
+ tbx RTMP1.16b, {v20.16b-v23.16b}, RX1.16b; \
+ sub RX0.16b, RX0.16b, RTMP3.16b; \
+ sub RX1.16b, RX1.16b, RTMP3.16b; \
+ tbx RTMP0.16b, {v24.16b-v27.16b}, RX0.16b; \
+ tbx RTMP1.16b, {v24.16b-v27.16b}, RX1.16b; \
+ sub RX0.16b, RX0.16b, RTMP3.16b; \
+ sub RX1.16b, RX1.16b, RTMP3.16b; \
+ tbx RTMP0.16b, {v28.16b-v31.16b}, RX0.16b; \
+ tbx RTMP1.16b, {v28.16b-v31.16b}, RX1.16b; \
+ \
+ /* linear part */ \
+ shl RX0.4s, RTMP0.4s, #8; \
+ shl RX1.4s, RTMP1.4s, #8; \
+ shl RTMP2.4s, RTMP0.4s, #16; \
+ shl RTMP3.4s, RTMP1.4s, #16; \
+ sri RX0.4s, RTMP0.4s, #(32 - 8); \
+ sri RX1.4s, RTMP1.4s, #(32 - 8); \
+ sri RTMP2.4s, RTMP0.4s, #(32 - 16); \
+ sri RTMP3.4s, RTMP1.4s, #(32 - 16); \
+ /* RX = x ^ rol32(x, 8) ^ rol32(x, 16) */ \
+ eor RX0.16b, RX0.16b, RTMP0.16b; \
+ eor RX1.16b, RX1.16b, RTMP1.16b; \
+ eor RX0.16b, RX0.16b, RTMP2.16b; \
+ eor RX1.16b, RX1.16b, RTMP3.16b; \
+ /* RTMP0/1 ^= x ^ rol32(x, 24) ^ rol32(RX, 2) */ \
+ shl RTMP2.4s, RTMP0.4s, #24; \
+ shl RTMP3.4s, RTMP1.4s, #24; \
+ sri RTMP2.4s, RTMP0.4s, #(32 - 24); \
+ sri RTMP3.4s, RTMP1.4s, #(32 - 24); \
+ eor RTMP0.16b, RTMP0.16b, RTMP2.16b; \
+ eor RTMP1.16b, RTMP1.16b, RTMP3.16b; \
+ shl RTMP2.4s, RX0.4s, #2; \
+ shl RTMP3.4s, RX1.4s, #2; \
+ sri RTMP2.4s, RX0.4s, #(32 - 2); \
+ sri RTMP3.4s, RX1.4s, #(32 - 2); \
+ eor RTMP0.16b, RTMP0.16b, RTMP2.16b; \
+ eor RTMP1.16b, RTMP1.16b, RTMP3.16b; \
+ /* s0/t0 ^= RTMP0/1 */ \
+ eor s0.16b, s0.16b, RTMP0.16b; \
+ eor t0.16b, t0.16b, RTMP1.16b;
+
+#define SM4_CRYPT_BLK8(b0, b1, b2, b3, b4, b5, b6, b7) \
+ rev32 b0.16b, b0.16b; \
+ rev32 b1.16b, b1.16b; \
+ rev32 b2.16b, b2.16b; \
+ rev32 b3.16b, b3.16b; \
+ rev32 b4.16b, b4.16b; \
+ rev32 b5.16b, b5.16b; \
+ rev32 b6.16b, b6.16b; \
+ rev32 b7.16b, b7.16b; \
+ \
+ transpose_4x4(b0, b1, b2, b3); \
+ transpose_4x4(b4, b5, b6, b7); \
+ \
+ mov x6, 8; \
+8: \
+ ld1 {RKEY.4s}, [x0], #16; \
+ subs x6, x6, #1; \
+ \
+ ROUND8(0, b0, b1, b2, b3, b4, b5, b6, b7); \
+ ROUND8(1, b1, b2, b3, b0, b5, b6, b7, b4); \
+ ROUND8(2, b2, b3, b0, b1, b6, b7, b4, b5); \
+ ROUND8(3, b3, b0, b1, b2, b7, b4, b5, b6); \
+ \
+ bne 8b; \
+ \
+ rotate_clockwise_90(b0, b1, b2, b3); \
+ rotate_clockwise_90(b4, b5, b6, b7); \
+ rev32 b0.16b, b0.16b; \
+ rev32 b1.16b, b1.16b; \
+ rev32 b2.16b, b2.16b; \
+ rev32 b3.16b, b3.16b; \
+ rev32 b4.16b, b4.16b; \
+ rev32 b5.16b, b5.16b; \
+ rev32 b6.16b, b6.16b; \
+ rev32 b7.16b, b7.16b; \
+ \
+ /* repoint to rkey */ \
+ sub x0, x0, #128;
+
+
+.align 3
+SYM_FUNC_START_LOCAL(__sm4_neon_crypt_blk1_4)
+ /* input:
+ * x0: round key array, CTX
+ * x1: dst
+ * x2: src
+ * w3: num blocks (1..4)
+ */
+ PREPARE;
+
+ ld1 {v0.16b}, [x2], #16;
+ mov v1.16b, v0.16b;
+ mov v2.16b, v0.16b;
+ mov v3.16b, v0.16b;
+ cmp w3, #2;
+ blt .Lblk4_load_input_done;
+ ld1 {v1.16b}, [x2], #16;
+ beq .Lblk4_load_input_done;
+ ld1 {v2.16b}, [x2], #16;
+ cmp w3, #3;
+ beq .Lblk4_load_input_done;
+ ld1 {v3.16b}, [x2];
+
+.Lblk4_load_input_done:
+ SM4_CRYPT_BLK4(v0, v1, v2, v3);
+
+ st1 {v0.16b}, [x1], #16;
+ cmp w3, #2;
+ blt .Lblk4_store_output_done;
+ st1 {v1.16b}, [x1], #16;
+ beq .Lblk4_store_output_done;
+ st1 {v2.16b}, [x1], #16;
+ cmp w3, #3;
+ beq .Lblk4_store_output_done;
+ st1 {v3.16b}, [x1];
+
+.Lblk4_store_output_done:
+ ret;
+SYM_FUNC_END(__sm4_neon_crypt_blk1_4)
+
+.align 3
+SYM_FUNC_START(sm4_neon_crypt_blk1_8)
+ /* input:
+ * x0: round key array, CTX
+ * x1: dst
+ * x2: src
+ * w3: num blocks (1..8)
+ */
+ cmp w3, #5;
+ blt __sm4_neon_crypt_blk1_4;
+
+ PREPARE;
+
+ ld1 {v0.16b-v3.16b}, [x2], #64;
+ ld1 {v4.16b}, [x2], #16;
+ mov v5.16b, v4.16b;
+ mov v6.16b, v4.16b;
+ mov v7.16b, v4.16b;
+ beq .Lblk8_load_input_done;
+ ld1 {v5.16b}, [x2], #16;
+ cmp w3, #7;
+ blt .Lblk8_load_input_done;
+ ld1 {v6.16b}, [x2], #16;
+ beq .Lblk8_load_input_done;
+ ld1 {v7.16b}, [x2];
+
+.Lblk8_load_input_done:
+ SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7);
+
+ cmp w3, #6;
+ st1 {v0.16b-v3.16b}, [x1], #64;
+ st1 {v4.16b}, [x1], #16;
+ blt .Lblk8_store_output_done;
+ st1 {v5.16b}, [x1], #16;
+ beq .Lblk8_store_output_done;
+ st1 {v6.16b}, [x1], #16;
+ cmp w3, #7;
+ beq .Lblk8_store_output_done;
+ st1 {v7.16b}, [x1];
+
+.Lblk8_store_output_done:
+ ret;
+SYM_FUNC_END(sm4_neon_crypt_blk1_8)
+
+.align 3
+SYM_FUNC_START(sm4_neon_crypt_blk8)
+ /* input:
+ * x0: round key array, CTX
+ * x1: dst
+ * x2: src
+ * w3: nblocks (multiples of 8)
+ */
+ PREPARE;
+
+.Lcrypt_loop_blk:
+ subs w3, w3, #8;
+ bmi .Lcrypt_end;
+
+ ld1 {v0.16b-v3.16b}, [x2], #64;
+ ld1 {v4.16b-v7.16b}, [x2], #64;
+
+ SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7);
+
+ st1 {v0.16b-v3.16b}, [x1], #64;
+ st1 {v4.16b-v7.16b}, [x1], #64;
+
+ b .Lcrypt_loop_blk;
+
+.Lcrypt_end:
+ ret;
+SYM_FUNC_END(sm4_neon_crypt_blk8)
+
+.align 3
+SYM_FUNC_START(sm4_neon_cbc_dec_blk8)
+ /* input:
+ * x0: round key array, CTX
+ * x1: dst
+ * x2: src
+ * x3: iv (big endian, 128 bit)
+ * w4: nblocks (multiples of 8)
+ */
+ PREPARE;
+
+ ld1 {RIV.16b}, [x3];
+
+.Lcbc_loop_blk:
+ subs w4, w4, #8;
+ bmi .Lcbc_end;
+
+ ld1 {v0.16b-v3.16b}, [x2], #64;
+ ld1 {v4.16b-v7.16b}, [x2];
+
+ SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7);
+
+ sub x2, x2, #64;
+ eor v0.16b, v0.16b, RIV.16b;
+ ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64;
+ eor v1.16b, v1.16b, RTMP0.16b;
+ eor v2.16b, v2.16b, RTMP1.16b;
+ eor v3.16b, v3.16b, RTMP2.16b;
+ st1 {v0.16b-v3.16b}, [x1], #64;
+
+ eor v4.16b, v4.16b, RTMP3.16b;
+ ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64;
+ eor v5.16b, v5.16b, RTMP0.16b;
+ eor v6.16b, v6.16b, RTMP1.16b;
+ eor v7.16b, v7.16b, RTMP2.16b;
+
+ mov RIV.16b, RTMP3.16b;
+ st1 {v4.16b-v7.16b}, [x1], #64;
+
+ b .Lcbc_loop_blk;
+
+.Lcbc_end:
+ /* store new IV */
+ st1 {RIV.16b}, [x3];
+
+ ret;
+SYM_FUNC_END(sm4_neon_cbc_dec_blk8)
+
+.align 3
+SYM_FUNC_START(sm4_neon_cfb_dec_blk8)
+ /* input:
+ * x0: round key array, CTX
+ * x1: dst
+ * x2: src
+ * x3: iv (big endian, 128 bit)
+ * w4: nblocks (multiples of 8)
+ */
+ PREPARE;
+
+ ld1 {v0.16b}, [x3];
+
+.Lcfb_loop_blk:
+ subs w4, w4, #8;
+ bmi .Lcfb_end;
+
+ ld1 {v1.16b, v2.16b, v3.16b}, [x2], #48;
+ ld1 {v4.16b-v7.16b}, [x2];
+
+ SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7);
+
+ sub x2, x2, #48;
+ ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64;
+ eor v0.16b, v0.16b, RTMP0.16b;
+ eor v1.16b, v1.16b, RTMP1.16b;
+ eor v2.16b, v2.16b, RTMP2.16b;
+ eor v3.16b, v3.16b, RTMP3.16b;
+ st1 {v0.16b-v3.16b}, [x1], #64;
+
+ ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64;
+ eor v4.16b, v4.16b, RTMP0.16b;
+ eor v5.16b, v5.16b, RTMP1.16b;
+ eor v6.16b, v6.16b, RTMP2.16b;
+ eor v7.16b, v7.16b, RTMP3.16b;
+ st1 {v4.16b-v7.16b}, [x1], #64;
+
+ mov v0.16b, RTMP3.16b;
+
+ b .Lcfb_loop_blk;
+
+.Lcfb_end:
+ /* store new IV */
+ st1 {v0.16b}, [x3];
+
+ ret;
+SYM_FUNC_END(sm4_neon_cfb_dec_blk8)
+
+.align 3
+SYM_FUNC_START(sm4_neon_ctr_enc_blk8)
+ /* input:
+ * x0: round key array, CTX
+ * x1: dst
+ * x2: src
+ * x3: ctr (big endian, 128 bit)
+ * w4: nblocks (multiples of 8)
+ */
+ PREPARE;
+
+ ldp x7, x8, [x3];
+ rev x7, x7;
+ rev x8, x8;
+
+.Lctr_loop_blk:
+ subs w4, w4, #8;
+ bmi .Lctr_end;
+
+#define inc_le128(vctr) \
+ mov vctr.d[1], x8; \
+ mov vctr.d[0], x7; \
+ adds x8, x8, #1; \
+ adc x7, x7, xzr; \
+ rev64 vctr.16b, vctr.16b;
+
+ /* construct CTRs */
+ inc_le128(v0); /* +0 */
+ inc_le128(v1); /* +1 */
+ inc_le128(v2); /* +2 */
+ inc_le128(v3); /* +3 */
+ inc_le128(v4); /* +4 */
+ inc_le128(v5); /* +5 */
+ inc_le128(v6); /* +6 */
+ inc_le128(v7); /* +7 */
+
+ SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7);
+
+ ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64;
+ eor v0.16b, v0.16b, RTMP0.16b;
+ eor v1.16b, v1.16b, RTMP1.16b;
+ eor v2.16b, v2.16b, RTMP2.16b;
+ eor v3.16b, v3.16b, RTMP3.16b;
+ st1 {v0.16b-v3.16b}, [x1], #64;
+
+ ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64;
+ eor v4.16b, v4.16b, RTMP0.16b;
+ eor v5.16b, v5.16b, RTMP1.16b;
+ eor v6.16b, v6.16b, RTMP2.16b;
+ eor v7.16b, v7.16b, RTMP3.16b;
+ st1 {v4.16b-v7.16b}, [x1], #64;
+
+ b .Lctr_loop_blk;
+
+.Lctr_end:
+ /* store new CTR */
+ rev x7, x7;
+ rev x8, x8;
+ stp x7, x8, [x3];
+
+ ret;
+SYM_FUNC_END(sm4_neon_ctr_enc_blk8)
diff --git a/arch/arm64/crypto/sm4-neon-glue.c b/arch/arm64/crypto/sm4-neon-glue.c
new file mode 100644
index 000000000000..03a6a6866a31
--- /dev/null
+++ b/arch/arm64/crypto/sm4-neon-glue.c
@@ -0,0 +1,442 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * SM4 Cipher Algorithm, using ARMv8 NEON
+ * as specified in
+ * https://tools.ietf.org/id/draft-ribose-cfrg-sm4-10.html
+ *
+ * Copyright (C) 2022, Alibaba Group.
+ * Copyright (C) 2022 Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
+ */
+
+#include <linux/module.h>
+#include <linux/crypto.h>
+#include <linux/kernel.h>
+#include <linux/cpufeature.h>
+#include <asm/neon.h>
+#include <asm/simd.h>
+#include <crypto/internal/simd.h>
+#include <crypto/internal/skcipher.h>
+#include <crypto/sm4.h>
+
+#define BYTES2BLKS(nbytes) ((nbytes) >> 4)
+#define BYTES2BLK8(nbytes) (((nbytes) >> 4) & ~(8 - 1))
+
+asmlinkage void sm4_neon_crypt_blk1_8(const u32 *rkey, u8 *dst, const u8 *src,
+ unsigned int nblks);
+asmlinkage void sm4_neon_crypt_blk8(const u32 *rkey, u8 *dst, const u8 *src,
+ unsigned int nblks);
+asmlinkage void sm4_neon_cbc_dec_blk8(const u32 *rkey, u8 *dst, const u8 *src,
+ u8 *iv, unsigned int nblks);
+asmlinkage void sm4_neon_cfb_dec_blk8(const u32 *rkey, u8 *dst, const u8 *src,
+ u8 *iv, unsigned int nblks);
+asmlinkage void sm4_neon_ctr_enc_blk8(const u32 *rkey, u8 *dst, const u8 *src,
+ u8 *iv, unsigned int nblks);
+
+static int sm4_setkey(struct crypto_skcipher *tfm, const u8 *key,
+ unsigned int key_len)
+{
+ struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm);
+
+ return sm4_expandkey(ctx, key, key_len);
+}
+
+static int sm4_ecb_do_crypt(struct skcipher_request *req, const u32 *rkey)
+{
+ struct skcipher_walk walk;
+ unsigned int nbytes;
+ int err;
+
+ err = skcipher_walk_virt(&walk, req, false);
+
+ while ((nbytes = walk.nbytes) > 0) {
+ const u8 *src = walk.src.virt.addr;
+ u8 *dst = walk.dst.virt.addr;
+ unsigned int nblks;
+
+ kernel_neon_begin();
+
+ nblks = BYTES2BLK8(nbytes);
+ if (nblks) {
+ sm4_neon_crypt_blk8(rkey, dst, src, nblks);
+ dst += nblks * SM4_BLOCK_SIZE;
+ src += nblks * SM4_BLOCK_SIZE;
+ nbytes -= nblks * SM4_BLOCK_SIZE;
+ }
+
+ nblks = BYTES2BLKS(nbytes);
+ if (nblks) {
+ sm4_neon_crypt_blk1_8(rkey, dst, src, nblks);
+ nbytes -= nblks * SM4_BLOCK_SIZE;
+ }
+
+ kernel_neon_end();
+
+ err = skcipher_walk_done(&walk, nbytes);
+ }
+
+ return err;
+}
+
+static int sm4_ecb_encrypt(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm);
+
+ return sm4_ecb_do_crypt(req, ctx->rkey_enc);
+}
+
+static int sm4_ecb_decrypt(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm);
+
+ return sm4_ecb_do_crypt(req, ctx->rkey_dec);
+}
+
+static int sm4_cbc_encrypt(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm);
+ struct skcipher_walk walk;
+ unsigned int nbytes;
+ int err;
+
+ err = skcipher_walk_virt(&walk, req, false);
+
+ while ((nbytes = walk.nbytes) > 0) {
+ const u8 *iv = walk.iv;
+ const u8 *src = walk.src.virt.addr;
+ u8 *dst = walk.dst.virt.addr;
+
+ while (nbytes >= SM4_BLOCK_SIZE) {
+ crypto_xor_cpy(dst, src, iv, SM4_BLOCK_SIZE);
+ sm4_crypt_block(ctx->rkey_enc, dst, dst);
+ iv = dst;
+ src += SM4_BLOCK_SIZE;
+ dst += SM4_BLOCK_SIZE;
+ nbytes -= SM4_BLOCK_SIZE;
+ }
+ if (iv != walk.iv)
+ memcpy(walk.iv, iv, SM4_BLOCK_SIZE);
+
+ err = skcipher_walk_done(&walk, nbytes);
+ }
+
+ return err;
+}
+
+static int sm4_cbc_decrypt(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm);
+ struct skcipher_walk walk;
+ unsigned int nbytes;
+ int err;
+
+ err = skcipher_walk_virt(&walk, req, false);
+
+ while ((nbytes = walk.nbytes) > 0) {
+ const u8 *src = walk.src.virt.addr;
+ u8 *dst = walk.dst.virt.addr;
+ unsigned int nblks;
+
+ kernel_neon_begin();
+
+ nblks = BYTES2BLK8(nbytes);
+ if (nblks) {
+ sm4_neon_cbc_dec_blk8(ctx->rkey_dec, dst, src,
+ walk.iv, nblks);
+ dst += nblks * SM4_BLOCK_SIZE;
+ src += nblks * SM4_BLOCK_SIZE;
+ nbytes -= nblks * SM4_BLOCK_SIZE;
+ }
+
+ nblks = BYTES2BLKS(nbytes);
+ if (nblks) {
+ u8 keystream[SM4_BLOCK_SIZE * 8];
+ u8 iv[SM4_BLOCK_SIZE];
+ int i;
+
+ sm4_neon_crypt_blk1_8(ctx->rkey_dec, keystream,
+ src, nblks);
+
+ src += ((int)nblks - 2) * SM4_BLOCK_SIZE;
+ dst += (nblks - 1) * SM4_BLOCK_SIZE;
+ memcpy(iv, src + SM4_BLOCK_SIZE, SM4_BLOCK_SIZE);
+
+ for (i = nblks - 1; i > 0; i--) {
+ crypto_xor_cpy(dst, src,
+ &keystream[i * SM4_BLOCK_SIZE],
+ SM4_BLOCK_SIZE);
+ src -= SM4_BLOCK_SIZE;
+ dst -= SM4_BLOCK_SIZE;
+ }
+ crypto_xor_cpy(dst, walk.iv,
+ keystream, SM4_BLOCK_SIZE);
+ memcpy(walk.iv, iv, SM4_BLOCK_SIZE);
+ nbytes -= nblks * SM4_BLOCK_SIZE;
+ }
+
+ kernel_neon_end();
+
+ err = skcipher_walk_done(&walk, nbytes);
+ }
+
+ return err;
+}
+
+static int sm4_cfb_encrypt(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm);
+ struct skcipher_walk walk;
+ unsigned int nbytes;
+ int err;
+
+ err = skcipher_walk_virt(&walk, req, false);
+
+ while ((nbytes = walk.nbytes) > 0) {
+ u8 keystream[SM4_BLOCK_SIZE];
+ const u8 *iv = walk.iv;
+ const u8 *src = walk.src.virt.addr;
+ u8 *dst = walk.dst.virt.addr;
+
+ while (nbytes >= SM4_BLOCK_SIZE) {
+ sm4_crypt_block(ctx->rkey_enc, keystream, iv);
+ crypto_xor_cpy(dst, src, keystream, SM4_BLOCK_SIZE);
+ iv = dst;
+ src += SM4_BLOCK_SIZE;
+ dst += SM4_BLOCK_SIZE;
+ nbytes -= SM4_BLOCK_SIZE;
+ }
+ if (iv != walk.iv)
+ memcpy(walk.iv, iv, SM4_BLOCK_SIZE);
+
+ /* tail */
+ if (walk.nbytes == walk.total && nbytes > 0) {
+ sm4_crypt_block(ctx->rkey_enc, keystream, walk.iv);
+ crypto_xor_cpy(dst, src, keystream, nbytes);
+ nbytes = 0;
+ }
+
+ err = skcipher_walk_done(&walk, nbytes);
+ }
+
+ return err;
+}
+
+static int sm4_cfb_decrypt(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm);
+ struct skcipher_walk walk;
+ unsigned int nbytes;
+ int err;
+
+ err = skcipher_walk_virt(&walk, req, false);
+
+ while ((nbytes = walk.nbytes) > 0) {
+ const u8 *src = walk.src.virt.addr;
+ u8 *dst = walk.dst.virt.addr;
+ unsigned int nblks;
+
+ kernel_neon_begin();
+
+ nblks = BYTES2BLK8(nbytes);
+ if (nblks) {
+ sm4_neon_cfb_dec_blk8(ctx->rkey_enc, dst, src,
+ walk.iv, nblks);
+ dst += nblks * SM4_BLOCK_SIZE;
+ src += nblks * SM4_BLOCK_SIZE;
+ nbytes -= nblks * SM4_BLOCK_SIZE;
+ }
+
+ nblks = BYTES2BLKS(nbytes);
+ if (nblks) {
+ u8 keystream[SM4_BLOCK_SIZE * 8];
+
+ memcpy(keystream, walk.iv, SM4_BLOCK_SIZE);
+ if (nblks > 1)
+ memcpy(&keystream[SM4_BLOCK_SIZE], src,
+ (nblks - 1) * SM4_BLOCK_SIZE);
+ memcpy(walk.iv, src + (nblks - 1) * SM4_BLOCK_SIZE,
+ SM4_BLOCK_SIZE);
+
+ sm4_neon_crypt_blk1_8(ctx->rkey_enc, keystream,
+ keystream, nblks);
+
+ crypto_xor_cpy(dst, src, keystream,
+ nblks * SM4_BLOCK_SIZE);
+ dst += nblks * SM4_BLOCK_SIZE;
+ src += nblks * SM4_BLOCK_SIZE;
+ nbytes -= nblks * SM4_BLOCK_SIZE;
+ }
+
+ kernel_neon_end();
+
+ /* tail */
+ if (walk.nbytes == walk.total && nbytes > 0) {
+ u8 keystream[SM4_BLOCK_SIZE];
+
+ sm4_crypt_block(ctx->rkey_enc, keystream, walk.iv);
+ crypto_xor_cpy(dst, src, keystream, nbytes);
+ nbytes = 0;
+ }
+
+ err = skcipher_walk_done(&walk, nbytes);
+ }
+
+ return err;
+}
+
+static int sm4_ctr_crypt(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm);
+ struct skcipher_walk walk;
+ unsigned int nbytes;
+ int err;
+
+ err = skcipher_walk_virt(&walk, req, false);
+
+ while ((nbytes = walk.nbytes) > 0) {
+ const u8 *src = walk.src.virt.addr;
+ u8 *dst = walk.dst.virt.addr;
+ unsigned int nblks;
+
+ kernel_neon_begin();
+
+ nblks = BYTES2BLK8(nbytes);
+ if (nblks) {
+ sm4_neon_ctr_enc_blk8(ctx->rkey_enc, dst, src,
+ walk.iv, nblks);
+ dst += nblks * SM4_BLOCK_SIZE;
+ src += nblks * SM4_BLOCK_SIZE;
+ nbytes -= nblks * SM4_BLOCK_SIZE;
+ }
+
+ nblks = BYTES2BLKS(nbytes);
+ if (nblks) {
+ u8 keystream[SM4_BLOCK_SIZE * 8];
+ int i;
+
+ for (i = 0; i < nblks; i++) {
+ memcpy(&keystream[i * SM4_BLOCK_SIZE],
+ walk.iv, SM4_BLOCK_SIZE);
+ crypto_inc(walk.iv, SM4_BLOCK_SIZE);
+ }
+ sm4_neon_crypt_blk1_8(ctx->rkey_enc, keystream,
+ keystream, nblks);
+
+ crypto_xor_cpy(dst, src, keystream,
+ nblks * SM4_BLOCK_SIZE);
+ dst += nblks * SM4_BLOCK_SIZE;
+ src += nblks * SM4_BLOCK_SIZE;
+ nbytes -= nblks * SM4_BLOCK_SIZE;
+ }
+
+ kernel_neon_end();
+
+ /* tail */
+ if (walk.nbytes == walk.total && nbytes > 0) {
+ u8 keystream[SM4_BLOCK_SIZE];
+
+ sm4_crypt_block(ctx->rkey_enc, keystream, walk.iv);
+ crypto_inc(walk.iv, SM4_BLOCK_SIZE);
+ crypto_xor_cpy(dst, src, keystream, nbytes);
+ nbytes = 0;
+ }
+
+ err = skcipher_walk_done(&walk, nbytes);
+ }
+
+ return err;
+}
+
+static struct skcipher_alg sm4_algs[] = {
+ {
+ .base = {
+ .cra_name = "ecb(sm4)",
+ .cra_driver_name = "ecb-sm4-neon",
+ .cra_priority = 200,
+ .cra_blocksize = SM4_BLOCK_SIZE,
+ .cra_ctxsize = sizeof(struct sm4_ctx),
+ .cra_module = THIS_MODULE,
+ },
+ .min_keysize = SM4_KEY_SIZE,
+ .max_keysize = SM4_KEY_SIZE,
+ .setkey = sm4_setkey,
+ .encrypt = sm4_ecb_encrypt,
+ .decrypt = sm4_ecb_decrypt,
+ }, {
+ .base = {
+ .cra_name = "cbc(sm4)",
+ .cra_driver_name = "cbc-sm4-neon",
+ .cra_priority = 200,
+ .cra_blocksize = SM4_BLOCK_SIZE,
+ .cra_ctxsize = sizeof(struct sm4_ctx),
+ .cra_module = THIS_MODULE,
+ },
+ .min_keysize = SM4_KEY_SIZE,
+ .max_keysize = SM4_KEY_SIZE,
+ .ivsize = SM4_BLOCK_SIZE,
+ .setkey = sm4_setkey,
+ .encrypt = sm4_cbc_encrypt,
+ .decrypt = sm4_cbc_decrypt,
+ }, {
+ .base = {
+ .cra_name = "cfb(sm4)",
+ .cra_driver_name = "cfb-sm4-neon",
+ .cra_priority = 200,
+ .cra_blocksize = 1,
+ .cra_ctxsize = sizeof(struct sm4_ctx),
+ .cra_module = THIS_MODULE,
+ },
+ .min_keysize = SM4_KEY_SIZE,
+ .max_keysize = SM4_KEY_SIZE,
+ .ivsize = SM4_BLOCK_SIZE,
+ .chunksize = SM4_BLOCK_SIZE,
+ .setkey = sm4_setkey,
+ .encrypt = sm4_cfb_encrypt,
+ .decrypt = sm4_cfb_decrypt,
+ }, {
+ .base = {
+ .cra_name = "ctr(sm4)",
+ .cra_driver_name = "ctr-sm4-neon",
+ .cra_priority = 200,
+ .cra_blocksize = 1,
+ .cra_ctxsize = sizeof(struct sm4_ctx),
+ .cra_module = THIS_MODULE,
+ },
+ .min_keysize = SM4_KEY_SIZE,
+ .max_keysize = SM4_KEY_SIZE,
+ .ivsize = SM4_BLOCK_SIZE,
+ .chunksize = SM4_BLOCK_SIZE,
+ .setkey = sm4_setkey,
+ .encrypt = sm4_ctr_crypt,
+ .decrypt = sm4_ctr_crypt,
+ }
+};
+
+static int __init sm4_init(void)
+{
+ return crypto_register_skciphers(sm4_algs, ARRAY_SIZE(sm4_algs));
+}
+
+static void __exit sm4_exit(void)
+{
+ crypto_unregister_skciphers(sm4_algs, ARRAY_SIZE(sm4_algs));
+}
+
+module_init(sm4_init);
+module_exit(sm4_exit);
+
+MODULE_DESCRIPTION("SM4 ECB/CBC/CFB/CTR using ARMv8 NEON");
+MODULE_ALIAS_CRYPTO("sm4-neon");
+MODULE_ALIAS_CRYPTO("sm4");
+MODULE_ALIAS_CRYPTO("ecb(sm4)");
+MODULE_ALIAS_CRYPTO("cbc(sm4)");
+MODULE_ALIAS_CRYPTO("cfb(sm4)");
+MODULE_ALIAS_CRYPTO("ctr(sm4)");
+MODULE_AUTHOR("Tianjia Zhang <tianjia.zhang@linux.alibaba.com>");
+MODULE_LICENSE("GPL v2");