aboutsummaryrefslogtreecommitdiffstatshomepage
path: root/arch/powerpc/crypto
diff options
context:
space:
mode:
Diffstat (limited to 'arch/powerpc/crypto')
-rw-r--r--arch/powerpc/crypto/.gitignore2
-rw-r--r--arch/powerpc/crypto/Kconfig84
-rw-r--r--arch/powerpc/crypto/Makefile14
-rw-r--r--arch/powerpc/crypto/aes-gcm-p10-glue.c156
-rw-r--r--arch/powerpc/crypto/aes-gcm-p10.S2421
-rw-r--r--arch/powerpc/crypto/aes.c8
-rw-r--r--arch/powerpc/crypto/aes_cbc.c4
-rw-r--r--arch/powerpc/crypto/aes_ctr.c6
-rw-r--r--arch/powerpc/crypto/aes_xts.c4
-rw-r--r--arch/powerpc/crypto/chacha-p10-glue.c221
-rw-r--r--arch/powerpc/crypto/chacha-p10le-8x.S842
-rw-r--r--arch/powerpc/crypto/crc-vpmsum_test.c133
-rw-r--r--arch/powerpc/crypto/crc32-vpmsum_core.S746
-rw-r--r--arch/powerpc/crypto/crc32c-vpmsum_asm.S842
-rw-r--r--arch/powerpc/crypto/crc32c-vpmsum_glue.c173
-rw-r--r--arch/powerpc/crypto/crct10dif-vpmsum_asm.S845
-rw-r--r--arch/powerpc/crypto/crct10dif-vpmsum_glue.c126
-rw-r--r--arch/powerpc/crypto/curve25519-ppc64le-core.c300
-rw-r--r--arch/powerpc/crypto/curve25519-ppc64le_asm.S671
-rw-r--r--arch/powerpc/crypto/ghash.c91
-rw-r--r--arch/powerpc/crypto/md5-glue.c99
-rw-r--r--arch/powerpc/crypto/poly1305-p10-glue.c186
-rw-r--r--arch/powerpc/crypto/poly1305-p10le_64.S1075
-rw-r--r--arch/powerpc/crypto/sha1-spe-glue.c130
-rw-r--r--arch/powerpc/crypto/sha1.c101
-rw-r--r--arch/powerpc/crypto/sha256-spe-asm.S318
-rw-r--r--arch/powerpc/crypto/sha256-spe-glue.c235
-rw-r--r--arch/powerpc/crypto/vmx.c2
28 files changed, 2294 insertions, 7541 deletions
diff --git a/arch/powerpc/crypto/.gitignore b/arch/powerpc/crypto/.gitignore
index e1094f08f713..e9fe73aac8b6 100644
--- a/arch/powerpc/crypto/.gitignore
+++ b/arch/powerpc/crypto/.gitignore
@@ -1,3 +1,5 @@
# SPDX-License-Identifier: GPL-2.0-only
aesp10-ppc.S
+aesp8-ppc.S
ghashp10-ppc.S
+ghashp8-ppc.S
diff --git a/arch/powerpc/crypto/Kconfig b/arch/powerpc/crypto/Kconfig
index 1e201b7ae2fc..caaa359f4742 100644
--- a/arch/powerpc/crypto/Kconfig
+++ b/arch/powerpc/crypto/Kconfig
@@ -2,42 +2,21 @@
menu "Accelerated Cryptographic Algorithms for CPU (powerpc)"
-config CRYPTO_CRC32C_VPMSUM
- tristate "CRC32c"
- depends on PPC64 && ALTIVEC
- select CRYPTO_HASH
- select CRC32
- help
- CRC32c CRC algorithm with the iSCSI polynomial (RFC 3385 and RFC 3720)
-
- Architecture: powerpc64 using
- - AltiVec extensions
-
- Enable on POWER8 and newer processors for improved performance.
-
-config CRYPTO_CRCT10DIF_VPMSUM
- tristate "CRC32T10DIF"
- depends on PPC64 && ALTIVEC && CRC_T10DIF
- select CRYPTO_HASH
+config CRYPTO_CURVE25519_PPC64
+ tristate
+ depends on PPC64 && CPU_LITTLE_ENDIAN
+ select CRYPTO_KPP
+ select CRYPTO_LIB_CURVE25519_GENERIC
+ select CRYPTO_ARCH_HAVE_LIB_CURVE25519
+ default CRYPTO_LIB_CURVE25519_INTERNAL
help
- CRC16 CRC algorithm used for the T10 (SCSI) Data Integrity Field (DIF)
-
- Architecture: powerpc64 using
- - AltiVec extensions
-
- Enable on POWER8 and newer processors for improved performance.
+ Curve25519 algorithm
-config CRYPTO_VPMSUM_TESTER
- tristate "CRC32c and CRC32T10DIF hardware acceleration tester"
- depends on CRYPTO_CRCT10DIF_VPMSUM && CRYPTO_CRC32C_VPMSUM
- help
- Stress test for CRC32c and CRCT10DIF algorithms implemented with
- powerpc64 AltiVec extensions (POWER8 vpmsum instructions).
- Unless you are testing these algorithms, you don't need this.
+ Architecture: PowerPC64
+ - Little-endian
config CRYPTO_MD5_PPC
tristate "Digests: MD5"
- depends on PPC
select CRYPTO_HASH
help
MD5 message digest algorithm (RFC1321)
@@ -46,7 +25,6 @@ config CRYPTO_MD5_PPC
config CRYPTO_SHA1_PPC
tristate "Hash functions: SHA-1"
- depends on PPC
help
SHA-1 secure hash algorithm (FIPS 180)
@@ -54,27 +32,16 @@ config CRYPTO_SHA1_PPC
config CRYPTO_SHA1_PPC_SPE
tristate "Hash functions: SHA-1 (SPE)"
- depends on PPC && SPE
+ depends on SPE
help
SHA-1 secure hash algorithm (FIPS 180)
Architecture: powerpc using
- SPE (Signal Processing Engine) extensions
-config CRYPTO_SHA256_PPC_SPE
- tristate "Hash functions: SHA-224 and SHA-256 (SPE)"
- depends on PPC && SPE
- select CRYPTO_SHA256
- select CRYPTO_HASH
- help
- SHA-224 and SHA-256 secure hash algorithms (FIPS 180)
-
- Architecture: powerpc using
- - SPE (Signal Processing Engine) extensions
-
config CRYPTO_AES_PPC_SPE
tristate "Ciphers: AES, modes: ECB/CBC/CTR/XTS (SPE)"
- depends on PPC && SPE
+ depends on SPE
select CRYPTO_SKCIPHER
help
Block ciphers: AES cipher algorithms (FIPS-197)
@@ -101,6 +68,7 @@ config CRYPTO_AES_GCM_P10
select CRYPTO_ALGAPI
select CRYPTO_AEAD
select CRYPTO_SKCIPHER
+ select CRYPTO_SIMD
help
AEAD cipher: AES cipher algorithms (FIPS-197)
GCM (Galois/Counter Mode) authenticated encryption mode (NIST SP800-38D)
@@ -111,32 +79,6 @@ config CRYPTO_AES_GCM_P10
Support for cryptographic acceleration instructions on Power10 or
later CPU. This module supports stitched acceleration for AES/GCM.
-config CRYPTO_CHACHA20_P10
- tristate "Ciphers: ChaCha20, XChacha20, XChacha12 (P10 or later)"
- depends on PPC64 && CPU_LITTLE_ENDIAN && VSX
- select CRYPTO_SKCIPHER
- select CRYPTO_LIB_CHACHA_GENERIC
- select CRYPTO_ARCH_HAVE_LIB_CHACHA
- help
- Length-preserving ciphers: ChaCha20, XChaCha20, and XChaCha12
- stream cipher algorithms
-
- Architecture: PowerPC64
- - Power10 or later
- - Little-endian
-
-config CRYPTO_POLY1305_P10
- tristate "Hash functions: Poly1305 (P10 or later)"
- depends on PPC64 && CPU_LITTLE_ENDIAN && VSX
- select CRYPTO_HASH
- select CRYPTO_LIB_POLY1305_GENERIC
- help
- Poly1305 authenticator algorithm (RFC7539)
-
- Architecture: PowerPC64
- - Power10 or later
- - Little-endian
-
config CRYPTO_DEV_VMX
bool "Support for VMX cryptographic acceleration instructions"
depends on PPC64 && VSX
diff --git a/arch/powerpc/crypto/Makefile b/arch/powerpc/crypto/Makefile
index fca0e9739869..8c2936ae466f 100644
--- a/arch/powerpc/crypto/Makefile
+++ b/arch/powerpc/crypto/Makefile
@@ -9,26 +9,17 @@ obj-$(CONFIG_CRYPTO_AES_PPC_SPE) += aes-ppc-spe.o
obj-$(CONFIG_CRYPTO_MD5_PPC) += md5-ppc.o
obj-$(CONFIG_CRYPTO_SHA1_PPC) += sha1-powerpc.o
obj-$(CONFIG_CRYPTO_SHA1_PPC_SPE) += sha1-ppc-spe.o
-obj-$(CONFIG_CRYPTO_SHA256_PPC_SPE) += sha256-ppc-spe.o
-obj-$(CONFIG_CRYPTO_CRC32C_VPMSUM) += crc32c-vpmsum.o
-obj-$(CONFIG_CRYPTO_CRCT10DIF_VPMSUM) += crct10dif-vpmsum.o
-obj-$(CONFIG_CRYPTO_VPMSUM_TESTER) += crc-vpmsum_test.o
obj-$(CONFIG_CRYPTO_AES_GCM_P10) += aes-gcm-p10-crypto.o
-obj-$(CONFIG_CRYPTO_CHACHA20_P10) += chacha-p10-crypto.o
-obj-$(CONFIG_CRYPTO_POLY1305_P10) += poly1305-p10-crypto.o
obj-$(CONFIG_CRYPTO_DEV_VMX_ENCRYPT) += vmx-crypto.o
+obj-$(CONFIG_CRYPTO_CURVE25519_PPC64) += curve25519-ppc64le.o
aes-ppc-spe-y := aes-spe-core.o aes-spe-keys.o aes-tab-4k.o aes-spe-modes.o aes-spe-glue.o
md5-ppc-y := md5-asm.o md5-glue.o
sha1-powerpc-y := sha1-powerpc-asm.o sha1.o
sha1-ppc-spe-y := sha1-spe-asm.o sha1-spe-glue.o
-sha256-ppc-spe-y := sha256-spe-asm.o sha256-spe-glue.o
-crc32c-vpmsum-y := crc32c-vpmsum_asm.o crc32c-vpmsum_glue.o
-crct10dif-vpmsum-y := crct10dif-vpmsum_asm.o crct10dif-vpmsum_glue.o
aes-gcm-p10-crypto-y := aes-gcm-p10-glue.o aes-gcm-p10.o ghashp10-ppc.o aesp10-ppc.o
-chacha-p10-crypto-y := chacha-p10-glue.o chacha-p10le-8x.o
-poly1305-p10-crypto-y := poly1305-p10-glue.o poly1305-p10le_64.o
vmx-crypto-objs := vmx.o aesp8-ppc.o ghashp8-ppc.o aes.o aes_cbc.o aes_ctr.o aes_xts.o ghash.o
+curve25519-ppc64le-y := curve25519-ppc64le-core.o curve25519-ppc64le_asm.o
ifeq ($(CONFIG_CPU_LITTLE_ENDIAN),y)
override flavour := linux-ppc64le
@@ -54,3 +45,4 @@ $(obj)/aesp8-ppc.S $(obj)/ghashp8-ppc.S: $(obj)/%.S: $(src)/%.pl FORCE
OBJECT_FILES_NON_STANDARD_aesp10-ppc.o := y
OBJECT_FILES_NON_STANDARD_ghashp10-ppc.o := y
OBJECT_FILES_NON_STANDARD_aesp8-ppc.o := y
+OBJECT_FILES_NON_STANDARD_ghashp8-ppc.o := y
diff --git a/arch/powerpc/crypto/aes-gcm-p10-glue.c b/arch/powerpc/crypto/aes-gcm-p10-glue.c
index f62ee54076c0..85f4fd4b1bdc 100644
--- a/arch/powerpc/crypto/aes-gcm-p10-glue.c
+++ b/arch/powerpc/crypto/aes-gcm-p10-glue.c
@@ -5,9 +5,10 @@
* Copyright 2022- IBM Inc. All rights reserved
*/
-#include <asm/unaligned.h>
+#include <linux/unaligned.h>
#include <asm/simd.h>
#include <asm/switch_to.h>
+#include <crypto/gcm.h>
#include <crypto/aes.h>
#include <crypto/algapi.h>
#include <crypto/b128ops.h>
@@ -24,6 +25,7 @@
#define PPC_ALIGN 16
#define GCM_IV_SIZE 12
+#define RFC4106_NONCE_SIZE 4
MODULE_DESCRIPTION("PPC64le AES-GCM with Stitched implementation");
MODULE_AUTHOR("Danny Tsen <dtsen@linux.ibm.com");
@@ -31,15 +33,16 @@ MODULE_LICENSE("GPL v2");
MODULE_ALIAS_CRYPTO("aes");
asmlinkage int aes_p10_set_encrypt_key(const u8 *userKey, const int bits,
- void *key);
+ void *key);
asmlinkage void aes_p10_encrypt(const u8 *in, u8 *out, const void *key);
-asmlinkage void aes_p10_gcm_encrypt(u8 *in, u8 *out, size_t len,
+asmlinkage void aes_p10_gcm_encrypt(const u8 *in, u8 *out, size_t len,
void *rkey, u8 *iv, void *Xi);
-asmlinkage void aes_p10_gcm_decrypt(u8 *in, u8 *out, size_t len,
+asmlinkage void aes_p10_gcm_decrypt(const u8 *in, u8 *out, size_t len,
void *rkey, u8 *iv, void *Xi);
asmlinkage void gcm_init_htable(unsigned char htable[], unsigned char Xi[]);
asmlinkage void gcm_ghash_p10(unsigned char *Xi, unsigned char *Htable,
- unsigned char *aad, unsigned int alen);
+ unsigned char *aad, unsigned int alen);
+asmlinkage void gcm_update(u8 *iv, void *Xi);
struct aes_key {
u8 key[AES_MAX_KEYLENGTH];
@@ -52,6 +55,7 @@ struct gcm_ctx {
u8 aad_hash[16];
u64 aadLen;
u64 Plen; /* offset 56 - used in aes_p10_gcm_{en/de}crypt */
+ u8 pblock[16];
};
struct Hash_ctx {
u8 H[16]; /* subkey */
@@ -60,17 +64,20 @@ struct Hash_ctx {
struct p10_aes_gcm_ctx {
struct aes_key enc_key;
+ u8 nonce[RFC4106_NONCE_SIZE];
};
static void vsx_begin(void)
{
preempt_disable();
+ pagefault_disable();
enable_kernel_vsx();
}
static void vsx_end(void)
{
disable_kernel_vsx();
+ pagefault_enable();
preempt_enable();
}
@@ -185,7 +192,7 @@ static int set_authsize(struct crypto_aead *tfm, unsigned int authsize)
}
static int p10_aes_gcm_setkey(struct crypto_aead *aead, const u8 *key,
- unsigned int keylen)
+ unsigned int keylen)
{
struct crypto_tfm *tfm = crypto_aead_tfm(aead);
struct p10_aes_gcm_ctx *ctx = crypto_tfm_ctx(tfm);
@@ -198,7 +205,8 @@ static int p10_aes_gcm_setkey(struct crypto_aead *aead, const u8 *key,
return ret ? -EINVAL : 0;
}
-static int p10_aes_gcm_crypt(struct aead_request *req, int enc)
+static int p10_aes_gcm_crypt(struct aead_request *req, u8 *riv,
+ int assoclen, int enc)
{
struct crypto_tfm *tfm = req->base.tfm;
struct p10_aes_gcm_ctx *ctx = crypto_tfm_ctx(tfm);
@@ -206,11 +214,9 @@ static int p10_aes_gcm_crypt(struct aead_request *req, int enc)
struct gcm_ctx *gctx = PTR_ALIGN((void *)databuf, PPC_ALIGN);
u8 hashbuf[sizeof(struct Hash_ctx) + PPC_ALIGN];
struct Hash_ctx *hash = PTR_ALIGN((void *)hashbuf, PPC_ALIGN);
- struct scatter_walk assoc_sg_walk;
struct skcipher_walk walk;
u8 *assocmem = NULL;
u8 *assoc;
- unsigned int assoclen = req->assoclen;
unsigned int cryptlen = req->cryptlen;
unsigned char ivbuf[AES_BLOCK_SIZE+PPC_ALIGN];
unsigned char *iv = PTR_ALIGN((void *)ivbuf, PPC_ALIGN);
@@ -218,16 +224,16 @@ static int p10_aes_gcm_crypt(struct aead_request *req, int enc)
unsigned long auth_tag_len = crypto_aead_authsize(__crypto_aead_cast(tfm));
u8 otag[16];
int total_processed = 0;
+ int nbytes;
memset(databuf, 0, sizeof(databuf));
memset(hashbuf, 0, sizeof(hashbuf));
memset(ivbuf, 0, sizeof(ivbuf));
- memcpy(iv, req->iv, GCM_IV_SIZE);
+ memcpy(iv, riv, GCM_IV_SIZE);
/* Linearize assoc, if not already linear */
if (req->src->length >= assoclen && req->src->length) {
- scatterwalk_start(&assoc_sg_walk, req->src);
- assoc = scatterwalk_map(&assoc_sg_walk);
+ assoc = sg_virt(req->src); /* ppc64 is !HIGHMEM */
} else {
gfp_t flags = (req->base.flags & CRYPTO_TFM_REQ_MAY_SLEEP) ?
GFP_KERNEL : GFP_ATOMIC;
@@ -245,10 +251,7 @@ static int p10_aes_gcm_crypt(struct aead_request *req, int enc)
gcmp10_init(gctx, iv, (unsigned char *) &ctx->enc_key, hash, assoc, assoclen);
vsx_end();
- if (!assocmem)
- scatterwalk_unmap(assoc);
- else
- kfree(assocmem);
+ kfree(assocmem);
if (enc)
ret = skcipher_walk_aead_encrypt(&walk, req, false);
@@ -257,19 +260,25 @@ static int p10_aes_gcm_crypt(struct aead_request *req, int enc)
if (ret)
return ret;
- while (walk.nbytes > 0 && ret == 0) {
+ while ((nbytes = walk.nbytes) > 0 && ret == 0) {
+ const u8 *src = walk.src.virt.addr;
+ u8 *dst = walk.dst.virt.addr;
+ u8 buf[AES_BLOCK_SIZE];
+
+ if (unlikely(nbytes > 0 && nbytes < AES_BLOCK_SIZE))
+ src = dst = memcpy(buf, src, nbytes);
vsx_begin();
if (enc)
- aes_p10_gcm_encrypt(walk.src.virt.addr,
- walk.dst.virt.addr,
- walk.nbytes,
+ aes_p10_gcm_encrypt(src, dst, nbytes,
&ctx->enc_key, gctx->iv, hash->Htable);
else
- aes_p10_gcm_decrypt(walk.src.virt.addr,
- walk.dst.virt.addr,
- walk.nbytes,
+ aes_p10_gcm_decrypt(src, dst, nbytes,
&ctx->enc_key, gctx->iv, hash->Htable);
+
+ if (unlikely(nbytes > 0 && nbytes < AES_BLOCK_SIZE))
+ memcpy(walk.dst.virt.addr, buf, nbytes);
+
vsx_end();
total_processed += walk.nbytes;
@@ -281,6 +290,7 @@ static int p10_aes_gcm_crypt(struct aead_request *req, int enc)
/* Finalize hash */
vsx_begin();
+ gcm_update(gctx->iv, hash->Htable);
finish_tag(gctx, hash, total_processed);
vsx_end();
@@ -302,17 +312,63 @@ static int p10_aes_gcm_crypt(struct aead_request *req, int enc)
return 0;
}
+static int rfc4106_setkey(struct crypto_aead *tfm, const u8 *inkey,
+ unsigned int keylen)
+{
+ struct p10_aes_gcm_ctx *ctx = crypto_aead_ctx(tfm);
+ int err;
+
+ keylen -= RFC4106_NONCE_SIZE;
+ err = p10_aes_gcm_setkey(tfm, inkey, keylen);
+ if (err)
+ return err;
+
+ memcpy(ctx->nonce, inkey + keylen, RFC4106_NONCE_SIZE);
+ return 0;
+}
+
+static int rfc4106_setauthsize(struct crypto_aead *tfm, unsigned int authsize)
+{
+ return crypto_rfc4106_check_authsize(authsize);
+}
+
+static int rfc4106_encrypt(struct aead_request *req)
+{
+ struct crypto_aead *aead = crypto_aead_reqtfm(req);
+ struct p10_aes_gcm_ctx *ctx = crypto_aead_ctx(aead);
+ u8 iv[AES_BLOCK_SIZE];
+
+ memcpy(iv, ctx->nonce, RFC4106_NONCE_SIZE);
+ memcpy(iv + RFC4106_NONCE_SIZE, req->iv, GCM_RFC4106_IV_SIZE);
+
+ return crypto_ipsec_check_assoclen(req->assoclen) ?:
+ p10_aes_gcm_crypt(req, iv, req->assoclen - GCM_RFC4106_IV_SIZE, 1);
+}
+
+static int rfc4106_decrypt(struct aead_request *req)
+{
+ struct crypto_aead *aead = crypto_aead_reqtfm(req);
+ struct p10_aes_gcm_ctx *ctx = crypto_aead_ctx(aead);
+ u8 iv[AES_BLOCK_SIZE];
+
+ memcpy(iv, ctx->nonce, RFC4106_NONCE_SIZE);
+ memcpy(iv + RFC4106_NONCE_SIZE, req->iv, GCM_RFC4106_IV_SIZE);
+
+ return crypto_ipsec_check_assoclen(req->assoclen) ?:
+ p10_aes_gcm_crypt(req, iv, req->assoclen - GCM_RFC4106_IV_SIZE, 0);
+}
+
static int p10_aes_gcm_encrypt(struct aead_request *req)
{
- return p10_aes_gcm_crypt(req, 1);
+ return p10_aes_gcm_crypt(req, req->iv, req->assoclen, 1);
}
static int p10_aes_gcm_decrypt(struct aead_request *req)
{
- return p10_aes_gcm_crypt(req, 0);
+ return p10_aes_gcm_crypt(req, req->iv, req->assoclen, 0);
}
-static struct aead_alg gcm_aes_alg = {
+static struct aead_alg gcm_aes_algs[] = {{
.ivsize = GCM_IV_SIZE,
.maxauthsize = 16,
@@ -321,23 +377,57 @@ static struct aead_alg gcm_aes_alg = {
.encrypt = p10_aes_gcm_encrypt,
.decrypt = p10_aes_gcm_decrypt,
- .base.cra_name = "gcm(aes)",
- .base.cra_driver_name = "aes_gcm_p10",
+ .base.cra_name = "__gcm(aes)",
+ .base.cra_driver_name = "__aes_gcm_p10",
.base.cra_priority = 2100,
.base.cra_blocksize = 1,
- .base.cra_ctxsize = sizeof(struct p10_aes_gcm_ctx),
+ .base.cra_ctxsize = sizeof(struct p10_aes_gcm_ctx)+
+ 4 * sizeof(u64[2]),
.base.cra_module = THIS_MODULE,
-};
+ .base.cra_flags = CRYPTO_ALG_INTERNAL,
+}, {
+ .ivsize = GCM_RFC4106_IV_SIZE,
+ .maxauthsize = 16,
+ .setkey = rfc4106_setkey,
+ .setauthsize = rfc4106_setauthsize,
+ .encrypt = rfc4106_encrypt,
+ .decrypt = rfc4106_decrypt,
+
+ .base.cra_name = "__rfc4106(gcm(aes))",
+ .base.cra_driver_name = "__rfc4106_aes_gcm_p10",
+ .base.cra_priority = 2100,
+ .base.cra_blocksize = 1,
+ .base.cra_ctxsize = sizeof(struct p10_aes_gcm_ctx) +
+ 4 * sizeof(u64[2]),
+ .base.cra_module = THIS_MODULE,
+ .base.cra_flags = CRYPTO_ALG_INTERNAL,
+}};
+
+static struct simd_aead_alg *p10_simd_aeads[ARRAY_SIZE(gcm_aes_algs)];
static int __init p10_init(void)
{
- return crypto_register_aead(&gcm_aes_alg);
+ int ret;
+
+ if (!cpu_has_feature(CPU_FTR_ARCH_31))
+ return 0;
+
+ ret = simd_register_aeads_compat(gcm_aes_algs,
+ ARRAY_SIZE(gcm_aes_algs),
+ p10_simd_aeads);
+ if (ret) {
+ simd_unregister_aeads(gcm_aes_algs, ARRAY_SIZE(gcm_aes_algs),
+ p10_simd_aeads);
+ return ret;
+ }
+ return 0;
}
static void __exit p10_exit(void)
{
- crypto_unregister_aead(&gcm_aes_alg);
+ simd_unregister_aeads(gcm_aes_algs, ARRAY_SIZE(gcm_aes_algs),
+ p10_simd_aeads);
}
-module_cpu_feature_match(PPC_MODULE_FEATURE_P10, p10_init);
+module_init(p10_init);
module_exit(p10_exit);
diff --git a/arch/powerpc/crypto/aes-gcm-p10.S b/arch/powerpc/crypto/aes-gcm-p10.S
index a51f4b265308..89f50eef3512 100644
--- a/arch/powerpc/crypto/aes-gcm-p10.S
+++ b/arch/powerpc/crypto/aes-gcm-p10.S
@@ -1,42 +1,42 @@
/* SPDX-License-Identifier: GPL-2.0-or-later */
- #
- # Accelerated AES-GCM stitched implementation for ppc64le.
- #
- # Copyright 2022- IBM Inc. All rights reserved
- #
- #===================================================================================
- # Written by Danny Tsen <dtsen@linux.ibm.com>
- #
- # GHASH is based on the Karatsuba multiplication method.
- #
- # Xi xor X1
- #
- # X1 * H^4 + X2 * H^3 + x3 * H^2 + X4 * H =
- # (X1.h * H4.h + xX.l * H4.l + X1 * H4) +
- # (X2.h * H3.h + X2.l * H3.l + X2 * H3) +
- # (X3.h * H2.h + X3.l * H2.l + X3 * H2) +
- # (X4.h * H.h + X4.l * H.l + X4 * H)
- #
- # Xi = v0
- # H Poly = v2
- # Hash keys = v3 - v14
- # ( H.l, H, H.h)
- # ( H^2.l, H^2, H^2.h)
- # ( H^3.l, H^3, H^3.h)
- # ( H^4.l, H^4, H^4.h)
- #
- # v30 is IV
- # v31 - counter 1
- #
- # AES used,
- # vs0 - vs14 for round keys
- # v15, v16, v17, v18, v19, v20, v21, v22 for 8 blocks (encrypted)
- #
- # This implementation uses stitched AES-GCM approach to improve overall performance.
- # AES is implemented with 8x blocks and GHASH is using 2 4x blocks.
- #
- # ===================================================================================
- #
+#
+# Accelerated AES-GCM stitched implementation for ppc64le.
+#
+# Copyright 2024- IBM Inc.
+#
+#===================================================================================
+# Written by Danny Tsen <dtsen@us.ibm.com>
+#
+# GHASH is based on the Karatsuba multiplication method.
+#
+# Xi xor X1
+#
+# X1 * H^4 + X2 * H^3 + x3 * H^2 + X4 * H =
+# (X1.h * H4.h + xX.l * H4.l + X1 * H4) +
+# (X2.h * H3.h + X2.l * H3.l + X2 * H3) +
+# (X3.h * H2.h + X3.l * H2.l + X3 * H2) +
+# (X4.h * H.h + X4.l * H.l + X4 * H)
+#
+# Xi = v0
+# H Poly = v2
+# Hash keys = v3 - v14
+# ( H.l, H, H.h)
+# ( H^2.l, H^2, H^2.h)
+# ( H^3.l, H^3, H^3.h)
+# ( H^4.l, H^4, H^4.h)
+#
+# v30 is IV
+# v31 - counter 1
+#
+# AES used,
+# vs0 - round key 0
+# v15, v16, v17, v18, v19, v20, v21, v22 for 8 blocks (encrypted)
+#
+# This implementation uses stitched AES-GCM approach to improve overall performance.
+# AES is implemented with 8x blocks and GHASH is using 2 4x blocks.
+#
+# ===================================================================================
+#
#include <asm/ppc_asm.h>
#include <linux/linkage.h>
@@ -44,483 +44,224 @@
.machine "any"
.text
- # 4x loops
- # v15 - v18 - input states
- # vs1 - vs9 - round keys
- #
-.macro Loop_aes_middle4x
- xxlor 19+32, 1, 1
- xxlor 20+32, 2, 2
- xxlor 21+32, 3, 3
- xxlor 22+32, 4, 4
-
- vcipher 15, 15, 19
- vcipher 16, 16, 19
- vcipher 17, 17, 19
- vcipher 18, 18, 19
-
- vcipher 15, 15, 20
- vcipher 16, 16, 20
- vcipher 17, 17, 20
- vcipher 18, 18, 20
-
- vcipher 15, 15, 21
- vcipher 16, 16, 21
- vcipher 17, 17, 21
- vcipher 18, 18, 21
-
- vcipher 15, 15, 22
- vcipher 16, 16, 22
- vcipher 17, 17, 22
- vcipher 18, 18, 22
-
- xxlor 19+32, 5, 5
- xxlor 20+32, 6, 6
- xxlor 21+32, 7, 7
- xxlor 22+32, 8, 8
-
- vcipher 15, 15, 19
- vcipher 16, 16, 19
- vcipher 17, 17, 19
- vcipher 18, 18, 19
-
- vcipher 15, 15, 20
- vcipher 16, 16, 20
- vcipher 17, 17, 20
- vcipher 18, 18, 20
-
- vcipher 15, 15, 21
- vcipher 16, 16, 21
- vcipher 17, 17, 21
- vcipher 18, 18, 21
-
- vcipher 15, 15, 22
- vcipher 16, 16, 22
- vcipher 17, 17, 22
- vcipher 18, 18, 22
-
- xxlor 23+32, 9, 9
- vcipher 15, 15, 23
- vcipher 16, 16, 23
- vcipher 17, 17, 23
- vcipher 18, 18, 23
+.macro SAVE_GPR GPR OFFSET FRAME
+ std \GPR,\OFFSET(\FRAME)
.endm
- # 8x loops
- # v15 - v22 - input states
- # vs1 - vs9 - round keys
- #
-.macro Loop_aes_middle8x
- xxlor 23+32, 1, 1
- xxlor 24+32, 2, 2
- xxlor 25+32, 3, 3
- xxlor 26+32, 4, 4
-
- vcipher 15, 15, 23
- vcipher 16, 16, 23
- vcipher 17, 17, 23
- vcipher 18, 18, 23
- vcipher 19, 19, 23
- vcipher 20, 20, 23
- vcipher 21, 21, 23
- vcipher 22, 22, 23
-
- vcipher 15, 15, 24
- vcipher 16, 16, 24
- vcipher 17, 17, 24
- vcipher 18, 18, 24
- vcipher 19, 19, 24
- vcipher 20, 20, 24
- vcipher 21, 21, 24
- vcipher 22, 22, 24
-
- vcipher 15, 15, 25
- vcipher 16, 16, 25
- vcipher 17, 17, 25
- vcipher 18, 18, 25
- vcipher 19, 19, 25
- vcipher 20, 20, 25
- vcipher 21, 21, 25
- vcipher 22, 22, 25
-
- vcipher 15, 15, 26
- vcipher 16, 16, 26
- vcipher 17, 17, 26
- vcipher 18, 18, 26
- vcipher 19, 19, 26
- vcipher 20, 20, 26
- vcipher 21, 21, 26
- vcipher 22, 22, 26
-
- xxlor 23+32, 5, 5
- xxlor 24+32, 6, 6
- xxlor 25+32, 7, 7
- xxlor 26+32, 8, 8
-
- vcipher 15, 15, 23
- vcipher 16, 16, 23
- vcipher 17, 17, 23
- vcipher 18, 18, 23
- vcipher 19, 19, 23
- vcipher 20, 20, 23
- vcipher 21, 21, 23
- vcipher 22, 22, 23
-
- vcipher 15, 15, 24
- vcipher 16, 16, 24
- vcipher 17, 17, 24
- vcipher 18, 18, 24
- vcipher 19, 19, 24
- vcipher 20, 20, 24
- vcipher 21, 21, 24
- vcipher 22, 22, 24
-
- vcipher 15, 15, 25
- vcipher 16, 16, 25
- vcipher 17, 17, 25
- vcipher 18, 18, 25
- vcipher 19, 19, 25
- vcipher 20, 20, 25
- vcipher 21, 21, 25
- vcipher 22, 22, 25
-
- vcipher 15, 15, 26
- vcipher 16, 16, 26
- vcipher 17, 17, 26
- vcipher 18, 18, 26
- vcipher 19, 19, 26
- vcipher 20, 20, 26
- vcipher 21, 21, 26
- vcipher 22, 22, 26
-
- xxlor 23+32, 9, 9
- vcipher 15, 15, 23
- vcipher 16, 16, 23
- vcipher 17, 17, 23
- vcipher 18, 18, 23
- vcipher 19, 19, 23
- vcipher 20, 20, 23
- vcipher 21, 21, 23
- vcipher 22, 22, 23
+.macro SAVE_VRS VRS OFFSET FRAME
+ stxv \VRS+32, \OFFSET(\FRAME)
.endm
-.macro Loop_aes_middle_1x
- xxlor 19+32, 1, 1
- xxlor 20+32, 2, 2
- xxlor 21+32, 3, 3
- xxlor 22+32, 4, 4
-
- vcipher 15, 15, 19
- vcipher 15, 15, 20
- vcipher 15, 15, 21
- vcipher 15, 15, 22
-
- xxlor 19+32, 5, 5
- xxlor 20+32, 6, 6
- xxlor 21+32, 7, 7
- xxlor 22+32, 8, 8
-
- vcipher 15, 15, 19
- vcipher 15, 15, 20
- vcipher 15, 15, 21
- vcipher 15, 15, 22
-
- xxlor 19+32, 9, 9
- vcipher 15, 15, 19
+.macro RESTORE_GPR GPR OFFSET FRAME
+ ld \GPR,\OFFSET(\FRAME)
.endm
- #
- # Compute 4x hash values based on Karatsuba method.
- #
-.macro ppc_aes_gcm_ghash
- vxor 15, 15, 0
-
- vpmsumd 23, 12, 15 # H4.L * X.L
- vpmsumd 24, 9, 16
- vpmsumd 25, 6, 17
- vpmsumd 26, 3, 18
-
- vxor 23, 23, 24
- vxor 23, 23, 25
- vxor 23, 23, 26 # L
-
- vpmsumd 24, 13, 15 # H4.L * X.H + H4.H * X.L
- vpmsumd 25, 10, 16 # H3.L * X1.H + H3.H * X1.L
- vpmsumd 26, 7, 17
- vpmsumd 27, 4, 18
-
- vxor 24, 24, 25
- vxor 24, 24, 26
- vxor 24, 24, 27 # M
-
- # sum hash and reduction with H Poly
- vpmsumd 28, 23, 2 # reduction
-
- vxor 29, 29, 29
- vsldoi 26, 24, 29, 8 # mL
- vsldoi 29, 29, 24, 8 # mH
- vxor 23, 23, 26 # mL + L
-
- vsldoi 23, 23, 23, 8 # swap
- vxor 23, 23, 28
-
- vpmsumd 24, 14, 15 # H4.H * X.H
- vpmsumd 25, 11, 16
- vpmsumd 26, 8, 17
- vpmsumd 27, 5, 18
-
- vxor 24, 24, 25
- vxor 24, 24, 26
- vxor 24, 24, 27
-
- vxor 24, 24, 29
-
- # sum hash and reduction with H Poly
- vsldoi 27, 23, 23, 8 # swap
- vpmsumd 23, 23, 2
- vxor 27, 27, 24
- vxor 23, 23, 27
-
- xxlor 32, 23+32, 23+32 # update hash
-
+.macro RESTORE_VRS VRS OFFSET FRAME
+ lxv \VRS+32, \OFFSET(\FRAME)
.endm
- #
- # Combine two 4x ghash
- # v15 - v22 - input blocks
- #
-.macro ppc_aes_gcm_ghash2_4x
- # first 4x hash
- vxor 15, 15, 0 # Xi + X
-
- vpmsumd 23, 12, 15 # H4.L * X.L
- vpmsumd 24, 9, 16
- vpmsumd 25, 6, 17
- vpmsumd 26, 3, 18
-
- vxor 23, 23, 24
- vxor 23, 23, 25
- vxor 23, 23, 26 # L
-
- vpmsumd 24, 13, 15 # H4.L * X.H + H4.H * X.L
- vpmsumd 25, 10, 16 # H3.L * X1.H + H3.H * X1.L
- vpmsumd 26, 7, 17
- vpmsumd 27, 4, 18
-
- vxor 24, 24, 25
- vxor 24, 24, 26
-
- # sum hash and reduction with H Poly
- vpmsumd 28, 23, 2 # reduction
-
- vxor 29, 29, 29
-
- vxor 24, 24, 27 # M
- vsldoi 26, 24, 29, 8 # mL
- vsldoi 29, 29, 24, 8 # mH
- vxor 23, 23, 26 # mL + L
-
- vsldoi 23, 23, 23, 8 # swap
- vxor 23, 23, 28
+.macro SAVE_REGS
+ mflr 0
+ std 0, 16(1)
+ stdu 1,-512(1)
+
+ SAVE_GPR 14, 112, 1
+ SAVE_GPR 15, 120, 1
+ SAVE_GPR 16, 128, 1
+ SAVE_GPR 17, 136, 1
+ SAVE_GPR 18, 144, 1
+ SAVE_GPR 19, 152, 1
+ SAVE_GPR 20, 160, 1
+ SAVE_GPR 21, 168, 1
+ SAVE_GPR 22, 176, 1
+ SAVE_GPR 23, 184, 1
+ SAVE_GPR 24, 192, 1
+
+ addi 9, 1, 256
+ SAVE_VRS 20, 0, 9
+ SAVE_VRS 21, 16, 9
+ SAVE_VRS 22, 32, 9
+ SAVE_VRS 23, 48, 9
+ SAVE_VRS 24, 64, 9
+ SAVE_VRS 25, 80, 9
+ SAVE_VRS 26, 96, 9
+ SAVE_VRS 27, 112, 9
+ SAVE_VRS 28, 128, 9
+ SAVE_VRS 29, 144, 9
+ SAVE_VRS 30, 160, 9
+ SAVE_VRS 31, 176, 9
+.endm # SAVE_REGS
- vpmsumd 24, 14, 15 # H4.H * X.H
- vpmsumd 25, 11, 16
- vpmsumd 26, 8, 17
- vpmsumd 27, 5, 18
+.macro RESTORE_REGS
+ addi 9, 1, 256
+ RESTORE_VRS 20, 0, 9
+ RESTORE_VRS 21, 16, 9
+ RESTORE_VRS 22, 32, 9
+ RESTORE_VRS 23, 48, 9
+ RESTORE_VRS 24, 64, 9
+ RESTORE_VRS 25, 80, 9
+ RESTORE_VRS 26, 96, 9
+ RESTORE_VRS 27, 112, 9
+ RESTORE_VRS 28, 128, 9
+ RESTORE_VRS 29, 144, 9
+ RESTORE_VRS 30, 160, 9
+ RESTORE_VRS 31, 176, 9
+
+ RESTORE_GPR 14, 112, 1
+ RESTORE_GPR 15, 120, 1
+ RESTORE_GPR 16, 128, 1
+ RESTORE_GPR 17, 136, 1
+ RESTORE_GPR 18, 144, 1
+ RESTORE_GPR 19, 152, 1
+ RESTORE_GPR 20, 160, 1
+ RESTORE_GPR 21, 168, 1
+ RESTORE_GPR 22, 176, 1
+ RESTORE_GPR 23, 184, 1
+ RESTORE_GPR 24, 192, 1
+
+ addi 1, 1, 512
+ ld 0, 16(1)
+ mtlr 0
+.endm # RESTORE_REGS
+
+# 4x loops
+.macro AES_CIPHER_4x _VCIPHER ST r
+ \_VCIPHER \ST, \ST, \r
+ \_VCIPHER \ST+1, \ST+1, \r
+ \_VCIPHER \ST+2, \ST+2, \r
+ \_VCIPHER \ST+3, \ST+3, \r
+.endm
- vxor 24, 24, 25
- vxor 24, 24, 26
- vxor 24, 24, 27 # H
+# 8x loops
+.macro AES_CIPHER_8x _VCIPHER ST r
+ \_VCIPHER \ST, \ST, \r
+ \_VCIPHER \ST+1, \ST+1, \r
+ \_VCIPHER \ST+2, \ST+2, \r
+ \_VCIPHER \ST+3, \ST+3, \r
+ \_VCIPHER \ST+4, \ST+4, \r
+ \_VCIPHER \ST+5, \ST+5, \r
+ \_VCIPHER \ST+6, \ST+6, \r
+ \_VCIPHER \ST+7, \ST+7, \r
+.endm
- vxor 24, 24, 29 # H + mH
+.macro LOOP_8AES_STATE
+ xxlor 32+23, 1, 1
+ xxlor 32+24, 2, 2
+ xxlor 32+25, 3, 3
+ xxlor 32+26, 4, 4
+ AES_CIPHER_8x vcipher, 15, 23
+ AES_CIPHER_8x vcipher, 15, 24
+ AES_CIPHER_8x vcipher, 15, 25
+ AES_CIPHER_8x vcipher, 15, 26
+ xxlor 32+23, 5, 5
+ xxlor 32+24, 6, 6
+ xxlor 32+25, 7, 7
+ xxlor 32+26, 8, 8
+ AES_CIPHER_8x vcipher, 15, 23
+ AES_CIPHER_8x vcipher, 15, 24
+ AES_CIPHER_8x vcipher, 15, 25
+ AES_CIPHER_8x vcipher, 15, 26
+.endm
- # sum hash and reduction with H Poly
- vsldoi 27, 23, 23, 8 # swap
- vpmsumd 23, 23, 2
- vxor 27, 27, 24
- vxor 27, 23, 27 # 1st Xi
-
- # 2nd 4x hash
- vpmsumd 24, 9, 20
- vpmsumd 25, 6, 21
- vpmsumd 26, 3, 22
- vxor 19, 19, 27 # Xi + X
- vpmsumd 23, 12, 19 # H4.L * X.L
-
- vxor 23, 23, 24
- vxor 23, 23, 25
- vxor 23, 23, 26 # L
-
- vpmsumd 24, 13, 19 # H4.L * X.H + H4.H * X.L
- vpmsumd 25, 10, 20 # H3.L * X1.H + H3.H * X1.L
- vpmsumd 26, 7, 21
- vpmsumd 27, 4, 22
-
- vxor 24, 24, 25
- vxor 24, 24, 26
+#
+# PPC_GHASH4x(H, S1, S2, S3, S4): Compute 4x hash values based on Karatsuba method.
+# H: returning digest
+# S#: states
+#
+# S1 should xor with the previous digest
+#
+# Xi = v0
+# H Poly = v2
+# Hash keys = v3 - v14
+# Scratch: v23 - v29
+#
+.macro PPC_GHASH4x H S1 S2 S3 S4
+
+ vpmsumd 23, 12, \S1 # H4.L * X.L
+ vpmsumd 24, 9, \S2
+ vpmsumd 25, 6, \S3
+ vpmsumd 26, 3, \S4
+
+ vpmsumd 27, 13, \S1 # H4.L * X.H + H4.H * X.L
+ vpmsumd 28, 10, \S2 # H3.L * X1.H + H3.H * X1.L
+
+ vxor 23, 23, 24
+ vxor 23, 23, 25
+ vxor 23, 23, 26 # L
+
+ vxor 24, 27, 28
+ vpmsumd 25, 7, \S3
+ vpmsumd 26, 4, \S4
+
+ vxor 24, 24, 25
+ vxor 24, 24, 26 # M
# sum hash and reduction with H Poly
- vpmsumd 28, 23, 2 # reduction
-
- vxor 29, 29, 29
+ vpmsumd 28, 23, 2 # reduction
- vxor 24, 24, 27 # M
- vsldoi 26, 24, 29, 8 # mL
- vsldoi 29, 29, 24, 8 # mH
- vxor 23, 23, 26 # mL + L
+ vxor 1, 1, 1
+ vsldoi 25, 24, 1, 8 # mL
+ vsldoi 1, 1, 24, 8 # mH
+ vxor 23, 23, 25 # mL + L
- vsldoi 23, 23, 23, 8 # swap
- vxor 23, 23, 28
+ # This performs swap and xor like,
+ # vsldoi 23, 23, 23, 8 # swap
+ # vxor 23, 23, 28
+ xxlor 32+25, 10, 10
+ vpermxor 23, 23, 28, 25
- vpmsumd 24, 14, 19 # H4.H * X.H
- vpmsumd 25, 11, 20
- vpmsumd 26, 8, 21
- vpmsumd 27, 5, 22
+ vpmsumd 26, 14, \S1 # H4.H * X.H
+ vpmsumd 27, 11, \S2
+ vpmsumd 28, 8, \S3
+ vpmsumd 29, 5, \S4
- vxor 24, 24, 25
- vxor 24, 24, 26
- vxor 24, 24, 27 # H
+ vxor 24, 26, 27
+ vxor 24, 24, 28
+ vxor 24, 24, 29
- vxor 24, 24, 29 # H + mH
+ vxor 24, 24, 1
# sum hash and reduction with H Poly
- vsldoi 27, 23, 23, 8 # swap
- vpmsumd 23, 23, 2
- vxor 27, 27, 24
- vxor 23, 23, 27
-
- xxlor 32, 23+32, 23+32 # update hash
-
+ vsldoi 25, 23, 23, 8 # swap
+ vpmsumd 23, 23, 2
+ vxor 27, 25, 24
+ vxor \H, 23, 27
.endm
- #
- # Compute update single hash
- #
-.macro ppc_update_hash_1x
- vxor 28, 28, 0
-
- vxor 19, 19, 19
+#
+# Compute update single ghash
+# scratch: v1, v22..v27
+#
+.macro PPC_GHASH1x H S1
- vpmsumd 22, 3, 28 # L
- vpmsumd 23, 4, 28 # M
- vpmsumd 24, 5, 28 # H
+ vxor 1, 1, 1
- vpmsumd 27, 22, 2 # reduction
+ vpmsumd 22, 3, \S1 # L
+ vpmsumd 23, 4, \S1 # M
+ vpmsumd 24, 5, \S1 # H
- vsldoi 25, 23, 19, 8 # mL
- vsldoi 26, 19, 23, 8 # mH
- vxor 22, 22, 25 # LL + LL
- vxor 24, 24, 26 # HH + HH
+ vpmsumd 27, 22, 2 # reduction
- vsldoi 22, 22, 22, 8 # swap
- vxor 22, 22, 27
+ vsldoi 25, 23, 1, 8 # mL
+ vsldoi 26, 1, 23, 8 # mH
+ vxor 22, 22, 25 # LL + LL
+ vxor 24, 24, 26 # HH + HH
- vsldoi 20, 22, 22, 8 # swap
- vpmsumd 22, 22, 2 # reduction
- vxor 20, 20, 24
- vxor 22, 22, 20
+ xxlor 32+25, 10, 10
+ vpermxor 22, 22, 27, 25
- vmr 0, 22 # update hash
-
-.endm
-
-.macro SAVE_REGS
- stdu 1,-640(1)
- mflr 0
-
- std 14,112(1)
- std 15,120(1)
- std 16,128(1)
- std 17,136(1)
- std 18,144(1)
- std 19,152(1)
- std 20,160(1)
- std 21,168(1)
- li 9, 256
- stvx 20, 9, 1
- addi 9, 9, 16
- stvx 21, 9, 1
- addi 9, 9, 16
- stvx 22, 9, 1
- addi 9, 9, 16
- stvx 23, 9, 1
- addi 9, 9, 16
- stvx 24, 9, 1
- addi 9, 9, 16
- stvx 25, 9, 1
- addi 9, 9, 16
- stvx 26, 9, 1
- addi 9, 9, 16
- stvx 27, 9, 1
- addi 9, 9, 16
- stvx 28, 9, 1
- addi 9, 9, 16
- stvx 29, 9, 1
- addi 9, 9, 16
- stvx 30, 9, 1
- addi 9, 9, 16
- stvx 31, 9, 1
- stxv 14, 464(1)
- stxv 15, 480(1)
- stxv 16, 496(1)
- stxv 17, 512(1)
- stxv 18, 528(1)
- stxv 19, 544(1)
- stxv 20, 560(1)
- stxv 21, 576(1)
- stxv 22, 592(1)
- std 0, 656(1)
-.endm
-
-.macro RESTORE_REGS
- lxv 14, 464(1)
- lxv 15, 480(1)
- lxv 16, 496(1)
- lxv 17, 512(1)
- lxv 18, 528(1)
- lxv 19, 544(1)
- lxv 20, 560(1)
- lxv 21, 576(1)
- lxv 22, 592(1)
- li 9, 256
- lvx 20, 9, 1
- addi 9, 9, 16
- lvx 21, 9, 1
- addi 9, 9, 16
- lvx 22, 9, 1
- addi 9, 9, 16
- lvx 23, 9, 1
- addi 9, 9, 16
- lvx 24, 9, 1
- addi 9, 9, 16
- lvx 25, 9, 1
- addi 9, 9, 16
- lvx 26, 9, 1
- addi 9, 9, 16
- lvx 27, 9, 1
- addi 9, 9, 16
- lvx 28, 9, 1
- addi 9, 9, 16
- lvx 29, 9, 1
- addi 9, 9, 16
- lvx 30, 9, 1
- addi 9, 9, 16
- lvx 31, 9, 1
-
- ld 0, 656(1)
- ld 14,112(1)
- ld 15,120(1)
- ld 16,128(1)
- ld 17,136(1)
- ld 18,144(1)
- ld 19,152(1)
- ld 20,160(1)
- ld 21,168(1)
-
- mtlr 0
- addi 1, 1, 640
+ vsldoi 23, 22, 22, 8 # swap
+ vpmsumd 22, 22, 2 # reduction
+ vxor 23, 23, 24
+ vxor \H, 22, 23
.endm
+#
+# LOAD_HASH_TABLE
+# Xi = v0
+# H Poly = v2
+# Hash keys = v3 - v14
+#
.macro LOAD_HASH_TABLE
# Load Xi
lxvb16x 32, 0, 8 # load Xi
@@ -557,657 +298,434 @@
lxvd2x 14+32, 10, 8 # H^4h
.endm
- #
- # aes_p10_gcm_encrypt (const void *inp, void *out, size_t len,
- # const char *rk, unsigned char iv[16], void *Xip);
- #
- # r3 - inp
- # r4 - out
- # r5 - len
- # r6 - AES round keys
- # r7 - iv and other data
- # r8 - Xi, HPoli, hash keys
- #
- # rounds is at offset 240 in rk
- # Xi is at 0 in gcm_table (Xip).
- #
-_GLOBAL(aes_p10_gcm_encrypt)
-.align 5
-
- SAVE_REGS
-
- LOAD_HASH_TABLE
-
- # initialize ICB: GHASH( IV ), IV - r7
- lxvb16x 30+32, 0, 7 # load IV - v30
-
- mr 12, 5 # length
- li 11, 0 # block index
-
- # counter 1
- vxor 31, 31, 31
- vspltisb 22, 1
- vsldoi 31, 31, 22,1 # counter 1
-
- # load round key to VSR
- lxv 0, 0(6)
- lxv 1, 0x10(6)
- lxv 2, 0x20(6)
- lxv 3, 0x30(6)
- lxv 4, 0x40(6)
- lxv 5, 0x50(6)
- lxv 6, 0x60(6)
- lxv 7, 0x70(6)
- lxv 8, 0x80(6)
- lxv 9, 0x90(6)
- lxv 10, 0xa0(6)
-
- # load rounds - 10 (128), 12 (192), 14 (256)
- lwz 9,240(6)
-
- #
- # vxor state, state, w # addroundkey
- xxlor 32+29, 0, 0
- vxor 15, 30, 29 # IV + round key - add round key 0
-
- cmpdi 9, 10
- beq Loop_aes_gcm_8x
-
- # load 2 more round keys (v11, v12)
- lxv 11, 0xb0(6)
- lxv 12, 0xc0(6)
-
- cmpdi 9, 12
- beq Loop_aes_gcm_8x
-
- # load 2 more round keys (v11, v12, v13, v14)
- lxv 13, 0xd0(6)
- lxv 14, 0xe0(6)
- cmpdi 9, 14
- beq Loop_aes_gcm_8x
-
- b aes_gcm_out
-
-.align 5
-Loop_aes_gcm_8x:
- mr 14, 3
- mr 9, 4
-
- #
- # check partial block
- #
-Continue_partial_check:
- ld 15, 56(7)
- cmpdi 15, 0
- beq Continue
- bgt Final_block
- cmpdi 15, 16
- blt Final_block
-
-Continue:
- # n blcoks
- li 10, 128
- divdu 10, 12, 10 # n 128 bytes-blocks
- cmpdi 10, 0
- beq Loop_last_block
-
- vaddudm 30, 30, 31 # IV + counter
- vxor 16, 30, 29
- vaddudm 30, 30, 31
- vxor 17, 30, 29
- vaddudm 30, 30, 31
- vxor 18, 30, 29
- vaddudm 30, 30, 31
- vxor 19, 30, 29
- vaddudm 30, 30, 31
- vxor 20, 30, 29
- vaddudm 30, 30, 31
- vxor 21, 30, 29
- vaddudm 30, 30, 31
- vxor 22, 30, 29
-
- mtctr 10
-
- li 15, 16
- li 16, 32
- li 17, 48
- li 18, 64
- li 19, 80
- li 20, 96
- li 21, 112
-
- lwz 10, 240(6)
-
-Loop_8x_block:
-
- lxvb16x 15, 0, 14 # load block
- lxvb16x 16, 15, 14 # load block
- lxvb16x 17, 16, 14 # load block
- lxvb16x 18, 17, 14 # load block
- lxvb16x 19, 18, 14 # load block
- lxvb16x 20, 19, 14 # load block
- lxvb16x 21, 20, 14 # load block
- lxvb16x 22, 21, 14 # load block
- addi 14, 14, 128
-
- Loop_aes_middle8x
-
- xxlor 23+32, 10, 10
-
- cmpdi 10, 10
- beq Do_next_ghash
-
- # 192 bits
- xxlor 24+32, 11, 11
-
- vcipher 15, 15, 23
- vcipher 16, 16, 23
- vcipher 17, 17, 23
- vcipher 18, 18, 23
- vcipher 19, 19, 23
- vcipher 20, 20, 23
- vcipher 21, 21, 23
- vcipher 22, 22, 23
-
- vcipher 15, 15, 24
- vcipher 16, 16, 24
- vcipher 17, 17, 24
- vcipher 18, 18, 24
- vcipher 19, 19, 24
- vcipher 20, 20, 24
- vcipher 21, 21, 24
- vcipher 22, 22, 24
-
- xxlor 23+32, 12, 12
-
- cmpdi 10, 12
- beq Do_next_ghash
-
- # 256 bits
- xxlor 24+32, 13, 13
-
- vcipher 15, 15, 23
- vcipher 16, 16, 23
- vcipher 17, 17, 23
- vcipher 18, 18, 23
- vcipher 19, 19, 23
- vcipher 20, 20, 23
- vcipher 21, 21, 23
- vcipher 22, 22, 23
-
- vcipher 15, 15, 24
- vcipher 16, 16, 24
- vcipher 17, 17, 24
- vcipher 18, 18, 24
- vcipher 19, 19, 24
- vcipher 20, 20, 24
- vcipher 21, 21, 24
- vcipher 22, 22, 24
-
- xxlor 23+32, 14, 14
-
- cmpdi 10, 14
- beq Do_next_ghash
- b aes_gcm_out
-
-Do_next_ghash:
-
- #
- # last round
- vcipherlast 15, 15, 23
- vcipherlast 16, 16, 23
-
- xxlxor 47, 47, 15
- stxvb16x 47, 0, 9 # store output
- xxlxor 48, 48, 16
- stxvb16x 48, 15, 9 # store output
-
- vcipherlast 17, 17, 23
- vcipherlast 18, 18, 23
-
- xxlxor 49, 49, 17
- stxvb16x 49, 16, 9 # store output
- xxlxor 50, 50, 18
- stxvb16x 50, 17, 9 # store output
-
- vcipherlast 19, 19, 23
- vcipherlast 20, 20, 23
-
- xxlxor 51, 51, 19
- stxvb16x 51, 18, 9 # store output
- xxlxor 52, 52, 20
- stxvb16x 52, 19, 9 # store output
-
- vcipherlast 21, 21, 23
- vcipherlast 22, 22, 23
-
- xxlxor 53, 53, 21
- stxvb16x 53, 20, 9 # store output
- xxlxor 54, 54, 22
- stxvb16x 54, 21, 9 # store output
-
- addi 9, 9, 128
-
- # ghash here
- ppc_aes_gcm_ghash2_4x
-
- xxlor 27+32, 0, 0
- vaddudm 30, 30, 31 # IV + counter
- vmr 29, 30
- vxor 15, 30, 27 # add round key
- vaddudm 30, 30, 31
- vxor 16, 30, 27
- vaddudm 30, 30, 31
- vxor 17, 30, 27
- vaddudm 30, 30, 31
- vxor 18, 30, 27
- vaddudm 30, 30, 31
- vxor 19, 30, 27
- vaddudm 30, 30, 31
- vxor 20, 30, 27
- vaddudm 30, 30, 31
- vxor 21, 30, 27
- vaddudm 30, 30, 31
- vxor 22, 30, 27
-
- addi 12, 12, -128
- addi 11, 11, 128
-
- bdnz Loop_8x_block
-
- vmr 30, 29
- stxvb16x 30+32, 0, 7 # update IV
-
-Loop_last_block:
- cmpdi 12, 0
- beq aes_gcm_out
-
- # loop last few blocks
+################################################################################
+# Compute AES and ghash one block at a time.
+# r23: AES rounds
+# v30: current IV
+# vs0: roundkey 0
+#
+################################################################################
+SYM_FUNC_START_LOCAL(aes_gcm_crypt_1x)
+
+ cmpdi 5, 16
+ bge __More_1x
+ blr
+__More_1x:
li 10, 16
- divdu 10, 12, 10
-
- mtctr 10
-
- lwz 10, 240(6)
-
- cmpdi 12, 16
- blt Final_block
-
-Next_rem_block:
- lxvb16x 15, 0, 14 # load block
-
- Loop_aes_middle_1x
-
- xxlor 23+32, 10, 10
-
- cmpdi 10, 10
- beq Do_next_1x
-
- # 192 bits
- xxlor 24+32, 11, 11
-
- vcipher 15, 15, 23
- vcipher 15, 15, 24
-
- xxlor 23+32, 12, 12
+ divdu 12, 5, 10
+
+ xxlxor 32+15, 32+30, 0
+
+ # Pre-load 8 AES rounds to scratch vectors.
+ xxlor 32+16, 1, 1
+ xxlor 32+17, 2, 2
+ xxlor 32+18, 3, 3
+ xxlor 32+19, 4, 4
+ xxlor 32+20, 5, 5
+ xxlor 32+21, 6, 6
+ xxlor 32+28, 7, 7
+ xxlor 32+29, 8, 8
+ lwz 23, 240(6) # n rounds
+ addi 22, 23, -9 # remaing AES rounds
- cmpdi 10, 12
- beq Do_next_1x
-
- # 256 bits
- xxlor 24+32, 13, 13
-
- vcipher 15, 15, 23
- vcipher 15, 15, 24
-
- xxlor 23+32, 14, 14
-
- cmpdi 10, 14
- beq Do_next_1x
-
-Do_next_1x:
- vcipherlast 15, 15, 23
-
- xxlxor 47, 47, 15
- stxvb16x 47, 0, 9 # store output
- addi 14, 14, 16
- addi 9, 9, 16
-
- vmr 28, 15
- ppc_update_hash_1x
-
- addi 12, 12, -16
- addi 11, 11, 16
- xxlor 19+32, 0, 0
- vaddudm 30, 30, 31 # IV + counter
- vxor 15, 30, 19 # add round key
-
- bdnz Next_rem_block
-
- li 15, 0
- std 15, 56(7) # clear partial?
- stxvb16x 30+32, 0, 7 # update IV
cmpdi 12, 0
- beq aes_gcm_out
-
-Final_block:
- lwz 10, 240(6)
- Loop_aes_middle_1x
-
- xxlor 23+32, 10, 10
-
- cmpdi 10, 10
- beq Do_final_1x
-
- # 192 bits
- xxlor 24+32, 11, 11
-
- vcipher 15, 15, 23
- vcipher 15, 15, 24
-
- xxlor 23+32, 12, 12
-
- cmpdi 10, 12
- beq Do_final_1x
-
- # 256 bits
- xxlor 24+32, 13, 13
-
- vcipher 15, 15, 23
- vcipher 15, 15, 24
+ bgt __Loop_1x
+ blr
- xxlor 23+32, 14, 14
+__Loop_1x:
+ mtctr 22
+ addi 10, 6, 144
+ vcipher 15, 15, 16
+ vcipher 15, 15, 17
+ vcipher 15, 15, 18
+ vcipher 15, 15, 19
+ vcipher 15, 15, 20
+ vcipher 15, 15, 21
+ vcipher 15, 15, 28
+ vcipher 15, 15, 29
- cmpdi 10, 14
- beq Do_final_1x
+__Loop_aes_1state:
+ lxv 32+1, 0(10)
+ vcipher 15, 15, 1
+ addi 10, 10, 16
+ bdnz __Loop_aes_1state
+ lxv 32+1, 0(10) # last round key
+ lxvb16x 11, 0, 14 # load input block
+ vcipherlast 15, 15, 1
+
+ xxlxor 32+15, 32+15, 11
+ stxvb16x 32+15, 0, 9 # store output
+ addi 14, 14, 16
+ addi 9, 9, 16
-Do_final_1x:
- vcipherlast 15, 15, 23
+ cmpdi 24, 0 # decrypt?
+ bne __Encrypt_1x
+ xxlor 15+32, 11, 11
+__Encrypt_1x:
+ vxor 15, 15, 0
+ PPC_GHASH1x 0, 15
- # check partial block
- li 21, 0 # encrypt
- ld 15, 56(7) # partial?
- cmpdi 15, 0
- beq Normal_block
- bl Do_partial_block
+ addi 5, 5, -16
+ addi 11, 11, 16
+ vadduwm 30, 30, 31 # IV + counter
+ xxlxor 32+15, 32+30, 0
+ addi 12, 12, -1
cmpdi 12, 0
- ble aes_gcm_out
+ bgt __Loop_1x
- b Continue_partial_check
-
-Normal_block:
- lxvb16x 15, 0, 14 # load last block
- xxlxor 47, 47, 15
-
- # create partial block mask
- li 15, 16
- sub 15, 15, 12 # index to the mask
-
- vspltisb 16, -1 # first 16 bytes - 0xffff...ff
- vspltisb 17, 0 # second 16 bytes - 0x0000...00
- li 10, 192
- stvx 16, 10, 1
+ stxvb16x 32+30, 0, 7 # update IV
+ stxvb16x 32+0, 0, 8 # update Xi
+ blr
+SYM_FUNC_END(aes_gcm_crypt_1x)
+
+################################################################################
+# Process a normal partial block when we come here.
+# Compute partial mask, Load and store partial block to stack.
+# Update partial_len and pblock.
+# pblock is (encrypted ^ AES state) for encrypt
+# and (input ^ AES state) for decrypt.
+#
+################################################################################
+SYM_FUNC_START_LOCAL(__Process_partial)
+
+ # create partial mask
+ vspltisb 16, -1
+ li 12, 16
+ sub 12, 12, 5
+ sldi 12, 12, 3
+ mtvsrdd 32+17, 0, 12
+ vslo 16, 16, 17 # partial block mask
+
+ lxvb16x 11, 0, 14 # load partial block
+ xxland 11, 11, 32+16
+
+ # AES crypt partial
+ xxlxor 32+15, 32+30, 0
+ lwz 23, 240(6) # n rounds
+ addi 22, 23, -1 # loop - 1
+ mtctr 22
+ addi 10, 6, 16
+
+__Loop_aes_pstate:
+ lxv 32+1, 0(10)
+ vcipher 15, 15, 1
addi 10, 10, 16
- stvx 17, 10, 1
-
- addi 10, 1, 192
- lxvb16x 16, 15, 10 # load partial block mask
- xxland 47, 47, 16
-
- vmr 28, 15
- ppc_update_hash_1x
+ bdnz __Loop_aes_pstate
+ lxv 32+1, 0(10) # last round key
+ vcipherlast 15, 15, 1
- # * should store only the remaining bytes.
- bl Write_partial_block
-
- stxvb16x 30+32, 0, 7 # update IV
- std 12, 56(7) # update partial?
- li 16, 16
+ xxlxor 32+15, 32+15, 11
+ vand 15, 15, 16
- stxvb16x 32, 0, 8 # write out Xi
- stxvb16x 32, 16, 8 # write out Xi
- b aes_gcm_out
-
- #
- # Compute data mask
- #
-.macro GEN_MASK _mask _start _end
- vspltisb 16, -1 # first 16 bytes - 0xffff...ff
- vspltisb 17, 0 # second 16 bytes - 0x0000...00
- li 10, 192
- stxvb16x 17+32, 10, 1
- add 10, 10, \_start
- stxvb16x 16+32, 10, 1
- add 10, 10, \_end
- stxvb16x 17+32, 10, 1
-
- addi 10, 1, 192
- lxvb16x \_mask, 0, 10 # load partial block mask
-.endm
+ # AES crypt output v15
+ # Write partial
+ li 10, 224
+ stxvb16x 15+32, 10, 1 # write v15 to stack
+ addi 10, 1, 223
+ addi 12, 9, -1
+ mtctr 5 # partial block len
+__Write_partial:
+ lbzu 22, 1(10)
+ stbu 22, 1(12)
+ bdnz __Write_partial
+
+ cmpdi 24, 0 # decrypt?
+ bne __Encrypt_partial
+ xxlor 32+15, 11, 11 # decrypt using the input block
+__Encrypt_partial:
+ #vxor 15, 15, 0 # ^ previous hash
+ #PPC_GHASH1x 0, 15
+
+ add 14, 14, 5
+ add 9, 9, 5
+ std 5, 56(7) # update partial
+ sub 11, 11, 5
+ li 5, 0 # done last byte
- #
- # Handle multiple partial blocks for encrypt and decrypt
- # operations.
- #
-SYM_FUNC_START_LOCAL(Do_partial_block)
- add 17, 15, 5
- cmpdi 17, 16
- bgt Big_block
- GEN_MASK 18, 15, 5
- b _Partial
-SYM_FUNC_END(Do_partial_block)
-Big_block:
+ #
+ # Don't increase IV since this is the last partial.
+ # It should get updated in gcm_update if no more data blocks.
+ #vadduwm 30, 30, 31 # increase IV
+ stxvb16x 32+30, 0, 7 # update IV
+ li 10, 64
+ stxvb16x 32+0, 0, 8 # Update X1
+ stxvb16x 32+15, 10, 7 # Update pblock
+ blr
+SYM_FUNC_END(__Process_partial)
+
+################################################################################
+# Combine partial blocks and ghash when we come here.
+#
+# The partial block has to be shifted to the right location to encrypt/decrypt
+# and compute ghash if combing the previous partial block is needed.
+# - Compute ghash for a full block. Clear Partial_len and pblock. Update IV.
+# Write Xi.
+# - Don't compute ghash if not full block. gcm_update will take care of it
+# is the last block. Update Partial_len and pblock.
+#
+################################################################################
+SYM_FUNC_START_LOCAL(__Combine_partial)
+
+ ld 12, 56(7)
+ mr 21, 5 # these bytes to be processed
+
+ li 17, 0
li 16, 16
- GEN_MASK 18, 15, 16
-
-_Partial:
- lxvb16x 17+32, 0, 14 # load last block
- sldi 16, 15, 3
- mtvsrdd 32+16, 0, 16
- vsro 17, 17, 16
- xxlxor 47, 47, 17+32
- xxland 47, 47, 18
-
- vxor 0, 0, 0 # clear Xi
- vmr 28, 15
-
- cmpdi 21, 0 # encrypt/decrypt ops?
- beq Skip_decrypt
- xxland 32+28, 32+17, 18
-
-Skip_decrypt:
-
- ppc_update_hash_1x
+ sub 22, 16, 12 # bytes to complete a block
+ sub 17, 22, 5 # remaining bytes in a block
+ cmpdi 5, 16
+ ble __Inp_msg_less16
+ li 17, 0
+ mr 21, 22
+ b __Combine_continue
+__Inp_msg_less16:
+ cmpd 22, 5
+ bgt __Combine_continue
+ li 17, 0
+ mr 21, 22 # these bytes to be processed
+
+__Combine_continue:
+ # load msg and shift to the proper location and mask
+ vspltisb 16, -1
+ sldi 15, 12, 3
+ mtvsrdd 32+17, 0, 15
+ vslo 16, 16, 17
+ vsro 16, 16, 17
+ sldi 15, 17, 3
+ mtvsrdd 32+17, 0, 15
+ vsro 16, 16, 17
+ vslo 16, 16, 17 # mask
+
+ lxvb16x 32+19, 0, 14 # load partial block
+ sldi 15, 12, 3
+ mtvsrdd 32+17, 0, 15
+ vsro 19, 19, 17 # 0x00..xxxx??..??
+ sldi 15, 17, 3
+ mtvsrdd 32+17, 0, 15
+ vsro 19, 19, 17 # 0x00..xxxx
+ vslo 19, 19, 17 # shift back to form 0x00..xxxx00..00
+
+ # AES crypt partial
+ xxlxor 32+15, 32+30, 0
+ lwz 23, 240(6) # n rounds
+ addi 22, 23, -1 # loop - 1
+ mtctr 22
+ addi 10, 6, 16
+
+__Loop_aes_cpstate:
+ lxv 32+1, 0(10)
+ vcipher 15, 15, 1
+ addi 10, 10, 16
+ bdnz __Loop_aes_cpstate
+ lxv 32+1, 0(10) # last round key
+ vcipherlast 15, 15, 1
- li 16, 16
- lxvb16x 32+29, 16, 8
- vxor 0, 0, 29
- stxvb16x 32, 0, 8 # save Xi
- stxvb16x 32, 16, 8 # save Xi
-
- # store partial block
- # loop the rest of the stream if any
- sldi 16, 15, 3
- mtvsrdd 32+16, 0, 16
- vslo 15, 15, 16
- #stxvb16x 15+32, 0, 9 # last block
+ vxor 15, 15, 19
+ vand 15, 15, 16
- li 16, 16
- sub 17, 16, 15 # 16 - partial
-
- add 16, 15, 5
- cmpdi 16, 16
- bgt Larger_16
- mr 17, 5
-Larger_16:
-
- # write partial
- li 10, 192
- stxvb16x 15+32, 10, 1 # save current block
-
- addi 10, 9, -1
- addi 16, 1, 191
- mtctr 17 # move partial byte count
-
-Write_last_partial:
- lbzu 18, 1(16)
- stbu 18, 1(10)
- bdnz Write_last_partial
- # Complete loop partial
-
- add 14, 14, 17
- add 9, 9, 17
- sub 12, 12, 17
- add 11, 11, 17
-
- add 15, 15, 5
- cmpdi 15, 16
- blt Save_partial
-
- vaddudm 30, 30, 31
- stxvb16x 30+32, 0, 7 # update IV
- xxlor 32+29, 0, 0
- vxor 15, 30, 29 # IV + round key - add round key 0
- li 15, 0
- std 15, 56(7) # partial done - clear
- b Partial_done
-Save_partial:
- std 15, 56(7) # partial
-
-Partial_done:
+ # AES crypt output v15
+ # Write partial
+ li 10, 224
+ stxvb16x 15+32, 10, 1 # write v15 to stack
+ addi 10, 1, 223
+ add 10, 10, 12 # add offset
+ addi 15, 9, -1
+ mtctr 21 # partial block len
+__Write_combine_partial:
+ lbzu 22, 1(10)
+ stbu 22, 1(15)
+ bdnz __Write_combine_partial
+
+ add 14, 14, 21
+ add 11, 11, 21
+ add 9, 9, 21
+ sub 5, 5, 21
+
+ # Encrypt/Decrypt?
+ cmpdi 24, 0 # decrypt?
+ bne __Encrypt_combine_partial
+ vmr 15, 19 # decrypt using the input block
+
+__Encrypt_combine_partial:
+ #
+ # Update partial flag and combine ghash.
+__Update_partial_ghash:
+ li 10, 64
+ lxvb16x 32+17, 10, 7 # load previous pblock
+ add 12, 12, 21 # combined pprocessed
+ vxor 15, 15, 17 # combined pblock
+
+ cmpdi 12, 16
+ beq __Clear_partial_flag
+ std 12, 56(7) # update partial len
+ stxvb16x 32+15, 10, 7 # Update current pblock
blr
- #
- # Write partial block
- # r9 - output
- # r12 - remaining bytes
- # v15 - partial input data
- #
-SYM_FUNC_START_LOCAL(Write_partial_block)
- li 10, 192
- stxvb16x 15+32, 10, 1 # last block
-
- addi 10, 9, -1
- addi 16, 1, 191
-
- mtctr 12 # remaining bytes
- li 15, 0
-
-Write_last_byte:
- lbzu 14, 1(16)
- stbu 14, 1(10)
- bdnz Write_last_byte
+__Clear_partial_flag:
+ li 12, 0
+ std 12, 56(7)
+ # Update IV and ghash here
+ vadduwm 30, 30, 31 # increase IV
+ stxvb16x 32+30, 0, 7 # update IV
+
+ # v15 either is either (input blockor encrypted)^(AES state)
+ vxor 15, 15, 0
+ PPC_GHASH1x 0, 15
+ stxvb16x 32+0, 10, 7 # update pblock for debug?
+ stxvb16x 32+0, 0, 8 # update Xi
blr
-SYM_FUNC_END(Write_partial_block)
+SYM_FUNC_END(__Combine_partial)
-aes_gcm_out:
- # out = state
- stxvb16x 32, 0, 8 # write out Xi
- add 3, 11, 12 # return count
+################################################################################
+# gcm_update(iv, Xi) - compute last hash
+#
+################################################################################
+SYM_FUNC_START(gcm_update)
- RESTORE_REGS
- blr
+ ld 10, 56(3)
+ cmpdi 10, 0
+ beq __no_update
- #
- # 8x Decrypt
- #
-_GLOBAL(aes_p10_gcm_decrypt)
-.align 5
+ lxvb16x 32, 0, 4 # load Xi
+ # load Hash - h^4, h^3, h^2, h
+ li 10, 32
+ lxvd2x 2+32, 10, 4 # H Poli
+ li 10, 48
+ lxvd2x 3+32, 10, 4 # Hl
+ li 10, 64
+ lxvd2x 4+32, 10, 4 # H
+ li 10, 80
+ lxvd2x 5+32, 10, 4 # Hh
+
+ addis 11, 2, permx@toc@ha
+ addi 11, 11, permx@toc@l
+ lxv 10, 0(11) # vs10: vpermxor vector
+
+ li 9, 64
+ lxvb16x 32+6, 9, 3 # load pblock
+ vxor 6, 6, 0
+
+ vxor 1, 1, 1
+ vpmsumd 12, 3, 6 # L
+ vpmsumd 13, 4, 6 # M
+ vpmsumd 14, 5, 6 # H
+ vpmsumd 17, 12, 2 # reduction
+ vsldoi 15, 13, 1, 8 # mL
+ vsldoi 16, 1, 13, 8 # mH
+ vxor 12, 12, 15 # LL + LL
+ vxor 14, 14, 16 # HH + HH
+ xxlor 32+15, 10, 10
+ vpermxor 12, 12, 17, 15
+ vsldoi 13, 12, 12, 8 # swap
+ vpmsumd 12, 12, 2 # reduction
+ vxor 13, 13, 14
+ vxor 7, 12, 13
+
+ #vxor 0, 0, 0
+ #stxvb16x 32+0, 9, 3
+ li 10, 0
+ std 10, 56(3)
+ stxvb16x 32+7, 0, 4
+
+__no_update:
+ blr
+SYM_FUNC_END(gcm_update)
+
+################################################################################
+# aes_p10_gcm_encrypt (const void *inp, void *out, size_t len,
+# const char *rk, unsigned char iv[16], void *Xip);
+#
+# r3 - inp
+# r4 - out
+# r5 - len
+# r6 - AES round keys
+# r7 - iv and other data
+# r8 - Xi, HPoli, hash keys
+#
+# rounds is at offset 240 in rk
+# Xi is at 0 in gcm_table (Xip).
+#
+################################################################################
+SYM_FUNC_START(aes_p10_gcm_encrypt)
+
+ cmpdi 5, 0
+ ble __Invalid_msg_len
SAVE_REGS
-
LOAD_HASH_TABLE
# initialize ICB: GHASH( IV ), IV - r7
lxvb16x 30+32, 0, 7 # load IV - v30
- mr 12, 5 # length
- li 11, 0 # block index
+ mr 14, 3
+ mr 9, 4
# counter 1
vxor 31, 31, 31
vspltisb 22, 1
vsldoi 31, 31, 22,1 # counter 1
- # load round key to VSR
- lxv 0, 0(6)
- lxv 1, 0x10(6)
- lxv 2, 0x20(6)
- lxv 3, 0x30(6)
- lxv 4, 0x40(6)
- lxv 5, 0x50(6)
- lxv 6, 0x60(6)
- lxv 7, 0x70(6)
- lxv 8, 0x80(6)
- lxv 9, 0x90(6)
- lxv 10, 0xa0(6)
+ addis 11, 2, permx@toc@ha
+ addi 11, 11, permx@toc@l
+ lxv 10, 0(11) # vs10: vpermxor vector
+ li 11, 0
+
+ # load 9 round keys to VSR
+ lxv 0, 0(6) # round key 0
+ lxv 1, 16(6) # round key 1
+ lxv 2, 32(6) # round key 2
+ lxv 3, 48(6) # round key 3
+ lxv 4, 64(6) # round key 4
+ lxv 5, 80(6) # round key 5
+ lxv 6, 96(6) # round key 6
+ lxv 7, 112(6) # round key 7
+ lxv 8, 128(6) # round key 8
# load rounds - 10 (128), 12 (192), 14 (256)
- lwz 9,240(6)
+ lwz 23, 240(6) # n rounds
+ li 24, 1 # encrypt
+__Process_encrypt:
#
- # vxor state, state, w # addroundkey
- xxlor 32+29, 0, 0
- vxor 15, 30, 29 # IV + round key - add round key 0
-
- cmpdi 9, 10
- beq Loop_aes_gcm_8x_dec
-
- # load 2 more round keys (v11, v12)
- lxv 11, 0xb0(6)
- lxv 12, 0xc0(6)
-
- cmpdi 9, 12
- beq Loop_aes_gcm_8x_dec
-
- # load 2 more round keys (v11, v12, v13, v14)
- lxv 13, 0xd0(6)
- lxv 14, 0xe0(6)
- cmpdi 9, 14
- beq Loop_aes_gcm_8x_dec
+ # Process different blocks
+ #
+ ld 12, 56(7)
+ cmpdi 12, 0
+ bgt __Do_combine_enc
+ cmpdi 5, 128
+ blt __Process_more_enc
+
+#
+# Process 8x AES/GCM blocks
+#
+__Process_8x_enc:
+ # 8x blcoks
+ li 10, 128
+ divdu 12, 5, 10 # n 128 bytes-blocks
- b aes_gcm_out
+ addi 12, 12, -1 # loop - 1
-.align 5
-Loop_aes_gcm_8x_dec:
- mr 14, 3
- mr 9, 4
+ vmr 15, 30 # first state: IV
+ vadduwm 16, 15, 31 # state + counter
+ vadduwm 17, 16, 31
+ vadduwm 18, 17, 31
+ vadduwm 19, 18, 31
+ vadduwm 20, 19, 31
+ vadduwm 21, 20, 31
+ vadduwm 22, 21, 31
+ xxlor 9, 32+22, 32+22 # save last state
- #
- # check partial block
- #
-Continue_partial_check_dec:
- ld 15, 56(7)
- cmpdi 15, 0
- beq Continue_dec
- bgt Final_block_dec
- cmpdi 15, 16
- blt Final_block_dec
-
-Continue_dec:
- # n blcoks
- li 10, 128
- divdu 10, 12, 10 # n 128 bytes-blocks
- cmpdi 10, 0
- beq Loop_last_block_dec
-
- vaddudm 30, 30, 31 # IV + counter
- vxor 16, 30, 29
- vaddudm 30, 30, 31
- vxor 17, 30, 29
- vaddudm 30, 30, 31
- vxor 18, 30, 29
- vaddudm 30, 30, 31
- vxor 19, 30, 29
- vaddudm 30, 30, 31
- vxor 20, 30, 29
- vaddudm 30, 30, 31
- vxor 21, 30, 29
- vaddudm 30, 30, 31
- vxor 22, 30, 29
-
- mtctr 10
+ # vxor state, state, w # addroundkey
+ xxlor 32+29, 0, 0
+ vxor 15, 15, 29 # IV + round key - add round key 0
+ vxor 16, 16, 29
+ vxor 17, 17, 29
+ vxor 18, 18, 29
+ vxor 19, 19, 29
+ vxor 20, 20, 29
+ vxor 21, 21, 29
+ vxor 22, 22, 29
li 15, 16
li 16, 32
@@ -1217,305 +735,502 @@ Continue_dec:
li 20, 96
li 21, 112
- lwz 10, 240(6)
-
-Loop_8x_block_dec:
-
- lxvb16x 15, 0, 14 # load block
- lxvb16x 16, 15, 14 # load block
- lxvb16x 17, 16, 14 # load block
- lxvb16x 18, 17, 14 # load block
- lxvb16x 19, 18, 14 # load block
- lxvb16x 20, 19, 14 # load block
- lxvb16x 21, 20, 14 # load block
- lxvb16x 22, 21, 14 # load block
- addi 14, 14, 128
-
- Loop_aes_middle8x
-
- xxlor 23+32, 10, 10
-
- cmpdi 10, 10
- beq Do_next_ghash_dec
-
- # 192 bits
- xxlor 24+32, 11, 11
-
- vcipher 15, 15, 23
- vcipher 16, 16, 23
- vcipher 17, 17, 23
- vcipher 18, 18, 23
- vcipher 19, 19, 23
- vcipher 20, 20, 23
- vcipher 21, 21, 23
- vcipher 22, 22, 23
-
- vcipher 15, 15, 24
- vcipher 16, 16, 24
- vcipher 17, 17, 24
- vcipher 18, 18, 24
- vcipher 19, 19, 24
- vcipher 20, 20, 24
- vcipher 21, 21, 24
- vcipher 22, 22, 24
-
- xxlor 23+32, 12, 12
-
- cmpdi 10, 12
- beq Do_next_ghash_dec
-
- # 256 bits
- xxlor 24+32, 13, 13
-
- vcipher 15, 15, 23
- vcipher 16, 16, 23
- vcipher 17, 17, 23
- vcipher 18, 18, 23
- vcipher 19, 19, 23
- vcipher 20, 20, 23
- vcipher 21, 21, 23
- vcipher 22, 22, 23
-
- vcipher 15, 15, 24
- vcipher 16, 16, 24
- vcipher 17, 17, 24
- vcipher 18, 18, 24
- vcipher 19, 19, 24
- vcipher 20, 20, 24
- vcipher 21, 21, 24
- vcipher 22, 22, 24
-
- xxlor 23+32, 14, 14
-
- cmpdi 10, 14
- beq Do_next_ghash_dec
- b aes_gcm_out
+ #
+ # Pre-compute first 8 AES state and leave 1/3/5 more rounds
+ # for the loop.
+ #
+ addi 22, 23, -9 # process 8 keys
+ mtctr 22 # AES key loop
+ addi 10, 6, 144
-Do_next_ghash_dec:
+ LOOP_8AES_STATE # process 8 AES keys
- #
- # last round
- vcipherlast 15, 15, 23
- vcipherlast 16, 16, 23
-
- xxlxor 47, 47, 15
- stxvb16x 47, 0, 9 # store output
- xxlxor 48, 48, 16
- stxvb16x 48, 15, 9 # store output
-
- vcipherlast 17, 17, 23
- vcipherlast 18, 18, 23
-
- xxlxor 49, 49, 17
- stxvb16x 49, 16, 9 # store output
- xxlxor 50, 50, 18
- stxvb16x 50, 17, 9 # store output
-
- vcipherlast 19, 19, 23
- vcipherlast 20, 20, 23
-
- xxlxor 51, 51, 19
- stxvb16x 51, 18, 9 # store output
- xxlxor 52, 52, 20
- stxvb16x 52, 19, 9 # store output
-
- vcipherlast 21, 21, 23
- vcipherlast 22, 22, 23
-
- xxlxor 53, 53, 21
- stxvb16x 53, 20, 9 # store output
- xxlxor 54, 54, 22
- stxvb16x 54, 21, 9 # store output
-
- addi 9, 9, 128
-
- xxlor 15+32, 15, 15
- xxlor 16+32, 16, 16
- xxlor 17+32, 17, 17
- xxlor 18+32, 18, 18
- xxlor 19+32, 19, 19
- xxlor 20+32, 20, 20
- xxlor 21+32, 21, 21
- xxlor 22+32, 22, 22
+__PreLoop_aes_state:
+ lxv 32+1, 0(10) # round key
+ AES_CIPHER_8x vcipher 15 1
+ addi 10, 10, 16
+ bdnz __PreLoop_aes_state
+ lxv 32+1, 0(10) # last round key (v1)
+
+ cmpdi 12, 0 # Only one loop (8 block)
+ beq __Finish_ghash
+
+#
+# Loop 8x blocks and compute ghash
+#
+__Loop_8x_block_enc:
+ vcipherlast 15, 15, 1
+ vcipherlast 16, 16, 1
+ vcipherlast 17, 17, 1
+ vcipherlast 18, 18, 1
+ vcipherlast 19, 19, 1
+ vcipherlast 20, 20, 1
+ vcipherlast 21, 21, 1
+ vcipherlast 22, 22, 1
+
+ lxvb16x 32+23, 0, 14 # load block
+ lxvb16x 32+24, 15, 14 # load block
+ lxvb16x 32+25, 16, 14 # load block
+ lxvb16x 32+26, 17, 14 # load block
+ lxvb16x 32+27, 18, 14 # load block
+ lxvb16x 32+28, 19, 14 # load block
+ lxvb16x 32+29, 20, 14 # load block
+ lxvb16x 32+30, 21, 14 # load block
+ addi 14, 14, 128
+
+ vxor 15, 15, 23
+ vxor 16, 16, 24
+ vxor 17, 17, 25
+ vxor 18, 18, 26
+ vxor 19, 19, 27
+ vxor 20, 20, 28
+ vxor 21, 21, 29
+ vxor 22, 22, 30
+
+ stxvb16x 47, 0, 9 # store output
+ stxvb16x 48, 15, 9 # store output
+ stxvb16x 49, 16, 9 # store output
+ stxvb16x 50, 17, 9 # store output
+ stxvb16x 51, 18, 9 # store output
+ stxvb16x 52, 19, 9 # store output
+ stxvb16x 53, 20, 9 # store output
+ stxvb16x 54, 21, 9 # store output
+ addi 9, 9, 128
# ghash here
- ppc_aes_gcm_ghash2_4x
-
- xxlor 27+32, 0, 0
- vaddudm 30, 30, 31 # IV + counter
- vmr 29, 30
- vxor 15, 30, 27 # add round key
- vaddudm 30, 30, 31
- vxor 16, 30, 27
- vaddudm 30, 30, 31
- vxor 17, 30, 27
- vaddudm 30, 30, 31
- vxor 18, 30, 27
- vaddudm 30, 30, 31
- vxor 19, 30, 27
- vaddudm 30, 30, 31
- vxor 20, 30, 27
- vaddudm 30, 30, 31
- vxor 21, 30, 27
- vaddudm 30, 30, 31
- vxor 22, 30, 27
-
- addi 12, 12, -128
+ vxor 15, 15, 0
+ PPC_GHASH4x 0, 15, 16, 17, 18
+
+ vxor 19, 19, 0
+ PPC_GHASH4x 0, 19, 20, 21, 22
+
+ xxlor 32+15, 9, 9 # last state
+ vadduwm 15, 15, 31 # state + counter
+ vadduwm 16, 15, 31
+ vadduwm 17, 16, 31
+ vadduwm 18, 17, 31
+ vadduwm 19, 18, 31
+ vadduwm 20, 19, 31
+ vadduwm 21, 20, 31
+ vadduwm 22, 21, 31
+ xxlor 9, 32+22, 32+22 # save last state
+
+ xxlor 32+27, 0, 0 # restore roundkey 0
+ vxor 15, 15, 27 # IV + round key - add round key 0
+ vxor 16, 16, 27
+ vxor 17, 17, 27
+ vxor 18, 18, 27
+ vxor 19, 19, 27
+ vxor 20, 20, 27
+ vxor 21, 21, 27
+ vxor 22, 22, 27
+
+ addi 5, 5, -128
addi 11, 11, 128
- bdnz Loop_8x_block_dec
-
- vmr 30, 29
- stxvb16x 30+32, 0, 7 # update IV
-
-Loop_last_block_dec:
- cmpdi 12, 0
- beq aes_gcm_out
-
- # loop last few blocks
- li 10, 16
- divdu 10, 12, 10
-
- mtctr 10
-
- lwz 10, 240(6)
-
- cmpdi 12, 16
- blt Final_block_dec
-
-Next_rem_block_dec:
- lxvb16x 15, 0, 14 # load block
-
- Loop_aes_middle_1x
-
- xxlor 23+32, 10, 10
+ LOOP_8AES_STATE # process 8 AES keys
+ mtctr 22 # AES key loop
+ addi 10, 6, 144
+__LastLoop_aes_state:
+ lxv 32+1, 0(10) # round key
+ AES_CIPHER_8x vcipher 15 1
+ addi 10, 10, 16
+ bdnz __LastLoop_aes_state
+ lxv 32+1, 0(10) # last round key (v1)
- cmpdi 10, 10
- beq Do_next_1x_dec
+ addi 12, 12, -1
+ cmpdi 12, 0
+ bne __Loop_8x_block_enc
+
+__Finish_ghash:
+ vcipherlast 15, 15, 1
+ vcipherlast 16, 16, 1
+ vcipherlast 17, 17, 1
+ vcipherlast 18, 18, 1
+ vcipherlast 19, 19, 1
+ vcipherlast 20, 20, 1
+ vcipherlast 21, 21, 1
+ vcipherlast 22, 22, 1
+
+ lxvb16x 32+23, 0, 14 # load block
+ lxvb16x 32+24, 15, 14 # load block
+ lxvb16x 32+25, 16, 14 # load block
+ lxvb16x 32+26, 17, 14 # load block
+ lxvb16x 32+27, 18, 14 # load block
+ lxvb16x 32+28, 19, 14 # load block
+ lxvb16x 32+29, 20, 14 # load block
+ lxvb16x 32+30, 21, 14 # load block
+ addi 14, 14, 128
+
+ vxor 15, 15, 23
+ vxor 16, 16, 24
+ vxor 17, 17, 25
+ vxor 18, 18, 26
+ vxor 19, 19, 27
+ vxor 20, 20, 28
+ vxor 21, 21, 29
+ vxor 22, 22, 30
+
+ stxvb16x 47, 0, 9 # store output
+ stxvb16x 48, 15, 9 # store output
+ stxvb16x 49, 16, 9 # store output
+ stxvb16x 50, 17, 9 # store output
+ stxvb16x 51, 18, 9 # store output
+ stxvb16x 52, 19, 9 # store output
+ stxvb16x 53, 20, 9 # store output
+ stxvb16x 54, 21, 9 # store output
+ addi 9, 9, 128
+
+ vxor 15, 15, 0
+ PPC_GHASH4x 0, 15, 16, 17, 18
+
+ vxor 19, 19, 0
+ PPC_GHASH4x 0, 19, 20, 21, 22
+
+ xxlor 30+32, 9, 9 # last ctr
+ vadduwm 30, 30, 31 # increase ctr
+ stxvb16x 32+30, 0, 7 # update IV
+ stxvb16x 32+0, 0, 8 # update Xi
+
+ addi 5, 5, -128
+ addi 11, 11, 128
- # 192 bits
- xxlor 24+32, 11, 11
+ #
+ # Done 8x blocks
+ #
- vcipher 15, 15, 23
- vcipher 15, 15, 24
+ cmpdi 5, 0
+ beq aes_gcm_out
- xxlor 23+32, 12, 12
+__Process_more_enc:
+ li 24, 1 # encrypt
+ bl aes_gcm_crypt_1x
+ cmpdi 5, 0
+ beq aes_gcm_out
- cmpdi 10, 12
- beq Do_next_1x_dec
+ bl __Process_partial
+ cmpdi 5, 0
+ beq aes_gcm_out
+__Do_combine_enc:
+ bl __Combine_partial
+ cmpdi 5, 0
+ bgt __Process_encrypt
+ b aes_gcm_out
- # 256 bits
- xxlor 24+32, 13, 13
+SYM_FUNC_END(aes_p10_gcm_encrypt)
- vcipher 15, 15, 23
- vcipher 15, 15, 24
+################################################################################
+# aes_p10_gcm_decrypt (const void *inp, void *out, size_t len,
+# const char *rk, unsigned char iv[16], void *Xip);
+# 8x Decrypt
+#
+################################################################################
+SYM_FUNC_START(aes_p10_gcm_decrypt)
- xxlor 23+32, 14, 14
+ cmpdi 5, 0
+ ble __Invalid_msg_len
- cmpdi 10, 14
- beq Do_next_1x_dec
+ SAVE_REGS
+ LOAD_HASH_TABLE
-Do_next_1x_dec:
- vcipherlast 15, 15, 23
+ # initialize ICB: GHASH( IV ), IV - r7
+ lxvb16x 30+32, 0, 7 # load IV - v30
- xxlxor 47, 47, 15
- stxvb16x 47, 0, 9 # store output
- addi 14, 14, 16
- addi 9, 9, 16
+ mr 14, 3
+ mr 9, 4
- xxlor 28+32, 15, 15
- #vmr 28, 15
- ppc_update_hash_1x
+ # counter 1
+ vxor 31, 31, 31
+ vspltisb 22, 1
+ vsldoi 31, 31, 22,1 # counter 1
- addi 12, 12, -16
- addi 11, 11, 16
- xxlor 19+32, 0, 0
- vaddudm 30, 30, 31 # IV + counter
- vxor 15, 30, 19 # add round key
+ addis 11, 2, permx@toc@ha
+ addi 11, 11, permx@toc@l
+ lxv 10, 0(11) # vs10: vpermxor vector
+ li 11, 0
+
+ # load 9 round keys to VSR
+ lxv 0, 0(6) # round key 0
+ lxv 1, 16(6) # round key 1
+ lxv 2, 32(6) # round key 2
+ lxv 3, 48(6) # round key 3
+ lxv 4, 64(6) # round key 4
+ lxv 5, 80(6) # round key 5
+ lxv 6, 96(6) # round key 6
+ lxv 7, 112(6) # round key 7
+ lxv 8, 128(6) # round key 8
- bdnz Next_rem_block_dec
+ # load rounds - 10 (128), 12 (192), 14 (256)
+ lwz 23, 240(6) # n rounds
+ li 24, 0 # decrypt
- li 15, 0
- std 15, 56(7) # clear partial?
- stxvb16x 30+32, 0, 7 # update IV
+__Process_decrypt:
+ #
+ # Process different blocks
+ #
+ ld 12, 56(7)
cmpdi 12, 0
- beq aes_gcm_out
-
-Final_block_dec:
- lwz 10, 240(6)
- Loop_aes_middle_1x
-
- xxlor 23+32, 10, 10
-
- cmpdi 10, 10
- beq Do_final_1x_dec
+ bgt __Do_combine_dec
+ cmpdi 5, 128
+ blt __Process_more_dec
+
+#
+# Process 8x AES/GCM blocks
+#
+__Process_8x_dec:
+ # 8x blcoks
+ li 10, 128
+ divdu 12, 5, 10 # n 128 bytes-blocks
- # 192 bits
- xxlor 24+32, 11, 11
+ addi 12, 12, -1 # loop - 1
- vcipher 15, 15, 23
- vcipher 15, 15, 24
+ vmr 15, 30 # first state: IV
+ vadduwm 16, 15, 31 # state + counter
+ vadduwm 17, 16, 31
+ vadduwm 18, 17, 31
+ vadduwm 19, 18, 31
+ vadduwm 20, 19, 31
+ vadduwm 21, 20, 31
+ vadduwm 22, 21, 31
+ xxlor 9, 32+22, 32+22 # save last state
- xxlor 23+32, 12, 12
+ # vxor state, state, w # addroundkey
+ xxlor 32+29, 0, 0
+ vxor 15, 15, 29 # IV + round key - add round key 0
+ vxor 16, 16, 29
+ vxor 17, 17, 29
+ vxor 18, 18, 29
+ vxor 19, 19, 29
+ vxor 20, 20, 29
+ vxor 21, 21, 29
+ vxor 22, 22, 29
- cmpdi 10, 12
- beq Do_final_1x_dec
+ li 15, 16
+ li 16, 32
+ li 17, 48
+ li 18, 64
+ li 19, 80
+ li 20, 96
+ li 21, 112
- # 256 bits
- xxlor 24+32, 13, 13
+ #
+ # Pre-compute first 8 AES state and leave 1/3/5 more rounds
+ # for the loop.
+ #
+ addi 22, 23, -9 # process 8 keys
+ mtctr 22 # AES key loop
+ addi 10, 6, 144
- vcipher 15, 15, 23
- vcipher 15, 15, 24
+ LOOP_8AES_STATE # process 8 AES keys
- xxlor 23+32, 14, 14
+__PreLoop_aes_state_dec:
+ lxv 32+1, 0(10) # round key
+ AES_CIPHER_8x vcipher 15 1
+ addi 10, 10, 16
+ bdnz __PreLoop_aes_state_dec
+ lxv 32+1, 0(10) # last round key (v1)
+
+ cmpdi 12, 0 # Only one loop (8 block)
+ beq __Finish_ghash_dec
+
+#
+# Loop 8x blocks and compute ghash
+#
+__Loop_8x_block_dec:
+ vcipherlast 15, 15, 1
+ vcipherlast 16, 16, 1
+ vcipherlast 17, 17, 1
+ vcipherlast 18, 18, 1
+ vcipherlast 19, 19, 1
+ vcipherlast 20, 20, 1
+ vcipherlast 21, 21, 1
+ vcipherlast 22, 22, 1
+
+ lxvb16x 32+23, 0, 14 # load block
+ lxvb16x 32+24, 15, 14 # load block
+ lxvb16x 32+25, 16, 14 # load block
+ lxvb16x 32+26, 17, 14 # load block
+ lxvb16x 32+27, 18, 14 # load block
+ lxvb16x 32+28, 19, 14 # load block
+ lxvb16x 32+29, 20, 14 # load block
+ lxvb16x 32+30, 21, 14 # load block
+ addi 14, 14, 128
+
+ vxor 15, 15, 23
+ vxor 16, 16, 24
+ vxor 17, 17, 25
+ vxor 18, 18, 26
+ vxor 19, 19, 27
+ vxor 20, 20, 28
+ vxor 21, 21, 29
+ vxor 22, 22, 30
+
+ stxvb16x 47, 0, 9 # store output
+ stxvb16x 48, 15, 9 # store output
+ stxvb16x 49, 16, 9 # store output
+ stxvb16x 50, 17, 9 # store output
+ stxvb16x 51, 18, 9 # store output
+ stxvb16x 52, 19, 9 # store output
+ stxvb16x 53, 20, 9 # store output
+ stxvb16x 54, 21, 9 # store output
+
+ addi 9, 9, 128
+
+ vmr 15, 23
+ vmr 16, 24
+ vmr 17, 25
+ vmr 18, 26
+ vmr 19, 27
+ vmr 20, 28
+ vmr 21, 29
+ vmr 22, 30
- cmpdi 10, 14
- beq Do_final_1x_dec
+ # ghash here
+ vxor 15, 15, 0
+ PPC_GHASH4x 0, 15, 16, 17, 18
+
+ vxor 19, 19, 0
+ PPC_GHASH4x 0, 19, 20, 21, 22
+
+ xxlor 32+15, 9, 9 # last state
+ vadduwm 15, 15, 31 # state + counter
+ vadduwm 16, 15, 31
+ vadduwm 17, 16, 31
+ vadduwm 18, 17, 31
+ vadduwm 19, 18, 31
+ vadduwm 20, 19, 31
+ vadduwm 21, 20, 31
+ vadduwm 22, 21, 31
+ xxlor 9, 32+22, 32+22 # save last state
+
+ xxlor 32+27, 0, 0 # restore roundkey 0
+ vxor 15, 15, 27 # IV + round key - add round key 0
+ vxor 16, 16, 27
+ vxor 17, 17, 27
+ vxor 18, 18, 27
+ vxor 19, 19, 27
+ vxor 20, 20, 27
+ vxor 21, 21, 27
+ vxor 22, 22, 27
+
+ addi 5, 5, -128
+ addi 11, 11, 128
-Do_final_1x_dec:
- vcipherlast 15, 15, 23
+ LOOP_8AES_STATE # process 8 AES keys
+ mtctr 22 # AES key loop
+ addi 10, 6, 144
+__LastLoop_aes_state_dec:
+ lxv 32+1, 0(10) # round key
+ AES_CIPHER_8x vcipher 15 1
+ addi 10, 10, 16
+ bdnz __LastLoop_aes_state_dec
+ lxv 32+1, 0(10) # last round key (v1)
- # check partial block
- li 21, 1 # decrypt
- ld 15, 56(7) # partial?
- cmpdi 15, 0
- beq Normal_block_dec
- bl Do_partial_block
+ addi 12, 12, -1
cmpdi 12, 0
- ble aes_gcm_out
-
- b Continue_partial_check_dec
+ bne __Loop_8x_block_dec
+
+__Finish_ghash_dec:
+ vcipherlast 15, 15, 1
+ vcipherlast 16, 16, 1
+ vcipherlast 17, 17, 1
+ vcipherlast 18, 18, 1
+ vcipherlast 19, 19, 1
+ vcipherlast 20, 20, 1
+ vcipherlast 21, 21, 1
+ vcipherlast 22, 22, 1
+
+ lxvb16x 32+23, 0, 14 # load block
+ lxvb16x 32+24, 15, 14 # load block
+ lxvb16x 32+25, 16, 14 # load block
+ lxvb16x 32+26, 17, 14 # load block
+ lxvb16x 32+27, 18, 14 # load block
+ lxvb16x 32+28, 19, 14 # load block
+ lxvb16x 32+29, 20, 14 # load block
+ lxvb16x 32+30, 21, 14 # load block
+ addi 14, 14, 128
+
+ vxor 15, 15, 23
+ vxor 16, 16, 24
+ vxor 17, 17, 25
+ vxor 18, 18, 26
+ vxor 19, 19, 27
+ vxor 20, 20, 28
+ vxor 21, 21, 29
+ vxor 22, 22, 30
+
+ stxvb16x 47, 0, 9 # store output
+ stxvb16x 48, 15, 9 # store output
+ stxvb16x 49, 16, 9 # store output
+ stxvb16x 50, 17, 9 # store output
+ stxvb16x 51, 18, 9 # store output
+ stxvb16x 52, 19, 9 # store output
+ stxvb16x 53, 20, 9 # store output
+ stxvb16x 54, 21, 9 # store output
+ addi 9, 9, 128
+
+ #vmr 15, 23
+ vxor 15, 23, 0
+ vmr 16, 24
+ vmr 17, 25
+ vmr 18, 26
+ vmr 19, 27
+ vmr 20, 28
+ vmr 21, 29
+ vmr 22, 30
+
+ #vxor 15, 15, 0
+ PPC_GHASH4x 0, 15, 16, 17, 18
+
+ vxor 19, 19, 0
+ PPC_GHASH4x 0, 19, 20, 21, 22
+
+ xxlor 30+32, 9, 9 # last ctr
+ vadduwm 30, 30, 31 # increase ctr
+ stxvb16x 32+30, 0, 7 # update IV
+ stxvb16x 32+0, 0, 8 # update Xi
+
+ addi 5, 5, -128
+ addi 11, 11, 128
-Normal_block_dec:
- lxvb16x 15, 0, 14 # load last block
- xxlxor 47, 47, 15
+ #
+ # Done 8x blocks
+ #
- # create partial block mask
- li 15, 16
- sub 15, 15, 12 # index to the mask
+ cmpdi 5, 0
+ beq aes_gcm_out
- vspltisb 16, -1 # first 16 bytes - 0xffff...ff
- vspltisb 17, 0 # second 16 bytes - 0x0000...00
- li 10, 192
- stvx 16, 10, 1
- addi 10, 10, 16
- stvx 17, 10, 1
+__Process_more_dec:
+ li 24, 0 # decrypt
+ bl aes_gcm_crypt_1x
+ cmpdi 5, 0
+ beq aes_gcm_out
- addi 10, 1, 192
- lxvb16x 16, 15, 10 # load partial block mask
- xxland 47, 47, 16
+ bl __Process_partial
+ cmpdi 5, 0
+ beq aes_gcm_out
+__Do_combine_dec:
+ bl __Combine_partial
+ cmpdi 5, 0
+ bgt __Process_decrypt
+ b aes_gcm_out
+SYM_FUNC_END(aes_p10_gcm_decrypt)
- xxland 32+28, 15, 16
- #vmr 28, 15
- ppc_update_hash_1x
+SYM_FUNC_START_LOCAL(aes_gcm_out)
- # * should store only the remaining bytes.
- bl Write_partial_block
+ mr 3, 11 # return count
- stxvb16x 30+32, 0, 7 # update IV
- std 12, 56(7) # update partial?
- li 16, 16
+ RESTORE_REGS
+ blr
- stxvb16x 32, 0, 8 # write out Xi
- stxvb16x 32, 16, 8 # write out Xi
- b aes_gcm_out
+__Invalid_msg_len:
+ li 3, 0
+ blr
+SYM_FUNC_END(aes_gcm_out)
+
+SYM_DATA_START_LOCAL(PERMX)
+.align 4
+# for vector permute and xor
+permx:
+.long 0x4c5d6e7f, 0x08192a3b, 0xc4d5e6f7, 0x8091a2b3
+SYM_DATA_END(permx)
diff --git a/arch/powerpc/crypto/aes.c b/arch/powerpc/crypto/aes.c
index ec06189fbf99..3f1e5e894902 100644
--- a/arch/powerpc/crypto/aes.c
+++ b/arch/powerpc/crypto/aes.c
@@ -7,15 +7,15 @@
* Author: Marcelo Henrique Cerri <mhcerri@br.ibm.com>
*/
-#include <linux/types.h>
-#include <linux/err.h>
-#include <linux/crypto.h>
-#include <linux/delay.h>
#include <asm/simd.h>
#include <asm/switch_to.h>
#include <crypto/aes.h>
#include <crypto/internal/cipher.h>
#include <crypto/internal/simd.h>
+#include <linux/err.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/uaccess.h>
#include "aesp8-ppc.h"
diff --git a/arch/powerpc/crypto/aes_cbc.c b/arch/powerpc/crypto/aes_cbc.c
index ed0debc7acb5..5f2a4f375eef 100644
--- a/arch/powerpc/crypto/aes_cbc.c
+++ b/arch/powerpc/crypto/aes_cbc.c
@@ -12,6 +12,10 @@
#include <crypto/aes.h>
#include <crypto/internal/simd.h>
#include <crypto/internal/skcipher.h>
+#include <linux/err.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/uaccess.h>
#include "aesp8-ppc.h"
diff --git a/arch/powerpc/crypto/aes_ctr.c b/arch/powerpc/crypto/aes_ctr.c
index 9a3da8cd62f3..e27c4036e711 100644
--- a/arch/powerpc/crypto/aes_ctr.c
+++ b/arch/powerpc/crypto/aes_ctr.c
@@ -12,6 +12,10 @@
#include <crypto/aes.h>
#include <crypto/internal/simd.h>
#include <crypto/internal/skcipher.h>
+#include <linux/err.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/uaccess.h>
#include "aesp8-ppc.h"
@@ -69,9 +73,9 @@ static int p8_aes_ctr_setkey(struct crypto_skcipher *tfm, const u8 *key,
static void p8_aes_ctr_final(const struct p8_aes_ctr_ctx *ctx,
struct skcipher_walk *walk)
{
+ const u8 *src = walk->src.virt.addr;
u8 *ctrblk = walk->iv;
u8 keystream[AES_BLOCK_SIZE];
- u8 *src = walk->src.virt.addr;
u8 *dst = walk->dst.virt.addr;
unsigned int nbytes = walk->nbytes;
diff --git a/arch/powerpc/crypto/aes_xts.c b/arch/powerpc/crypto/aes_xts.c
index dabbccb41550..9440e771cede 100644
--- a/arch/powerpc/crypto/aes_xts.c
+++ b/arch/powerpc/crypto/aes_xts.c
@@ -13,6 +13,10 @@
#include <crypto/internal/simd.h>
#include <crypto/internal/skcipher.h>
#include <crypto/xts.h>
+#include <linux/err.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/uaccess.h>
#include "aesp8-ppc.h"
diff --git a/arch/powerpc/crypto/chacha-p10-glue.c b/arch/powerpc/crypto/chacha-p10-glue.c
deleted file mode 100644
index 74fb86b0d209..000000000000
--- a/arch/powerpc/crypto/chacha-p10-glue.c
+++ /dev/null
@@ -1,221 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * PowerPC P10 (ppc64le) accelerated ChaCha and XChaCha stream ciphers,
- * including ChaCha20 (RFC7539)
- *
- * Copyright 2023- IBM Corp. All rights reserved.
- */
-
-#include <crypto/algapi.h>
-#include <crypto/internal/chacha.h>
-#include <crypto/internal/simd.h>
-#include <crypto/internal/skcipher.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/cpufeature.h>
-#include <linux/sizes.h>
-#include <asm/simd.h>
-#include <asm/switch_to.h>
-
-asmlinkage void chacha_p10le_8x(u32 *state, u8 *dst, const u8 *src,
- unsigned int len, int nrounds);
-
-static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_p10);
-
-static void vsx_begin(void)
-{
- preempt_disable();
- enable_kernel_vsx();
-}
-
-static void vsx_end(void)
-{
- disable_kernel_vsx();
- preempt_enable();
-}
-
-static void chacha_p10_do_8x(u32 *state, u8 *dst, const u8 *src,
- unsigned int bytes, int nrounds)
-{
- unsigned int l = bytes & ~0x0FF;
-
- if (l > 0) {
- chacha_p10le_8x(state, dst, src, l, nrounds);
- bytes -= l;
- src += l;
- dst += l;
- state[12] += l / CHACHA_BLOCK_SIZE;
- }
-
- if (bytes > 0)
- chacha_crypt_generic(state, dst, src, bytes, nrounds);
-}
-
-void hchacha_block_arch(const u32 *state, u32 *stream, int nrounds)
-{
- hchacha_block_generic(state, stream, nrounds);
-}
-EXPORT_SYMBOL(hchacha_block_arch);
-
-void chacha_init_arch(u32 *state, const u32 *key, const u8 *iv)
-{
- chacha_init_generic(state, key, iv);
-}
-EXPORT_SYMBOL(chacha_init_arch);
-
-void chacha_crypt_arch(u32 *state, u8 *dst, const u8 *src, unsigned int bytes,
- int nrounds)
-{
- if (!static_branch_likely(&have_p10) || bytes <= CHACHA_BLOCK_SIZE ||
- !crypto_simd_usable())
- return chacha_crypt_generic(state, dst, src, bytes, nrounds);
-
- do {
- unsigned int todo = min_t(unsigned int, bytes, SZ_4K);
-
- vsx_begin();
- chacha_p10_do_8x(state, dst, src, todo, nrounds);
- vsx_end();
-
- bytes -= todo;
- src += todo;
- dst += todo;
- } while (bytes);
-}
-EXPORT_SYMBOL(chacha_crypt_arch);
-
-static int chacha_p10_stream_xor(struct skcipher_request *req,
- const struct chacha_ctx *ctx, const u8 *iv)
-{
- struct skcipher_walk walk;
- u32 state[16];
- int err;
-
- err = skcipher_walk_virt(&walk, req, false);
- if (err)
- return err;
-
- chacha_init_generic(state, ctx->key, iv);
-
- while (walk.nbytes > 0) {
- unsigned int nbytes = walk.nbytes;
-
- if (nbytes < walk.total)
- nbytes = rounddown(nbytes, walk.stride);
-
- if (!crypto_simd_usable()) {
- chacha_crypt_generic(state, walk.dst.virt.addr,
- walk.src.virt.addr, nbytes,
- ctx->nrounds);
- } else {
- vsx_begin();
- chacha_p10_do_8x(state, walk.dst.virt.addr,
- walk.src.virt.addr, nbytes, ctx->nrounds);
- vsx_end();
- }
- err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
- if (err)
- break;
- }
-
- return err;
-}
-
-static int chacha_p10(struct skcipher_request *req)
-{
- struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
- struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
-
- return chacha_p10_stream_xor(req, ctx, req->iv);
-}
-
-static int xchacha_p10(struct skcipher_request *req)
-{
- struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
- struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
- struct chacha_ctx subctx;
- u32 state[16];
- u8 real_iv[16];
-
- chacha_init_generic(state, ctx->key, req->iv);
- hchacha_block_arch(state, subctx.key, ctx->nrounds);
- subctx.nrounds = ctx->nrounds;
-
- memcpy(&real_iv[0], req->iv + 24, 8);
- memcpy(&real_iv[8], req->iv + 16, 8);
- return chacha_p10_stream_xor(req, &subctx, real_iv);
-}
-
-static struct skcipher_alg algs[] = {
- {
- .base.cra_name = "chacha20",
- .base.cra_driver_name = "chacha20-p10",
- .base.cra_priority = 300,
- .base.cra_blocksize = 1,
- .base.cra_ctxsize = sizeof(struct chacha_ctx),
- .base.cra_module = THIS_MODULE,
-
- .min_keysize = CHACHA_KEY_SIZE,
- .max_keysize = CHACHA_KEY_SIZE,
- .ivsize = CHACHA_IV_SIZE,
- .chunksize = CHACHA_BLOCK_SIZE,
- .setkey = chacha20_setkey,
- .encrypt = chacha_p10,
- .decrypt = chacha_p10,
- }, {
- .base.cra_name = "xchacha20",
- .base.cra_driver_name = "xchacha20-p10",
- .base.cra_priority = 300,
- .base.cra_blocksize = 1,
- .base.cra_ctxsize = sizeof(struct chacha_ctx),
- .base.cra_module = THIS_MODULE,
-
- .min_keysize = CHACHA_KEY_SIZE,
- .max_keysize = CHACHA_KEY_SIZE,
- .ivsize = XCHACHA_IV_SIZE,
- .chunksize = CHACHA_BLOCK_SIZE,
- .setkey = chacha20_setkey,
- .encrypt = xchacha_p10,
- .decrypt = xchacha_p10,
- }, {
- .base.cra_name = "xchacha12",
- .base.cra_driver_name = "xchacha12-p10",
- .base.cra_priority = 300,
- .base.cra_blocksize = 1,
- .base.cra_ctxsize = sizeof(struct chacha_ctx),
- .base.cra_module = THIS_MODULE,
-
- .min_keysize = CHACHA_KEY_SIZE,
- .max_keysize = CHACHA_KEY_SIZE,
- .ivsize = XCHACHA_IV_SIZE,
- .chunksize = CHACHA_BLOCK_SIZE,
- .setkey = chacha12_setkey,
- .encrypt = xchacha_p10,
- .decrypt = xchacha_p10,
- }
-};
-
-static int __init chacha_p10_init(void)
-{
- static_branch_enable(&have_p10);
-
- return crypto_register_skciphers(algs, ARRAY_SIZE(algs));
-}
-
-static void __exit chacha_p10_exit(void)
-{
- crypto_unregister_skciphers(algs, ARRAY_SIZE(algs));
-}
-
-module_cpu_feature_match(PPC_MODULE_FEATURE_P10, chacha_p10_init);
-module_exit(chacha_p10_exit);
-
-MODULE_DESCRIPTION("ChaCha and XChaCha stream ciphers (P10 accelerated)");
-MODULE_AUTHOR("Danny Tsen <dtsen@linux.ibm.com>");
-MODULE_LICENSE("GPL v2");
-MODULE_ALIAS_CRYPTO("chacha20");
-MODULE_ALIAS_CRYPTO("chacha20-p10");
-MODULE_ALIAS_CRYPTO("xchacha20");
-MODULE_ALIAS_CRYPTO("xchacha20-p10");
-MODULE_ALIAS_CRYPTO("xchacha12");
-MODULE_ALIAS_CRYPTO("xchacha12-p10");
diff --git a/arch/powerpc/crypto/chacha-p10le-8x.S b/arch/powerpc/crypto/chacha-p10le-8x.S
deleted file mode 100644
index 17bedb66b822..000000000000
--- a/arch/powerpc/crypto/chacha-p10le-8x.S
+++ /dev/null
@@ -1,842 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-#
-# Accelerated chacha20 implementation for ppc64le.
-#
-# Copyright 2023- IBM Corp. All rights reserved
-#
-#===================================================================================
-# Written by Danny Tsen <dtsen@us.ibm.com>
-#
-# chacha_p10le_8x(u32 *state, byte *dst, const byte *src,
-# size_t len, int nrounds);
-#
-# do rounds, 8 quarter rounds
-# 1. a += b; d ^= a; d <<<= 16;
-# 2. c += d; b ^= c; b <<<= 12;
-# 3. a += b; d ^= a; d <<<= 8;
-# 4. c += d; b ^= c; b <<<= 7
-#
-# row1 = (row1 + row2), row4 = row1 xor row4, row4 rotate each word by 16
-# row3 = (row3 + row4), row2 = row3 xor row2, row2 rotate each word by 12
-# row1 = (row1 + row2), row4 = row1 xor row4, row4 rotate each word by 8
-# row3 = (row3 + row4), row2 = row3 xor row2, row2 rotate each word by 7
-#
-# 4 blocks (a b c d)
-#
-# a0 b0 c0 d0
-# a1 b1 c1 d1
-# ...
-# a4 b4 c4 d4
-# ...
-# a8 b8 c8 d8
-# ...
-# a12 b12 c12 d12
-# a13 ...
-# a14 ...
-# a15 b15 c15 d15
-#
-# Column round (v0, v4, v8, v12, v1, v5, v9, v13, v2, v6, v10, v14, v3, v7, v11, v15)
-# Diagnal round (v0, v5, v10, v15, v1, v6, v11, v12, v2, v7, v8, v13, v3, v4, v9, v14)
-#
-
-#include <asm/ppc_asm.h>
-#include <asm/asm-offsets.h>
-#include <asm/asm-compat.h>
-#include <linux/linkage.h>
-
-.machine "any"
-.text
-
-.macro SAVE_GPR GPR OFFSET FRAME
- std \GPR,\OFFSET(\FRAME)
-.endm
-
-.macro SAVE_VRS VRS OFFSET FRAME
- li 16, \OFFSET
- stvx \VRS, 16, \FRAME
-.endm
-
-.macro SAVE_VSX VSX OFFSET FRAME
- li 16, \OFFSET
- stxvx \VSX, 16, \FRAME
-.endm
-
-.macro RESTORE_GPR GPR OFFSET FRAME
- ld \GPR,\OFFSET(\FRAME)
-.endm
-
-.macro RESTORE_VRS VRS OFFSET FRAME
- li 16, \OFFSET
- lvx \VRS, 16, \FRAME
-.endm
-
-.macro RESTORE_VSX VSX OFFSET FRAME
- li 16, \OFFSET
- lxvx \VSX, 16, \FRAME
-.endm
-
-.macro SAVE_REGS
- mflr 0
- std 0, 16(1)
- stdu 1,-752(1)
-
- SAVE_GPR 14, 112, 1
- SAVE_GPR 15, 120, 1
- SAVE_GPR 16, 128, 1
- SAVE_GPR 17, 136, 1
- SAVE_GPR 18, 144, 1
- SAVE_GPR 19, 152, 1
- SAVE_GPR 20, 160, 1
- SAVE_GPR 21, 168, 1
- SAVE_GPR 22, 176, 1
- SAVE_GPR 23, 184, 1
- SAVE_GPR 24, 192, 1
- SAVE_GPR 25, 200, 1
- SAVE_GPR 26, 208, 1
- SAVE_GPR 27, 216, 1
- SAVE_GPR 28, 224, 1
- SAVE_GPR 29, 232, 1
- SAVE_GPR 30, 240, 1
- SAVE_GPR 31, 248, 1
-
- addi 9, 1, 256
- SAVE_VRS 20, 0, 9
- SAVE_VRS 21, 16, 9
- SAVE_VRS 22, 32, 9
- SAVE_VRS 23, 48, 9
- SAVE_VRS 24, 64, 9
- SAVE_VRS 25, 80, 9
- SAVE_VRS 26, 96, 9
- SAVE_VRS 27, 112, 9
- SAVE_VRS 28, 128, 9
- SAVE_VRS 29, 144, 9
- SAVE_VRS 30, 160, 9
- SAVE_VRS 31, 176, 9
-
- SAVE_VSX 14, 192, 9
- SAVE_VSX 15, 208, 9
- SAVE_VSX 16, 224, 9
- SAVE_VSX 17, 240, 9
- SAVE_VSX 18, 256, 9
- SAVE_VSX 19, 272, 9
- SAVE_VSX 20, 288, 9
- SAVE_VSX 21, 304, 9
- SAVE_VSX 22, 320, 9
- SAVE_VSX 23, 336, 9
- SAVE_VSX 24, 352, 9
- SAVE_VSX 25, 368, 9
- SAVE_VSX 26, 384, 9
- SAVE_VSX 27, 400, 9
- SAVE_VSX 28, 416, 9
- SAVE_VSX 29, 432, 9
- SAVE_VSX 30, 448, 9
- SAVE_VSX 31, 464, 9
-.endm # SAVE_REGS
-
-.macro RESTORE_REGS
- addi 9, 1, 256
- RESTORE_VRS 20, 0, 9
- RESTORE_VRS 21, 16, 9
- RESTORE_VRS 22, 32, 9
- RESTORE_VRS 23, 48, 9
- RESTORE_VRS 24, 64, 9
- RESTORE_VRS 25, 80, 9
- RESTORE_VRS 26, 96, 9
- RESTORE_VRS 27, 112, 9
- RESTORE_VRS 28, 128, 9
- RESTORE_VRS 29, 144, 9
- RESTORE_VRS 30, 160, 9
- RESTORE_VRS 31, 176, 9
-
- RESTORE_VSX 14, 192, 9
- RESTORE_VSX 15, 208, 9
- RESTORE_VSX 16, 224, 9
- RESTORE_VSX 17, 240, 9
- RESTORE_VSX 18, 256, 9
- RESTORE_VSX 19, 272, 9
- RESTORE_VSX 20, 288, 9
- RESTORE_VSX 21, 304, 9
- RESTORE_VSX 22, 320, 9
- RESTORE_VSX 23, 336, 9
- RESTORE_VSX 24, 352, 9
- RESTORE_VSX 25, 368, 9
- RESTORE_VSX 26, 384, 9
- RESTORE_VSX 27, 400, 9
- RESTORE_VSX 28, 416, 9
- RESTORE_VSX 29, 432, 9
- RESTORE_VSX 30, 448, 9
- RESTORE_VSX 31, 464, 9
-
- RESTORE_GPR 14, 112, 1
- RESTORE_GPR 15, 120, 1
- RESTORE_GPR 16, 128, 1
- RESTORE_GPR 17, 136, 1
- RESTORE_GPR 18, 144, 1
- RESTORE_GPR 19, 152, 1
- RESTORE_GPR 20, 160, 1
- RESTORE_GPR 21, 168, 1
- RESTORE_GPR 22, 176, 1
- RESTORE_GPR 23, 184, 1
- RESTORE_GPR 24, 192, 1
- RESTORE_GPR 25, 200, 1
- RESTORE_GPR 26, 208, 1
- RESTORE_GPR 27, 216, 1
- RESTORE_GPR 28, 224, 1
- RESTORE_GPR 29, 232, 1
- RESTORE_GPR 30, 240, 1
- RESTORE_GPR 31, 248, 1
-
- addi 1, 1, 752
- ld 0, 16(1)
- mtlr 0
-.endm # RESTORE_REGS
-
-.macro QT_loop_8x
- # QR(v0, v4, v8, v12, v1, v5, v9, v13, v2, v6, v10, v14, v3, v7, v11, v15)
- xxlor 0, 32+25, 32+25
- xxlor 32+25, 20, 20
- vadduwm 0, 0, 4
- vadduwm 1, 1, 5
- vadduwm 2, 2, 6
- vadduwm 3, 3, 7
- vadduwm 16, 16, 20
- vadduwm 17, 17, 21
- vadduwm 18, 18, 22
- vadduwm 19, 19, 23
-
- vpermxor 12, 12, 0, 25
- vpermxor 13, 13, 1, 25
- vpermxor 14, 14, 2, 25
- vpermxor 15, 15, 3, 25
- vpermxor 28, 28, 16, 25
- vpermxor 29, 29, 17, 25
- vpermxor 30, 30, 18, 25
- vpermxor 31, 31, 19, 25
- xxlor 32+25, 0, 0
- vadduwm 8, 8, 12
- vadduwm 9, 9, 13
- vadduwm 10, 10, 14
- vadduwm 11, 11, 15
- vadduwm 24, 24, 28
- vadduwm 25, 25, 29
- vadduwm 26, 26, 30
- vadduwm 27, 27, 31
- vxor 4, 4, 8
- vxor 5, 5, 9
- vxor 6, 6, 10
- vxor 7, 7, 11
- vxor 20, 20, 24
- vxor 21, 21, 25
- vxor 22, 22, 26
- vxor 23, 23, 27
-
- xxlor 0, 32+25, 32+25
- xxlor 32+25, 21, 21
- vrlw 4, 4, 25 #
- vrlw 5, 5, 25
- vrlw 6, 6, 25
- vrlw 7, 7, 25
- vrlw 20, 20, 25 #
- vrlw 21, 21, 25
- vrlw 22, 22, 25
- vrlw 23, 23, 25
- xxlor 32+25, 0, 0
- vadduwm 0, 0, 4
- vadduwm 1, 1, 5
- vadduwm 2, 2, 6
- vadduwm 3, 3, 7
- vadduwm 16, 16, 20
- vadduwm 17, 17, 21
- vadduwm 18, 18, 22
- vadduwm 19, 19, 23
-
- xxlor 0, 32+25, 32+25
- xxlor 32+25, 22, 22
- vpermxor 12, 12, 0, 25
- vpermxor 13, 13, 1, 25
- vpermxor 14, 14, 2, 25
- vpermxor 15, 15, 3, 25
- vpermxor 28, 28, 16, 25
- vpermxor 29, 29, 17, 25
- vpermxor 30, 30, 18, 25
- vpermxor 31, 31, 19, 25
- xxlor 32+25, 0, 0
- vadduwm 8, 8, 12
- vadduwm 9, 9, 13
- vadduwm 10, 10, 14
- vadduwm 11, 11, 15
- vadduwm 24, 24, 28
- vadduwm 25, 25, 29
- vadduwm 26, 26, 30
- vadduwm 27, 27, 31
- xxlor 0, 32+28, 32+28
- xxlor 32+28, 23, 23
- vxor 4, 4, 8
- vxor 5, 5, 9
- vxor 6, 6, 10
- vxor 7, 7, 11
- vxor 20, 20, 24
- vxor 21, 21, 25
- vxor 22, 22, 26
- vxor 23, 23, 27
- vrlw 4, 4, 28 #
- vrlw 5, 5, 28
- vrlw 6, 6, 28
- vrlw 7, 7, 28
- vrlw 20, 20, 28 #
- vrlw 21, 21, 28
- vrlw 22, 22, 28
- vrlw 23, 23, 28
- xxlor 32+28, 0, 0
-
- # QR(v0, v5, v10, v15, v1, v6, v11, v12, v2, v7, v8, v13, v3, v4, v9, v14)
- xxlor 0, 32+25, 32+25
- xxlor 32+25, 20, 20
- vadduwm 0, 0, 5
- vadduwm 1, 1, 6
- vadduwm 2, 2, 7
- vadduwm 3, 3, 4
- vadduwm 16, 16, 21
- vadduwm 17, 17, 22
- vadduwm 18, 18, 23
- vadduwm 19, 19, 20
-
- vpermxor 15, 15, 0, 25
- vpermxor 12, 12, 1, 25
- vpermxor 13, 13, 2, 25
- vpermxor 14, 14, 3, 25
- vpermxor 31, 31, 16, 25
- vpermxor 28, 28, 17, 25
- vpermxor 29, 29, 18, 25
- vpermxor 30, 30, 19, 25
-
- xxlor 32+25, 0, 0
- vadduwm 10, 10, 15
- vadduwm 11, 11, 12
- vadduwm 8, 8, 13
- vadduwm 9, 9, 14
- vadduwm 26, 26, 31
- vadduwm 27, 27, 28
- vadduwm 24, 24, 29
- vadduwm 25, 25, 30
- vxor 5, 5, 10
- vxor 6, 6, 11
- vxor 7, 7, 8
- vxor 4, 4, 9
- vxor 21, 21, 26
- vxor 22, 22, 27
- vxor 23, 23, 24
- vxor 20, 20, 25
-
- xxlor 0, 32+25, 32+25
- xxlor 32+25, 21, 21
- vrlw 5, 5, 25
- vrlw 6, 6, 25
- vrlw 7, 7, 25
- vrlw 4, 4, 25
- vrlw 21, 21, 25
- vrlw 22, 22, 25
- vrlw 23, 23, 25
- vrlw 20, 20, 25
- xxlor 32+25, 0, 0
-
- vadduwm 0, 0, 5
- vadduwm 1, 1, 6
- vadduwm 2, 2, 7
- vadduwm 3, 3, 4
- vadduwm 16, 16, 21
- vadduwm 17, 17, 22
- vadduwm 18, 18, 23
- vadduwm 19, 19, 20
-
- xxlor 0, 32+25, 32+25
- xxlor 32+25, 22, 22
- vpermxor 15, 15, 0, 25
- vpermxor 12, 12, 1, 25
- vpermxor 13, 13, 2, 25
- vpermxor 14, 14, 3, 25
- vpermxor 31, 31, 16, 25
- vpermxor 28, 28, 17, 25
- vpermxor 29, 29, 18, 25
- vpermxor 30, 30, 19, 25
- xxlor 32+25, 0, 0
-
- vadduwm 10, 10, 15
- vadduwm 11, 11, 12
- vadduwm 8, 8, 13
- vadduwm 9, 9, 14
- vadduwm 26, 26, 31
- vadduwm 27, 27, 28
- vadduwm 24, 24, 29
- vadduwm 25, 25, 30
-
- xxlor 0, 32+28, 32+28
- xxlor 32+28, 23, 23
- vxor 5, 5, 10
- vxor 6, 6, 11
- vxor 7, 7, 8
- vxor 4, 4, 9
- vxor 21, 21, 26
- vxor 22, 22, 27
- vxor 23, 23, 24
- vxor 20, 20, 25
- vrlw 5, 5, 28
- vrlw 6, 6, 28
- vrlw 7, 7, 28
- vrlw 4, 4, 28
- vrlw 21, 21, 28
- vrlw 22, 22, 28
- vrlw 23, 23, 28
- vrlw 20, 20, 28
- xxlor 32+28, 0, 0
-.endm
-
-.macro QT_loop_4x
- # QR(v0, v4, v8, v12, v1, v5, v9, v13, v2, v6, v10, v14, v3, v7, v11, v15)
- vadduwm 0, 0, 4
- vadduwm 1, 1, 5
- vadduwm 2, 2, 6
- vadduwm 3, 3, 7
- vpermxor 12, 12, 0, 20
- vpermxor 13, 13, 1, 20
- vpermxor 14, 14, 2, 20
- vpermxor 15, 15, 3, 20
- vadduwm 8, 8, 12
- vadduwm 9, 9, 13
- vadduwm 10, 10, 14
- vadduwm 11, 11, 15
- vxor 4, 4, 8
- vxor 5, 5, 9
- vxor 6, 6, 10
- vxor 7, 7, 11
- vrlw 4, 4, 21
- vrlw 5, 5, 21
- vrlw 6, 6, 21
- vrlw 7, 7, 21
- vadduwm 0, 0, 4
- vadduwm 1, 1, 5
- vadduwm 2, 2, 6
- vadduwm 3, 3, 7
- vpermxor 12, 12, 0, 22
- vpermxor 13, 13, 1, 22
- vpermxor 14, 14, 2, 22
- vpermxor 15, 15, 3, 22
- vadduwm 8, 8, 12
- vadduwm 9, 9, 13
- vadduwm 10, 10, 14
- vadduwm 11, 11, 15
- vxor 4, 4, 8
- vxor 5, 5, 9
- vxor 6, 6, 10
- vxor 7, 7, 11
- vrlw 4, 4, 23
- vrlw 5, 5, 23
- vrlw 6, 6, 23
- vrlw 7, 7, 23
-
- # QR(v0, v5, v10, v15, v1, v6, v11, v12, v2, v7, v8, v13, v3, v4, v9, v14)
- vadduwm 0, 0, 5
- vadduwm 1, 1, 6
- vadduwm 2, 2, 7
- vadduwm 3, 3, 4
- vpermxor 15, 15, 0, 20
- vpermxor 12, 12, 1, 20
- vpermxor 13, 13, 2, 20
- vpermxor 14, 14, 3, 20
- vadduwm 10, 10, 15
- vadduwm 11, 11, 12
- vadduwm 8, 8, 13
- vadduwm 9, 9, 14
- vxor 5, 5, 10
- vxor 6, 6, 11
- vxor 7, 7, 8
- vxor 4, 4, 9
- vrlw 5, 5, 21
- vrlw 6, 6, 21
- vrlw 7, 7, 21
- vrlw 4, 4, 21
- vadduwm 0, 0, 5
- vadduwm 1, 1, 6
- vadduwm 2, 2, 7
- vadduwm 3, 3, 4
- vpermxor 15, 15, 0, 22
- vpermxor 12, 12, 1, 22
- vpermxor 13, 13, 2, 22
- vpermxor 14, 14, 3, 22
- vadduwm 10, 10, 15
- vadduwm 11, 11, 12
- vadduwm 8, 8, 13
- vadduwm 9, 9, 14
- vxor 5, 5, 10
- vxor 6, 6, 11
- vxor 7, 7, 8
- vxor 4, 4, 9
- vrlw 5, 5, 23
- vrlw 6, 6, 23
- vrlw 7, 7, 23
- vrlw 4, 4, 23
-.endm
-
-# Transpose
-.macro TP_4x a0 a1 a2 a3
- xxmrghw 10, 32+\a0, 32+\a1 # a0, a1, b0, b1
- xxmrghw 11, 32+\a2, 32+\a3 # a2, a3, b2, b3
- xxmrglw 12, 32+\a0, 32+\a1 # c0, c1, d0, d1
- xxmrglw 13, 32+\a2, 32+\a3 # c2, c3, d2, d3
- xxpermdi 32+\a0, 10, 11, 0 # a0, a1, a2, a3
- xxpermdi 32+\a1, 10, 11, 3 # b0, b1, b2, b3
- xxpermdi 32+\a2, 12, 13, 0 # c0, c1, c2, c3
- xxpermdi 32+\a3, 12, 13, 3 # d0, d1, d2, d3
-.endm
-
-# key stream = working state + state
-.macro Add_state S
- vadduwm \S+0, \S+0, 16-\S
- vadduwm \S+4, \S+4, 17-\S
- vadduwm \S+8, \S+8, 18-\S
- vadduwm \S+12, \S+12, 19-\S
-
- vadduwm \S+1, \S+1, 16-\S
- vadduwm \S+5, \S+5, 17-\S
- vadduwm \S+9, \S+9, 18-\S
- vadduwm \S+13, \S+13, 19-\S
-
- vadduwm \S+2, \S+2, 16-\S
- vadduwm \S+6, \S+6, 17-\S
- vadduwm \S+10, \S+10, 18-\S
- vadduwm \S+14, \S+14, 19-\S
-
- vadduwm \S+3, \S+3, 16-\S
- vadduwm \S+7, \S+7, 17-\S
- vadduwm \S+11, \S+11, 18-\S
- vadduwm \S+15, \S+15, 19-\S
-.endm
-
-#
-# write 256 bytes
-#
-.macro Write_256 S
- add 9, 14, 5
- add 16, 14, 4
- lxvw4x 0, 0, 9
- lxvw4x 1, 17, 9
- lxvw4x 2, 18, 9
- lxvw4x 3, 19, 9
- lxvw4x 4, 20, 9
- lxvw4x 5, 21, 9
- lxvw4x 6, 22, 9
- lxvw4x 7, 23, 9
- lxvw4x 8, 24, 9
- lxvw4x 9, 25, 9
- lxvw4x 10, 26, 9
- lxvw4x 11, 27, 9
- lxvw4x 12, 28, 9
- lxvw4x 13, 29, 9
- lxvw4x 14, 30, 9
- lxvw4x 15, 31, 9
-
- xxlxor \S+32, \S+32, 0
- xxlxor \S+36, \S+36, 1
- xxlxor \S+40, \S+40, 2
- xxlxor \S+44, \S+44, 3
- xxlxor \S+33, \S+33, 4
- xxlxor \S+37, \S+37, 5
- xxlxor \S+41, \S+41, 6
- xxlxor \S+45, \S+45, 7
- xxlxor \S+34, \S+34, 8
- xxlxor \S+38, \S+38, 9
- xxlxor \S+42, \S+42, 10
- xxlxor \S+46, \S+46, 11
- xxlxor \S+35, \S+35, 12
- xxlxor \S+39, \S+39, 13
- xxlxor \S+43, \S+43, 14
- xxlxor \S+47, \S+47, 15
-
- stxvw4x \S+32, 0, 16
- stxvw4x \S+36, 17, 16
- stxvw4x \S+40, 18, 16
- stxvw4x \S+44, 19, 16
-
- stxvw4x \S+33, 20, 16
- stxvw4x \S+37, 21, 16
- stxvw4x \S+41, 22, 16
- stxvw4x \S+45, 23, 16
-
- stxvw4x \S+34, 24, 16
- stxvw4x \S+38, 25, 16
- stxvw4x \S+42, 26, 16
- stxvw4x \S+46, 27, 16
-
- stxvw4x \S+35, 28, 16
- stxvw4x \S+39, 29, 16
- stxvw4x \S+43, 30, 16
- stxvw4x \S+47, 31, 16
-
-.endm
-
-#
-# chacha20_p10le_8x(u32 *state, byte *dst, const byte *src, size_t len, int nrounds);
-#
-SYM_FUNC_START(chacha_p10le_8x)
-.align 5
- cmpdi 6, 0
- ble Out_no_chacha
-
- SAVE_REGS
-
- # r17 - r31 mainly for Write_256 macro.
- li 17, 16
- li 18, 32
- li 19, 48
- li 20, 64
- li 21, 80
- li 22, 96
- li 23, 112
- li 24, 128
- li 25, 144
- li 26, 160
- li 27, 176
- li 28, 192
- li 29, 208
- li 30, 224
- li 31, 240
-
- mr 15, 6 # len
- li 14, 0 # offset to inp and outp
-
- lxvw4x 48, 0, 3 # vr16, constants
- lxvw4x 49, 17, 3 # vr17, key 1
- lxvw4x 50, 18, 3 # vr18, key 2
- lxvw4x 51, 19, 3 # vr19, counter, nonce
-
- # create (0, 1, 2, 3) counters
- vspltisw 0, 0
- vspltisw 1, 1
- vspltisw 2, 2
- vspltisw 3, 3
- vmrghw 4, 0, 1
- vmrglw 5, 2, 3
- vsldoi 30, 4, 5, 8 # vr30 counter, 4 (0, 1, 2, 3)
-
- vspltisw 21, 12
- vspltisw 23, 7
-
- addis 11, 2, permx@toc@ha
- addi 11, 11, permx@toc@l
- lxvw4x 32+20, 0, 11
- lxvw4x 32+22, 17, 11
-
- sradi 8, 7, 1
-
- mtctr 8
-
- # save constants to vsx
- xxlor 16, 48, 48
- xxlor 17, 49, 49
- xxlor 18, 50, 50
- xxlor 19, 51, 51
-
- vspltisw 25, 4
- vspltisw 26, 8
-
- xxlor 25, 32+26, 32+26
- xxlor 24, 32+25, 32+25
-
- vadduwm 31, 30, 25 # counter = (0, 1, 2, 3) + (4, 4, 4, 4)
- xxlor 30, 32+30, 32+30
- xxlor 31, 32+31, 32+31
-
- xxlor 20, 32+20, 32+20
- xxlor 21, 32+21, 32+21
- xxlor 22, 32+22, 32+22
- xxlor 23, 32+23, 32+23
-
- cmpdi 6, 512
- blt Loop_last
-
-Loop_8x:
- xxspltw 32+0, 16, 0
- xxspltw 32+1, 16, 1
- xxspltw 32+2, 16, 2
- xxspltw 32+3, 16, 3
-
- xxspltw 32+4, 17, 0
- xxspltw 32+5, 17, 1
- xxspltw 32+6, 17, 2
- xxspltw 32+7, 17, 3
- xxspltw 32+8, 18, 0
- xxspltw 32+9, 18, 1
- xxspltw 32+10, 18, 2
- xxspltw 32+11, 18, 3
- xxspltw 32+12, 19, 0
- xxspltw 32+13, 19, 1
- xxspltw 32+14, 19, 2
- xxspltw 32+15, 19, 3
- vadduwm 12, 12, 30 # increase counter
-
- xxspltw 32+16, 16, 0
- xxspltw 32+17, 16, 1
- xxspltw 32+18, 16, 2
- xxspltw 32+19, 16, 3
-
- xxspltw 32+20, 17, 0
- xxspltw 32+21, 17, 1
- xxspltw 32+22, 17, 2
- xxspltw 32+23, 17, 3
- xxspltw 32+24, 18, 0
- xxspltw 32+25, 18, 1
- xxspltw 32+26, 18, 2
- xxspltw 32+27, 18, 3
- xxspltw 32+28, 19, 0
- xxspltw 32+29, 19, 1
- vadduwm 28, 28, 31 # increase counter
- xxspltw 32+30, 19, 2
- xxspltw 32+31, 19, 3
-
-.align 5
-quarter_loop_8x:
- QT_loop_8x
-
- bdnz quarter_loop_8x
-
- xxlor 0, 32+30, 32+30
- xxlor 32+30, 30, 30
- vadduwm 12, 12, 30
- xxlor 32+30, 0, 0
- TP_4x 0, 1, 2, 3
- TP_4x 4, 5, 6, 7
- TP_4x 8, 9, 10, 11
- TP_4x 12, 13, 14, 15
-
- xxlor 0, 48, 48
- xxlor 1, 49, 49
- xxlor 2, 50, 50
- xxlor 3, 51, 51
- xxlor 48, 16, 16
- xxlor 49, 17, 17
- xxlor 50, 18, 18
- xxlor 51, 19, 19
- Add_state 0
- xxlor 48, 0, 0
- xxlor 49, 1, 1
- xxlor 50, 2, 2
- xxlor 51, 3, 3
- Write_256 0
- addi 14, 14, 256 # offset +=256
- addi 15, 15, -256 # len -=256
-
- xxlor 5, 32+31, 32+31
- xxlor 32+31, 31, 31
- vadduwm 28, 28, 31
- xxlor 32+31, 5, 5
- TP_4x 16+0, 16+1, 16+2, 16+3
- TP_4x 16+4, 16+5, 16+6, 16+7
- TP_4x 16+8, 16+9, 16+10, 16+11
- TP_4x 16+12, 16+13, 16+14, 16+15
-
- xxlor 32, 16, 16
- xxlor 33, 17, 17
- xxlor 34, 18, 18
- xxlor 35, 19, 19
- Add_state 16
- Write_256 16
- addi 14, 14, 256 # offset +=256
- addi 15, 15, -256 # len +=256
-
- xxlor 32+24, 24, 24
- xxlor 32+25, 25, 25
- xxlor 32+30, 30, 30
- vadduwm 30, 30, 25
- vadduwm 31, 30, 24
- xxlor 30, 32+30, 32+30
- xxlor 31, 32+31, 32+31
-
- cmpdi 15, 0
- beq Out_loop
-
- cmpdi 15, 512
- blt Loop_last
-
- mtctr 8
- b Loop_8x
-
-Loop_last:
- lxvw4x 48, 0, 3 # vr16, constants
- lxvw4x 49, 17, 3 # vr17, key 1
- lxvw4x 50, 18, 3 # vr18, key 2
- lxvw4x 51, 19, 3 # vr19, counter, nonce
-
- vspltisw 21, 12
- vspltisw 23, 7
- addis 11, 2, permx@toc@ha
- addi 11, 11, permx@toc@l
- lxvw4x 32+20, 0, 11
- lxvw4x 32+22, 17, 11
-
- sradi 8, 7, 1
- mtctr 8
-
-Loop_4x:
- vspltw 0, 16, 0
- vspltw 1, 16, 1
- vspltw 2, 16, 2
- vspltw 3, 16, 3
-
- vspltw 4, 17, 0
- vspltw 5, 17, 1
- vspltw 6, 17, 2
- vspltw 7, 17, 3
- vspltw 8, 18, 0
- vspltw 9, 18, 1
- vspltw 10, 18, 2
- vspltw 11, 18, 3
- vspltw 12, 19, 0
- vadduwm 12, 12, 30 # increase counter
- vspltw 13, 19, 1
- vspltw 14, 19, 2
- vspltw 15, 19, 3
-
-.align 5
-quarter_loop:
- QT_loop_4x
-
- bdnz quarter_loop
-
- vadduwm 12, 12, 30
- TP_4x 0, 1, 2, 3
- TP_4x 4, 5, 6, 7
- TP_4x 8, 9, 10, 11
- TP_4x 12, 13, 14, 15
-
- Add_state 0
- Write_256 0
- addi 14, 14, 256 # offset += 256
- addi 15, 15, -256 # len += 256
-
- # Update state counter
- vspltisw 25, 4
- vadduwm 30, 30, 25
-
- cmpdi 15, 0
- beq Out_loop
- cmpdi 15, 256
- blt Out_loop
-
- mtctr 8
- b Loop_4x
-
-Out_loop:
- RESTORE_REGS
- blr
-
-Out_no_chacha:
- li 3, 0
- blr
-SYM_FUNC_END(chacha_p10le_8x)
-
-SYM_DATA_START_LOCAL(PERMX)
-.align 5
-permx:
-.long 0x22330011, 0x66774455, 0xaabb8899, 0xeeffccdd
-.long 0x11223300, 0x55667744, 0x99aabb88, 0xddeeffcc
-SYM_DATA_END(PERMX)
diff --git a/arch/powerpc/crypto/crc-vpmsum_test.c b/arch/powerpc/crypto/crc-vpmsum_test.c
deleted file mode 100644
index c61a874a3a5c..000000000000
--- a/arch/powerpc/crypto/crc-vpmsum_test.c
+++ /dev/null
@@ -1,133 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * CRC vpmsum tester
- * Copyright 2017 Daniel Axtens, IBM Corporation.
- */
-
-#include <linux/crc-t10dif.h>
-#include <linux/crc32.h>
-#include <crypto/internal/hash.h>
-#include <linux/init.h>
-#include <linux/module.h>
-#include <linux/random.h>
-#include <linux/string.h>
-#include <linux/kernel.h>
-#include <linux/cpufeature.h>
-#include <asm/switch_to.h>
-
-static unsigned long iterations = 10000;
-
-#define MAX_CRC_LENGTH 65535
-
-
-static int __init crc_test_init(void)
-{
- u16 crc16 = 0, verify16 = 0;
- __le32 verify32le = 0;
- unsigned char *data;
- u32 verify32 = 0;
- unsigned long i;
- __le32 crc32;
- int ret;
-
- struct crypto_shash *crct10dif_tfm;
- struct crypto_shash *crc32c_tfm;
-
- if (!cpu_has_feature(CPU_FTR_ARCH_207S))
- return -ENODEV;
-
- data = kmalloc(MAX_CRC_LENGTH, GFP_KERNEL);
- if (!data)
- return -ENOMEM;
-
- crct10dif_tfm = crypto_alloc_shash("crct10dif", 0, 0);
-
- if (IS_ERR(crct10dif_tfm)) {
- pr_err("Error allocating crc-t10dif\n");
- goto free_buf;
- }
-
- crc32c_tfm = crypto_alloc_shash("crc32c", 0, 0);
-
- if (IS_ERR(crc32c_tfm)) {
- pr_err("Error allocating crc32c\n");
- goto free_16;
- }
-
- do {
- SHASH_DESC_ON_STACK(crct10dif_shash, crct10dif_tfm);
- SHASH_DESC_ON_STACK(crc32c_shash, crc32c_tfm);
-
- crct10dif_shash->tfm = crct10dif_tfm;
- ret = crypto_shash_init(crct10dif_shash);
-
- if (ret) {
- pr_err("Error initing crc-t10dif\n");
- goto free_32;
- }
-
-
- crc32c_shash->tfm = crc32c_tfm;
- ret = crypto_shash_init(crc32c_shash);
-
- if (ret) {
- pr_err("Error initing crc32c\n");
- goto free_32;
- }
-
- pr_info("crc-vpmsum_test begins, %lu iterations\n", iterations);
- for (i=0; i<iterations; i++) {
- size_t offset = get_random_u32_below(16);
- size_t len = get_random_u32_below(MAX_CRC_LENGTH);
-
- if (len <= offset)
- continue;
- get_random_bytes(data, len);
- len -= offset;
-
- crypto_shash_update(crct10dif_shash, data+offset, len);
- crypto_shash_final(crct10dif_shash, (u8 *)(&crc16));
- verify16 = crc_t10dif_generic(verify16, data+offset, len);
-
-
- if (crc16 != verify16) {
- pr_err("FAILURE in CRC16: got 0x%04x expected 0x%04x (len %lu)\n",
- crc16, verify16, len);
- break;
- }
-
- crypto_shash_update(crc32c_shash, data+offset, len);
- crypto_shash_final(crc32c_shash, (u8 *)(&crc32));
- verify32 = le32_to_cpu(verify32le);
- verify32le = ~cpu_to_le32(__crc32c_le(~verify32, data+offset, len));
- if (crc32 != verify32le) {
- pr_err("FAILURE in CRC32: got 0x%08x expected 0x%08x (len %lu)\n",
- crc32, verify32, len);
- break;
- }
- cond_resched();
- }
- pr_info("crc-vpmsum_test done, completed %lu iterations\n", i);
- } while (0);
-
-free_32:
- crypto_free_shash(crc32c_tfm);
-
-free_16:
- crypto_free_shash(crct10dif_tfm);
-
-free_buf:
- kfree(data);
-
- return 0;
-}
-
-static void __exit crc_test_exit(void) {}
-
-module_init(crc_test_init);
-module_exit(crc_test_exit);
-module_param(iterations, long, 0400);
-
-MODULE_AUTHOR("Daniel Axtens <dja@axtens.net>");
-MODULE_DESCRIPTION("Vector polynomial multiply-sum CRC tester");
-MODULE_LICENSE("GPL");
diff --git a/arch/powerpc/crypto/crc32-vpmsum_core.S b/arch/powerpc/crypto/crc32-vpmsum_core.S
deleted file mode 100644
index b0f87f595b26..000000000000
--- a/arch/powerpc/crypto/crc32-vpmsum_core.S
+++ /dev/null
@@ -1,746 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * Core of the accelerated CRC algorithm.
- * In your file, define the constants and CRC_FUNCTION_NAME
- * Then include this file.
- *
- * Calculate the checksum of data that is 16 byte aligned and a multiple of
- * 16 bytes.
- *
- * The first step is to reduce it to 1024 bits. We do this in 8 parallel
- * chunks in order to mask the latency of the vpmsum instructions. If we
- * have more than 32 kB of data to checksum we repeat this step multiple
- * times, passing in the previous 1024 bits.
- *
- * The next step is to reduce the 1024 bits to 64 bits. This step adds
- * 32 bits of 0s to the end - this matches what a CRC does. We just
- * calculate constants that land the data in this 32 bits.
- *
- * We then use fixed point Barrett reduction to compute a mod n over GF(2)
- * for n = CRC using POWER8 instructions. We use x = 32.
- *
- * https://en.wikipedia.org/wiki/Barrett_reduction
- *
- * Copyright (C) 2015 Anton Blanchard <anton@au.ibm.com>, IBM
-*/
-
-#include <asm/ppc_asm.h>
-#include <asm/ppc-opcode.h>
-
-#define MAX_SIZE 32768
-
- .text
-
-#if defined(__BIG_ENDIAN__) && defined(REFLECT)
-#define BYTESWAP_DATA
-#elif defined(__LITTLE_ENDIAN__) && !defined(REFLECT)
-#define BYTESWAP_DATA
-#else
-#undef BYTESWAP_DATA
-#endif
-
-#define off16 r25
-#define off32 r26
-#define off48 r27
-#define off64 r28
-#define off80 r29
-#define off96 r30
-#define off112 r31
-
-#define const1 v24
-#define const2 v25
-
-#define byteswap v26
-#define mask_32bit v27
-#define mask_64bit v28
-#define zeroes v29
-
-#ifdef BYTESWAP_DATA
-#define VPERM(A, B, C, D) vperm A, B, C, D
-#else
-#define VPERM(A, B, C, D)
-#endif
-
-/* unsigned int CRC_FUNCTION_NAME(unsigned int crc, void *p, unsigned long len) */
-FUNC_START(CRC_FUNCTION_NAME)
- std r31,-8(r1)
- std r30,-16(r1)
- std r29,-24(r1)
- std r28,-32(r1)
- std r27,-40(r1)
- std r26,-48(r1)
- std r25,-56(r1)
-
- li off16,16
- li off32,32
- li off48,48
- li off64,64
- li off80,80
- li off96,96
- li off112,112
- li r0,0
-
- /* Enough room for saving 10 non volatile VMX registers */
- subi r6,r1,56+10*16
- subi r7,r1,56+2*16
-
- stvx v20,0,r6
- stvx v21,off16,r6
- stvx v22,off32,r6
- stvx v23,off48,r6
- stvx v24,off64,r6
- stvx v25,off80,r6
- stvx v26,off96,r6
- stvx v27,off112,r6
- stvx v28,0,r7
- stvx v29,off16,r7
-
- mr r10,r3
-
- vxor zeroes,zeroes,zeroes
- vspltisw v0,-1
-
- vsldoi mask_32bit,zeroes,v0,4
- vsldoi mask_64bit,zeroes,v0,8
-
- /* Get the initial value into v8 */
- vxor v8,v8,v8
- MTVRD(v8, R3)
-#ifdef REFLECT
- vsldoi v8,zeroes,v8,8 /* shift into bottom 32 bits */
-#else
- vsldoi v8,v8,zeroes,4 /* shift into top 32 bits */
-#endif
-
-#ifdef BYTESWAP_DATA
- LOAD_REG_ADDR(r3, .byteswap_constant)
- lvx byteswap,0,r3
- addi r3,r3,16
-#endif
-
- cmpdi r5,256
- blt .Lshort
-
- rldicr r6,r5,0,56
-
- /* Checksum in blocks of MAX_SIZE */
-1: lis r7,MAX_SIZE@h
- ori r7,r7,MAX_SIZE@l
- mr r9,r7
- cmpd r6,r7
- bgt 2f
- mr r7,r6
-2: subf r6,r7,r6
-
- /* our main loop does 128 bytes at a time */
- srdi r7,r7,7
-
- /*
- * Work out the offset into the constants table to start at. Each
- * constant is 16 bytes, and it is used against 128 bytes of input
- * data - 128 / 16 = 8
- */
- sldi r8,r7,4
- srdi r9,r9,3
- subf r8,r8,r9
-
- /* We reduce our final 128 bytes in a separate step */
- addi r7,r7,-1
- mtctr r7
-
- LOAD_REG_ADDR(r3, .constants)
-
- /* Find the start of our constants */
- add r3,r3,r8
-
- /* zero v0-v7 which will contain our checksums */
- vxor v0,v0,v0
- vxor v1,v1,v1
- vxor v2,v2,v2
- vxor v3,v3,v3
- vxor v4,v4,v4
- vxor v5,v5,v5
- vxor v6,v6,v6
- vxor v7,v7,v7
-
- lvx const1,0,r3
-
- /*
- * If we are looping back to consume more data we use the values
- * already in v16-v23.
- */
- cmpdi r0,1
- beq 2f
-
- /* First warm up pass */
- lvx v16,0,r4
- lvx v17,off16,r4
- VPERM(v16,v16,v16,byteswap)
- VPERM(v17,v17,v17,byteswap)
- lvx v18,off32,r4
- lvx v19,off48,r4
- VPERM(v18,v18,v18,byteswap)
- VPERM(v19,v19,v19,byteswap)
- lvx v20,off64,r4
- lvx v21,off80,r4
- VPERM(v20,v20,v20,byteswap)
- VPERM(v21,v21,v21,byteswap)
- lvx v22,off96,r4
- lvx v23,off112,r4
- VPERM(v22,v22,v22,byteswap)
- VPERM(v23,v23,v23,byteswap)
- addi r4,r4,8*16
-
- /* xor in initial value */
- vxor v16,v16,v8
-
-2: bdz .Lfirst_warm_up_done
-
- addi r3,r3,16
- lvx const2,0,r3
-
- /* Second warm up pass */
- VPMSUMD(v8,v16,const1)
- lvx v16,0,r4
- VPERM(v16,v16,v16,byteswap)
- ori r2,r2,0
-
- VPMSUMD(v9,v17,const1)
- lvx v17,off16,r4
- VPERM(v17,v17,v17,byteswap)
- ori r2,r2,0
-
- VPMSUMD(v10,v18,const1)
- lvx v18,off32,r4
- VPERM(v18,v18,v18,byteswap)
- ori r2,r2,0
-
- VPMSUMD(v11,v19,const1)
- lvx v19,off48,r4
- VPERM(v19,v19,v19,byteswap)
- ori r2,r2,0
-
- VPMSUMD(v12,v20,const1)
- lvx v20,off64,r4
- VPERM(v20,v20,v20,byteswap)
- ori r2,r2,0
-
- VPMSUMD(v13,v21,const1)
- lvx v21,off80,r4
- VPERM(v21,v21,v21,byteswap)
- ori r2,r2,0
-
- VPMSUMD(v14,v22,const1)
- lvx v22,off96,r4
- VPERM(v22,v22,v22,byteswap)
- ori r2,r2,0
-
- VPMSUMD(v15,v23,const1)
- lvx v23,off112,r4
- VPERM(v23,v23,v23,byteswap)
-
- addi r4,r4,8*16
-
- bdz .Lfirst_cool_down
-
- /*
- * main loop. We modulo schedule it such that it takes three iterations
- * to complete - first iteration load, second iteration vpmsum, third
- * iteration xor.
- */
- .balign 16
-4: lvx const1,0,r3
- addi r3,r3,16
- ori r2,r2,0
-
- vxor v0,v0,v8
- VPMSUMD(v8,v16,const2)
- lvx v16,0,r4
- VPERM(v16,v16,v16,byteswap)
- ori r2,r2,0
-
- vxor v1,v1,v9
- VPMSUMD(v9,v17,const2)
- lvx v17,off16,r4
- VPERM(v17,v17,v17,byteswap)
- ori r2,r2,0
-
- vxor v2,v2,v10
- VPMSUMD(v10,v18,const2)
- lvx v18,off32,r4
- VPERM(v18,v18,v18,byteswap)
- ori r2,r2,0
-
- vxor v3,v3,v11
- VPMSUMD(v11,v19,const2)
- lvx v19,off48,r4
- VPERM(v19,v19,v19,byteswap)
- lvx const2,0,r3
- ori r2,r2,0
-
- vxor v4,v4,v12
- VPMSUMD(v12,v20,const1)
- lvx v20,off64,r4
- VPERM(v20,v20,v20,byteswap)
- ori r2,r2,0
-
- vxor v5,v5,v13
- VPMSUMD(v13,v21,const1)
- lvx v21,off80,r4
- VPERM(v21,v21,v21,byteswap)
- ori r2,r2,0
-
- vxor v6,v6,v14
- VPMSUMD(v14,v22,const1)
- lvx v22,off96,r4
- VPERM(v22,v22,v22,byteswap)
- ori r2,r2,0
-
- vxor v7,v7,v15
- VPMSUMD(v15,v23,const1)
- lvx v23,off112,r4
- VPERM(v23,v23,v23,byteswap)
-
- addi r4,r4,8*16
-
- bdnz 4b
-
-.Lfirst_cool_down:
- /* First cool down pass */
- lvx const1,0,r3
- addi r3,r3,16
-
- vxor v0,v0,v8
- VPMSUMD(v8,v16,const1)
- ori r2,r2,0
-
- vxor v1,v1,v9
- VPMSUMD(v9,v17,const1)
- ori r2,r2,0
-
- vxor v2,v2,v10
- VPMSUMD(v10,v18,const1)
- ori r2,r2,0
-
- vxor v3,v3,v11
- VPMSUMD(v11,v19,const1)
- ori r2,r2,0
-
- vxor v4,v4,v12
- VPMSUMD(v12,v20,const1)
- ori r2,r2,0
-
- vxor v5,v5,v13
- VPMSUMD(v13,v21,const1)
- ori r2,r2,0
-
- vxor v6,v6,v14
- VPMSUMD(v14,v22,const1)
- ori r2,r2,0
-
- vxor v7,v7,v15
- VPMSUMD(v15,v23,const1)
- ori r2,r2,0
-
-.Lsecond_cool_down:
- /* Second cool down pass */
- vxor v0,v0,v8
- vxor v1,v1,v9
- vxor v2,v2,v10
- vxor v3,v3,v11
- vxor v4,v4,v12
- vxor v5,v5,v13
- vxor v6,v6,v14
- vxor v7,v7,v15
-
-#ifdef REFLECT
- /*
- * vpmsumd produces a 96 bit result in the least significant bits
- * of the register. Since we are bit reflected we have to shift it
- * left 32 bits so it occupies the least significant bits in the
- * bit reflected domain.
- */
- vsldoi v0,v0,zeroes,4
- vsldoi v1,v1,zeroes,4
- vsldoi v2,v2,zeroes,4
- vsldoi v3,v3,zeroes,4
- vsldoi v4,v4,zeroes,4
- vsldoi v5,v5,zeroes,4
- vsldoi v6,v6,zeroes,4
- vsldoi v7,v7,zeroes,4
-#endif
-
- /* xor with last 1024 bits */
- lvx v8,0,r4
- lvx v9,off16,r4
- VPERM(v8,v8,v8,byteswap)
- VPERM(v9,v9,v9,byteswap)
- lvx v10,off32,r4
- lvx v11,off48,r4
- VPERM(v10,v10,v10,byteswap)
- VPERM(v11,v11,v11,byteswap)
- lvx v12,off64,r4
- lvx v13,off80,r4
- VPERM(v12,v12,v12,byteswap)
- VPERM(v13,v13,v13,byteswap)
- lvx v14,off96,r4
- lvx v15,off112,r4
- VPERM(v14,v14,v14,byteswap)
- VPERM(v15,v15,v15,byteswap)
-
- addi r4,r4,8*16
-
- vxor v16,v0,v8
- vxor v17,v1,v9
- vxor v18,v2,v10
- vxor v19,v3,v11
- vxor v20,v4,v12
- vxor v21,v5,v13
- vxor v22,v6,v14
- vxor v23,v7,v15
-
- li r0,1
- cmpdi r6,0
- addi r6,r6,128
- bne 1b
-
- /* Work out how many bytes we have left */
- andi. r5,r5,127
-
- /* Calculate where in the constant table we need to start */
- subfic r6,r5,128
- add r3,r3,r6
-
- /* How many 16 byte chunks are in the tail */
- srdi r7,r5,4
- mtctr r7
-
- /*
- * Reduce the previously calculated 1024 bits to 64 bits, shifting
- * 32 bits to include the trailing 32 bits of zeros
- */
- lvx v0,0,r3
- lvx v1,off16,r3
- lvx v2,off32,r3
- lvx v3,off48,r3
- lvx v4,off64,r3
- lvx v5,off80,r3
- lvx v6,off96,r3
- lvx v7,off112,r3
- addi r3,r3,8*16
-
- VPMSUMW(v0,v16,v0)
- VPMSUMW(v1,v17,v1)
- VPMSUMW(v2,v18,v2)
- VPMSUMW(v3,v19,v3)
- VPMSUMW(v4,v20,v4)
- VPMSUMW(v5,v21,v5)
- VPMSUMW(v6,v22,v6)
- VPMSUMW(v7,v23,v7)
-
- /* Now reduce the tail (0 - 112 bytes) */
- cmpdi r7,0
- beq 1f
-
- lvx v16,0,r4
- lvx v17,0,r3
- VPERM(v16,v16,v16,byteswap)
- VPMSUMW(v16,v16,v17)
- vxor v0,v0,v16
- bdz 1f
-
- lvx v16,off16,r4
- lvx v17,off16,r3
- VPERM(v16,v16,v16,byteswap)
- VPMSUMW(v16,v16,v17)
- vxor v0,v0,v16
- bdz 1f
-
- lvx v16,off32,r4
- lvx v17,off32,r3
- VPERM(v16,v16,v16,byteswap)
- VPMSUMW(v16,v16,v17)
- vxor v0,v0,v16
- bdz 1f
-
- lvx v16,off48,r4
- lvx v17,off48,r3
- VPERM(v16,v16,v16,byteswap)
- VPMSUMW(v16,v16,v17)
- vxor v0,v0,v16
- bdz 1f
-
- lvx v16,off64,r4
- lvx v17,off64,r3
- VPERM(v16,v16,v16,byteswap)
- VPMSUMW(v16,v16,v17)
- vxor v0,v0,v16
- bdz 1f
-
- lvx v16,off80,r4
- lvx v17,off80,r3
- VPERM(v16,v16,v16,byteswap)
- VPMSUMW(v16,v16,v17)
- vxor v0,v0,v16
- bdz 1f
-
- lvx v16,off96,r4
- lvx v17,off96,r3
- VPERM(v16,v16,v16,byteswap)
- VPMSUMW(v16,v16,v17)
- vxor v0,v0,v16
-
- /* Now xor all the parallel chunks together */
-1: vxor v0,v0,v1
- vxor v2,v2,v3
- vxor v4,v4,v5
- vxor v6,v6,v7
-
- vxor v0,v0,v2
- vxor v4,v4,v6
-
- vxor v0,v0,v4
-
-.Lbarrett_reduction:
- /* Barrett constants */
- LOAD_REG_ADDR(r3, .barrett_constants)
-
- lvx const1,0,r3
- lvx const2,off16,r3
-
- vsldoi v1,v0,v0,8
- vxor v0,v0,v1 /* xor two 64 bit results together */
-
-#ifdef REFLECT
- /* shift left one bit */
- vspltisb v1,1
- vsl v0,v0,v1
-#endif
-
- vand v0,v0,mask_64bit
-#ifndef REFLECT
- /*
- * Now for the Barrett reduction algorithm. The idea is to calculate q,
- * the multiple of our polynomial that we need to subtract. By
- * doing the computation 2x bits higher (ie 64 bits) and shifting the
- * result back down 2x bits, we round down to the nearest multiple.
- */
- VPMSUMD(v1,v0,const1) /* ma */
- vsldoi v1,zeroes,v1,8 /* q = floor(ma/(2^64)) */
- VPMSUMD(v1,v1,const2) /* qn */
- vxor v0,v0,v1 /* a - qn, subtraction is xor in GF(2) */
-
- /*
- * Get the result into r3. We need to shift it left 8 bytes:
- * V0 [ 0 1 2 X ]
- * V0 [ 0 X 2 3 ]
- */
- vsldoi v0,v0,zeroes,8 /* shift result into top 64 bits */
-#else
- /*
- * The reflected version of Barrett reduction. Instead of bit
- * reflecting our data (which is expensive to do), we bit reflect our
- * constants and our algorithm, which means the intermediate data in
- * our vector registers goes from 0-63 instead of 63-0. We can reflect
- * the algorithm because we don't carry in mod 2 arithmetic.
- */
- vand v1,v0,mask_32bit /* bottom 32 bits of a */
- VPMSUMD(v1,v1,const1) /* ma */
- vand v1,v1,mask_32bit /* bottom 32bits of ma */
- VPMSUMD(v1,v1,const2) /* qn */
- vxor v0,v0,v1 /* a - qn, subtraction is xor in GF(2) */
-
- /*
- * Since we are bit reflected, the result (ie the low 32 bits) is in
- * the high 32 bits. We just need to shift it left 4 bytes
- * V0 [ 0 1 X 3 ]
- * V0 [ 0 X 2 3 ]
- */
- vsldoi v0,v0,zeroes,4 /* shift result into top 64 bits of */
-#endif
-
- /* Get it into r3 */
- MFVRD(R3, v0)
-
-.Lout:
- subi r6,r1,56+10*16
- subi r7,r1,56+2*16
-
- lvx v20,0,r6
- lvx v21,off16,r6
- lvx v22,off32,r6
- lvx v23,off48,r6
- lvx v24,off64,r6
- lvx v25,off80,r6
- lvx v26,off96,r6
- lvx v27,off112,r6
- lvx v28,0,r7
- lvx v29,off16,r7
-
- ld r31,-8(r1)
- ld r30,-16(r1)
- ld r29,-24(r1)
- ld r28,-32(r1)
- ld r27,-40(r1)
- ld r26,-48(r1)
- ld r25,-56(r1)
-
- blr
-
-.Lfirst_warm_up_done:
- lvx const1,0,r3
- addi r3,r3,16
-
- VPMSUMD(v8,v16,const1)
- VPMSUMD(v9,v17,const1)
- VPMSUMD(v10,v18,const1)
- VPMSUMD(v11,v19,const1)
- VPMSUMD(v12,v20,const1)
- VPMSUMD(v13,v21,const1)
- VPMSUMD(v14,v22,const1)
- VPMSUMD(v15,v23,const1)
-
- b .Lsecond_cool_down
-
-.Lshort:
- cmpdi r5,0
- beq .Lzero
-
- LOAD_REG_ADDR(r3, .short_constants)
-
- /* Calculate where in the constant table we need to start */
- subfic r6,r5,256
- add r3,r3,r6
-
- /* How many 16 byte chunks? */
- srdi r7,r5,4
- mtctr r7
-
- vxor v19,v19,v19
- vxor v20,v20,v20
-
- lvx v0,0,r4
- lvx v16,0,r3
- VPERM(v0,v0,v16,byteswap)
- vxor v0,v0,v8 /* xor in initial value */
- VPMSUMW(v0,v0,v16)
- bdz .Lv0
-
- lvx v1,off16,r4
- lvx v17,off16,r3
- VPERM(v1,v1,v17,byteswap)
- VPMSUMW(v1,v1,v17)
- bdz .Lv1
-
- lvx v2,off32,r4
- lvx v16,off32,r3
- VPERM(v2,v2,v16,byteswap)
- VPMSUMW(v2,v2,v16)
- bdz .Lv2
-
- lvx v3,off48,r4
- lvx v17,off48,r3
- VPERM(v3,v3,v17,byteswap)
- VPMSUMW(v3,v3,v17)
- bdz .Lv3
-
- lvx v4,off64,r4
- lvx v16,off64,r3
- VPERM(v4,v4,v16,byteswap)
- VPMSUMW(v4,v4,v16)
- bdz .Lv4
-
- lvx v5,off80,r4
- lvx v17,off80,r3
- VPERM(v5,v5,v17,byteswap)
- VPMSUMW(v5,v5,v17)
- bdz .Lv5
-
- lvx v6,off96,r4
- lvx v16,off96,r3
- VPERM(v6,v6,v16,byteswap)
- VPMSUMW(v6,v6,v16)
- bdz .Lv6
-
- lvx v7,off112,r4
- lvx v17,off112,r3
- VPERM(v7,v7,v17,byteswap)
- VPMSUMW(v7,v7,v17)
- bdz .Lv7
-
- addi r3,r3,128
- addi r4,r4,128
-
- lvx v8,0,r4
- lvx v16,0,r3
- VPERM(v8,v8,v16,byteswap)
- VPMSUMW(v8,v8,v16)
- bdz .Lv8
-
- lvx v9,off16,r4
- lvx v17,off16,r3
- VPERM(v9,v9,v17,byteswap)
- VPMSUMW(v9,v9,v17)
- bdz .Lv9
-
- lvx v10,off32,r4
- lvx v16,off32,r3
- VPERM(v10,v10,v16,byteswap)
- VPMSUMW(v10,v10,v16)
- bdz .Lv10
-
- lvx v11,off48,r4
- lvx v17,off48,r3
- VPERM(v11,v11,v17,byteswap)
- VPMSUMW(v11,v11,v17)
- bdz .Lv11
-
- lvx v12,off64,r4
- lvx v16,off64,r3
- VPERM(v12,v12,v16,byteswap)
- VPMSUMW(v12,v12,v16)
- bdz .Lv12
-
- lvx v13,off80,r4
- lvx v17,off80,r3
- VPERM(v13,v13,v17,byteswap)
- VPMSUMW(v13,v13,v17)
- bdz .Lv13
-
- lvx v14,off96,r4
- lvx v16,off96,r3
- VPERM(v14,v14,v16,byteswap)
- VPMSUMW(v14,v14,v16)
- bdz .Lv14
-
- lvx v15,off112,r4
- lvx v17,off112,r3
- VPERM(v15,v15,v17,byteswap)
- VPMSUMW(v15,v15,v17)
-
-.Lv15: vxor v19,v19,v15
-.Lv14: vxor v20,v20,v14
-.Lv13: vxor v19,v19,v13
-.Lv12: vxor v20,v20,v12
-.Lv11: vxor v19,v19,v11
-.Lv10: vxor v20,v20,v10
-.Lv9: vxor v19,v19,v9
-.Lv8: vxor v20,v20,v8
-.Lv7: vxor v19,v19,v7
-.Lv6: vxor v20,v20,v6
-.Lv5: vxor v19,v19,v5
-.Lv4: vxor v20,v20,v4
-.Lv3: vxor v19,v19,v3
-.Lv2: vxor v20,v20,v2
-.Lv1: vxor v19,v19,v1
-.Lv0: vxor v20,v20,v0
-
- vxor v0,v19,v20
-
- b .Lbarrett_reduction
-
-.Lzero:
- mr r3,r10
- b .Lout
-
-FUNC_END(CRC_FUNCTION_NAME)
diff --git a/arch/powerpc/crypto/crc32c-vpmsum_asm.S b/arch/powerpc/crypto/crc32c-vpmsum_asm.S
deleted file mode 100644
index bf442004ea1f..000000000000
--- a/arch/powerpc/crypto/crc32c-vpmsum_asm.S
+++ /dev/null
@@ -1,842 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * Calculate a crc32c with vpmsum acceleration
- *
- * Copyright (C) 2015 Anton Blanchard <anton@au.ibm.com>, IBM
- */
- .section .rodata
-.balign 16
-
-.byteswap_constant:
- /* byte reverse permute constant */
- .octa 0x0F0E0D0C0B0A09080706050403020100
-
-.constants:
-
- /* Reduce 262144 kbits to 1024 bits */
- /* x^261120 mod p(x)` << 1, x^261184 mod p(x)` << 1 */
- .octa 0x00000000b6ca9e20000000009c37c408
-
- /* x^260096 mod p(x)` << 1, x^260160 mod p(x)` << 1 */
- .octa 0x00000000350249a800000001b51df26c
-
- /* x^259072 mod p(x)` << 1, x^259136 mod p(x)` << 1 */
- .octa 0x00000001862dac54000000000724b9d0
-
- /* x^258048 mod p(x)` << 1, x^258112 mod p(x)` << 1 */
- .octa 0x00000001d87fb48c00000001c00532fe
-
- /* x^257024 mod p(x)` << 1, x^257088 mod p(x)` << 1 */
- .octa 0x00000001f39b699e00000000f05a9362
-
- /* x^256000 mod p(x)` << 1, x^256064 mod p(x)` << 1 */
- .octa 0x0000000101da11b400000001e1007970
-
- /* x^254976 mod p(x)` << 1, x^255040 mod p(x)` << 1 */
- .octa 0x00000001cab571e000000000a57366ee
-
- /* x^253952 mod p(x)` << 1, x^254016 mod p(x)` << 1 */
- .octa 0x00000000c7020cfe0000000192011284
-
- /* x^252928 mod p(x)` << 1, x^252992 mod p(x)` << 1 */
- .octa 0x00000000cdaed1ae0000000162716d9a
-
- /* x^251904 mod p(x)` << 1, x^251968 mod p(x)` << 1 */
- .octa 0x00000001e804effc00000000cd97ecde
-
- /* x^250880 mod p(x)` << 1, x^250944 mod p(x)` << 1 */
- .octa 0x0000000077c3ea3a0000000058812bc0
-
- /* x^249856 mod p(x)` << 1, x^249920 mod p(x)` << 1 */
- .octa 0x0000000068df31b40000000088b8c12e
-
- /* x^248832 mod p(x)` << 1, x^248896 mod p(x)` << 1 */
- .octa 0x00000000b059b6c200000001230b234c
-
- /* x^247808 mod p(x)` << 1, x^247872 mod p(x)` << 1 */
- .octa 0x0000000145fb8ed800000001120b416e
-
- /* x^246784 mod p(x)` << 1, x^246848 mod p(x)` << 1 */
- .octa 0x00000000cbc0916800000001974aecb0
-
- /* x^245760 mod p(x)` << 1, x^245824 mod p(x)` << 1 */
- .octa 0x000000005ceeedc2000000008ee3f226
-
- /* x^244736 mod p(x)` << 1, x^244800 mod p(x)` << 1 */
- .octa 0x0000000047d74e8600000001089aba9a
-
- /* x^243712 mod p(x)` << 1, x^243776 mod p(x)` << 1 */
- .octa 0x00000001407e9e220000000065113872
-
- /* x^242688 mod p(x)` << 1, x^242752 mod p(x)` << 1 */
- .octa 0x00000001da967bda000000005c07ec10
-
- /* x^241664 mod p(x)` << 1, x^241728 mod p(x)` << 1 */
- .octa 0x000000006c8983680000000187590924
-
- /* x^240640 mod p(x)` << 1, x^240704 mod p(x)` << 1 */
- .octa 0x00000000f2d14c9800000000e35da7c6
-
- /* x^239616 mod p(x)` << 1, x^239680 mod p(x)` << 1 */
- .octa 0x00000001993c6ad4000000000415855a
-
- /* x^238592 mod p(x)` << 1, x^238656 mod p(x)` << 1 */
- .octa 0x000000014683d1ac0000000073617758
-
- /* x^237568 mod p(x)` << 1, x^237632 mod p(x)` << 1 */
- .octa 0x00000001a7c93e6c0000000176021d28
-
- /* x^236544 mod p(x)` << 1, x^236608 mod p(x)` << 1 */
- .octa 0x000000010211e90a00000001c358fd0a
-
- /* x^235520 mod p(x)` << 1, x^235584 mod p(x)` << 1 */
- .octa 0x000000001119403e00000001ff7a2c18
-
- /* x^234496 mod p(x)` << 1, x^234560 mod p(x)` << 1 */
- .octa 0x000000001c3261aa00000000f2d9f7e4
-
- /* x^233472 mod p(x)` << 1, x^233536 mod p(x)` << 1 */
- .octa 0x000000014e37a634000000016cf1f9c8
-
- /* x^232448 mod p(x)` << 1, x^232512 mod p(x)` << 1 */
- .octa 0x0000000073786c0c000000010af9279a
-
- /* x^231424 mod p(x)` << 1, x^231488 mod p(x)` << 1 */
- .octa 0x000000011dc037f80000000004f101e8
-
- /* x^230400 mod p(x)` << 1, x^230464 mod p(x)` << 1 */
- .octa 0x0000000031433dfc0000000070bcf184
-
- /* x^229376 mod p(x)` << 1, x^229440 mod p(x)` << 1 */
- .octa 0x000000009cde8348000000000a8de642
-
- /* x^228352 mod p(x)` << 1, x^228416 mod p(x)` << 1 */
- .octa 0x0000000038d3c2a60000000062ea130c
-
- /* x^227328 mod p(x)` << 1, x^227392 mod p(x)` << 1 */
- .octa 0x000000011b25f26000000001eb31cbb2
-
- /* x^226304 mod p(x)` << 1, x^226368 mod p(x)` << 1 */
- .octa 0x000000001629e6f00000000170783448
-
- /* x^225280 mod p(x)` << 1, x^225344 mod p(x)` << 1 */
- .octa 0x0000000160838b4c00000001a684b4c6
-
- /* x^224256 mod p(x)` << 1, x^224320 mod p(x)` << 1 */
- .octa 0x000000007a44011c00000000253ca5b4
-
- /* x^223232 mod p(x)` << 1, x^223296 mod p(x)` << 1 */
- .octa 0x00000000226f417a0000000057b4b1e2
-
- /* x^222208 mod p(x)` << 1, x^222272 mod p(x)` << 1 */
- .octa 0x0000000045eb2eb400000000b6bd084c
-
- /* x^221184 mod p(x)` << 1, x^221248 mod p(x)` << 1 */
- .octa 0x000000014459d70c0000000123c2d592
-
- /* x^220160 mod p(x)` << 1, x^220224 mod p(x)` << 1 */
- .octa 0x00000001d406ed8200000000159dafce
-
- /* x^219136 mod p(x)` << 1, x^219200 mod p(x)` << 1 */
- .octa 0x0000000160c8e1a80000000127e1a64e
-
- /* x^218112 mod p(x)` << 1, x^218176 mod p(x)` << 1 */
- .octa 0x0000000027ba80980000000056860754
-
- /* x^217088 mod p(x)` << 1, x^217152 mod p(x)` << 1 */
- .octa 0x000000006d92d01800000001e661aae8
-
- /* x^216064 mod p(x)` << 1, x^216128 mod p(x)` << 1 */
- .octa 0x000000012ed7e3f200000000f82c6166
-
- /* x^215040 mod p(x)` << 1, x^215104 mod p(x)` << 1 */
- .octa 0x000000002dc8778800000000c4f9c7ae
-
- /* x^214016 mod p(x)` << 1, x^214080 mod p(x)` << 1 */
- .octa 0x0000000018240bb80000000074203d20
-
- /* x^212992 mod p(x)` << 1, x^213056 mod p(x)` << 1 */
- .octa 0x000000001ad381580000000198173052
-
- /* x^211968 mod p(x)` << 1, x^212032 mod p(x)` << 1 */
- .octa 0x00000001396b78f200000001ce8aba54
-
- /* x^210944 mod p(x)` << 1, x^211008 mod p(x)` << 1 */
- .octa 0x000000011a68133400000001850d5d94
-
- /* x^209920 mod p(x)` << 1, x^209984 mod p(x)` << 1 */
- .octa 0x000000012104732e00000001d609239c
-
- /* x^208896 mod p(x)` << 1, x^208960 mod p(x)` << 1 */
- .octa 0x00000000a140d90c000000001595f048
-
- /* x^207872 mod p(x)` << 1, x^207936 mod p(x)` << 1 */
- .octa 0x00000001b7215eda0000000042ccee08
-
- /* x^206848 mod p(x)` << 1, x^206912 mod p(x)` << 1 */
- .octa 0x00000001aaf1df3c000000010a389d74
-
- /* x^205824 mod p(x)` << 1, x^205888 mod p(x)` << 1 */
- .octa 0x0000000029d15b8a000000012a840da6
-
- /* x^204800 mod p(x)` << 1, x^204864 mod p(x)` << 1 */
- .octa 0x00000000f1a96922000000001d181c0c
-
- /* x^203776 mod p(x)` << 1, x^203840 mod p(x)` << 1 */
- .octa 0x00000001ac80d03c0000000068b7d1f6
-
- /* x^202752 mod p(x)` << 1, x^202816 mod p(x)` << 1 */
- .octa 0x000000000f11d56a000000005b0f14fc
-
- /* x^201728 mod p(x)` << 1, x^201792 mod p(x)` << 1 */
- .octa 0x00000001f1c022a20000000179e9e730
-
- /* x^200704 mod p(x)` << 1, x^200768 mod p(x)` << 1 */
- .octa 0x0000000173d00ae200000001ce1368d6
-
- /* x^199680 mod p(x)` << 1, x^199744 mod p(x)` << 1 */
- .octa 0x00000001d4ffe4ac0000000112c3a84c
-
- /* x^198656 mod p(x)` << 1, x^198720 mod p(x)` << 1 */
- .octa 0x000000016edc5ae400000000de940fee
-
- /* x^197632 mod p(x)` << 1, x^197696 mod p(x)` << 1 */
- .octa 0x00000001f1a0214000000000fe896b7e
-
- /* x^196608 mod p(x)` << 1, x^196672 mod p(x)` << 1 */
- .octa 0x00000000ca0b28a000000001f797431c
-
- /* x^195584 mod p(x)` << 1, x^195648 mod p(x)` << 1 */
- .octa 0x00000001928e30a20000000053e989ba
-
- /* x^194560 mod p(x)` << 1, x^194624 mod p(x)` << 1 */
- .octa 0x0000000097b1b002000000003920cd16
-
- /* x^193536 mod p(x)` << 1, x^193600 mod p(x)` << 1 */
- .octa 0x00000000b15bf90600000001e6f579b8
-
- /* x^192512 mod p(x)` << 1, x^192576 mod p(x)` << 1 */
- .octa 0x00000000411c5d52000000007493cb0a
-
- /* x^191488 mod p(x)` << 1, x^191552 mod p(x)` << 1 */
- .octa 0x00000001c36f330000000001bdd376d8
-
- /* x^190464 mod p(x)` << 1, x^190528 mod p(x)` << 1 */
- .octa 0x00000001119227e0000000016badfee6
-
- /* x^189440 mod p(x)` << 1, x^189504 mod p(x)` << 1 */
- .octa 0x00000000114d47020000000071de5c58
-
- /* x^188416 mod p(x)` << 1, x^188480 mod p(x)` << 1 */
- .octa 0x00000000458b5b9800000000453f317c
-
- /* x^187392 mod p(x)` << 1, x^187456 mod p(x)` << 1 */
- .octa 0x000000012e31fb8e0000000121675cce
-
- /* x^186368 mod p(x)` << 1, x^186432 mod p(x)` << 1 */
- .octa 0x000000005cf619d800000001f409ee92
-
- /* x^185344 mod p(x)` << 1, x^185408 mod p(x)` << 1 */
- .octa 0x0000000063f4d8b200000000f36b9c88
-
- /* x^184320 mod p(x)` << 1, x^184384 mod p(x)` << 1 */
- .octa 0x000000004138dc8a0000000036b398f4
-
- /* x^183296 mod p(x)` << 1, x^183360 mod p(x)` << 1 */
- .octa 0x00000001d29ee8e000000001748f9adc
-
- /* x^182272 mod p(x)` << 1, x^182336 mod p(x)` << 1 */
- .octa 0x000000006a08ace800000001be94ec00
-
- /* x^181248 mod p(x)` << 1, x^181312 mod p(x)` << 1 */
- .octa 0x0000000127d4201000000000b74370d6
-
- /* x^180224 mod p(x)` << 1, x^180288 mod p(x)` << 1 */
- .octa 0x0000000019d76b6200000001174d0b98
-
- /* x^179200 mod p(x)` << 1, x^179264 mod p(x)` << 1 */
- .octa 0x00000001b1471f6e00000000befc06a4
-
- /* x^178176 mod p(x)` << 1, x^178240 mod p(x)` << 1 */
- .octa 0x00000001f64c19cc00000001ae125288
-
- /* x^177152 mod p(x)` << 1, x^177216 mod p(x)` << 1 */
- .octa 0x00000000003c0ea00000000095c19b34
-
- /* x^176128 mod p(x)` << 1, x^176192 mod p(x)` << 1 */
- .octa 0x000000014d73abf600000001a78496f2
-
- /* x^175104 mod p(x)` << 1, x^175168 mod p(x)` << 1 */
- .octa 0x00000001620eb84400000001ac5390a0
-
- /* x^174080 mod p(x)` << 1, x^174144 mod p(x)` << 1 */
- .octa 0x0000000147655048000000002a80ed6e
-
- /* x^173056 mod p(x)` << 1, x^173120 mod p(x)` << 1 */
- .octa 0x0000000067b5077e00000001fa9b0128
-
- /* x^172032 mod p(x)` << 1, x^172096 mod p(x)` << 1 */
- .octa 0x0000000010ffe20600000001ea94929e
-
- /* x^171008 mod p(x)` << 1, x^171072 mod p(x)` << 1 */
- .octa 0x000000000fee8f1e0000000125f4305c
-
- /* x^169984 mod p(x)` << 1, x^170048 mod p(x)` << 1 */
- .octa 0x00000001da26fbae00000001471e2002
-
- /* x^168960 mod p(x)` << 1, x^169024 mod p(x)` << 1 */
- .octa 0x00000001b3a8bd880000000132d2253a
-
- /* x^167936 mod p(x)` << 1, x^168000 mod p(x)` << 1 */
- .octa 0x00000000e8f3898e00000000f26b3592
-
- /* x^166912 mod p(x)` << 1, x^166976 mod p(x)` << 1 */
- .octa 0x00000000b0d0d28c00000000bc8b67b0
-
- /* x^165888 mod p(x)` << 1, x^165952 mod p(x)` << 1 */
- .octa 0x0000000030f2a798000000013a826ef2
-
- /* x^164864 mod p(x)` << 1, x^164928 mod p(x)` << 1 */
- .octa 0x000000000fba10020000000081482c84
-
- /* x^163840 mod p(x)` << 1, x^163904 mod p(x)` << 1 */
- .octa 0x00000000bdb9bd7200000000e77307c2
-
- /* x^162816 mod p(x)` << 1, x^162880 mod p(x)` << 1 */
- .octa 0x0000000075d3bf5a00000000d4a07ec8
-
- /* x^161792 mod p(x)` << 1, x^161856 mod p(x)` << 1 */
- .octa 0x00000000ef1f98a00000000017102100
-
- /* x^160768 mod p(x)` << 1, x^160832 mod p(x)` << 1 */
- .octa 0x00000000689c760200000000db406486
-
- /* x^159744 mod p(x)` << 1, x^159808 mod p(x)` << 1 */
- .octa 0x000000016d5fa5fe0000000192db7f88
-
- /* x^158720 mod p(x)` << 1, x^158784 mod p(x)` << 1 */
- .octa 0x00000001d0d2b9ca000000018bf67b1e
-
- /* x^157696 mod p(x)` << 1, x^157760 mod p(x)` << 1 */
- .octa 0x0000000041e7b470000000007c09163e
-
- /* x^156672 mod p(x)` << 1, x^156736 mod p(x)` << 1 */
- .octa 0x00000001cbb6495e000000000adac060
-
- /* x^155648 mod p(x)` << 1, x^155712 mod p(x)` << 1 */
- .octa 0x000000010052a0b000000000bd8316ae
-
- /* x^154624 mod p(x)` << 1, x^154688 mod p(x)` << 1 */
- .octa 0x00000001d8effb5c000000019f09ab54
-
- /* x^153600 mod p(x)` << 1, x^153664 mod p(x)` << 1 */
- .octa 0x00000001d969853c0000000125155542
-
- /* x^152576 mod p(x)` << 1, x^152640 mod p(x)` << 1 */
- .octa 0x00000000523ccce2000000018fdb5882
-
- /* x^151552 mod p(x)` << 1, x^151616 mod p(x)` << 1 */
- .octa 0x000000001e2436bc00000000e794b3f4
-
- /* x^150528 mod p(x)` << 1, x^150592 mod p(x)` << 1 */
- .octa 0x00000000ddd1c3a2000000016f9bb022
-
- /* x^149504 mod p(x)` << 1, x^149568 mod p(x)` << 1 */
- .octa 0x0000000019fcfe3800000000290c9978
-
- /* x^148480 mod p(x)` << 1, x^148544 mod p(x)` << 1 */
- .octa 0x00000001ce95db640000000083c0f350
-
- /* x^147456 mod p(x)` << 1, x^147520 mod p(x)` << 1 */
- .octa 0x00000000af5828060000000173ea6628
-
- /* x^146432 mod p(x)` << 1, x^146496 mod p(x)` << 1 */
- .octa 0x00000001006388f600000001c8b4e00a
-
- /* x^145408 mod p(x)` << 1, x^145472 mod p(x)` << 1 */
- .octa 0x0000000179eca00a00000000de95d6aa
-
- /* x^144384 mod p(x)` << 1, x^144448 mod p(x)` << 1 */
- .octa 0x0000000122410a6a000000010b7f7248
-
- /* x^143360 mod p(x)` << 1, x^143424 mod p(x)` << 1 */
- .octa 0x000000004288e87c00000001326e3a06
-
- /* x^142336 mod p(x)` << 1, x^142400 mod p(x)` << 1 */
- .octa 0x000000016c5490da00000000bb62c2e6
-
- /* x^141312 mod p(x)` << 1, x^141376 mod p(x)` << 1 */
- .octa 0x00000000d1c71f6e0000000156a4b2c2
-
- /* x^140288 mod p(x)` << 1, x^140352 mod p(x)` << 1 */
- .octa 0x00000001b4ce08a6000000011dfe763a
-
- /* x^139264 mod p(x)` << 1, x^139328 mod p(x)` << 1 */
- .octa 0x00000001466ba60c000000007bcca8e2
-
- /* x^138240 mod p(x)` << 1, x^138304 mod p(x)` << 1 */
- .octa 0x00000001f6c488a40000000186118faa
-
- /* x^137216 mod p(x)` << 1, x^137280 mod p(x)` << 1 */
- .octa 0x000000013bfb06820000000111a65a88
-
- /* x^136192 mod p(x)` << 1, x^136256 mod p(x)` << 1 */
- .octa 0x00000000690e9e54000000003565e1c4
-
- /* x^135168 mod p(x)` << 1, x^135232 mod p(x)` << 1 */
- .octa 0x00000000281346b6000000012ed02a82
-
- /* x^134144 mod p(x)` << 1, x^134208 mod p(x)` << 1 */
- .octa 0x000000015646402400000000c486ecfc
-
- /* x^133120 mod p(x)` << 1, x^133184 mod p(x)` << 1 */
- .octa 0x000000016063a8dc0000000001b951b2
-
- /* x^132096 mod p(x)` << 1, x^132160 mod p(x)` << 1 */
- .octa 0x0000000116a663620000000048143916
-
- /* x^131072 mod p(x)` << 1, x^131136 mod p(x)` << 1 */
- .octa 0x000000017e8aa4d200000001dc2ae124
-
- /* x^130048 mod p(x)` << 1, x^130112 mod p(x)` << 1 */
- .octa 0x00000001728eb10c00000001416c58d6
-
- /* x^129024 mod p(x)` << 1, x^129088 mod p(x)` << 1 */
- .octa 0x00000001b08fd7fa00000000a479744a
-
- /* x^128000 mod p(x)` << 1, x^128064 mod p(x)` << 1 */
- .octa 0x00000001092a16e80000000096ca3a26
-
- /* x^126976 mod p(x)` << 1, x^127040 mod p(x)` << 1 */
- .octa 0x00000000a505637c00000000ff223d4e
-
- /* x^125952 mod p(x)` << 1, x^126016 mod p(x)` << 1 */
- .octa 0x00000000d94869b2000000010e84da42
-
- /* x^124928 mod p(x)` << 1, x^124992 mod p(x)` << 1 */
- .octa 0x00000001c8b203ae00000001b61ba3d0
-
- /* x^123904 mod p(x)` << 1, x^123968 mod p(x)` << 1 */
- .octa 0x000000005704aea000000000680f2de8
-
- /* x^122880 mod p(x)` << 1, x^122944 mod p(x)` << 1 */
- .octa 0x000000012e295fa2000000008772a9a8
-
- /* x^121856 mod p(x)` << 1, x^121920 mod p(x)` << 1 */
- .octa 0x000000011d0908bc0000000155f295bc
-
- /* x^120832 mod p(x)` << 1, x^120896 mod p(x)` << 1 */
- .octa 0x0000000193ed97ea00000000595f9282
-
- /* x^119808 mod p(x)` << 1, x^119872 mod p(x)` << 1 */
- .octa 0x000000013a0f1c520000000164b1c25a
-
- /* x^118784 mod p(x)` << 1, x^118848 mod p(x)` << 1 */
- .octa 0x000000010c2c40c000000000fbd67c50
-
- /* x^117760 mod p(x)` << 1, x^117824 mod p(x)` << 1 */
- .octa 0x00000000ff6fac3e0000000096076268
-
- /* x^116736 mod p(x)` << 1, x^116800 mod p(x)` << 1 */
- .octa 0x000000017b3609c000000001d288e4cc
-
- /* x^115712 mod p(x)` << 1, x^115776 mod p(x)` << 1 */
- .octa 0x0000000088c8c92200000001eaac1bdc
-
- /* x^114688 mod p(x)` << 1, x^114752 mod p(x)` << 1 */
- .octa 0x00000001751baae600000001f1ea39e2
-
- /* x^113664 mod p(x)` << 1, x^113728 mod p(x)` << 1 */
- .octa 0x000000010795297200000001eb6506fc
-
- /* x^112640 mod p(x)` << 1, x^112704 mod p(x)` << 1 */
- .octa 0x0000000162b00abe000000010f806ffe
-
- /* x^111616 mod p(x)` << 1, x^111680 mod p(x)` << 1 */
- .octa 0x000000000d7b404c000000010408481e
-
- /* x^110592 mod p(x)` << 1, x^110656 mod p(x)` << 1 */
- .octa 0x00000000763b13d40000000188260534
-
- /* x^109568 mod p(x)` << 1, x^109632 mod p(x)` << 1 */
- .octa 0x00000000f6dc22d80000000058fc73e0
-
- /* x^108544 mod p(x)` << 1, x^108608 mod p(x)` << 1 */
- .octa 0x000000007daae06000000000391c59b8
-
- /* x^107520 mod p(x)` << 1, x^107584 mod p(x)` << 1 */
- .octa 0x000000013359ab7c000000018b638400
-
- /* x^106496 mod p(x)` << 1, x^106560 mod p(x)` << 1 */
- .octa 0x000000008add438a000000011738f5c4
-
- /* x^105472 mod p(x)` << 1, x^105536 mod p(x)` << 1 */
- .octa 0x00000001edbefdea000000008cf7c6da
-
- /* x^104448 mod p(x)` << 1, x^104512 mod p(x)` << 1 */
- .octa 0x000000004104e0f800000001ef97fb16
-
- /* x^103424 mod p(x)` << 1, x^103488 mod p(x)` << 1 */
- .octa 0x00000000b48a82220000000102130e20
-
- /* x^102400 mod p(x)` << 1, x^102464 mod p(x)` << 1 */
- .octa 0x00000001bcb4684400000000db968898
-
- /* x^101376 mod p(x)` << 1, x^101440 mod p(x)` << 1 */
- .octa 0x000000013293ce0a00000000b5047b5e
-
- /* x^100352 mod p(x)` << 1, x^100416 mod p(x)` << 1 */
- .octa 0x00000001710d0844000000010b90fdb2
-
- /* x^99328 mod p(x)` << 1, x^99392 mod p(x)` << 1 */
- .octa 0x0000000117907f6e000000004834a32e
-
- /* x^98304 mod p(x)` << 1, x^98368 mod p(x)` << 1 */
- .octa 0x0000000087ddf93e0000000059c8f2b0
-
- /* x^97280 mod p(x)` << 1, x^97344 mod p(x)` << 1 */
- .octa 0x000000005970e9b00000000122cec508
-
- /* x^96256 mod p(x)` << 1, x^96320 mod p(x)` << 1 */
- .octa 0x0000000185b2b7d0000000000a330cda
-
- /* x^95232 mod p(x)` << 1, x^95296 mod p(x)` << 1 */
- .octa 0x00000001dcee0efc000000014a47148c
-
- /* x^94208 mod p(x)` << 1, x^94272 mod p(x)` << 1 */
- .octa 0x0000000030da27220000000042c61cb8
-
- /* x^93184 mod p(x)` << 1, x^93248 mod p(x)` << 1 */
- .octa 0x000000012f925a180000000012fe6960
-
- /* x^92160 mod p(x)` << 1, x^92224 mod p(x)` << 1 */
- .octa 0x00000000dd2e357c00000000dbda2c20
-
- /* x^91136 mod p(x)` << 1, x^91200 mod p(x)` << 1 */
- .octa 0x00000000071c80de000000011122410c
-
- /* x^90112 mod p(x)` << 1, x^90176 mod p(x)` << 1 */
- .octa 0x000000011513140a00000000977b2070
-
- /* x^89088 mod p(x)` << 1, x^89152 mod p(x)` << 1 */
- .octa 0x00000001df876e8e000000014050438e
-
- /* x^88064 mod p(x)` << 1, x^88128 mod p(x)` << 1 */
- .octa 0x000000015f81d6ce0000000147c840e8
-
- /* x^87040 mod p(x)` << 1, x^87104 mod p(x)` << 1 */
- .octa 0x000000019dd94dbe00000001cc7c88ce
-
- /* x^86016 mod p(x)` << 1, x^86080 mod p(x)` << 1 */
- .octa 0x00000001373d206e00000001476b35a4
-
- /* x^84992 mod p(x)` << 1, x^85056 mod p(x)` << 1 */
- .octa 0x00000000668ccade000000013d52d508
-
- /* x^83968 mod p(x)` << 1, x^84032 mod p(x)` << 1 */
- .octa 0x00000001b192d268000000008e4be32e
-
- /* x^82944 mod p(x)` << 1, x^83008 mod p(x)` << 1 */
- .octa 0x00000000e30f3a7800000000024120fe
-
- /* x^81920 mod p(x)` << 1, x^81984 mod p(x)` << 1 */
- .octa 0x000000010ef1f7bc00000000ddecddb4
-
- /* x^80896 mod p(x)` << 1, x^80960 mod p(x)` << 1 */
- .octa 0x00000001f5ac738000000000d4d403bc
-
- /* x^79872 mod p(x)` << 1, x^79936 mod p(x)` << 1 */
- .octa 0x000000011822ea7000000001734b89aa
-
- /* x^78848 mod p(x)` << 1, x^78912 mod p(x)` << 1 */
- .octa 0x00000000c3a33848000000010e7a58d6
-
- /* x^77824 mod p(x)` << 1, x^77888 mod p(x)` << 1 */
- .octa 0x00000001bd151c2400000001f9f04e9c
-
- /* x^76800 mod p(x)` << 1, x^76864 mod p(x)` << 1 */
- .octa 0x0000000056002d7600000000b692225e
-
- /* x^75776 mod p(x)` << 1, x^75840 mod p(x)` << 1 */
- .octa 0x000000014657c4f4000000019b8d3f3e
-
- /* x^74752 mod p(x)` << 1, x^74816 mod p(x)` << 1 */
- .octa 0x0000000113742d7c00000001a874f11e
-
- /* x^73728 mod p(x)` << 1, x^73792 mod p(x)` << 1 */
- .octa 0x000000019c5920ba000000010d5a4254
-
- /* x^72704 mod p(x)` << 1, x^72768 mod p(x)` << 1 */
- .octa 0x000000005216d2d600000000bbb2f5d6
-
- /* x^71680 mod p(x)` << 1, x^71744 mod p(x)` << 1 */
- .octa 0x0000000136f5ad8a0000000179cc0e36
-
- /* x^70656 mod p(x)` << 1, x^70720 mod p(x)` << 1 */
- .octa 0x000000018b07beb600000001dca1da4a
-
- /* x^69632 mod p(x)` << 1, x^69696 mod p(x)` << 1 */
- .octa 0x00000000db1e93b000000000feb1a192
-
- /* x^68608 mod p(x)` << 1, x^68672 mod p(x)` << 1 */
- .octa 0x000000000b96fa3a00000000d1eeedd6
-
- /* x^67584 mod p(x)` << 1, x^67648 mod p(x)` << 1 */
- .octa 0x00000001d9968af0000000008fad9bb4
-
- /* x^66560 mod p(x)` << 1, x^66624 mod p(x)` << 1 */
- .octa 0x000000000e4a77a200000001884938e4
-
- /* x^65536 mod p(x)` << 1, x^65600 mod p(x)` << 1 */
- .octa 0x00000000508c2ac800000001bc2e9bc0
-
- /* x^64512 mod p(x)` << 1, x^64576 mod p(x)` << 1 */
- .octa 0x0000000021572a8000000001f9658a68
-
- /* x^63488 mod p(x)` << 1, x^63552 mod p(x)` << 1 */
- .octa 0x00000001b859daf2000000001b9224fc
-
- /* x^62464 mod p(x)` << 1, x^62528 mod p(x)` << 1 */
- .octa 0x000000016f7884740000000055b2fb84
-
- /* x^61440 mod p(x)` << 1, x^61504 mod p(x)` << 1 */
- .octa 0x00000001b438810e000000018b090348
-
- /* x^60416 mod p(x)` << 1, x^60480 mod p(x)` << 1 */
- .octa 0x0000000095ddc6f2000000011ccbd5ea
-
- /* x^59392 mod p(x)` << 1, x^59456 mod p(x)` << 1 */
- .octa 0x00000001d977c20c0000000007ae47f8
-
- /* x^58368 mod p(x)` << 1, x^58432 mod p(x)` << 1 */
- .octa 0x00000000ebedb99a0000000172acbec0
-
- /* x^57344 mod p(x)` << 1, x^57408 mod p(x)` << 1 */
- .octa 0x00000001df9e9e9200000001c6e3ff20
-
- /* x^56320 mod p(x)` << 1, x^56384 mod p(x)` << 1 */
- .octa 0x00000001a4a3f95200000000e1b38744
-
- /* x^55296 mod p(x)` << 1, x^55360 mod p(x)` << 1 */
- .octa 0x00000000e2f5122000000000791585b2
-
- /* x^54272 mod p(x)` << 1, x^54336 mod p(x)` << 1 */
- .octa 0x000000004aa01f3e00000000ac53b894
-
- /* x^53248 mod p(x)` << 1, x^53312 mod p(x)` << 1 */
- .octa 0x00000000b3e90a5800000001ed5f2cf4
-
- /* x^52224 mod p(x)` << 1, x^52288 mod p(x)` << 1 */
- .octa 0x000000000c9ca2aa00000001df48b2e0
-
- /* x^51200 mod p(x)` << 1, x^51264 mod p(x)` << 1 */
- .octa 0x000000015168231600000000049c1c62
-
- /* x^50176 mod p(x)` << 1, x^50240 mod p(x)` << 1 */
- .octa 0x0000000036fce78c000000017c460c12
-
- /* x^49152 mod p(x)` << 1, x^49216 mod p(x)` << 1 */
- .octa 0x000000009037dc10000000015be4da7e
-
- /* x^48128 mod p(x)` << 1, x^48192 mod p(x)` << 1 */
- .octa 0x00000000d3298582000000010f38f668
-
- /* x^47104 mod p(x)` << 1, x^47168 mod p(x)` << 1 */
- .octa 0x00000001b42e8ad60000000039f40a00
-
- /* x^46080 mod p(x)` << 1, x^46144 mod p(x)` << 1 */
- .octa 0x00000000142a983800000000bd4c10c4
-
- /* x^45056 mod p(x)` << 1, x^45120 mod p(x)` << 1 */
- .octa 0x0000000109c7f1900000000042db1d98
-
- /* x^44032 mod p(x)` << 1, x^44096 mod p(x)` << 1 */
- .octa 0x0000000056ff931000000001c905bae6
-
- /* x^43008 mod p(x)` << 1, x^43072 mod p(x)` << 1 */
- .octa 0x00000001594513aa00000000069d40ea
-
- /* x^41984 mod p(x)` << 1, x^42048 mod p(x)` << 1 */
- .octa 0x00000001e3b5b1e8000000008e4fbad0
-
- /* x^40960 mod p(x)` << 1, x^41024 mod p(x)` << 1 */
- .octa 0x000000011dd5fc080000000047bedd46
-
- /* x^39936 mod p(x)` << 1, x^40000 mod p(x)` << 1 */
- .octa 0x00000001675f0cc20000000026396bf8
-
- /* x^38912 mod p(x)` << 1, x^38976 mod p(x)` << 1 */
- .octa 0x00000000d1c8dd4400000000379beb92
-
- /* x^37888 mod p(x)` << 1, x^37952 mod p(x)` << 1 */
- .octa 0x0000000115ebd3d8000000000abae54a
-
- /* x^36864 mod p(x)` << 1, x^36928 mod p(x)` << 1 */
- .octa 0x00000001ecbd0dac0000000007e6a128
-
- /* x^35840 mod p(x)` << 1, x^35904 mod p(x)` << 1 */
- .octa 0x00000000cdf67af2000000000ade29d2
-
- /* x^34816 mod p(x)` << 1, x^34880 mod p(x)` << 1 */
- .octa 0x000000004c01ff4c00000000f974c45c
-
- /* x^33792 mod p(x)` << 1, x^33856 mod p(x)` << 1 */
- .octa 0x00000000f2d8657e00000000e77ac60a
-
- /* x^32768 mod p(x)` << 1, x^32832 mod p(x)` << 1 */
- .octa 0x000000006bae74c40000000145895816
-
- /* x^31744 mod p(x)` << 1, x^31808 mod p(x)` << 1 */
- .octa 0x0000000152af8aa00000000038e362be
-
- /* x^30720 mod p(x)` << 1, x^30784 mod p(x)` << 1 */
- .octa 0x0000000004663802000000007f991a64
-
- /* x^29696 mod p(x)` << 1, x^29760 mod p(x)` << 1 */
- .octa 0x00000001ab2f5afc00000000fa366d3a
-
- /* x^28672 mod p(x)` << 1, x^28736 mod p(x)` << 1 */
- .octa 0x0000000074a4ebd400000001a2bb34f0
-
- /* x^27648 mod p(x)` << 1, x^27712 mod p(x)` << 1 */
- .octa 0x00000001d7ab3a4c0000000028a9981e
-
- /* x^26624 mod p(x)` << 1, x^26688 mod p(x)` << 1 */
- .octa 0x00000001a8da60c600000001dbc672be
-
- /* x^25600 mod p(x)` << 1, x^25664 mod p(x)` << 1 */
- .octa 0x000000013cf6382000000000b04d77f6
-
- /* x^24576 mod p(x)` << 1, x^24640 mod p(x)` << 1 */
- .octa 0x00000000bec12e1e0000000124400d96
-
- /* x^23552 mod p(x)` << 1, x^23616 mod p(x)` << 1 */
- .octa 0x00000001c6368010000000014ca4b414
-
- /* x^22528 mod p(x)` << 1, x^22592 mod p(x)` << 1 */
- .octa 0x00000001e6e78758000000012fe2c938
-
- /* x^21504 mod p(x)` << 1, x^21568 mod p(x)` << 1 */
- .octa 0x000000008d7f2b3c00000001faed01e6
-
- /* x^20480 mod p(x)` << 1, x^20544 mod p(x)` << 1 */
- .octa 0x000000016b4a156e000000007e80ecfe
-
- /* x^19456 mod p(x)` << 1, x^19520 mod p(x)` << 1 */
- .octa 0x00000001c63cfeb60000000098daee94
-
- /* x^18432 mod p(x)` << 1, x^18496 mod p(x)` << 1 */
- .octa 0x000000015f902670000000010a04edea
-
- /* x^17408 mod p(x)` << 1, x^17472 mod p(x)` << 1 */
- .octa 0x00000001cd5de11e00000001c00b4524
-
- /* x^16384 mod p(x)` << 1, x^16448 mod p(x)` << 1 */
- .octa 0x000000001acaec540000000170296550
-
- /* x^15360 mod p(x)` << 1, x^15424 mod p(x)` << 1 */
- .octa 0x000000002bd0ca780000000181afaa48
-
- /* x^14336 mod p(x)` << 1, x^14400 mod p(x)` << 1 */
- .octa 0x0000000032d63d5c0000000185a31ffa
-
- /* x^13312 mod p(x)` << 1, x^13376 mod p(x)` << 1 */
- .octa 0x000000001c6d4e4c000000002469f608
-
- /* x^12288 mod p(x)` << 1, x^12352 mod p(x)` << 1 */
- .octa 0x0000000106a60b92000000006980102a
-
- /* x^11264 mod p(x)` << 1, x^11328 mod p(x)` << 1 */
- .octa 0x00000000d3855e120000000111ea9ca8
-
- /* x^10240 mod p(x)` << 1, x^10304 mod p(x)` << 1 */
- .octa 0x00000000e312563600000001bd1d29ce
-
- /* x^9216 mod p(x)` << 1, x^9280 mod p(x)` << 1 */
- .octa 0x000000009e8f7ea400000001b34b9580
-
- /* x^8192 mod p(x)` << 1, x^8256 mod p(x)` << 1 */
- .octa 0x00000001c82e562c000000003076054e
-
- /* x^7168 mod p(x)` << 1, x^7232 mod p(x)` << 1 */
- .octa 0x00000000ca9f09ce000000012a608ea4
-
- /* x^6144 mod p(x)` << 1, x^6208 mod p(x)` << 1 */
- .octa 0x00000000c63764e600000000784d05fe
-
- /* x^5120 mod p(x)` << 1, x^5184 mod p(x)` << 1 */
- .octa 0x0000000168d2e49e000000016ef0d82a
-
- /* x^4096 mod p(x)` << 1, x^4160 mod p(x)` << 1 */
- .octa 0x00000000e986c1480000000075bda454
-
- /* x^3072 mod p(x)` << 1, x^3136 mod p(x)` << 1 */
- .octa 0x00000000cfb65894000000003dc0a1c4
-
- /* x^2048 mod p(x)` << 1, x^2112 mod p(x)` << 1 */
- .octa 0x0000000111cadee400000000e9a5d8be
-
- /* x^1024 mod p(x)` << 1, x^1088 mod p(x)` << 1 */
- .octa 0x0000000171fb63ce00000001609bc4b4
-
-.short_constants:
-
- /* Reduce final 1024-2048 bits to 64 bits, shifting 32 bits to include the trailing 32 bits of zeros */
- /* x^1952 mod p(x)`, x^1984 mod p(x)`, x^2016 mod p(x)`, x^2048 mod p(x)` */
- .octa 0x7fec2963e5bf80485cf015c388e56f72
-
- /* x^1824 mod p(x)`, x^1856 mod p(x)`, x^1888 mod p(x)`, x^1920 mod p(x)` */
- .octa 0x38e888d4844752a9963a18920246e2e6
-
- /* x^1696 mod p(x)`, x^1728 mod p(x)`, x^1760 mod p(x)`, x^1792 mod p(x)` */
- .octa 0x42316c00730206ad419a441956993a31
-
- /* x^1568 mod p(x)`, x^1600 mod p(x)`, x^1632 mod p(x)`, x^1664 mod p(x)` */
- .octa 0x543d5c543e65ddf9924752ba2b830011
-
- /* x^1440 mod p(x)`, x^1472 mod p(x)`, x^1504 mod p(x)`, x^1536 mod p(x)` */
- .octa 0x78e87aaf56767c9255bd7f9518e4a304
-
- /* x^1312 mod p(x)`, x^1344 mod p(x)`, x^1376 mod p(x)`, x^1408 mod p(x)` */
- .octa 0x8f68fcec1903da7f6d76739fe0553f1e
-
- /* x^1184 mod p(x)`, x^1216 mod p(x)`, x^1248 mod p(x)`, x^1280 mod p(x)` */
- .octa 0x3f4840246791d588c133722b1fe0b5c3
-
- /* x^1056 mod p(x)`, x^1088 mod p(x)`, x^1120 mod p(x)`, x^1152 mod p(x)` */
- .octa 0x34c96751b04de25a64b67ee0e55ef1f3
-
- /* x^928 mod p(x)`, x^960 mod p(x)`, x^992 mod p(x)`, x^1024 mod p(x)` */
- .octa 0x156c8e180b4a395b069db049b8fdb1e7
-
- /* x^800 mod p(x)`, x^832 mod p(x)`, x^864 mod p(x)`, x^896 mod p(x)` */
- .octa 0xe0b99ccbe661f7bea11bfaf3c9e90b9e
-
- /* x^672 mod p(x)`, x^704 mod p(x)`, x^736 mod p(x)`, x^768 mod p(x)` */
- .octa 0x041d37768cd75659817cdc5119b29a35
-
- /* x^544 mod p(x)`, x^576 mod p(x)`, x^608 mod p(x)`, x^640 mod p(x)` */
- .octa 0x3a0777818cfaa9651ce9d94b36c41f1c
-
- /* x^416 mod p(x)`, x^448 mod p(x)`, x^480 mod p(x)`, x^512 mod p(x)` */
- .octa 0x0e148e8252377a554f256efcb82be955
-
- /* x^288 mod p(x)`, x^320 mod p(x)`, x^352 mod p(x)`, x^384 mod p(x)` */
- .octa 0x9c25531d19e65ddeec1631edb2dea967
-
- /* x^160 mod p(x)`, x^192 mod p(x)`, x^224 mod p(x)`, x^256 mod p(x)` */
- .octa 0x790606ff9957c0a65d27e147510ac59a
-
- /* x^32 mod p(x)`, x^64 mod p(x)`, x^96 mod p(x)`, x^128 mod p(x)` */
- .octa 0x82f63b786ea2d55ca66805eb18b8ea18
-
-
-.barrett_constants:
- /* 33 bit reflected Barrett constant m - (4^32)/n */
- .octa 0x000000000000000000000000dea713f1 /* x^64 div p(x)` */
- /* 33 bit reflected Barrett constant n */
- .octa 0x00000000000000000000000105ec76f1
-
-#define CRC_FUNCTION_NAME __crc32c_vpmsum
-#define REFLECT
-#include "crc32-vpmsum_core.S"
diff --git a/arch/powerpc/crypto/crc32c-vpmsum_glue.c b/arch/powerpc/crypto/crc32c-vpmsum_glue.c
deleted file mode 100644
index 63760b7dbb76..000000000000
--- a/arch/powerpc/crypto/crc32c-vpmsum_glue.c
+++ /dev/null
@@ -1,173 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-#include <linux/crc32.h>
-#include <crypto/internal/hash.h>
-#include <crypto/internal/simd.h>
-#include <linux/init.h>
-#include <linux/module.h>
-#include <linux/string.h>
-#include <linux/kernel.h>
-#include <linux/cpufeature.h>
-#include <asm/simd.h>
-#include <asm/switch_to.h>
-
-#define CHKSUM_BLOCK_SIZE 1
-#define CHKSUM_DIGEST_SIZE 4
-
-#define VMX_ALIGN 16
-#define VMX_ALIGN_MASK (VMX_ALIGN-1)
-
-#define VECTOR_BREAKPOINT 512
-
-u32 __crc32c_vpmsum(u32 crc, unsigned char const *p, size_t len);
-
-static u32 crc32c_vpmsum(u32 crc, unsigned char const *p, size_t len)
-{
- unsigned int prealign;
- unsigned int tail;
-
- if (len < (VECTOR_BREAKPOINT + VMX_ALIGN) || !crypto_simd_usable())
- return __crc32c_le(crc, p, len);
-
- if ((unsigned long)p & VMX_ALIGN_MASK) {
- prealign = VMX_ALIGN - ((unsigned long)p & VMX_ALIGN_MASK);
- crc = __crc32c_le(crc, p, prealign);
- len -= prealign;
- p += prealign;
- }
-
- if (len & ~VMX_ALIGN_MASK) {
- preempt_disable();
- pagefault_disable();
- enable_kernel_altivec();
- crc = __crc32c_vpmsum(crc, p, len & ~VMX_ALIGN_MASK);
- disable_kernel_altivec();
- pagefault_enable();
- preempt_enable();
- }
-
- tail = len & VMX_ALIGN_MASK;
- if (tail) {
- p += len & ~VMX_ALIGN_MASK;
- crc = __crc32c_le(crc, p, tail);
- }
-
- return crc;
-}
-
-static int crc32c_vpmsum_cra_init(struct crypto_tfm *tfm)
-{
- u32 *key = crypto_tfm_ctx(tfm);
-
- *key = ~0;
-
- return 0;
-}
-
-/*
- * Setting the seed allows arbitrary accumulators and flexible XOR policy
- * If your algorithm starts with ~0, then XOR with ~0 before you set
- * the seed.
- */
-static int crc32c_vpmsum_setkey(struct crypto_shash *hash, const u8 *key,
- unsigned int keylen)
-{
- u32 *mctx = crypto_shash_ctx(hash);
-
- if (keylen != sizeof(u32))
- return -EINVAL;
- *mctx = le32_to_cpup((__le32 *)key);
- return 0;
-}
-
-static int crc32c_vpmsum_init(struct shash_desc *desc)
-{
- u32 *mctx = crypto_shash_ctx(desc->tfm);
- u32 *crcp = shash_desc_ctx(desc);
-
- *crcp = *mctx;
-
- return 0;
-}
-
-static int crc32c_vpmsum_update(struct shash_desc *desc, const u8 *data,
- unsigned int len)
-{
- u32 *crcp = shash_desc_ctx(desc);
-
- *crcp = crc32c_vpmsum(*crcp, data, len);
-
- return 0;
-}
-
-static int __crc32c_vpmsum_finup(u32 *crcp, const u8 *data, unsigned int len,
- u8 *out)
-{
- *(__le32 *)out = ~cpu_to_le32(crc32c_vpmsum(*crcp, data, len));
-
- return 0;
-}
-
-static int crc32c_vpmsum_finup(struct shash_desc *desc, const u8 *data,
- unsigned int len, u8 *out)
-{
- return __crc32c_vpmsum_finup(shash_desc_ctx(desc), data, len, out);
-}
-
-static int crc32c_vpmsum_final(struct shash_desc *desc, u8 *out)
-{
- u32 *crcp = shash_desc_ctx(desc);
-
- *(__le32 *)out = ~cpu_to_le32p(crcp);
-
- return 0;
-}
-
-static int crc32c_vpmsum_digest(struct shash_desc *desc, const u8 *data,
- unsigned int len, u8 *out)
-{
- return __crc32c_vpmsum_finup(crypto_shash_ctx(desc->tfm), data, len,
- out);
-}
-
-static struct shash_alg alg = {
- .setkey = crc32c_vpmsum_setkey,
- .init = crc32c_vpmsum_init,
- .update = crc32c_vpmsum_update,
- .final = crc32c_vpmsum_final,
- .finup = crc32c_vpmsum_finup,
- .digest = crc32c_vpmsum_digest,
- .descsize = sizeof(u32),
- .digestsize = CHKSUM_DIGEST_SIZE,
- .base = {
- .cra_name = "crc32c",
- .cra_driver_name = "crc32c-vpmsum",
- .cra_priority = 200,
- .cra_flags = CRYPTO_ALG_OPTIONAL_KEY,
- .cra_blocksize = CHKSUM_BLOCK_SIZE,
- .cra_ctxsize = sizeof(u32),
- .cra_module = THIS_MODULE,
- .cra_init = crc32c_vpmsum_cra_init,
- }
-};
-
-static int __init crc32c_vpmsum_mod_init(void)
-{
- if (!cpu_has_feature(CPU_FTR_ARCH_207S))
- return -ENODEV;
-
- return crypto_register_shash(&alg);
-}
-
-static void __exit crc32c_vpmsum_mod_fini(void)
-{
- crypto_unregister_shash(&alg);
-}
-
-module_cpu_feature_match(PPC_MODULE_FEATURE_VEC_CRYPTO, crc32c_vpmsum_mod_init);
-module_exit(crc32c_vpmsum_mod_fini);
-
-MODULE_AUTHOR("Anton Blanchard <anton@samba.org>");
-MODULE_DESCRIPTION("CRC32C using vector polynomial multiply-sum instructions");
-MODULE_LICENSE("GPL");
-MODULE_ALIAS_CRYPTO("crc32c");
-MODULE_ALIAS_CRYPTO("crc32c-vpmsum");
diff --git a/arch/powerpc/crypto/crct10dif-vpmsum_asm.S b/arch/powerpc/crypto/crct10dif-vpmsum_asm.S
deleted file mode 100644
index f0b93a0fe168..000000000000
--- a/arch/powerpc/crypto/crct10dif-vpmsum_asm.S
+++ /dev/null
@@ -1,845 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * Calculate a CRC T10DIF with vpmsum acceleration
- *
- * Constants generated by crc32-vpmsum, available at
- * https://github.com/antonblanchard/crc32-vpmsum
- *
- * crc32-vpmsum is
- * Copyright (C) 2015 Anton Blanchard <anton@au.ibm.com>, IBM
- */
- .section .rodata
-.balign 16
-
-.byteswap_constant:
- /* byte reverse permute constant */
- .octa 0x0F0E0D0C0B0A09080706050403020100
-
-.constants:
-
- /* Reduce 262144 kbits to 1024 bits */
- /* x^261184 mod p(x), x^261120 mod p(x) */
- .octa 0x0000000056d300000000000052550000
-
- /* x^260160 mod p(x), x^260096 mod p(x) */
- .octa 0x00000000ee67000000000000a1e40000
-
- /* x^259136 mod p(x), x^259072 mod p(x) */
- .octa 0x0000000060830000000000004ad10000
-
- /* x^258112 mod p(x), x^258048 mod p(x) */
- .octa 0x000000008cfe0000000000009ab40000
-
- /* x^257088 mod p(x), x^257024 mod p(x) */
- .octa 0x000000003e93000000000000fdb50000
-
- /* x^256064 mod p(x), x^256000 mod p(x) */
- .octa 0x000000003c2000000000000045480000
-
- /* x^255040 mod p(x), x^254976 mod p(x) */
- .octa 0x00000000b1fc0000000000008d690000
-
- /* x^254016 mod p(x), x^253952 mod p(x) */
- .octa 0x00000000f82b00000000000024ad0000
-
- /* x^252992 mod p(x), x^252928 mod p(x) */
- .octa 0x0000000044420000000000009f1a0000
-
- /* x^251968 mod p(x), x^251904 mod p(x) */
- .octa 0x00000000e88c00000000000066ec0000
-
- /* x^250944 mod p(x), x^250880 mod p(x) */
- .octa 0x00000000385c000000000000c87d0000
-
- /* x^249920 mod p(x), x^249856 mod p(x) */
- .octa 0x000000003227000000000000c8ff0000
-
- /* x^248896 mod p(x), x^248832 mod p(x) */
- .octa 0x00000000a9a900000000000033440000
-
- /* x^247872 mod p(x), x^247808 mod p(x) */
- .octa 0x00000000abaa00000000000066eb0000
-
- /* x^246848 mod p(x), x^246784 mod p(x) */
- .octa 0x000000001ac3000000000000c4ef0000
-
- /* x^245824 mod p(x), x^245760 mod p(x) */
- .octa 0x0000000063f000000000000056f30000
-
- /* x^244800 mod p(x), x^244736 mod p(x) */
- .octa 0x0000000032cc00000000000002050000
-
- /* x^243776 mod p(x), x^243712 mod p(x) */
- .octa 0x00000000f8b5000000000000568e0000
-
- /* x^242752 mod p(x), x^242688 mod p(x) */
- .octa 0x000000008db100000000000064290000
-
- /* x^241728 mod p(x), x^241664 mod p(x) */
- .octa 0x0000000059ca0000000000006b660000
-
- /* x^240704 mod p(x), x^240640 mod p(x) */
- .octa 0x000000005f5c00000000000018f80000
-
- /* x^239680 mod p(x), x^239616 mod p(x) */
- .octa 0x0000000061af000000000000b6090000
-
- /* x^238656 mod p(x), x^238592 mod p(x) */
- .octa 0x00000000e29e000000000000099a0000
-
- /* x^237632 mod p(x), x^237568 mod p(x) */
- .octa 0x000000000975000000000000a8360000
-
- /* x^236608 mod p(x), x^236544 mod p(x) */
- .octa 0x0000000043900000000000004f570000
-
- /* x^235584 mod p(x), x^235520 mod p(x) */
- .octa 0x00000000f9cd000000000000134c0000
-
- /* x^234560 mod p(x), x^234496 mod p(x) */
- .octa 0x000000007c29000000000000ec380000
-
- /* x^233536 mod p(x), x^233472 mod p(x) */
- .octa 0x000000004c6a000000000000b0d10000
-
- /* x^232512 mod p(x), x^232448 mod p(x) */
- .octa 0x00000000e7290000000000007d3e0000
-
- /* x^231488 mod p(x), x^231424 mod p(x) */
- .octa 0x00000000f1ab000000000000f0b20000
-
- /* x^230464 mod p(x), x^230400 mod p(x) */
- .octa 0x0000000039db0000000000009c270000
-
- /* x^229440 mod p(x), x^229376 mod p(x) */
- .octa 0x000000005e2800000000000092890000
-
- /* x^228416 mod p(x), x^228352 mod p(x) */
- .octa 0x00000000d44e000000000000d5ee0000
-
- /* x^227392 mod p(x), x^227328 mod p(x) */
- .octa 0x00000000cd0a00000000000041f50000
-
- /* x^226368 mod p(x), x^226304 mod p(x) */
- .octa 0x00000000c5b400000000000010520000
-
- /* x^225344 mod p(x), x^225280 mod p(x) */
- .octa 0x00000000fd2100000000000042170000
-
- /* x^224320 mod p(x), x^224256 mod p(x) */
- .octa 0x000000002f2500000000000095c20000
-
- /* x^223296 mod p(x), x^223232 mod p(x) */
- .octa 0x000000001b0100000000000001ce0000
-
- /* x^222272 mod p(x), x^222208 mod p(x) */
- .octa 0x000000000d430000000000002aca0000
-
- /* x^221248 mod p(x), x^221184 mod p(x) */
- .octa 0x0000000030a6000000000000385e0000
-
- /* x^220224 mod p(x), x^220160 mod p(x) */
- .octa 0x00000000e37b0000000000006f7a0000
-
- /* x^219200 mod p(x), x^219136 mod p(x) */
- .octa 0x00000000873600000000000024320000
-
- /* x^218176 mod p(x), x^218112 mod p(x) */
- .octa 0x00000000e9fb000000000000bd9c0000
-
- /* x^217152 mod p(x), x^217088 mod p(x) */
- .octa 0x000000003b9500000000000054bc0000
-
- /* x^216128 mod p(x), x^216064 mod p(x) */
- .octa 0x00000000133e000000000000a4660000
-
- /* x^215104 mod p(x), x^215040 mod p(x) */
- .octa 0x00000000784500000000000079930000
-
- /* x^214080 mod p(x), x^214016 mod p(x) */
- .octa 0x00000000b9800000000000001bb80000
-
- /* x^213056 mod p(x), x^212992 mod p(x) */
- .octa 0x00000000687600000000000024400000
-
- /* x^212032 mod p(x), x^211968 mod p(x) */
- .octa 0x00000000aff300000000000029e10000
-
- /* x^211008 mod p(x), x^210944 mod p(x) */
- .octa 0x0000000024b50000000000005ded0000
-
- /* x^209984 mod p(x), x^209920 mod p(x) */
- .octa 0x0000000017e8000000000000b12e0000
-
- /* x^208960 mod p(x), x^208896 mod p(x) */
- .octa 0x00000000128400000000000026d20000
-
- /* x^207936 mod p(x), x^207872 mod p(x) */
- .octa 0x000000002115000000000000a32a0000
-
- /* x^206912 mod p(x), x^206848 mod p(x) */
- .octa 0x000000009595000000000000a1210000
-
- /* x^205888 mod p(x), x^205824 mod p(x) */
- .octa 0x00000000281e000000000000ee8b0000
-
- /* x^204864 mod p(x), x^204800 mod p(x) */
- .octa 0x0000000006010000000000003d0d0000
-
- /* x^203840 mod p(x), x^203776 mod p(x) */
- .octa 0x00000000e2b600000000000034e90000
-
- /* x^202816 mod p(x), x^202752 mod p(x) */
- .octa 0x000000001bd40000000000004cdb0000
-
- /* x^201792 mod p(x), x^201728 mod p(x) */
- .octa 0x00000000df2800000000000030e90000
-
- /* x^200768 mod p(x), x^200704 mod p(x) */
- .octa 0x0000000049c200000000000042590000
-
- /* x^199744 mod p(x), x^199680 mod p(x) */
- .octa 0x000000009b97000000000000df950000
-
- /* x^198720 mod p(x), x^198656 mod p(x) */
- .octa 0x000000006184000000000000da7b0000
-
- /* x^197696 mod p(x), x^197632 mod p(x) */
- .octa 0x00000000461700000000000012510000
-
- /* x^196672 mod p(x), x^196608 mod p(x) */
- .octa 0x000000009b40000000000000f37e0000
-
- /* x^195648 mod p(x), x^195584 mod p(x) */
- .octa 0x00000000eeb2000000000000ecf10000
-
- /* x^194624 mod p(x), x^194560 mod p(x) */
- .octa 0x00000000b2e800000000000050f20000
-
- /* x^193600 mod p(x), x^193536 mod p(x) */
- .octa 0x00000000f59a000000000000e0b30000
-
- /* x^192576 mod p(x), x^192512 mod p(x) */
- .octa 0x00000000467f0000000000004d5a0000
-
- /* x^191552 mod p(x), x^191488 mod p(x) */
- .octa 0x00000000da92000000000000bb010000
-
- /* x^190528 mod p(x), x^190464 mod p(x) */
- .octa 0x000000001e1000000000000022a40000
-
- /* x^189504 mod p(x), x^189440 mod p(x) */
- .octa 0x0000000058fe000000000000836f0000
-
- /* x^188480 mod p(x), x^188416 mod p(x) */
- .octa 0x00000000b9ce000000000000d78d0000
-
- /* x^187456 mod p(x), x^187392 mod p(x) */
- .octa 0x0000000022210000000000004f8d0000
-
- /* x^186432 mod p(x), x^186368 mod p(x) */
- .octa 0x00000000744600000000000033760000
-
- /* x^185408 mod p(x), x^185344 mod p(x) */
- .octa 0x000000001c2e000000000000a1e50000
-
- /* x^184384 mod p(x), x^184320 mod p(x) */
- .octa 0x00000000dcc8000000000000a1a40000
-
- /* x^183360 mod p(x), x^183296 mod p(x) */
- .octa 0x00000000910f00000000000019a20000
-
- /* x^182336 mod p(x), x^182272 mod p(x) */
- .octa 0x0000000055d5000000000000f6ae0000
-
- /* x^181312 mod p(x), x^181248 mod p(x) */
- .octa 0x00000000c8ba000000000000a7ac0000
-
- /* x^180288 mod p(x), x^180224 mod p(x) */
- .octa 0x0000000031f8000000000000eea20000
-
- /* x^179264 mod p(x), x^179200 mod p(x) */
- .octa 0x000000001966000000000000c4d90000
-
- /* x^178240 mod p(x), x^178176 mod p(x) */
- .octa 0x00000000b9810000000000002b470000
-
- /* x^177216 mod p(x), x^177152 mod p(x) */
- .octa 0x000000008303000000000000f7cf0000
-
- /* x^176192 mod p(x), x^176128 mod p(x) */
- .octa 0x000000002ce500000000000035b30000
-
- /* x^175168 mod p(x), x^175104 mod p(x) */
- .octa 0x000000002fae0000000000000c7c0000
-
- /* x^174144 mod p(x), x^174080 mod p(x) */
- .octa 0x00000000f50c0000000000009edf0000
-
- /* x^173120 mod p(x), x^173056 mod p(x) */
- .octa 0x00000000714f00000000000004cd0000
-
- /* x^172096 mod p(x), x^172032 mod p(x) */
- .octa 0x00000000c161000000000000541b0000
-
- /* x^171072 mod p(x), x^171008 mod p(x) */
- .octa 0x0000000021c8000000000000e2700000
-
- /* x^170048 mod p(x), x^169984 mod p(x) */
- .octa 0x00000000b93d00000000000009a60000
-
- /* x^169024 mod p(x), x^168960 mod p(x) */
- .octa 0x00000000fbcf000000000000761c0000
-
- /* x^168000 mod p(x), x^167936 mod p(x) */
- .octa 0x0000000026350000000000009db30000
-
- /* x^166976 mod p(x), x^166912 mod p(x) */
- .octa 0x00000000b64f0000000000003e9f0000
-
- /* x^165952 mod p(x), x^165888 mod p(x) */
- .octa 0x00000000bd0e00000000000078590000
-
- /* x^164928 mod p(x), x^164864 mod p(x) */
- .octa 0x00000000d9360000000000008bc80000
-
- /* x^163904 mod p(x), x^163840 mod p(x) */
- .octa 0x000000002f140000000000008c9f0000
-
- /* x^162880 mod p(x), x^162816 mod p(x) */
- .octa 0x000000006a270000000000006af70000
-
- /* x^161856 mod p(x), x^161792 mod p(x) */
- .octa 0x000000006685000000000000e5210000
-
- /* x^160832 mod p(x), x^160768 mod p(x) */
- .octa 0x0000000062da00000000000008290000
-
- /* x^159808 mod p(x), x^159744 mod p(x) */
- .octa 0x00000000bb4b000000000000e4d00000
-
- /* x^158784 mod p(x), x^158720 mod p(x) */
- .octa 0x00000000d2490000000000004ae10000
-
- /* x^157760 mod p(x), x^157696 mod p(x) */
- .octa 0x00000000c85b00000000000000e70000
-
- /* x^156736 mod p(x), x^156672 mod p(x) */
- .octa 0x00000000c37a00000000000015650000
-
- /* x^155712 mod p(x), x^155648 mod p(x) */
- .octa 0x0000000018530000000000001c2f0000
-
- /* x^154688 mod p(x), x^154624 mod p(x) */
- .octa 0x00000000b46600000000000037bd0000
-
- /* x^153664 mod p(x), x^153600 mod p(x) */
- .octa 0x00000000439b00000000000012190000
-
- /* x^152640 mod p(x), x^152576 mod p(x) */
- .octa 0x00000000b1260000000000005ece0000
-
- /* x^151616 mod p(x), x^151552 mod p(x) */
- .octa 0x00000000d8110000000000002a5e0000
-
- /* x^150592 mod p(x), x^150528 mod p(x) */
- .octa 0x00000000099f00000000000052330000
-
- /* x^149568 mod p(x), x^149504 mod p(x) */
- .octa 0x00000000f9f9000000000000f9120000
-
- /* x^148544 mod p(x), x^148480 mod p(x) */
- .octa 0x000000005cc00000000000000ddc0000
-
- /* x^147520 mod p(x), x^147456 mod p(x) */
- .octa 0x00000000343b00000000000012200000
-
- /* x^146496 mod p(x), x^146432 mod p(x) */
- .octa 0x000000009222000000000000d12b0000
-
- /* x^145472 mod p(x), x^145408 mod p(x) */
- .octa 0x00000000d781000000000000eb2d0000
-
- /* x^144448 mod p(x), x^144384 mod p(x) */
- .octa 0x000000000bf400000000000058970000
-
- /* x^143424 mod p(x), x^143360 mod p(x) */
- .octa 0x00000000094200000000000013690000
-
- /* x^142400 mod p(x), x^142336 mod p(x) */
- .octa 0x00000000d55100000000000051950000
-
- /* x^141376 mod p(x), x^141312 mod p(x) */
- .octa 0x000000008f11000000000000954b0000
-
- /* x^140352 mod p(x), x^140288 mod p(x) */
- .octa 0x00000000140f000000000000b29e0000
-
- /* x^139328 mod p(x), x^139264 mod p(x) */
- .octa 0x00000000c6db000000000000db5d0000
-
- /* x^138304 mod p(x), x^138240 mod p(x) */
- .octa 0x00000000715b000000000000dfaf0000
-
- /* x^137280 mod p(x), x^137216 mod p(x) */
- .octa 0x000000000dea000000000000e3b60000
-
- /* x^136256 mod p(x), x^136192 mod p(x) */
- .octa 0x000000006f94000000000000ddaf0000
-
- /* x^135232 mod p(x), x^135168 mod p(x) */
- .octa 0x0000000024e1000000000000e4f70000
-
- /* x^134208 mod p(x), x^134144 mod p(x) */
- .octa 0x000000008810000000000000aa110000
-
- /* x^133184 mod p(x), x^133120 mod p(x) */
- .octa 0x0000000030c2000000000000a8e60000
-
- /* x^132160 mod p(x), x^132096 mod p(x) */
- .octa 0x00000000e6d0000000000000ccf30000
-
- /* x^131136 mod p(x), x^131072 mod p(x) */
- .octa 0x000000004da000000000000079bf0000
-
- /* x^130112 mod p(x), x^130048 mod p(x) */
- .octa 0x000000007759000000000000b3a30000
-
- /* x^129088 mod p(x), x^129024 mod p(x) */
- .octa 0x00000000597400000000000028790000
-
- /* x^128064 mod p(x), x^128000 mod p(x) */
- .octa 0x000000007acd000000000000b5820000
-
- /* x^127040 mod p(x), x^126976 mod p(x) */
- .octa 0x00000000e6e400000000000026ad0000
-
- /* x^126016 mod p(x), x^125952 mod p(x) */
- .octa 0x000000006d49000000000000985b0000
-
- /* x^124992 mod p(x), x^124928 mod p(x) */
- .octa 0x000000000f0800000000000011520000
-
- /* x^123968 mod p(x), x^123904 mod p(x) */
- .octa 0x000000002c7f000000000000846c0000
-
- /* x^122944 mod p(x), x^122880 mod p(x) */
- .octa 0x000000005ce7000000000000ae1d0000
-
- /* x^121920 mod p(x), x^121856 mod p(x) */
- .octa 0x00000000d4cb000000000000e21d0000
-
- /* x^120896 mod p(x), x^120832 mod p(x) */
- .octa 0x000000003a2300000000000019bb0000
-
- /* x^119872 mod p(x), x^119808 mod p(x) */
- .octa 0x000000000e1700000000000095290000
-
- /* x^118848 mod p(x), x^118784 mod p(x) */
- .octa 0x000000006e6400000000000050d20000
-
- /* x^117824 mod p(x), x^117760 mod p(x) */
- .octa 0x000000008d5c0000000000000cd10000
-
- /* x^116800 mod p(x), x^116736 mod p(x) */
- .octa 0x00000000ef310000000000007b570000
-
- /* x^115776 mod p(x), x^115712 mod p(x) */
- .octa 0x00000000645d00000000000053d60000
-
- /* x^114752 mod p(x), x^114688 mod p(x) */
- .octa 0x0000000018fc00000000000077510000
-
- /* x^113728 mod p(x), x^113664 mod p(x) */
- .octa 0x000000000cb3000000000000a7b70000
-
- /* x^112704 mod p(x), x^112640 mod p(x) */
- .octa 0x00000000991b000000000000d0780000
-
- /* x^111680 mod p(x), x^111616 mod p(x) */
- .octa 0x00000000845a000000000000be3c0000
-
- /* x^110656 mod p(x), x^110592 mod p(x) */
- .octa 0x00000000d3a9000000000000df020000
-
- /* x^109632 mod p(x), x^109568 mod p(x) */
- .octa 0x0000000017d7000000000000063e0000
-
- /* x^108608 mod p(x), x^108544 mod p(x) */
- .octa 0x000000007a860000000000008ab40000
-
- /* x^107584 mod p(x), x^107520 mod p(x) */
- .octa 0x00000000fd7c000000000000c7bd0000
-
- /* x^106560 mod p(x), x^106496 mod p(x) */
- .octa 0x00000000a56b000000000000efd60000
-
- /* x^105536 mod p(x), x^105472 mod p(x) */
- .octa 0x0000000010e400000000000071380000
-
- /* x^104512 mod p(x), x^104448 mod p(x) */
- .octa 0x00000000994500000000000004d30000
-
- /* x^103488 mod p(x), x^103424 mod p(x) */
- .octa 0x00000000b83c0000000000003b0e0000
-
- /* x^102464 mod p(x), x^102400 mod p(x) */
- .octa 0x00000000d6c10000000000008b020000
-
- /* x^101440 mod p(x), x^101376 mod p(x) */
- .octa 0x000000009efc000000000000da940000
-
- /* x^100416 mod p(x), x^100352 mod p(x) */
- .octa 0x000000005e87000000000000f9f70000
-
- /* x^99392 mod p(x), x^99328 mod p(x) */
- .octa 0x000000006c9b00000000000045e40000
-
- /* x^98368 mod p(x), x^98304 mod p(x) */
- .octa 0x00000000178a00000000000083940000
-
- /* x^97344 mod p(x), x^97280 mod p(x) */
- .octa 0x00000000f0c8000000000000f0a00000
-
- /* x^96320 mod p(x), x^96256 mod p(x) */
- .octa 0x00000000f699000000000000b74b0000
-
- /* x^95296 mod p(x), x^95232 mod p(x) */
- .octa 0x00000000316d000000000000c1cf0000
-
- /* x^94272 mod p(x), x^94208 mod p(x) */
- .octa 0x00000000987e00000000000072680000
-
- /* x^93248 mod p(x), x^93184 mod p(x) */
- .octa 0x00000000acff000000000000e0ab0000
-
- /* x^92224 mod p(x), x^92160 mod p(x) */
- .octa 0x00000000a1f6000000000000c5a80000
-
- /* x^91200 mod p(x), x^91136 mod p(x) */
- .octa 0x0000000061bd000000000000cf690000
-
- /* x^90176 mod p(x), x^90112 mod p(x) */
- .octa 0x00000000c9f2000000000000cbcc0000
-
- /* x^89152 mod p(x), x^89088 mod p(x) */
- .octa 0x000000005a33000000000000de050000
-
- /* x^88128 mod p(x), x^88064 mod p(x) */
- .octa 0x00000000e416000000000000ccd70000
-
- /* x^87104 mod p(x), x^87040 mod p(x) */
- .octa 0x0000000058930000000000002f670000
-
- /* x^86080 mod p(x), x^86016 mod p(x) */
- .octa 0x00000000a9d3000000000000152f0000
-
- /* x^85056 mod p(x), x^84992 mod p(x) */
- .octa 0x00000000c114000000000000ecc20000
-
- /* x^84032 mod p(x), x^83968 mod p(x) */
- .octa 0x00000000b9270000000000007c890000
-
- /* x^83008 mod p(x), x^82944 mod p(x) */
- .octa 0x000000002e6000000000000006ee0000
-
- /* x^81984 mod p(x), x^81920 mod p(x) */
- .octa 0x00000000dfc600000000000009100000
-
- /* x^80960 mod p(x), x^80896 mod p(x) */
- .octa 0x000000004911000000000000ad4e0000
-
- /* x^79936 mod p(x), x^79872 mod p(x) */
- .octa 0x00000000ae1b000000000000b04d0000
-
- /* x^78912 mod p(x), x^78848 mod p(x) */
- .octa 0x0000000005fa000000000000e9900000
-
- /* x^77888 mod p(x), x^77824 mod p(x) */
- .octa 0x0000000004a1000000000000cc6f0000
-
- /* x^76864 mod p(x), x^76800 mod p(x) */
- .octa 0x00000000af73000000000000ed110000
-
- /* x^75840 mod p(x), x^75776 mod p(x) */
- .octa 0x0000000082530000000000008f7e0000
-
- /* x^74816 mod p(x), x^74752 mod p(x) */
- .octa 0x00000000cfdc000000000000594f0000
-
- /* x^73792 mod p(x), x^73728 mod p(x) */
- .octa 0x00000000a6b6000000000000a8750000
-
- /* x^72768 mod p(x), x^72704 mod p(x) */
- .octa 0x00000000fd76000000000000aa0c0000
-
- /* x^71744 mod p(x), x^71680 mod p(x) */
- .octa 0x0000000006f500000000000071db0000
-
- /* x^70720 mod p(x), x^70656 mod p(x) */
- .octa 0x0000000037ca000000000000ab0c0000
-
- /* x^69696 mod p(x), x^69632 mod p(x) */
- .octa 0x00000000d7ab000000000000b7a00000
-
- /* x^68672 mod p(x), x^68608 mod p(x) */
- .octa 0x00000000440800000000000090d30000
-
- /* x^67648 mod p(x), x^67584 mod p(x) */
- .octa 0x00000000186100000000000054730000
-
- /* x^66624 mod p(x), x^66560 mod p(x) */
- .octa 0x000000007368000000000000a3a20000
-
- /* x^65600 mod p(x), x^65536 mod p(x) */
- .octa 0x0000000026d0000000000000f9040000
-
- /* x^64576 mod p(x), x^64512 mod p(x) */
- .octa 0x00000000fe770000000000009c0a0000
-
- /* x^63552 mod p(x), x^63488 mod p(x) */
- .octa 0x000000002cba000000000000d1e70000
-
- /* x^62528 mod p(x), x^62464 mod p(x) */
- .octa 0x00000000f8bd0000000000005ac10000
-
- /* x^61504 mod p(x), x^61440 mod p(x) */
- .octa 0x000000007372000000000000d68d0000
-
- /* x^60480 mod p(x), x^60416 mod p(x) */
- .octa 0x00000000f37f00000000000089f60000
-
- /* x^59456 mod p(x), x^59392 mod p(x) */
- .octa 0x00000000078400000000000008a90000
-
- /* x^58432 mod p(x), x^58368 mod p(x) */
- .octa 0x00000000d3e400000000000042360000
-
- /* x^57408 mod p(x), x^57344 mod p(x) */
- .octa 0x00000000eba800000000000092d50000
-
- /* x^56384 mod p(x), x^56320 mod p(x) */
- .octa 0x00000000afbe000000000000b4d50000
-
- /* x^55360 mod p(x), x^55296 mod p(x) */
- .octa 0x00000000d8ca000000000000c9060000
-
- /* x^54336 mod p(x), x^54272 mod p(x) */
- .octa 0x00000000c2d00000000000008f4f0000
-
- /* x^53312 mod p(x), x^53248 mod p(x) */
- .octa 0x00000000373200000000000028690000
-
- /* x^52288 mod p(x), x^52224 mod p(x) */
- .octa 0x0000000046ae000000000000c3b30000
-
- /* x^51264 mod p(x), x^51200 mod p(x) */
- .octa 0x00000000b243000000000000f8700000
-
- /* x^50240 mod p(x), x^50176 mod p(x) */
- .octa 0x00000000f7f500000000000029eb0000
-
- /* x^49216 mod p(x), x^49152 mod p(x) */
- .octa 0x000000000c7e000000000000fe730000
-
- /* x^48192 mod p(x), x^48128 mod p(x) */
- .octa 0x00000000c38200000000000096000000
-
- /* x^47168 mod p(x), x^47104 mod p(x) */
- .octa 0x000000008956000000000000683c0000
-
- /* x^46144 mod p(x), x^46080 mod p(x) */
- .octa 0x00000000422d0000000000005f1e0000
-
- /* x^45120 mod p(x), x^45056 mod p(x) */
- .octa 0x00000000ac0f0000000000006f810000
-
- /* x^44096 mod p(x), x^44032 mod p(x) */
- .octa 0x00000000ce30000000000000031f0000
-
- /* x^43072 mod p(x), x^43008 mod p(x) */
- .octa 0x000000003d43000000000000455a0000
-
- /* x^42048 mod p(x), x^41984 mod p(x) */
- .octa 0x000000007ebe000000000000a6050000
-
- /* x^41024 mod p(x), x^40960 mod p(x) */
- .octa 0x00000000976e00000000000077eb0000
-
- /* x^40000 mod p(x), x^39936 mod p(x) */
- .octa 0x000000000872000000000000389c0000
-
- /* x^38976 mod p(x), x^38912 mod p(x) */
- .octa 0x000000008979000000000000c7b20000
-
- /* x^37952 mod p(x), x^37888 mod p(x) */
- .octa 0x000000005c1e0000000000001d870000
-
- /* x^36928 mod p(x), x^36864 mod p(x) */
- .octa 0x00000000aebb00000000000045810000
-
- /* x^35904 mod p(x), x^35840 mod p(x) */
- .octa 0x000000004f7e0000000000006d4a0000
-
- /* x^34880 mod p(x), x^34816 mod p(x) */
- .octa 0x00000000ea98000000000000b9200000
-
- /* x^33856 mod p(x), x^33792 mod p(x) */
- .octa 0x00000000f39600000000000022f20000
-
- /* x^32832 mod p(x), x^32768 mod p(x) */
- .octa 0x000000000bc500000000000041ca0000
-
- /* x^31808 mod p(x), x^31744 mod p(x) */
- .octa 0x00000000786400000000000078500000
-
- /* x^30784 mod p(x), x^30720 mod p(x) */
- .octa 0x00000000be970000000000009e7e0000
-
- /* x^29760 mod p(x), x^29696 mod p(x) */
- .octa 0x00000000dd6d000000000000a53c0000
-
- /* x^28736 mod p(x), x^28672 mod p(x) */
- .octa 0x000000004c3f00000000000039340000
-
- /* x^27712 mod p(x), x^27648 mod p(x) */
- .octa 0x0000000093a4000000000000b58e0000
-
- /* x^26688 mod p(x), x^26624 mod p(x) */
- .octa 0x0000000050fb00000000000062d40000
-
- /* x^25664 mod p(x), x^25600 mod p(x) */
- .octa 0x00000000f505000000000000a26f0000
-
- /* x^24640 mod p(x), x^24576 mod p(x) */
- .octa 0x0000000064f900000000000065e60000
-
- /* x^23616 mod p(x), x^23552 mod p(x) */
- .octa 0x00000000e8c2000000000000aad90000
-
- /* x^22592 mod p(x), x^22528 mod p(x) */
- .octa 0x00000000720b000000000000a3b00000
-
- /* x^21568 mod p(x), x^21504 mod p(x) */
- .octa 0x00000000e992000000000000d2680000
-
- /* x^20544 mod p(x), x^20480 mod p(x) */
- .octa 0x000000009132000000000000cf4c0000
-
- /* x^19520 mod p(x), x^19456 mod p(x) */
- .octa 0x00000000608a00000000000076610000
-
- /* x^18496 mod p(x), x^18432 mod p(x) */
- .octa 0x000000009948000000000000fb9f0000
-
- /* x^17472 mod p(x), x^17408 mod p(x) */
- .octa 0x00000000173000000000000003770000
-
- /* x^16448 mod p(x), x^16384 mod p(x) */
- .octa 0x000000006fe300000000000004880000
-
- /* x^15424 mod p(x), x^15360 mod p(x) */
- .octa 0x00000000e15300000000000056a70000
-
- /* x^14400 mod p(x), x^14336 mod p(x) */
- .octa 0x0000000092d60000000000009dfd0000
-
- /* x^13376 mod p(x), x^13312 mod p(x) */
- .octa 0x0000000002fd00000000000074c80000
-
- /* x^12352 mod p(x), x^12288 mod p(x) */
- .octa 0x00000000c78b000000000000a3ec0000
-
- /* x^11328 mod p(x), x^11264 mod p(x) */
- .octa 0x000000009262000000000000b3530000
-
- /* x^10304 mod p(x), x^10240 mod p(x) */
- .octa 0x0000000084f200000000000047bf0000
-
- /* x^9280 mod p(x), x^9216 mod p(x) */
- .octa 0x0000000067ee000000000000e97c0000
-
- /* x^8256 mod p(x), x^8192 mod p(x) */
- .octa 0x00000000535b00000000000091e10000
-
- /* x^7232 mod p(x), x^7168 mod p(x) */
- .octa 0x000000007ebb00000000000055060000
-
- /* x^6208 mod p(x), x^6144 mod p(x) */
- .octa 0x00000000c6a1000000000000fd360000
-
- /* x^5184 mod p(x), x^5120 mod p(x) */
- .octa 0x000000001be500000000000055860000
-
- /* x^4160 mod p(x), x^4096 mod p(x) */
- .octa 0x00000000ae0e0000000000005bd00000
-
- /* x^3136 mod p(x), x^3072 mod p(x) */
- .octa 0x0000000022040000000000008db20000
-
- /* x^2112 mod p(x), x^2048 mod p(x) */
- .octa 0x00000000c9eb000000000000efe20000
-
- /* x^1088 mod p(x), x^1024 mod p(x) */
- .octa 0x0000000039b400000000000051d10000
-
-.short_constants:
-
- /* Reduce final 1024-2048 bits to 64 bits, shifting 32 bits to include the trailing 32 bits of zeros */
- /* x^2048 mod p(x), x^2016 mod p(x), x^1984 mod p(x), x^1952 mod p(x) */
- .octa 0xefe20000dccf00009440000033590000
-
- /* x^1920 mod p(x), x^1888 mod p(x), x^1856 mod p(x), x^1824 mod p(x) */
- .octa 0xee6300002f3f000062180000e0ed0000
-
- /* x^1792 mod p(x), x^1760 mod p(x), x^1728 mod p(x), x^1696 mod p(x) */
- .octa 0xcf5f000017ef0000ccbe000023d30000
-
- /* x^1664 mod p(x), x^1632 mod p(x), x^1600 mod p(x), x^1568 mod p(x) */
- .octa 0x6d0c0000a30e00000920000042630000
-
- /* x^1536 mod p(x), x^1504 mod p(x), x^1472 mod p(x), x^1440 mod p(x) */
- .octa 0x21d30000932b0000a7a00000efcc0000
-
- /* x^1408 mod p(x), x^1376 mod p(x), x^1344 mod p(x), x^1312 mod p(x) */
- .octa 0x10be00000b310000666f00000d1c0000
-
- /* x^1280 mod p(x), x^1248 mod p(x), x^1216 mod p(x), x^1184 mod p(x) */
- .octa 0x1f240000ce9e0000caad0000589e0000
-
- /* x^1152 mod p(x), x^1120 mod p(x), x^1088 mod p(x), x^1056 mod p(x) */
- .octa 0x29610000d02b000039b400007cf50000
-
- /* x^1024 mod p(x), x^992 mod p(x), x^960 mod p(x), x^928 mod p(x) */
- .octa 0x51d100009d9d00003c0e0000bfd60000
-
- /* x^896 mod p(x), x^864 mod p(x), x^832 mod p(x), x^800 mod p(x) */
- .octa 0xda390000ceae000013830000713c0000
-
- /* x^768 mod p(x), x^736 mod p(x), x^704 mod p(x), x^672 mod p(x) */
- .octa 0xb67800001e16000085c0000080a60000
-
- /* x^640 mod p(x), x^608 mod p(x), x^576 mod p(x), x^544 mod p(x) */
- .octa 0x0db40000f7f90000371d0000e6580000
-
- /* x^512 mod p(x), x^480 mod p(x), x^448 mod p(x), x^416 mod p(x) */
- .octa 0x87e70000044c0000aadb0000a4970000
-
- /* x^384 mod p(x), x^352 mod p(x), x^320 mod p(x), x^288 mod p(x) */
- .octa 0x1f990000ad180000d8b30000e7b50000
-
- /* x^256 mod p(x), x^224 mod p(x), x^192 mod p(x), x^160 mod p(x) */
- .octa 0xbe6c00006ee300004c1a000006df0000
-
- /* x^128 mod p(x), x^96 mod p(x), x^64 mod p(x), x^32 mod p(x) */
- .octa 0xfb0b00002d560000136800008bb70000
-
-
-.barrett_constants:
- /* Barrett constant m - (4^32)/n */
- .octa 0x000000000000000000000001f65a57f8 /* x^64 div p(x) */
- /* Barrett constant n */
- .octa 0x0000000000000000000000018bb70000
-
-#define CRC_FUNCTION_NAME __crct10dif_vpmsum
-#include "crc32-vpmsum_core.S"
diff --git a/arch/powerpc/crypto/crct10dif-vpmsum_glue.c b/arch/powerpc/crypto/crct10dif-vpmsum_glue.c
deleted file mode 100644
index 1dc8b6915178..000000000000
--- a/arch/powerpc/crypto/crct10dif-vpmsum_glue.c
+++ /dev/null
@@ -1,126 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * Calculate a CRC T10-DIF with vpmsum acceleration
- *
- * Copyright 2017, Daniel Axtens, IBM Corporation.
- * [based on crc32c-vpmsum_glue.c]
- */
-
-#include <linux/crc-t10dif.h>
-#include <crypto/internal/hash.h>
-#include <crypto/internal/simd.h>
-#include <linux/init.h>
-#include <linux/module.h>
-#include <linux/string.h>
-#include <linux/kernel.h>
-#include <linux/cpufeature.h>
-#include <asm/simd.h>
-#include <asm/switch_to.h>
-
-#define VMX_ALIGN 16
-#define VMX_ALIGN_MASK (VMX_ALIGN-1)
-
-#define VECTOR_BREAKPOINT 64
-
-u32 __crct10dif_vpmsum(u32 crc, unsigned char const *p, size_t len);
-
-static u16 crct10dif_vpmsum(u16 crci, unsigned char const *p, size_t len)
-{
- unsigned int prealign;
- unsigned int tail;
- u32 crc = crci;
-
- if (len < (VECTOR_BREAKPOINT + VMX_ALIGN) || !crypto_simd_usable())
- return crc_t10dif_generic(crc, p, len);
-
- if ((unsigned long)p & VMX_ALIGN_MASK) {
- prealign = VMX_ALIGN - ((unsigned long)p & VMX_ALIGN_MASK);
- crc = crc_t10dif_generic(crc, p, prealign);
- len -= prealign;
- p += prealign;
- }
-
- if (len & ~VMX_ALIGN_MASK) {
- crc <<= 16;
- preempt_disable();
- pagefault_disable();
- enable_kernel_altivec();
- crc = __crct10dif_vpmsum(crc, p, len & ~VMX_ALIGN_MASK);
- disable_kernel_altivec();
- pagefault_enable();
- preempt_enable();
- crc >>= 16;
- }
-
- tail = len & VMX_ALIGN_MASK;
- if (tail) {
- p += len & ~VMX_ALIGN_MASK;
- crc = crc_t10dif_generic(crc, p, tail);
- }
-
- return crc & 0xffff;
-}
-
-static int crct10dif_vpmsum_init(struct shash_desc *desc)
-{
- u16 *crc = shash_desc_ctx(desc);
-
- *crc = 0;
- return 0;
-}
-
-static int crct10dif_vpmsum_update(struct shash_desc *desc, const u8 *data,
- unsigned int length)
-{
- u16 *crc = shash_desc_ctx(desc);
-
- *crc = crct10dif_vpmsum(*crc, data, length);
-
- return 0;
-}
-
-
-static int crct10dif_vpmsum_final(struct shash_desc *desc, u8 *out)
-{
- u16 *crcp = shash_desc_ctx(desc);
-
- *(u16 *)out = *crcp;
- return 0;
-}
-
-static struct shash_alg alg = {
- .init = crct10dif_vpmsum_init,
- .update = crct10dif_vpmsum_update,
- .final = crct10dif_vpmsum_final,
- .descsize = CRC_T10DIF_DIGEST_SIZE,
- .digestsize = CRC_T10DIF_DIGEST_SIZE,
- .base = {
- .cra_name = "crct10dif",
- .cra_driver_name = "crct10dif-vpmsum",
- .cra_priority = 200,
- .cra_blocksize = CRC_T10DIF_BLOCK_SIZE,
- .cra_module = THIS_MODULE,
- }
-};
-
-static int __init crct10dif_vpmsum_mod_init(void)
-{
- if (!cpu_has_feature(CPU_FTR_ARCH_207S))
- return -ENODEV;
-
- return crypto_register_shash(&alg);
-}
-
-static void __exit crct10dif_vpmsum_mod_fini(void)
-{
- crypto_unregister_shash(&alg);
-}
-
-module_cpu_feature_match(PPC_MODULE_FEATURE_VEC_CRYPTO, crct10dif_vpmsum_mod_init);
-module_exit(crct10dif_vpmsum_mod_fini);
-
-MODULE_AUTHOR("Daniel Axtens <dja@axtens.net>");
-MODULE_DESCRIPTION("CRCT10DIF using vector polynomial multiply-sum instructions");
-MODULE_LICENSE("GPL");
-MODULE_ALIAS_CRYPTO("crct10dif");
-MODULE_ALIAS_CRYPTO("crct10dif-vpmsum");
diff --git a/arch/powerpc/crypto/curve25519-ppc64le-core.c b/arch/powerpc/crypto/curve25519-ppc64le-core.c
new file mode 100644
index 000000000000..f7810be0b292
--- /dev/null
+++ b/arch/powerpc/crypto/curve25519-ppc64le-core.c
@@ -0,0 +1,300 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright 2024- IBM Corp.
+ *
+ * X25519 scalar multiplication with 51 bits limbs for PPC64le.
+ * Based on RFC7748 and AArch64 optimized implementation for X25519
+ * - Algorithm 1 Scalar multiplication of a variable point
+ */
+
+#include <crypto/curve25519.h>
+#include <crypto/internal/kpp.h>
+
+#include <linux/types.h>
+#include <linux/jump_label.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/scatterlist.h>
+
+#include <linux/cpufeature.h>
+#include <linux/processor.h>
+
+typedef uint64_t fe51[5];
+
+asmlinkage void x25519_fe51_mul(fe51 h, const fe51 f, const fe51 g);
+asmlinkage void x25519_fe51_sqr(fe51 h, const fe51 f);
+asmlinkage void x25519_fe51_mul121666(fe51 h, fe51 f);
+asmlinkage void x25519_fe51_sqr_times(fe51 h, const fe51 f, int n);
+asmlinkage void x25519_fe51_frombytes(fe51 h, const uint8_t *s);
+asmlinkage void x25519_fe51_tobytes(uint8_t *s, const fe51 h);
+asmlinkage void x25519_cswap(fe51 p, fe51 q, unsigned int bit);
+
+#define fmul x25519_fe51_mul
+#define fsqr x25519_fe51_sqr
+#define fmul121666 x25519_fe51_mul121666
+#define fe51_tobytes x25519_fe51_tobytes
+
+static void fadd(fe51 h, const fe51 f, const fe51 g)
+{
+ h[0] = f[0] + g[0];
+ h[1] = f[1] + g[1];
+ h[2] = f[2] + g[2];
+ h[3] = f[3] + g[3];
+ h[4] = f[4] + g[4];
+}
+
+/*
+ * Prime = 2 ** 255 - 19, 255 bits
+ * (0x7fffffff ffffffff ffffffff ffffffff ffffffff ffffffff ffffffff ffffffed)
+ *
+ * Prime in 5 51-bit limbs
+ */
+static fe51 prime51 = { 0x7ffffffffffed, 0x7ffffffffffff, 0x7ffffffffffff, 0x7ffffffffffff, 0x7ffffffffffff};
+
+static void fsub(fe51 h, const fe51 f, const fe51 g)
+{
+ h[0] = (f[0] + ((prime51[0] * 2))) - g[0];
+ h[1] = (f[1] + ((prime51[1] * 2))) - g[1];
+ h[2] = (f[2] + ((prime51[2] * 2))) - g[2];
+ h[3] = (f[3] + ((prime51[3] * 2))) - g[3];
+ h[4] = (f[4] + ((prime51[4] * 2))) - g[4];
+}
+
+static void fe51_frombytes(fe51 h, const uint8_t *s)
+{
+ /*
+ * Make sure 64-bit aligned.
+ */
+ unsigned char sbuf[32+8];
+ unsigned char *sb = PTR_ALIGN((void *)sbuf, 8);
+
+ memcpy(sb, s, 32);
+ x25519_fe51_frombytes(h, sb);
+}
+
+static void finv(fe51 o, const fe51 i)
+{
+ fe51 a0, b, c, t00;
+
+ fsqr(a0, i);
+ x25519_fe51_sqr_times(t00, a0, 2);
+
+ fmul(b, t00, i);
+ fmul(a0, b, a0);
+
+ fsqr(t00, a0);
+
+ fmul(b, t00, b);
+ x25519_fe51_sqr_times(t00, b, 5);
+
+ fmul(b, t00, b);
+ x25519_fe51_sqr_times(t00, b, 10);
+
+ fmul(c, t00, b);
+ x25519_fe51_sqr_times(t00, c, 20);
+
+ fmul(t00, t00, c);
+ x25519_fe51_sqr_times(t00, t00, 10);
+
+ fmul(b, t00, b);
+ x25519_fe51_sqr_times(t00, b, 50);
+
+ fmul(c, t00, b);
+ x25519_fe51_sqr_times(t00, c, 100);
+
+ fmul(t00, t00, c);
+ x25519_fe51_sqr_times(t00, t00, 50);
+
+ fmul(t00, t00, b);
+ x25519_fe51_sqr_times(t00, t00, 5);
+
+ fmul(o, t00, a0);
+}
+
+static void curve25519_fe51(uint8_t out[32], const uint8_t scalar[32],
+ const uint8_t point[32])
+{
+ fe51 x1, x2, z2, x3, z3;
+ uint8_t s[32];
+ unsigned int swap = 0;
+ int i;
+
+ memcpy(s, scalar, 32);
+ s[0] &= 0xf8;
+ s[31] &= 0x7f;
+ s[31] |= 0x40;
+ fe51_frombytes(x1, point);
+
+ z2[0] = z2[1] = z2[2] = z2[3] = z2[4] = 0;
+ x3[0] = x1[0];
+ x3[1] = x1[1];
+ x3[2] = x1[2];
+ x3[3] = x1[3];
+ x3[4] = x1[4];
+
+ x2[0] = z3[0] = 1;
+ x2[1] = z3[1] = 0;
+ x2[2] = z3[2] = 0;
+ x2[3] = z3[3] = 0;
+ x2[4] = z3[4] = 0;
+
+ for (i = 254; i >= 0; --i) {
+ unsigned int k_t = 1 & (s[i / 8] >> (i & 7));
+ fe51 a, b, c, d, e;
+ fe51 da, cb, aa, bb;
+ fe51 dacb_p, dacb_m;
+
+ swap ^= k_t;
+ x25519_cswap(x2, x3, swap);
+ x25519_cswap(z2, z3, swap);
+ swap = k_t;
+
+ fsub(b, x2, z2); // B = x_2 - z_2
+ fadd(a, x2, z2); // A = x_2 + z_2
+ fsub(d, x3, z3); // D = x_3 - z_3
+ fadd(c, x3, z3); // C = x_3 + z_3
+
+ fsqr(bb, b); // BB = B^2
+ fsqr(aa, a); // AA = A^2
+ fmul(da, d, a); // DA = D * A
+ fmul(cb, c, b); // CB = C * B
+
+ fsub(e, aa, bb); // E = AA - BB
+ fmul(x2, aa, bb); // x2 = AA * BB
+ fadd(dacb_p, da, cb); // DA + CB
+ fsub(dacb_m, da, cb); // DA - CB
+
+ fmul121666(z3, e); // 121666 * E
+ fsqr(z2, dacb_m); // (DA - CB)^2
+ fsqr(x3, dacb_p); // x3 = (DA + CB)^2
+ fadd(b, bb, z3); // BB + 121666 * E
+ fmul(z3, x1, z2); // z3 = x1 * (DA - CB)^2
+ fmul(z2, e, b); // z2 = e * (BB + (DA + CB)^2)
+ }
+
+ finv(z2, z2);
+ fmul(x2, x2, z2);
+ fe51_tobytes(out, x2);
+}
+
+void curve25519_arch(u8 mypublic[CURVE25519_KEY_SIZE],
+ const u8 secret[CURVE25519_KEY_SIZE],
+ const u8 basepoint[CURVE25519_KEY_SIZE])
+{
+ curve25519_fe51(mypublic, secret, basepoint);
+}
+EXPORT_SYMBOL(curve25519_arch);
+
+void curve25519_base_arch(u8 pub[CURVE25519_KEY_SIZE],
+ const u8 secret[CURVE25519_KEY_SIZE])
+{
+ curve25519_fe51(pub, secret, curve25519_base_point);
+}
+EXPORT_SYMBOL(curve25519_base_arch);
+
+static int curve25519_set_secret(struct crypto_kpp *tfm, const void *buf,
+ unsigned int len)
+{
+ u8 *secret = kpp_tfm_ctx(tfm);
+
+ if (!len)
+ curve25519_generate_secret(secret);
+ else if (len == CURVE25519_KEY_SIZE &&
+ crypto_memneq(buf, curve25519_null_point, CURVE25519_KEY_SIZE))
+ memcpy(secret, buf, CURVE25519_KEY_SIZE);
+ else
+ return -EINVAL;
+ return 0;
+}
+
+static int curve25519_generate_public_key(struct kpp_request *req)
+{
+ struct crypto_kpp *tfm = crypto_kpp_reqtfm(req);
+ const u8 *secret = kpp_tfm_ctx(tfm);
+ u8 buf[CURVE25519_KEY_SIZE];
+ int copied, nbytes;
+
+ if (req->src)
+ return -EINVAL;
+
+ curve25519_base_arch(buf, secret);
+
+ /* might want less than we've got */
+ nbytes = min_t(size_t, CURVE25519_KEY_SIZE, req->dst_len);
+ copied = sg_copy_from_buffer(req->dst, sg_nents_for_len(req->dst,
+ nbytes),
+ buf, nbytes);
+ if (copied != nbytes)
+ return -EINVAL;
+ return 0;
+}
+
+static int curve25519_compute_shared_secret(struct kpp_request *req)
+{
+ struct crypto_kpp *tfm = crypto_kpp_reqtfm(req);
+ const u8 *secret = kpp_tfm_ctx(tfm);
+ u8 public_key[CURVE25519_KEY_SIZE];
+ u8 buf[CURVE25519_KEY_SIZE];
+ int copied, nbytes;
+
+ if (!req->src)
+ return -EINVAL;
+
+ copied = sg_copy_to_buffer(req->src,
+ sg_nents_for_len(req->src,
+ CURVE25519_KEY_SIZE),
+ public_key, CURVE25519_KEY_SIZE);
+ if (copied != CURVE25519_KEY_SIZE)
+ return -EINVAL;
+
+ curve25519_arch(buf, secret, public_key);
+
+ /* might want less than we've got */
+ nbytes = min_t(size_t, CURVE25519_KEY_SIZE, req->dst_len);
+ copied = sg_copy_from_buffer(req->dst, sg_nents_for_len(req->dst,
+ nbytes),
+ buf, nbytes);
+ if (copied != nbytes)
+ return -EINVAL;
+ return 0;
+}
+
+static unsigned int curve25519_max_size(struct crypto_kpp *tfm)
+{
+ return CURVE25519_KEY_SIZE;
+}
+
+static struct kpp_alg curve25519_alg = {
+ .base.cra_name = "curve25519",
+ .base.cra_driver_name = "curve25519-ppc64le",
+ .base.cra_priority = 200,
+ .base.cra_module = THIS_MODULE,
+ .base.cra_ctxsize = CURVE25519_KEY_SIZE,
+
+ .set_secret = curve25519_set_secret,
+ .generate_public_key = curve25519_generate_public_key,
+ .compute_shared_secret = curve25519_compute_shared_secret,
+ .max_size = curve25519_max_size,
+};
+
+
+static int __init curve25519_mod_init(void)
+{
+ return IS_REACHABLE(CONFIG_CRYPTO_KPP) ?
+ crypto_register_kpp(&curve25519_alg) : 0;
+}
+
+static void __exit curve25519_mod_exit(void)
+{
+ if (IS_REACHABLE(CONFIG_CRYPTO_KPP))
+ crypto_unregister_kpp(&curve25519_alg);
+}
+
+module_init(curve25519_mod_init);
+module_exit(curve25519_mod_exit);
+
+MODULE_ALIAS_CRYPTO("curve25519");
+MODULE_ALIAS_CRYPTO("curve25519-ppc64le");
+MODULE_DESCRIPTION("PPC64le Curve25519 scalar multiplication with 51 bits limbs");
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Danny Tsen <dtsen@us.ibm.com>");
diff --git a/arch/powerpc/crypto/curve25519-ppc64le_asm.S b/arch/powerpc/crypto/curve25519-ppc64le_asm.S
new file mode 100644
index 000000000000..06c1febe24b9
--- /dev/null
+++ b/arch/powerpc/crypto/curve25519-ppc64le_asm.S
@@ -0,0 +1,671 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#
+# This code is taken from CRYPTOGAMs[1] and is included here using the option
+# in the license to distribute the code under the GPL. Therefore this program
+# is free software; you can redistribute it and/or modify it under the terms of
+# the GNU General Public License version 2 as published by the Free Software
+# Foundation.
+#
+# [1] https://github.com/dot-asm/cryptogams/
+
+# Copyright (c) 2006-2017, CRYPTOGAMS by <appro@openssl.org>
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# * Redistributions of source code must retain copyright notices,
+# this list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above
+# copyright notice, this list of conditions and the following
+# disclaimer in the documentation and/or other materials
+# provided with the distribution.
+#
+# * Neither the name of the CRYPTOGAMS nor the names of its
+# copyright holder and contributors may be used to endorse or
+# promote products derived from this software without specific
+# prior written permission.
+#
+# ALTERNATIVELY, provided that this notice is retained in full, this
+# product may be distributed under the terms of the GNU General Public
+# License (GPL), in which case the provisions of the GPL apply INSTEAD OF
+# those given above.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see https://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+#
+# ====================================================================
+# Written and Modified by Danny Tsen <dtsen@us.ibm.com>
+# - Added x25519_fe51_sqr_times, x25519_fe51_frombytes, x25519_fe51_tobytes
+# and x25519_cswap
+#
+# Copyright 2024- IBM Corp.
+#
+# X25519 lower-level primitives for PPC64.
+#
+
+#include <linux/linkage.h>
+
+.text
+
+.align 5
+SYM_FUNC_START(x25519_fe51_mul)
+
+ stdu 1,-144(1)
+ std 21,56(1)
+ std 22,64(1)
+ std 23,72(1)
+ std 24,80(1)
+ std 25,88(1)
+ std 26,96(1)
+ std 27,104(1)
+ std 28,112(1)
+ std 29,120(1)
+ std 30,128(1)
+ std 31,136(1)
+
+ ld 6,0(5)
+ ld 7,0(4)
+ ld 8,8(4)
+ ld 9,16(4)
+ ld 10,24(4)
+ ld 11,32(4)
+
+ mulld 22,7,6
+ mulhdu 23,7,6
+
+ mulld 24,8,6
+ mulhdu 25,8,6
+
+ mulld 30,11,6
+ mulhdu 31,11,6
+ ld 4,8(5)
+ mulli 11,11,19
+
+ mulld 26,9,6
+ mulhdu 27,9,6
+
+ mulld 28,10,6
+ mulhdu 29,10,6
+ mulld 12,11,4
+ mulhdu 21,11,4
+ addc 22,22,12
+ adde 23,23,21
+
+ mulld 12,7,4
+ mulhdu 21,7,4
+ addc 24,24,12
+ adde 25,25,21
+
+ mulld 12,10,4
+ mulhdu 21,10,4
+ ld 6,16(5)
+ mulli 10,10,19
+ addc 30,30,12
+ adde 31,31,21
+
+ mulld 12,8,4
+ mulhdu 21,8,4
+ addc 26,26,12
+ adde 27,27,21
+
+ mulld 12,9,4
+ mulhdu 21,9,4
+ addc 28,28,12
+ adde 29,29,21
+ mulld 12,10,6
+ mulhdu 21,10,6
+ addc 22,22,12
+ adde 23,23,21
+
+ mulld 12,11,6
+ mulhdu 21,11,6
+ addc 24,24,12
+ adde 25,25,21
+
+ mulld 12,9,6
+ mulhdu 21,9,6
+ ld 4,24(5)
+ mulli 9,9,19
+ addc 30,30,12
+ adde 31,31,21
+
+ mulld 12,7,6
+ mulhdu 21,7,6
+ addc 26,26,12
+ adde 27,27,21
+
+ mulld 12,8,6
+ mulhdu 21,8,6
+ addc 28,28,12
+ adde 29,29,21
+ mulld 12,9,4
+ mulhdu 21,9,4
+ addc 22,22,12
+ adde 23,23,21
+
+ mulld 12,10,4
+ mulhdu 21,10,4
+ addc 24,24,12
+ adde 25,25,21
+
+ mulld 12,8,4
+ mulhdu 21,8,4
+ ld 6,32(5)
+ mulli 8,8,19
+ addc 30,30,12
+ adde 31,31,21
+
+ mulld 12,11,4
+ mulhdu 21,11,4
+ addc 26,26,12
+ adde 27,27,21
+
+ mulld 12,7,4
+ mulhdu 21,7,4
+ addc 28,28,12
+ adde 29,29,21
+ mulld 12,8,6
+ mulhdu 21,8,6
+ addc 22,22,12
+ adde 23,23,21
+
+ mulld 12,9,6
+ mulhdu 21,9,6
+ addc 24,24,12
+ adde 25,25,21
+
+ mulld 12,10,6
+ mulhdu 21,10,6
+ addc 26,26,12
+ adde 27,27,21
+
+ mulld 12,11,6
+ mulhdu 21,11,6
+ addc 28,28,12
+ adde 29,29,21
+
+ mulld 12,7,6
+ mulhdu 21,7,6
+ addc 30,30,12
+ adde 31,31,21
+
+.Lfe51_reduce:
+ li 0,-1
+ srdi 0,0,13
+
+ srdi 12,26,51
+ and 9,26,0
+ insrdi 12,27,51,0
+ srdi 21,22,51
+ and 7,22,0
+ insrdi 21,23,51,0
+ addc 28,28,12
+ addze 29,29
+ addc 24,24,21
+ addze 25,25
+
+ srdi 12,28,51
+ and 10,28,0
+ insrdi 12,29,51,0
+ srdi 21,24,51
+ and 8,24,0
+ insrdi 21,25,51,0
+ addc 30,30,12
+ addze 31,31
+ add 9,9,21
+
+ srdi 12,30,51
+ and 11,30,0
+ insrdi 12,31,51,0
+ mulli 12,12,19
+
+ add 7,7,12
+
+ srdi 21,9,51
+ and 9,9,0
+ add 10,10,21
+
+ srdi 12,7,51
+ and 7,7,0
+ add 8,8,12
+
+ std 9,16(3)
+ std 10,24(3)
+ std 11,32(3)
+ std 7,0(3)
+ std 8,8(3)
+
+ ld 21,56(1)
+ ld 22,64(1)
+ ld 23,72(1)
+ ld 24,80(1)
+ ld 25,88(1)
+ ld 26,96(1)
+ ld 27,104(1)
+ ld 28,112(1)
+ ld 29,120(1)
+ ld 30,128(1)
+ ld 31,136(1)
+ addi 1,1,144
+ blr
+SYM_FUNC_END(x25519_fe51_mul)
+
+.align 5
+SYM_FUNC_START(x25519_fe51_sqr)
+
+ stdu 1,-144(1)
+ std 21,56(1)
+ std 22,64(1)
+ std 23,72(1)
+ std 24,80(1)
+ std 25,88(1)
+ std 26,96(1)
+ std 27,104(1)
+ std 28,112(1)
+ std 29,120(1)
+ std 30,128(1)
+ std 31,136(1)
+
+ ld 7,0(4)
+ ld 8,8(4)
+ ld 9,16(4)
+ ld 10,24(4)
+ ld 11,32(4)
+
+ add 6,7,7
+ mulli 21,11,19
+
+ mulld 22,7,7
+ mulhdu 23,7,7
+ mulld 24,8,6
+ mulhdu 25,8,6
+ mulld 26,9,6
+ mulhdu 27,9,6
+ mulld 28,10,6
+ mulhdu 29,10,6
+ mulld 30,11,6
+ mulhdu 31,11,6
+ add 6,8,8
+ mulld 12,11,21
+ mulhdu 11,11,21
+ addc 28,28,12
+ adde 29,29,11
+
+ mulli 5,10,19
+
+ mulld 12,8,8
+ mulhdu 11,8,8
+ addc 26,26,12
+ adde 27,27,11
+ mulld 12,9,6
+ mulhdu 11,9,6
+ addc 28,28,12
+ adde 29,29,11
+ mulld 12,10,6
+ mulhdu 11,10,6
+ addc 30,30,12
+ adde 31,31,11
+ mulld 12,21,6
+ mulhdu 11,21,6
+ add 6,10,10
+ addc 22,22,12
+ adde 23,23,11
+ mulld 12,10,5
+ mulhdu 10,10,5
+ addc 24,24,12
+ adde 25,25,10
+ mulld 12,6,21
+ mulhdu 10,6,21
+ add 6,9,9
+ addc 26,26,12
+ adde 27,27,10
+
+ mulld 12,9,9
+ mulhdu 10,9,9
+ addc 30,30,12
+ adde 31,31,10
+ mulld 12,5,6
+ mulhdu 10,5,6
+ addc 22,22,12
+ adde 23,23,10
+ mulld 12,21,6
+ mulhdu 10,21,6
+ addc 24,24,12
+ adde 25,25,10
+
+ b .Lfe51_reduce
+SYM_FUNC_END(x25519_fe51_sqr)
+
+.align 5
+SYM_FUNC_START(x25519_fe51_mul121666)
+
+ stdu 1,-144(1)
+ std 21,56(1)
+ std 22,64(1)
+ std 23,72(1)
+ std 24,80(1)
+ std 25,88(1)
+ std 26,96(1)
+ std 27,104(1)
+ std 28,112(1)
+ std 29,120(1)
+ std 30,128(1)
+ std 31,136(1)
+
+ lis 6,1
+ ori 6,6,56130
+ ld 7,0(4)
+ ld 8,8(4)
+ ld 9,16(4)
+ ld 10,24(4)
+ ld 11,32(4)
+
+ mulld 22,7,6
+ mulhdu 23,7,6
+ mulld 24,8,6
+ mulhdu 25,8,6
+ mulld 26,9,6
+ mulhdu 27,9,6
+ mulld 28,10,6
+ mulhdu 29,10,6
+ mulld 30,11,6
+ mulhdu 31,11,6
+
+ b .Lfe51_reduce
+SYM_FUNC_END(x25519_fe51_mul121666)
+
+.align 5
+SYM_FUNC_START(x25519_fe51_sqr_times)
+
+ stdu 1,-144(1)
+ std 21,56(1)
+ std 22,64(1)
+ std 23,72(1)
+ std 24,80(1)
+ std 25,88(1)
+ std 26,96(1)
+ std 27,104(1)
+ std 28,112(1)
+ std 29,120(1)
+ std 30,128(1)
+ std 31,136(1)
+
+ ld 7,0(4)
+ ld 8,8(4)
+ ld 9,16(4)
+ ld 10,24(4)
+ ld 11,32(4)
+
+ mtctr 5
+
+.Lsqr_times_loop:
+ add 6,7,7
+ mulli 21,11,19
+
+ mulld 22,7,7
+ mulhdu 23,7,7
+ mulld 24,8,6
+ mulhdu 25,8,6
+ mulld 26,9,6
+ mulhdu 27,9,6
+ mulld 28,10,6
+ mulhdu 29,10,6
+ mulld 30,11,6
+ mulhdu 31,11,6
+ add 6,8,8
+ mulld 12,11,21
+ mulhdu 11,11,21
+ addc 28,28,12
+ adde 29,29,11
+
+ mulli 5,10,19
+
+ mulld 12,8,8
+ mulhdu 11,8,8
+ addc 26,26,12
+ adde 27,27,11
+ mulld 12,9,6
+ mulhdu 11,9,6
+ addc 28,28,12
+ adde 29,29,11
+ mulld 12,10,6
+ mulhdu 11,10,6
+ addc 30,30,12
+ adde 31,31,11
+ mulld 12,21,6
+ mulhdu 11,21,6
+ add 6,10,10
+ addc 22,22,12
+ adde 23,23,11
+ mulld 12,10,5
+ mulhdu 10,10,5
+ addc 24,24,12
+ adde 25,25,10
+ mulld 12,6,21
+ mulhdu 10,6,21
+ add 6,9,9
+ addc 26,26,12
+ adde 27,27,10
+
+ mulld 12,9,9
+ mulhdu 10,9,9
+ addc 30,30,12
+ adde 31,31,10
+ mulld 12,5,6
+ mulhdu 10,5,6
+ addc 22,22,12
+ adde 23,23,10
+ mulld 12,21,6
+ mulhdu 10,21,6
+ addc 24,24,12
+ adde 25,25,10
+
+ # fe51_reduce
+ li 0,-1
+ srdi 0,0,13
+
+ srdi 12,26,51
+ and 9,26,0
+ insrdi 12,27,51,0
+ srdi 21,22,51
+ and 7,22,0
+ insrdi 21,23,51,0
+ addc 28,28,12
+ addze 29,29
+ addc 24,24,21
+ addze 25,25
+
+ srdi 12,28,51
+ and 10,28,0
+ insrdi 12,29,51,0
+ srdi 21,24,51
+ and 8,24,0
+ insrdi 21,25,51,0
+ addc 30,30,12
+ addze 31,31
+ add 9,9,21
+
+ srdi 12,30,51
+ and 11,30,0
+ insrdi 12,31,51,0
+ mulli 12,12,19
+
+ add 7,7,12
+
+ srdi 21,9,51
+ and 9,9,0
+ add 10,10,21
+
+ srdi 12,7,51
+ and 7,7,0
+ add 8,8,12
+
+ bdnz .Lsqr_times_loop
+
+ std 9,16(3)
+ std 10,24(3)
+ std 11,32(3)
+ std 7,0(3)
+ std 8,8(3)
+
+ ld 21,56(1)
+ ld 22,64(1)
+ ld 23,72(1)
+ ld 24,80(1)
+ ld 25,88(1)
+ ld 26,96(1)
+ ld 27,104(1)
+ ld 28,112(1)
+ ld 29,120(1)
+ ld 30,128(1)
+ ld 31,136(1)
+ addi 1,1,144
+ blr
+SYM_FUNC_END(x25519_fe51_sqr_times)
+
+.align 5
+SYM_FUNC_START(x25519_fe51_frombytes)
+
+ li 12, -1
+ srdi 12, 12, 13 # 0x7ffffffffffff
+
+ ld 5, 0(4)
+ ld 6, 8(4)
+ ld 7, 16(4)
+ ld 8, 24(4)
+
+ srdi 10, 5, 51
+ and 5, 5, 12 # h0
+
+ sldi 11, 6, 13
+ or 11, 10, 11 # h1t
+ srdi 10, 6, 38
+ and 6, 11, 12 # h1
+
+ sldi 11, 7, 26
+ or 10, 10, 11 # h2t
+
+ srdi 11, 7, 25
+ and 7, 10, 12 # h2
+ sldi 10, 8, 39
+ or 11, 11, 10 # h3t
+
+ srdi 9, 8, 12
+ and 8, 11, 12 # h3
+ and 9, 9, 12 # h4
+
+ std 5, 0(3)
+ std 6, 8(3)
+ std 7, 16(3)
+ std 8, 24(3)
+ std 9, 32(3)
+
+ blr
+SYM_FUNC_END(x25519_fe51_frombytes)
+
+.align 5
+SYM_FUNC_START(x25519_fe51_tobytes)
+
+ ld 5, 0(4)
+ ld 6, 8(4)
+ ld 7, 16(4)
+ ld 8, 24(4)
+ ld 9, 32(4)
+
+ li 12, -1
+ srdi 12, 12, 13 # 0x7ffffffffffff
+
+ # Full reducuction
+ addi 10, 5, 19
+ srdi 10, 10, 51
+ add 10, 10, 6
+ srdi 10, 10, 51
+ add 10, 10, 7
+ srdi 10, 10, 51
+ add 10, 10, 8
+ srdi 10, 10, 51
+ add 10, 10, 9
+ srdi 10, 10, 51
+
+ mulli 10, 10, 19
+ add 5, 5, 10
+ srdi 11, 5, 51
+ add 6, 6, 11
+ srdi 11, 6, 51
+ add 7, 7, 11
+ srdi 11, 7, 51
+ add 8, 8, 11
+ srdi 11, 8, 51
+ add 9, 9, 11
+
+ and 5, 5, 12
+ and 6, 6, 12
+ and 7, 7, 12
+ and 8, 8, 12
+ and 9, 9, 12
+
+ sldi 10, 6, 51
+ or 5, 5, 10 # s0
+
+ srdi 11, 6, 13
+ sldi 10, 7, 38
+ or 6, 11, 10 # s1
+
+ srdi 11, 7, 26
+ sldi 10, 8, 25
+ or 7, 11, 10 # s2
+
+ srdi 11, 8, 39
+ sldi 10, 9, 12
+ or 8, 11, 10 # s4
+
+ std 5, 0(3)
+ std 6, 8(3)
+ std 7, 16(3)
+ std 8, 24(3)
+
+ blr
+SYM_FUNC_END(x25519_fe51_tobytes)
+
+.align 5
+SYM_FUNC_START(x25519_cswap)
+
+ li 7, 5
+ neg 6, 5
+ mtctr 7
+
+.Lswap_loop:
+ ld 8, 0(3)
+ ld 9, 0(4)
+ xor 10, 8, 9
+ and 10, 10, 6
+ xor 11, 8, 10
+ xor 12, 9, 10
+ std 11, 0(3)
+ addi 3, 3, 8
+ std 12, 0(4)
+ addi 4, 4, 8
+ bdnz .Lswap_loop
+
+ blr
+SYM_FUNC_END(x25519_cswap)
diff --git a/arch/powerpc/crypto/ghash.c b/arch/powerpc/crypto/ghash.c
index 77eca20bc7ac..7308735bdb33 100644
--- a/arch/powerpc/crypto/ghash.c
+++ b/arch/powerpc/crypto/ghash.c
@@ -11,19 +11,18 @@
* Copyright (C) 2014 - 2018 Linaro Ltd. <ard.biesheuvel@linaro.org>
*/
-#include <linux/types.h>
-#include <linux/err.h>
-#include <linux/crypto.h>
-#include <linux/delay.h>
-#include <asm/simd.h>
+#include "aesp8-ppc.h"
#include <asm/switch_to.h>
#include <crypto/aes.h>
+#include <crypto/gf128mul.h>
#include <crypto/ghash.h>
-#include <crypto/scatterwalk.h>
#include <crypto/internal/hash.h>
#include <crypto/internal/simd.h>
-#include <crypto/b128ops.h>
-#include "aesp8-ppc.h"
+#include <linux/err.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/uaccess.h>
void gcm_init_p8(u128 htable[16], const u64 Xi[2]);
void gcm_gmult_p8(u64 Xi[2], const u128 htable[16]);
@@ -39,15 +38,12 @@ struct p8_ghash_ctx {
struct p8_ghash_desc_ctx {
u64 shash[2];
- u8 buffer[GHASH_DIGEST_SIZE];
- int bytes;
};
static int p8_ghash_init(struct shash_desc *desc)
{
struct p8_ghash_desc_ctx *dctx = shash_desc_ctx(desc);
- dctx->bytes = 0;
memset(dctx->shash, 0, GHASH_DIGEST_SIZE);
return 0;
}
@@ -74,27 +70,30 @@ static int p8_ghash_setkey(struct crypto_shash *tfm, const u8 *key,
}
static inline void __ghash_block(struct p8_ghash_ctx *ctx,
- struct p8_ghash_desc_ctx *dctx)
+ struct p8_ghash_desc_ctx *dctx,
+ const u8 *src)
{
if (crypto_simd_usable()) {
preempt_disable();
pagefault_disable();
enable_kernel_vsx();
- gcm_ghash_p8(dctx->shash, ctx->htable,
- dctx->buffer, GHASH_DIGEST_SIZE);
+ gcm_ghash_p8(dctx->shash, ctx->htable, src, GHASH_BLOCK_SIZE);
disable_kernel_vsx();
pagefault_enable();
preempt_enable();
} else {
- crypto_xor((u8 *)dctx->shash, dctx->buffer, GHASH_BLOCK_SIZE);
+ crypto_xor((u8 *)dctx->shash, src, GHASH_BLOCK_SIZE);
gf128mul_lle((be128 *)dctx->shash, &ctx->key);
}
}
-static inline void __ghash_blocks(struct p8_ghash_ctx *ctx,
- struct p8_ghash_desc_ctx *dctx,
- const u8 *src, unsigned int srclen)
+static inline int __ghash_blocks(struct p8_ghash_ctx *ctx,
+ struct p8_ghash_desc_ctx *dctx,
+ const u8 *src, unsigned int srclen)
{
+ int remain = srclen - round_down(srclen, GHASH_BLOCK_SIZE);
+
+ srclen -= remain;
if (crypto_simd_usable()) {
preempt_disable();
pagefault_disable();
@@ -105,62 +104,38 @@ static inline void __ghash_blocks(struct p8_ghash_ctx *ctx,
pagefault_enable();
preempt_enable();
} else {
- while (srclen >= GHASH_BLOCK_SIZE) {
+ do {
crypto_xor((u8 *)dctx->shash, src, GHASH_BLOCK_SIZE);
gf128mul_lle((be128 *)dctx->shash, &ctx->key);
srclen -= GHASH_BLOCK_SIZE;
src += GHASH_BLOCK_SIZE;
- }
+ } while (srclen);
}
+
+ return remain;
}
static int p8_ghash_update(struct shash_desc *desc,
const u8 *src, unsigned int srclen)
{
- unsigned int len;
struct p8_ghash_ctx *ctx = crypto_tfm_ctx(crypto_shash_tfm(desc->tfm));
struct p8_ghash_desc_ctx *dctx = shash_desc_ctx(desc);
- if (dctx->bytes) {
- if (dctx->bytes + srclen < GHASH_DIGEST_SIZE) {
- memcpy(dctx->buffer + dctx->bytes, src,
- srclen);
- dctx->bytes += srclen;
- return 0;
- }
- memcpy(dctx->buffer + dctx->bytes, src,
- GHASH_DIGEST_SIZE - dctx->bytes);
-
- __ghash_block(ctx, dctx);
-
- src += GHASH_DIGEST_SIZE - dctx->bytes;
- srclen -= GHASH_DIGEST_SIZE - dctx->bytes;
- dctx->bytes = 0;
- }
- len = srclen & ~(GHASH_DIGEST_SIZE - 1);
- if (len) {
- __ghash_blocks(ctx, dctx, src, len);
- src += len;
- srclen -= len;
- }
- if (srclen) {
- memcpy(dctx->buffer, src, srclen);
- dctx->bytes = srclen;
- }
- return 0;
+ return __ghash_blocks(ctx, dctx, src, srclen);
}
-static int p8_ghash_final(struct shash_desc *desc, u8 *out)
+static int p8_ghash_finup(struct shash_desc *desc, const u8 *src,
+ unsigned int len, u8 *out)
{
- int i;
struct p8_ghash_ctx *ctx = crypto_tfm_ctx(crypto_shash_tfm(desc->tfm));
struct p8_ghash_desc_ctx *dctx = shash_desc_ctx(desc);
- if (dctx->bytes) {
- for (i = dctx->bytes; i < GHASH_DIGEST_SIZE; i++)
- dctx->buffer[i] = 0;
- __ghash_block(ctx, dctx);
- dctx->bytes = 0;
+ if (len) {
+ u8 buf[GHASH_BLOCK_SIZE] = {};
+
+ memcpy(buf, src, len);
+ __ghash_block(ctx, dctx, buf);
+ memzero_explicit(buf, sizeof(buf));
}
memcpy(out, dctx->shash, GHASH_DIGEST_SIZE);
return 0;
@@ -170,14 +145,14 @@ struct shash_alg p8_ghash_alg = {
.digestsize = GHASH_DIGEST_SIZE,
.init = p8_ghash_init,
.update = p8_ghash_update,
- .final = p8_ghash_final,
+ .finup = p8_ghash_finup,
.setkey = p8_ghash_setkey,
- .descsize = sizeof(struct p8_ghash_desc_ctx)
- + sizeof(struct ghash_desc_ctx),
+ .descsize = sizeof(struct p8_ghash_desc_ctx),
.base = {
.cra_name = "ghash",
.cra_driver_name = "p8_ghash",
.cra_priority = 1000,
+ .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY,
.cra_blocksize = GHASH_BLOCK_SIZE,
.cra_ctxsize = sizeof(struct p8_ghash_ctx),
.cra_module = THIS_MODULE,
diff --git a/arch/powerpc/crypto/md5-glue.c b/arch/powerpc/crypto/md5-glue.c
index c24f605033bd..204440a90cd8 100644
--- a/arch/powerpc/crypto/md5-glue.c
+++ b/arch/powerpc/crypto/md5-glue.c
@@ -8,25 +8,13 @@
*/
#include <crypto/internal/hash.h>
-#include <linux/init.h>
-#include <linux/module.h>
-#include <linux/mm.h>
-#include <linux/types.h>
#include <crypto/md5.h>
-#include <asm/byteorder.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/string.h>
extern void ppc_md5_transform(u32 *state, const u8 *src, u32 blocks);
-static inline void ppc_md5_clear_context(struct md5_state *sctx)
-{
- int count = sizeof(struct md5_state) >> 2;
- u32 *ptr = (u32 *)sctx;
-
- /* make sure we can clear the fast way */
- BUILD_BUG_ON(sizeof(struct md5_state) % 4);
- do { *ptr++ = 0; } while (--count);
-}
-
static int ppc_md5_init(struct shash_desc *desc)
{
struct md5_state *sctx = shash_desc_ctx(desc);
@@ -44,79 +32,34 @@ static int ppc_md5_update(struct shash_desc *desc, const u8 *data,
unsigned int len)
{
struct md5_state *sctx = shash_desc_ctx(desc);
- const unsigned int offset = sctx->byte_count & 0x3f;
- unsigned int avail = 64 - offset;
- const u8 *src = data;
- sctx->byte_count += len;
-
- if (avail > len) {
- memcpy((char *)sctx->block + offset, src, len);
- return 0;
- }
-
- if (offset) {
- memcpy((char *)sctx->block + offset, src, avail);
- ppc_md5_transform(sctx->hash, (const u8 *)sctx->block, 1);
- len -= avail;
- src += avail;
- }
-
- if (len > 63) {
- ppc_md5_transform(sctx->hash, src, len >> 6);
- src += len & ~0x3f;
- len &= 0x3f;
- }
-
- memcpy((char *)sctx->block, src, len);
- return 0;
+ sctx->byte_count += round_down(len, MD5_HMAC_BLOCK_SIZE);
+ ppc_md5_transform(sctx->hash, data, len >> 6);
+ return len - round_down(len, MD5_HMAC_BLOCK_SIZE);
}
-static int ppc_md5_final(struct shash_desc *desc, u8 *out)
+static int ppc_md5_finup(struct shash_desc *desc, const u8 *src,
+ unsigned int offset, u8 *out)
{
struct md5_state *sctx = shash_desc_ctx(desc);
- const unsigned int offset = sctx->byte_count & 0x3f;
- const u8 *src = (const u8 *)sctx->block;
- u8 *p = (u8 *)src + offset;
- int padlen = 55 - offset;
- __le64 *pbits = (__le64 *)((char *)sctx->block + 56);
+ __le64 block[MD5_BLOCK_WORDS] = {};
+ u8 *p = memcpy(block, src, offset);
__le32 *dst = (__le32 *)out;
+ __le64 *pbits;
+ src = p;
+ p += offset;
*p++ = 0x80;
-
- if (padlen < 0) {
- memset(p, 0x00, padlen + sizeof (u64));
- ppc_md5_transform(sctx->hash, src, 1);
- p = (char *)sctx->block;
- padlen = 56;
- }
-
- memset(p, 0, padlen);
+ sctx->byte_count += offset;
+ pbits = &block[(MD5_BLOCK_WORDS / (offset > 55 ? 1 : 2)) - 1];
*pbits = cpu_to_le64(sctx->byte_count << 3);
- ppc_md5_transform(sctx->hash, src, 1);
+ ppc_md5_transform(sctx->hash, src, (pbits - block + 1) / 8);
+ memzero_explicit(block, sizeof(block));
dst[0] = cpu_to_le32(sctx->hash[0]);
dst[1] = cpu_to_le32(sctx->hash[1]);
dst[2] = cpu_to_le32(sctx->hash[2]);
dst[3] = cpu_to_le32(sctx->hash[3]);
-
- ppc_md5_clear_context(sctx);
- return 0;
-}
-
-static int ppc_md5_export(struct shash_desc *desc, void *out)
-{
- struct md5_state *sctx = shash_desc_ctx(desc);
-
- memcpy(out, sctx, sizeof(*sctx));
- return 0;
-}
-
-static int ppc_md5_import(struct shash_desc *desc, const void *in)
-{
- struct md5_state *sctx = shash_desc_ctx(desc);
-
- memcpy(sctx, in, sizeof(*sctx));
return 0;
}
@@ -124,15 +67,13 @@ static struct shash_alg alg = {
.digestsize = MD5_DIGEST_SIZE,
.init = ppc_md5_init,
.update = ppc_md5_update,
- .final = ppc_md5_final,
- .export = ppc_md5_export,
- .import = ppc_md5_import,
- .descsize = sizeof(struct md5_state),
- .statesize = sizeof(struct md5_state),
+ .finup = ppc_md5_finup,
+ .descsize = MD5_STATE_SIZE,
.base = {
.cra_name = "md5",
.cra_driver_name= "md5-ppc",
.cra_priority = 200,
+ .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY,
.cra_blocksize = MD5_HMAC_BLOCK_SIZE,
.cra_module = THIS_MODULE,
}
diff --git a/arch/powerpc/crypto/poly1305-p10-glue.c b/arch/powerpc/crypto/poly1305-p10-glue.c
deleted file mode 100644
index 95dd708573ee..000000000000
--- a/arch/powerpc/crypto/poly1305-p10-glue.c
+++ /dev/null
@@ -1,186 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Poly1305 authenticator algorithm, RFC7539.
- *
- * Copyright 2023- IBM Corp. All rights reserved.
- */
-
-#include <crypto/algapi.h>
-#include <linux/crypto.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/jump_label.h>
-#include <crypto/internal/hash.h>
-#include <crypto/internal/poly1305.h>
-#include <crypto/internal/simd.h>
-#include <linux/cpufeature.h>
-#include <asm/unaligned.h>
-#include <asm/simd.h>
-#include <asm/switch_to.h>
-
-asmlinkage void poly1305_p10le_4blocks(void *h, const u8 *m, u32 mlen);
-asmlinkage void poly1305_64s(void *h, const u8 *m, u32 mlen, int highbit);
-asmlinkage void poly1305_emit_64(void *h, void *s, u8 *dst);
-
-static void vsx_begin(void)
-{
- preempt_disable();
- enable_kernel_vsx();
-}
-
-static void vsx_end(void)
-{
- disable_kernel_vsx();
- preempt_enable();
-}
-
-static int crypto_poly1305_p10_init(struct shash_desc *desc)
-{
- struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
-
- poly1305_core_init(&dctx->h);
- dctx->buflen = 0;
- dctx->rset = 0;
- dctx->sset = false;
-
- return 0;
-}
-
-static unsigned int crypto_poly1305_setdctxkey(struct poly1305_desc_ctx *dctx,
- const u8 *inp, unsigned int len)
-{
- unsigned int acc = 0;
-
- if (unlikely(!dctx->sset)) {
- if (!dctx->rset && len >= POLY1305_BLOCK_SIZE) {
- struct poly1305_core_key *key = &dctx->core_r;
-
- key->key.r64[0] = get_unaligned_le64(&inp[0]);
- key->key.r64[1] = get_unaligned_le64(&inp[8]);
- inp += POLY1305_BLOCK_SIZE;
- len -= POLY1305_BLOCK_SIZE;
- acc += POLY1305_BLOCK_SIZE;
- dctx->rset = 1;
- }
- if (len >= POLY1305_BLOCK_SIZE) {
- dctx->s[0] = get_unaligned_le32(&inp[0]);
- dctx->s[1] = get_unaligned_le32(&inp[4]);
- dctx->s[2] = get_unaligned_le32(&inp[8]);
- dctx->s[3] = get_unaligned_le32(&inp[12]);
- acc += POLY1305_BLOCK_SIZE;
- dctx->sset = true;
- }
- }
- return acc;
-}
-
-static int crypto_poly1305_p10_update(struct shash_desc *desc,
- const u8 *src, unsigned int srclen)
-{
- struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
- unsigned int bytes, used;
-
- if (unlikely(dctx->buflen)) {
- bytes = min(srclen, POLY1305_BLOCK_SIZE - dctx->buflen);
- memcpy(dctx->buf + dctx->buflen, src, bytes);
- src += bytes;
- srclen -= bytes;
- dctx->buflen += bytes;
-
- if (dctx->buflen == POLY1305_BLOCK_SIZE) {
- if (likely(!crypto_poly1305_setdctxkey(dctx, dctx->buf,
- POLY1305_BLOCK_SIZE))) {
- vsx_begin();
- poly1305_64s(&dctx->h, dctx->buf,
- POLY1305_BLOCK_SIZE, 1);
- vsx_end();
- }
- dctx->buflen = 0;
- }
- }
-
- if (likely(srclen >= POLY1305_BLOCK_SIZE)) {
- bytes = round_down(srclen, POLY1305_BLOCK_SIZE);
- used = crypto_poly1305_setdctxkey(dctx, src, bytes);
- if (likely(used)) {
- srclen -= used;
- src += used;
- }
- if (crypto_simd_usable() && (srclen >= POLY1305_BLOCK_SIZE*4)) {
- vsx_begin();
- poly1305_p10le_4blocks(&dctx->h, src, srclen);
- vsx_end();
- src += srclen - (srclen % (POLY1305_BLOCK_SIZE * 4));
- srclen %= POLY1305_BLOCK_SIZE * 4;
- }
- while (srclen >= POLY1305_BLOCK_SIZE) {
- vsx_begin();
- poly1305_64s(&dctx->h, src, POLY1305_BLOCK_SIZE, 1);
- vsx_end();
- srclen -= POLY1305_BLOCK_SIZE;
- src += POLY1305_BLOCK_SIZE;
- }
- }
-
- if (unlikely(srclen)) {
- dctx->buflen = srclen;
- memcpy(dctx->buf, src, srclen);
- }
-
- return 0;
-}
-
-static int crypto_poly1305_p10_final(struct shash_desc *desc, u8 *dst)
-{
- struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
-
- if (unlikely(!dctx->sset))
- return -ENOKEY;
-
- if ((dctx->buflen)) {
- dctx->buf[dctx->buflen++] = 1;
- memset(dctx->buf + dctx->buflen, 0,
- POLY1305_BLOCK_SIZE - dctx->buflen);
- vsx_begin();
- poly1305_64s(&dctx->h, dctx->buf, POLY1305_BLOCK_SIZE, 0);
- vsx_end();
- dctx->buflen = 0;
- }
-
- poly1305_emit_64(&dctx->h, &dctx->s, dst);
- return 0;
-}
-
-static struct shash_alg poly1305_alg = {
- .digestsize = POLY1305_DIGEST_SIZE,
- .init = crypto_poly1305_p10_init,
- .update = crypto_poly1305_p10_update,
- .final = crypto_poly1305_p10_final,
- .descsize = sizeof(struct poly1305_desc_ctx),
- .base = {
- .cra_name = "poly1305",
- .cra_driver_name = "poly1305-p10",
- .cra_priority = 300,
- .cra_blocksize = POLY1305_BLOCK_SIZE,
- .cra_module = THIS_MODULE,
- },
-};
-
-static int __init poly1305_p10_init(void)
-{
- return crypto_register_shash(&poly1305_alg);
-}
-
-static void __exit poly1305_p10_exit(void)
-{
- crypto_unregister_shash(&poly1305_alg);
-}
-
-module_cpu_feature_match(PPC_MODULE_FEATURE_P10, poly1305_p10_init);
-module_exit(poly1305_p10_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Danny Tsen <dtsen@linux.ibm.com>");
-MODULE_DESCRIPTION("Optimized Poly1305 for P10");
-MODULE_ALIAS_CRYPTO("poly1305");
-MODULE_ALIAS_CRYPTO("poly1305-p10");
diff --git a/arch/powerpc/crypto/poly1305-p10le_64.S b/arch/powerpc/crypto/poly1305-p10le_64.S
deleted file mode 100644
index a3c1987f1ecd..000000000000
--- a/arch/powerpc/crypto/poly1305-p10le_64.S
+++ /dev/null
@@ -1,1075 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-#
-# Accelerated poly1305 implementation for ppc64le.
-#
-# Copyright 2023- IBM Corp. All rights reserved
-#
-#===================================================================================
-# Written by Danny Tsen <dtsen@us.ibm.com>
-#
-# Poly1305 - this version mainly using vector/VSX/Scalar
-# - 26 bits limbs
-# - Handle multiple 64 byte blcok.
-#
-# Block size 16 bytes
-# key = (r, s)
-# clamp r &= 0x0FFFFFFC0FFFFFFC 0x0FFFFFFC0FFFFFFF
-# p = 2^130 - 5
-# a += m
-# a = (r + a) % p
-# a += s
-#
-# Improve performance by breaking down polynominal to the sum of products with
-# h4 = m1 * r⁴ + m2 * r³ + m3 * r² + m4 * r
-#
-# 07/22/21 - this revison based on the above sum of products. Setup r^4, r^3, r^2, r and s3, s2, s1, s0
-# to 9 vectors for multiplications.
-#
-# setup r^4, r^3, r^2, r vectors
-# vs [r^1, r^3, r^2, r^4]
-# vs0 = [r0,.....]
-# vs1 = [r1,.....]
-# vs2 = [r2,.....]
-# vs3 = [r3,.....]
-# vs4 = [r4,.....]
-# vs5 = [r1*5,...]
-# vs6 = [r2*5,...]
-# vs7 = [r2*5,...]
-# vs8 = [r4*5,...]
-#
-# Each word in a vector consists a member of a "r/s" in [a * r/s].
-#
-# r0, r4*5, r3*5, r2*5, r1*5;
-# r1, r0, r4*5, r3*5, r2*5;
-# r2, r1, r0, r4*5, r3*5;
-# r3, r2, r1, r0, r4*5;
-# r4, r3, r2, r1, r0 ;
-#
-#
-# poly1305_p10le_4blocks( uint8_t *k, uint32_t mlen, uint8_t *m)
-# k = 32 bytes key
-# r3 = k (r, s)
-# r4 = mlen
-# r5 = m
-#
-#include <asm/ppc_asm.h>
-#include <asm/asm-offsets.h>
-#include <asm/asm-compat.h>
-#include <linux/linkage.h>
-
-.machine "any"
-
-.text
-
-.macro SAVE_GPR GPR OFFSET FRAME
- std \GPR,\OFFSET(\FRAME)
-.endm
-
-.macro SAVE_VRS VRS OFFSET FRAME
- li 16, \OFFSET
- stvx \VRS, 16, \FRAME
-.endm
-
-.macro SAVE_VSX VSX OFFSET FRAME
- li 16, \OFFSET
- stxvx \VSX, 16, \FRAME
-.endm
-
-.macro RESTORE_GPR GPR OFFSET FRAME
- ld \GPR,\OFFSET(\FRAME)
-.endm
-
-.macro RESTORE_VRS VRS OFFSET FRAME
- li 16, \OFFSET
- lvx \VRS, 16, \FRAME
-.endm
-
-.macro RESTORE_VSX VSX OFFSET FRAME
- li 16, \OFFSET
- lxvx \VSX, 16, \FRAME
-.endm
-
-.macro SAVE_REGS
- mflr 0
- std 0, 16(1)
- stdu 1,-752(1)
-
- SAVE_GPR 14, 112, 1
- SAVE_GPR 15, 120, 1
- SAVE_GPR 16, 128, 1
- SAVE_GPR 17, 136, 1
- SAVE_GPR 18, 144, 1
- SAVE_GPR 19, 152, 1
- SAVE_GPR 20, 160, 1
- SAVE_GPR 21, 168, 1
- SAVE_GPR 22, 176, 1
- SAVE_GPR 23, 184, 1
- SAVE_GPR 24, 192, 1
- SAVE_GPR 25, 200, 1
- SAVE_GPR 26, 208, 1
- SAVE_GPR 27, 216, 1
- SAVE_GPR 28, 224, 1
- SAVE_GPR 29, 232, 1
- SAVE_GPR 30, 240, 1
- SAVE_GPR 31, 248, 1
-
- addi 9, 1, 256
- SAVE_VRS 20, 0, 9
- SAVE_VRS 21, 16, 9
- SAVE_VRS 22, 32, 9
- SAVE_VRS 23, 48, 9
- SAVE_VRS 24, 64, 9
- SAVE_VRS 25, 80, 9
- SAVE_VRS 26, 96, 9
- SAVE_VRS 27, 112, 9
- SAVE_VRS 28, 128, 9
- SAVE_VRS 29, 144, 9
- SAVE_VRS 30, 160, 9
- SAVE_VRS 31, 176, 9
-
- SAVE_VSX 14, 192, 9
- SAVE_VSX 15, 208, 9
- SAVE_VSX 16, 224, 9
- SAVE_VSX 17, 240, 9
- SAVE_VSX 18, 256, 9
- SAVE_VSX 19, 272, 9
- SAVE_VSX 20, 288, 9
- SAVE_VSX 21, 304, 9
- SAVE_VSX 22, 320, 9
- SAVE_VSX 23, 336, 9
- SAVE_VSX 24, 352, 9
- SAVE_VSX 25, 368, 9
- SAVE_VSX 26, 384, 9
- SAVE_VSX 27, 400, 9
- SAVE_VSX 28, 416, 9
- SAVE_VSX 29, 432, 9
- SAVE_VSX 30, 448, 9
- SAVE_VSX 31, 464, 9
-.endm # SAVE_REGS
-
-.macro RESTORE_REGS
- addi 9, 1, 256
- RESTORE_VRS 20, 0, 9
- RESTORE_VRS 21, 16, 9
- RESTORE_VRS 22, 32, 9
- RESTORE_VRS 23, 48, 9
- RESTORE_VRS 24, 64, 9
- RESTORE_VRS 25, 80, 9
- RESTORE_VRS 26, 96, 9
- RESTORE_VRS 27, 112, 9
- RESTORE_VRS 28, 128, 9
- RESTORE_VRS 29, 144, 9
- RESTORE_VRS 30, 160, 9
- RESTORE_VRS 31, 176, 9
-
- RESTORE_VSX 14, 192, 9
- RESTORE_VSX 15, 208, 9
- RESTORE_VSX 16, 224, 9
- RESTORE_VSX 17, 240, 9
- RESTORE_VSX 18, 256, 9
- RESTORE_VSX 19, 272, 9
- RESTORE_VSX 20, 288, 9
- RESTORE_VSX 21, 304, 9
- RESTORE_VSX 22, 320, 9
- RESTORE_VSX 23, 336, 9
- RESTORE_VSX 24, 352, 9
- RESTORE_VSX 25, 368, 9
- RESTORE_VSX 26, 384, 9
- RESTORE_VSX 27, 400, 9
- RESTORE_VSX 28, 416, 9
- RESTORE_VSX 29, 432, 9
- RESTORE_VSX 30, 448, 9
- RESTORE_VSX 31, 464, 9
-
- RESTORE_GPR 14, 112, 1
- RESTORE_GPR 15, 120, 1
- RESTORE_GPR 16, 128, 1
- RESTORE_GPR 17, 136, 1
- RESTORE_GPR 18, 144, 1
- RESTORE_GPR 19, 152, 1
- RESTORE_GPR 20, 160, 1
- RESTORE_GPR 21, 168, 1
- RESTORE_GPR 22, 176, 1
- RESTORE_GPR 23, 184, 1
- RESTORE_GPR 24, 192, 1
- RESTORE_GPR 25, 200, 1
- RESTORE_GPR 26, 208, 1
- RESTORE_GPR 27, 216, 1
- RESTORE_GPR 28, 224, 1
- RESTORE_GPR 29, 232, 1
- RESTORE_GPR 30, 240, 1
- RESTORE_GPR 31, 248, 1
-
- addi 1, 1, 752
- ld 0, 16(1)
- mtlr 0
-.endm # RESTORE_REGS
-
-#
-# p[0] = a0*r0 + a1*r4*5 + a2*r3*5 + a3*r2*5 + a4*r1*5;
-# p[1] = a0*r1 + a1*r0 + a2*r4*5 + a3*r3*5 + a4*r2*5;
-# p[2] = a0*r2 + a1*r1 + a2*r0 + a3*r4*5 + a4*r3*5;
-# p[3] = a0*r3 + a1*r2 + a2*r1 + a3*r0 + a4*r4*5;
-# p[4] = a0*r4 + a1*r3 + a2*r2 + a3*r1 + a4*r0 ;
-#
-# [r^2, r^3, r^1, r^4]
-# [m3, m2, m4, m1]
-#
-# multiply odd and even words
-.macro mul_odd
- vmulouw 14, 4, 26
- vmulouw 10, 5, 3
- vmulouw 11, 6, 2
- vmulouw 12, 7, 1
- vmulouw 13, 8, 0
- vmulouw 15, 4, 27
- vaddudm 14, 14, 10
- vaddudm 14, 14, 11
- vmulouw 10, 5, 26
- vmulouw 11, 6, 3
- vaddudm 14, 14, 12
- vaddudm 14, 14, 13 # x0
- vaddudm 15, 15, 10
- vaddudm 15, 15, 11
- vmulouw 12, 7, 2
- vmulouw 13, 8, 1
- vaddudm 15, 15, 12
- vaddudm 15, 15, 13 # x1
- vmulouw 16, 4, 28
- vmulouw 10, 5, 27
- vmulouw 11, 6, 26
- vaddudm 16, 16, 10
- vaddudm 16, 16, 11
- vmulouw 12, 7, 3
- vmulouw 13, 8, 2
- vaddudm 16, 16, 12
- vaddudm 16, 16, 13 # x2
- vmulouw 17, 4, 29
- vmulouw 10, 5, 28
- vmulouw 11, 6, 27
- vaddudm 17, 17, 10
- vaddudm 17, 17, 11
- vmulouw 12, 7, 26
- vmulouw 13, 8, 3
- vaddudm 17, 17, 12
- vaddudm 17, 17, 13 # x3
- vmulouw 18, 4, 30
- vmulouw 10, 5, 29
- vmulouw 11, 6, 28
- vaddudm 18, 18, 10
- vaddudm 18, 18, 11
- vmulouw 12, 7, 27
- vmulouw 13, 8, 26
- vaddudm 18, 18, 12
- vaddudm 18, 18, 13 # x4
-.endm
-
-.macro mul_even
- vmuleuw 9, 4, 26
- vmuleuw 10, 5, 3
- vmuleuw 11, 6, 2
- vmuleuw 12, 7, 1
- vmuleuw 13, 8, 0
- vaddudm 14, 14, 9
- vaddudm 14, 14, 10
- vaddudm 14, 14, 11
- vaddudm 14, 14, 12
- vaddudm 14, 14, 13 # x0
-
- vmuleuw 9, 4, 27
- vmuleuw 10, 5, 26
- vmuleuw 11, 6, 3
- vmuleuw 12, 7, 2
- vmuleuw 13, 8, 1
- vaddudm 15, 15, 9
- vaddudm 15, 15, 10
- vaddudm 15, 15, 11
- vaddudm 15, 15, 12
- vaddudm 15, 15, 13 # x1
-
- vmuleuw 9, 4, 28
- vmuleuw 10, 5, 27
- vmuleuw 11, 6, 26
- vmuleuw 12, 7, 3
- vmuleuw 13, 8, 2
- vaddudm 16, 16, 9
- vaddudm 16, 16, 10
- vaddudm 16, 16, 11
- vaddudm 16, 16, 12
- vaddudm 16, 16, 13 # x2
-
- vmuleuw 9, 4, 29
- vmuleuw 10, 5, 28
- vmuleuw 11, 6, 27
- vmuleuw 12, 7, 26
- vmuleuw 13, 8, 3
- vaddudm 17, 17, 9
- vaddudm 17, 17, 10
- vaddudm 17, 17, 11
- vaddudm 17, 17, 12
- vaddudm 17, 17, 13 # x3
-
- vmuleuw 9, 4, 30
- vmuleuw 10, 5, 29
- vmuleuw 11, 6, 28
- vmuleuw 12, 7, 27
- vmuleuw 13, 8, 26
- vaddudm 18, 18, 9
- vaddudm 18, 18, 10
- vaddudm 18, 18, 11
- vaddudm 18, 18, 12
- vaddudm 18, 18, 13 # x4
-.endm
-
-#
-# poly1305_setup_r
-#
-# setup r^4, r^3, r^2, r vectors
-# [r, r^3, r^2, r^4]
-# vs0 = [r0,...]
-# vs1 = [r1,...]
-# vs2 = [r2,...]
-# vs3 = [r3,...]
-# vs4 = [r4,...]
-# vs5 = [r4*5,...]
-# vs6 = [r3*5,...]
-# vs7 = [r2*5,...]
-# vs8 = [r1*5,...]
-#
-# r0, r4*5, r3*5, r2*5, r1*5;
-# r1, r0, r4*5, r3*5, r2*5;
-# r2, r1, r0, r4*5, r3*5;
-# r3, r2, r1, r0, r4*5;
-# r4, r3, r2, r1, r0 ;
-#
-.macro poly1305_setup_r
-
- # save r
- xxlor 26, 58, 58
- xxlor 27, 59, 59
- xxlor 28, 60, 60
- xxlor 29, 61, 61
- xxlor 30, 62, 62
-
- xxlxor 31, 31, 31
-
-# [r, r^3, r^2, r^4]
- # compute r^2
- vmr 4, 26
- vmr 5, 27
- vmr 6, 28
- vmr 7, 29
- vmr 8, 30
- bl do_mul # r^2 r^1
- xxpermdi 58, 58, 36, 0x3 # r0
- xxpermdi 59, 59, 37, 0x3 # r1
- xxpermdi 60, 60, 38, 0x3 # r2
- xxpermdi 61, 61, 39, 0x3 # r3
- xxpermdi 62, 62, 40, 0x3 # r4
- xxpermdi 36, 36, 36, 0x3
- xxpermdi 37, 37, 37, 0x3
- xxpermdi 38, 38, 38, 0x3
- xxpermdi 39, 39, 39, 0x3
- xxpermdi 40, 40, 40, 0x3
- vspltisb 13, 2
- vsld 9, 27, 13
- vsld 10, 28, 13
- vsld 11, 29, 13
- vsld 12, 30, 13
- vaddudm 0, 9, 27
- vaddudm 1, 10, 28
- vaddudm 2, 11, 29
- vaddudm 3, 12, 30
-
- bl do_mul # r^4 r^3
- vmrgow 26, 26, 4
- vmrgow 27, 27, 5
- vmrgow 28, 28, 6
- vmrgow 29, 29, 7
- vmrgow 30, 30, 8
- vspltisb 13, 2
- vsld 9, 27, 13
- vsld 10, 28, 13
- vsld 11, 29, 13
- vsld 12, 30, 13
- vaddudm 0, 9, 27
- vaddudm 1, 10, 28
- vaddudm 2, 11, 29
- vaddudm 3, 12, 30
-
- # r^2 r^4
- xxlor 0, 58, 58
- xxlor 1, 59, 59
- xxlor 2, 60, 60
- xxlor 3, 61, 61
- xxlor 4, 62, 62
- xxlor 5, 32, 32
- xxlor 6, 33, 33
- xxlor 7, 34, 34
- xxlor 8, 35, 35
-
- vspltw 9, 26, 3
- vspltw 10, 26, 2
- vmrgow 26, 10, 9
- vspltw 9, 27, 3
- vspltw 10, 27, 2
- vmrgow 27, 10, 9
- vspltw 9, 28, 3
- vspltw 10, 28, 2
- vmrgow 28, 10, 9
- vspltw 9, 29, 3
- vspltw 10, 29, 2
- vmrgow 29, 10, 9
- vspltw 9, 30, 3
- vspltw 10, 30, 2
- vmrgow 30, 10, 9
-
- vsld 9, 27, 13
- vsld 10, 28, 13
- vsld 11, 29, 13
- vsld 12, 30, 13
- vaddudm 0, 9, 27
- vaddudm 1, 10, 28
- vaddudm 2, 11, 29
- vaddudm 3, 12, 30
-.endm
-
-SYM_FUNC_START_LOCAL(do_mul)
- mul_odd
-
- # do reduction ( h %= p )
- # carry reduction
- vspltisb 9, 2
- vsrd 10, 14, 31
- vsrd 11, 17, 31
- vand 7, 17, 25
- vand 4, 14, 25
- vaddudm 18, 18, 11
- vsrd 12, 18, 31
- vaddudm 15, 15, 10
-
- vsrd 11, 15, 31
- vand 8, 18, 25
- vand 5, 15, 25
- vaddudm 4, 4, 12
- vsld 10, 12, 9
- vaddudm 6, 16, 11
-
- vsrd 13, 6, 31
- vand 6, 6, 25
- vaddudm 4, 4, 10
- vsrd 10, 4, 31
- vaddudm 7, 7, 13
-
- vsrd 11, 7, 31
- vand 7, 7, 25
- vand 4, 4, 25
- vaddudm 5, 5, 10
- vaddudm 8, 8, 11
- blr
-SYM_FUNC_END(do_mul)
-
-#
-# init key
-#
-.macro do_poly1305_init
- addis 10, 2, rmask@toc@ha
- addi 10, 10, rmask@toc@l
-
- ld 11, 0(10)
- ld 12, 8(10)
-
- li 14, 16
- li 15, 32
- addis 10, 2, cnum@toc@ha
- addi 10, 10, cnum@toc@l
- lvx 25, 0, 10 # v25 - mask
- lvx 31, 14, 10 # v31 = 1a
- lvx 19, 15, 10 # v19 = 1 << 24
- lxv 24, 48(10) # vs24
- lxv 25, 64(10) # vs25
-
- # initialize
- # load key from r3 to vectors
- ld 9, 24(3)
- ld 10, 32(3)
- and. 9, 9, 11
- and. 10, 10, 12
-
- # break 26 bits
- extrdi 14, 9, 26, 38
- extrdi 15, 9, 26, 12
- extrdi 16, 9, 12, 0
- mtvsrdd 58, 0, 14
- insrdi 16, 10, 14, 38
- mtvsrdd 59, 0, 15
- extrdi 17, 10, 26, 24
- mtvsrdd 60, 0, 16
- extrdi 18, 10, 24, 0
- mtvsrdd 61, 0, 17
- mtvsrdd 62, 0, 18
-
- # r1 = r1 * 5, r2 = r2 * 5, r3 = r3 * 5, r4 = r4 * 5
- li 9, 5
- mtvsrdd 36, 0, 9
- vmulouw 0, 27, 4 # v0 = rr0
- vmulouw 1, 28, 4 # v1 = rr1
- vmulouw 2, 29, 4 # v2 = rr2
- vmulouw 3, 30, 4 # v3 = rr3
-.endm
-
-#
-# poly1305_p10le_4blocks( uint8_t *k, uint32_t mlen, uint8_t *m)
-# k = 32 bytes key
-# r3 = k (r, s)
-# r4 = mlen
-# r5 = m
-#
-SYM_FUNC_START(poly1305_p10le_4blocks)
-.align 5
- cmpdi 5, 64
- blt Out_no_poly1305
-
- SAVE_REGS
-
- do_poly1305_init
-
- li 21, 0 # counter to message
-
- poly1305_setup_r
-
- # load previous H state
- # break/convert r6 to 26 bits
- ld 9, 0(3)
- ld 10, 8(3)
- ld 19, 16(3)
- sldi 19, 19, 24
- mtvsrdd 41, 0, 19
- extrdi 14, 9, 26, 38
- extrdi 15, 9, 26, 12
- extrdi 16, 9, 12, 0
- mtvsrdd 36, 0, 14
- insrdi 16, 10, 14, 38
- mtvsrdd 37, 0, 15
- extrdi 17, 10, 26, 24
- mtvsrdd 38, 0, 16
- extrdi 18, 10, 24, 0
- mtvsrdd 39, 0, 17
- mtvsrdd 40, 0, 18
- vor 8, 8, 9
-
- # input m1 m2
- add 20, 4, 21
- xxlor 49, 24, 24
- xxlor 50, 25, 25
- lxvw4x 43, 0, 20
- addi 17, 20, 16
- lxvw4x 44, 0, 17
- vperm 14, 11, 12, 17
- vperm 15, 11, 12, 18
- vand 9, 14, 25 # a0
- vsrd 10, 14, 31 # >> 26
- vsrd 11, 10, 31 # 12 bits left
- vand 10, 10, 25 # a1
- vspltisb 13, 12
- vand 16, 15, 25
- vsld 12, 16, 13
- vor 11, 11, 12
- vand 11, 11, 25 # a2
- vspltisb 13, 14
- vsrd 12, 15, 13 # >> 14
- vsrd 13, 12, 31 # >> 26, a4
- vand 12, 12, 25 # a3
-
- vaddudm 20, 4, 9
- vaddudm 21, 5, 10
- vaddudm 22, 6, 11
- vaddudm 23, 7, 12
- vaddudm 24, 8, 13
-
- # m3 m4
- addi 17, 17, 16
- lxvw4x 43, 0, 17
- addi 17, 17, 16
- lxvw4x 44, 0, 17
- vperm 14, 11, 12, 17
- vperm 15, 11, 12, 18
- vand 9, 14, 25 # a0
- vsrd 10, 14, 31 # >> 26
- vsrd 11, 10, 31 # 12 bits left
- vand 10, 10, 25 # a1
- vspltisb 13, 12
- vand 16, 15, 25
- vsld 12, 16, 13
- vspltisb 13, 14
- vor 11, 11, 12
- vand 11, 11, 25 # a2
- vsrd 12, 15, 13 # >> 14
- vsrd 13, 12, 31 # >> 26, a4
- vand 12, 12, 25 # a3
-
- # Smash 4 message blocks into 5 vectors of [m4, m2, m3, m1]
- vmrgow 4, 9, 20
- vmrgow 5, 10, 21
- vmrgow 6, 11, 22
- vmrgow 7, 12, 23
- vmrgow 8, 13, 24
- vaddudm 8, 8, 19
-
- addi 5, 5, -64 # len -= 64
- addi 21, 21, 64 # offset += 64
-
- li 9, 64
- divdu 31, 5, 9
-
- cmpdi 31, 0
- ble Skip_block_loop
-
- mtctr 31
-
-# h4 = m1 * r⁴ + m2 * r³ + m3 * r² + m4 * r
-# Rewrite the polynominal sum of product as follows,
-# h1 = (h0 + m1) * r^2, h2 = (h0 + m2) * r^2
-# h3 = (h1 + m3) * r^2, h4 = (h2 + m4) * r^2 --> (h0 + m1) r*4 + (h3 + m3) r^2, (h0 + m2) r^4 + (h0 + m4) r^2
-# .... Repeat
-# h5 = (h3 + m5) * r^2, h6 = (h4 + m6) * r^2 -->
-# h7 = (h5 + m7) * r^2, h8 = (h6 + m8) * r^1 --> m5 * r^4 + m6 * r^3 + m7 * r^2 + m8 * r
-#
-loop_4blocks:
-
- # Multiply odd words and even words
- mul_odd
- mul_even
- # carry reduction
- vspltisb 9, 2
- vsrd 10, 14, 31
- vsrd 11, 17, 31
- vand 7, 17, 25
- vand 4, 14, 25
- vaddudm 18, 18, 11
- vsrd 12, 18, 31
- vaddudm 15, 15, 10
-
- vsrd 11, 15, 31
- vand 8, 18, 25
- vand 5, 15, 25
- vaddudm 4, 4, 12
- vsld 10, 12, 9
- vaddudm 6, 16, 11
-
- vsrd 13, 6, 31
- vand 6, 6, 25
- vaddudm 4, 4, 10
- vsrd 10, 4, 31
- vaddudm 7, 7, 13
-
- vsrd 11, 7, 31
- vand 7, 7, 25
- vand 4, 4, 25
- vaddudm 5, 5, 10
- vaddudm 8, 8, 11
-
- # input m1 m2 m3 m4
- add 20, 4, 21
- xxlor 49, 24, 24
- xxlor 50, 25, 25
- lxvw4x 43, 0, 20
- addi 17, 20, 16
- lxvw4x 44, 0, 17
- vperm 14, 11, 12, 17
- vperm 15, 11, 12, 18
- addi 17, 17, 16
- lxvw4x 43, 0, 17
- addi 17, 17, 16
- lxvw4x 44, 0, 17
- vperm 17, 11, 12, 17
- vperm 18, 11, 12, 18
-
- vand 20, 14, 25 # a0
- vand 9, 17, 25 # a0
- vsrd 21, 14, 31 # >> 26
- vsrd 22, 21, 31 # 12 bits left
- vsrd 10, 17, 31 # >> 26
- vsrd 11, 10, 31 # 12 bits left
-
- vand 21, 21, 25 # a1
- vand 10, 10, 25 # a1
-
- vspltisb 13, 12
- vand 16, 15, 25
- vsld 23, 16, 13
- vor 22, 22, 23
- vand 22, 22, 25 # a2
- vand 16, 18, 25
- vsld 12, 16, 13
- vor 11, 11, 12
- vand 11, 11, 25 # a2
- vspltisb 13, 14
- vsrd 23, 15, 13 # >> 14
- vsrd 24, 23, 31 # >> 26, a4
- vand 23, 23, 25 # a3
- vsrd 12, 18, 13 # >> 14
- vsrd 13, 12, 31 # >> 26, a4
- vand 12, 12, 25 # a3
-
- vaddudm 4, 4, 20
- vaddudm 5, 5, 21
- vaddudm 6, 6, 22
- vaddudm 7, 7, 23
- vaddudm 8, 8, 24
-
- # Smash 4 message blocks into 5 vectors of [m4, m2, m3, m1]
- vmrgow 4, 9, 4
- vmrgow 5, 10, 5
- vmrgow 6, 11, 6
- vmrgow 7, 12, 7
- vmrgow 8, 13, 8
- vaddudm 8, 8, 19
-
- addi 5, 5, -64 # len -= 64
- addi 21, 21, 64 # offset += 64
-
- bdnz loop_4blocks
-
-Skip_block_loop:
- xxlor 58, 0, 0
- xxlor 59, 1, 1
- xxlor 60, 2, 2
- xxlor 61, 3, 3
- xxlor 62, 4, 4
- xxlor 32, 5, 5
- xxlor 33, 6, 6
- xxlor 34, 7, 7
- xxlor 35, 8, 8
-
- # Multiply odd words and even words
- mul_odd
- mul_even
-
- # Sum the products.
- xxpermdi 41, 31, 46, 0
- xxpermdi 42, 31, 47, 0
- vaddudm 4, 14, 9
- xxpermdi 36, 31, 36, 3
- vaddudm 5, 15, 10
- xxpermdi 37, 31, 37, 3
- xxpermdi 43, 31, 48, 0
- vaddudm 6, 16, 11
- xxpermdi 38, 31, 38, 3
- xxpermdi 44, 31, 49, 0
- vaddudm 7, 17, 12
- xxpermdi 39, 31, 39, 3
- xxpermdi 45, 31, 50, 0
- vaddudm 8, 18, 13
- xxpermdi 40, 31, 40, 3
-
- # carry reduction
- vspltisb 9, 2
- vsrd 10, 4, 31
- vsrd 11, 7, 31
- vand 7, 7, 25
- vand 4, 4, 25
- vaddudm 8, 8, 11
- vsrd 12, 8, 31
- vaddudm 5, 5, 10
-
- vsrd 11, 5, 31
- vand 8, 8, 25
- vand 5, 5, 25
- vaddudm 4, 4, 12
- vsld 10, 12, 9
- vaddudm 6, 6, 11
-
- vsrd 13, 6, 31
- vand 6, 6, 25
- vaddudm 4, 4, 10
- vsrd 10, 4, 31
- vaddudm 7, 7, 13
-
- vsrd 11, 7, 31
- vand 7, 7, 25
- vand 4, 4, 25
- vaddudm 5, 5, 10
- vsrd 10, 5, 31
- vand 5, 5, 25
- vaddudm 6, 6, 10
- vaddudm 8, 8, 11
-
- b do_final_update
-
-do_final_update:
- # combine 26 bit limbs
- # v4, v5, v6, v7 and v8 are 26 bit vectors
- vsld 5, 5, 31
- vor 20, 4, 5
- vspltisb 11, 12
- vsrd 12, 6, 11
- vsld 6, 6, 31
- vsld 6, 6, 31
- vor 20, 20, 6
- vspltisb 11, 14
- vsld 7, 7, 11
- vor 21, 7, 12
- mfvsrld 16, 40 # save last 2 bytes
- vsld 8, 8, 11
- vsld 8, 8, 31
- vor 21, 21, 8
- mfvsrld 17, 52
- mfvsrld 19, 53
- srdi 16, 16, 24
-
- std 17, 0(3)
- std 19, 8(3)
- stw 16, 16(3)
-
-Out_loop:
- li 3, 0
-
- RESTORE_REGS
-
- blr
-
-Out_no_poly1305:
- li 3, 0
- blr
-SYM_FUNC_END(poly1305_p10le_4blocks)
-
-#
-# =======================================================================
-# The following functions implement 64 x 64 bits multiplication poly1305.
-#
-SYM_FUNC_START_LOCAL(Poly1305_init_64)
- # mask 0x0FFFFFFC0FFFFFFC
- # mask 0x0FFFFFFC0FFFFFFF
- addis 10, 2, rmask@toc@ha
- addi 10, 10, rmask@toc@l
- ld 11, 0(10)
- ld 12, 8(10)
-
- # initialize
- # load key from r3
- ld 9, 24(3)
- ld 10, 32(3)
- and. 9, 9, 11 # cramp mask r0
- and. 10, 10, 12 # cramp mask r1
-
- srdi 21, 10, 2
- add 19, 21, 10 # s1: r19 - (r1 >> 2) *5
-
- # setup r and s
- li 25, 0
- mtvsrdd 32+0, 9, 19 # r0, s1
- mtvsrdd 32+1, 10, 9 # r1, r0
- mtvsrdd 32+2, 19, 25 # s1
- mtvsrdd 32+3, 9, 25 # r0
-
- blr
-SYM_FUNC_END(Poly1305_init_64)
-
-# Poly1305_mult
-# v6 = (h0, h1), v8 = h2
-# v0 = (r0, s1), v1 = (r1, r0), v2 = s1, v3 = r0
-#
-# Output: v7, v10, v11
-#
-SYM_FUNC_START_LOCAL(Poly1305_mult)
- #
- # d0 = h0 * r0 + h1 * s1
- vmsumudm 7, 6, 0, 9 # h0 * r0, h1 * s1
-
- # d1 = h0 * r1 + h1 * r0 + h2 * s1
- vmsumudm 11, 6, 1, 9 # h0 * r1, h1 * r0
- vmsumudm 10, 8, 2, 11 # d1 += h2 * s1
-
- # d2 = r0
- vmsumudm 11, 8, 3, 9 # d2 = h2 * r0
- blr
-SYM_FUNC_END(Poly1305_mult)
-
-#
-# carry reduction
-# h %=p
-#
-# Input: v7, v10, v11
-# Output: r27, r28, r29
-#
-SYM_FUNC_START_LOCAL(Carry_reduction)
- mfvsrld 27, 32+7
- mfvsrld 28, 32+10
- mfvsrld 29, 32+11
- mfvsrd 20, 32+7 # h0.h
- mfvsrd 21, 32+10 # h1.h
-
- addc 28, 28, 20
- adde 29, 29, 21
- srdi 22, 29, 0x2
- sldi 23, 22, 0x2
- add 23, 23, 22 # (h2 & 3) * 5
- addc 27, 27, 23 # h0
- addze 28, 28 # h1
- andi. 29, 29, 0x3 # h2
- blr
-SYM_FUNC_END(Carry_reduction)
-
-#
-# poly1305 multiplication
-# h *= r, h %= p
-# d0 = h0 * r0 + h1 * s1
-# d1 = h0 * r1 + h1 * r0 + h2 * s1
-# d2 = h0 * r0
-#
-#
-# unsigned int poly1305_test_64s(unisgned char *state, const byte *src, size_t len, highbit)
-# - no highbit if final leftover block (highbit = 0)
-#
-SYM_FUNC_START(poly1305_64s)
- cmpdi 5, 0
- ble Out_no_poly1305_64
-
- mflr 0
- std 0, 16(1)
- stdu 1,-400(1)
-
- SAVE_GPR 14, 112, 1
- SAVE_GPR 15, 120, 1
- SAVE_GPR 16, 128, 1
- SAVE_GPR 17, 136, 1
- SAVE_GPR 18, 144, 1
- SAVE_GPR 19, 152, 1
- SAVE_GPR 20, 160, 1
- SAVE_GPR 21, 168, 1
- SAVE_GPR 22, 176, 1
- SAVE_GPR 23, 184, 1
- SAVE_GPR 24, 192, 1
- SAVE_GPR 25, 200, 1
- SAVE_GPR 26, 208, 1
- SAVE_GPR 27, 216, 1
- SAVE_GPR 28, 224, 1
- SAVE_GPR 29, 232, 1
- SAVE_GPR 30, 240, 1
- SAVE_GPR 31, 248, 1
-
- # Init poly1305
- bl Poly1305_init_64
-
- li 25, 0 # offset to inp and outp
-
- add 11, 25, 4
-
- # load h
- # h0, h1, h2?
- ld 27, 0(3)
- ld 28, 8(3)
- lwz 29, 16(3)
-
- li 30, 16
- divdu 31, 5, 30
-
- mtctr 31
-
- mr 24, 6 # highbit
-
-Loop_block_64:
- vxor 9, 9, 9
-
- ld 20, 0(11)
- ld 21, 8(11)
- addi 11, 11, 16
-
- addc 27, 27, 20
- adde 28, 28, 21
- adde 29, 29, 24
-
- li 22, 0
- mtvsrdd 32+6, 27, 28 # h0, h1
- mtvsrdd 32+8, 29, 22 # h2
-
- bl Poly1305_mult
-
- bl Carry_reduction
-
- bdnz Loop_block_64
-
- std 27, 0(3)
- std 28, 8(3)
- stw 29, 16(3)
-
- li 3, 0
-
- RESTORE_GPR 14, 112, 1
- RESTORE_GPR 15, 120, 1
- RESTORE_GPR 16, 128, 1
- RESTORE_GPR 17, 136, 1
- RESTORE_GPR 18, 144, 1
- RESTORE_GPR 19, 152, 1
- RESTORE_GPR 20, 160, 1
- RESTORE_GPR 21, 168, 1
- RESTORE_GPR 22, 176, 1
- RESTORE_GPR 23, 184, 1
- RESTORE_GPR 24, 192, 1
- RESTORE_GPR 25, 200, 1
- RESTORE_GPR 26, 208, 1
- RESTORE_GPR 27, 216, 1
- RESTORE_GPR 28, 224, 1
- RESTORE_GPR 29, 232, 1
- RESTORE_GPR 30, 240, 1
- RESTORE_GPR 31, 248, 1
-
- addi 1, 1, 400
- ld 0, 16(1)
- mtlr 0
-
- blr
-
-Out_no_poly1305_64:
- li 3, 0
- blr
-SYM_FUNC_END(poly1305_64s)
-
-#
-# Input: r3 = h, r4 = s, r5 = mac
-# mac = h + s
-#
-SYM_FUNC_START(poly1305_emit_64)
- ld 10, 0(3)
- ld 11, 8(3)
- ld 12, 16(3)
-
- # compare modulus
- # h + 5 + (-p)
- mr 6, 10
- mr 7, 11
- mr 8, 12
- addic. 6, 6, 5
- addze 7, 7
- addze 8, 8
- srdi 9, 8, 2 # overflow?
- cmpdi 9, 0
- beq Skip_h64
- mr 10, 6
- mr 11, 7
- mr 12, 8
-
-Skip_h64:
- ld 6, 0(4)
- ld 7, 8(4)
- addc 10, 10, 6
- adde 11, 11, 7
- addze 12, 12
-
- std 10, 0(5)
- std 11, 8(5)
- blr
-SYM_FUNC_END(poly1305_emit_64)
-
-SYM_DATA_START_LOCAL(RMASK)
-.align 5
-rmask:
-.byte 0xff, 0xff, 0xff, 0x0f, 0xfc, 0xff, 0xff, 0x0f, 0xfc, 0xff, 0xff, 0x0f, 0xfc, 0xff, 0xff, 0x0f
-cnum:
-.long 0x03ffffff, 0x00000000, 0x03ffffff, 0x00000000
-.long 0x1a, 0x00, 0x1a, 0x00
-.long 0x01000000, 0x01000000, 0x01000000, 0x01000000
-.long 0x00010203, 0x04050607, 0x10111213, 0x14151617
-.long 0x08090a0b, 0x0c0d0e0f, 0x18191a1b, 0x1c1d1e1f
-SYM_DATA_END(RMASK)
diff --git a/arch/powerpc/crypto/sha1-spe-glue.c b/arch/powerpc/crypto/sha1-spe-glue.c
index 9170892a8557..04c88e173ce1 100644
--- a/arch/powerpc/crypto/sha1-spe-glue.c
+++ b/arch/powerpc/crypto/sha1-spe-glue.c
@@ -7,16 +7,13 @@
* Copyright (c) 2015 Markus Stockhausen <stockhausen@collogia.de>
*/
+#include <asm/switch_to.h>
#include <crypto/internal/hash.h>
-#include <linux/init.h>
-#include <linux/module.h>
-#include <linux/mm.h>
-#include <linux/types.h>
#include <crypto/sha1.h>
#include <crypto/sha1_base.h>
-#include <asm/byteorder.h>
-#include <asm/switch_to.h>
-#include <linux/hardirq.h>
+#include <linux/kernel.h>
+#include <linux/preempt.h>
+#include <linux/module.h>
/*
* MAX_BYTES defines the number of bytes that are allowed to be processed
@@ -30,7 +27,7 @@
*/
#define MAX_BYTES 2048
-extern void ppc_spe_sha1_transform(u32 *state, const u8 *src, u32 blocks);
+asmlinkage void ppc_spe_sha1_transform(u32 *state, const u8 *src, u32 blocks);
static void spe_begin(void)
{
@@ -46,126 +43,45 @@ static void spe_end(void)
preempt_enable();
}
-static inline void ppc_sha1_clear_context(struct sha1_state *sctx)
+static void ppc_spe_sha1_block(struct sha1_state *sctx, const u8 *src,
+ int blocks)
{
- int count = sizeof(struct sha1_state) >> 2;
- u32 *ptr = (u32 *)sctx;
-
- /* make sure we can clear the fast way */
- BUILD_BUG_ON(sizeof(struct sha1_state) % 4);
- do { *ptr++ = 0; } while (--count);
-}
-
-static int ppc_spe_sha1_update(struct shash_desc *desc, const u8 *data,
- unsigned int len)
-{
- struct sha1_state *sctx = shash_desc_ctx(desc);
- const unsigned int offset = sctx->count & 0x3f;
- const unsigned int avail = 64 - offset;
- unsigned int bytes;
- const u8 *src = data;
-
- if (avail > len) {
- sctx->count += len;
- memcpy((char *)sctx->buffer + offset, src, len);
- return 0;
- }
-
- sctx->count += len;
-
- if (offset) {
- memcpy((char *)sctx->buffer + offset, src, avail);
+ do {
+ int unit = min(blocks, MAX_BYTES / SHA1_BLOCK_SIZE);
spe_begin();
- ppc_spe_sha1_transform(sctx->state, (const u8 *)sctx->buffer, 1);
+ ppc_spe_sha1_transform(sctx->state, src, unit);
spe_end();
- len -= avail;
- src += avail;
- }
-
- while (len > 63) {
- bytes = (len > MAX_BYTES) ? MAX_BYTES : len;
- bytes = bytes & ~0x3f;
-
- spe_begin();
- ppc_spe_sha1_transform(sctx->state, src, bytes >> 6);
- spe_end();
-
- src += bytes;
- len -= bytes;
- }
-
- memcpy((char *)sctx->buffer, src, len);
- return 0;
-}
-
-static int ppc_spe_sha1_final(struct shash_desc *desc, u8 *out)
-{
- struct sha1_state *sctx = shash_desc_ctx(desc);
- const unsigned int offset = sctx->count & 0x3f;
- char *p = (char *)sctx->buffer + offset;
- int padlen;
- __be64 *pbits = (__be64 *)(((char *)&sctx->buffer) + 56);
- __be32 *dst = (__be32 *)out;
-
- padlen = 55 - offset;
- *p++ = 0x80;
-
- spe_begin();
-
- if (padlen < 0) {
- memset(p, 0x00, padlen + sizeof (u64));
- ppc_spe_sha1_transform(sctx->state, sctx->buffer, 1);
- p = (char *)sctx->buffer;
- padlen = 56;
- }
-
- memset(p, 0, padlen);
- *pbits = cpu_to_be64(sctx->count << 3);
- ppc_spe_sha1_transform(sctx->state, sctx->buffer, 1);
-
- spe_end();
-
- dst[0] = cpu_to_be32(sctx->state[0]);
- dst[1] = cpu_to_be32(sctx->state[1]);
- dst[2] = cpu_to_be32(sctx->state[2]);
- dst[3] = cpu_to_be32(sctx->state[3]);
- dst[4] = cpu_to_be32(sctx->state[4]);
-
- ppc_sha1_clear_context(sctx);
- return 0;
+ src += unit * SHA1_BLOCK_SIZE;
+ blocks -= unit;
+ } while (blocks);
}
-static int ppc_spe_sha1_export(struct shash_desc *desc, void *out)
+static int ppc_spe_sha1_update(struct shash_desc *desc, const u8 *data,
+ unsigned int len)
{
- struct sha1_state *sctx = shash_desc_ctx(desc);
-
- memcpy(out, sctx, sizeof(*sctx));
- return 0;
+ return sha1_base_do_update_blocks(desc, data, len, ppc_spe_sha1_block);
}
-static int ppc_spe_sha1_import(struct shash_desc *desc, const void *in)
+static int ppc_spe_sha1_finup(struct shash_desc *desc, const u8 *src,
+ unsigned int len, u8 *out)
{
- struct sha1_state *sctx = shash_desc_ctx(desc);
-
- memcpy(sctx, in, sizeof(*sctx));
- return 0;
+ sha1_base_do_finup(desc, src, len, ppc_spe_sha1_block);
+ return sha1_base_finish(desc, out);
}
static struct shash_alg alg = {
.digestsize = SHA1_DIGEST_SIZE,
.init = sha1_base_init,
.update = ppc_spe_sha1_update,
- .final = ppc_spe_sha1_final,
- .export = ppc_spe_sha1_export,
- .import = ppc_spe_sha1_import,
- .descsize = sizeof(struct sha1_state),
- .statesize = sizeof(struct sha1_state),
+ .finup = ppc_spe_sha1_finup,
+ .descsize = SHA1_STATE_SIZE,
.base = {
.cra_name = "sha1",
.cra_driver_name= "sha1-ppc-spe",
.cra_priority = 300,
+ .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY,
.cra_blocksize = SHA1_BLOCK_SIZE,
.cra_module = THIS_MODULE,
}
diff --git a/arch/powerpc/crypto/sha1.c b/arch/powerpc/crypto/sha1.c
index f283bbd3f121..4593946aa9b3 100644
--- a/arch/powerpc/crypto/sha1.c
+++ b/arch/powerpc/crypto/sha1.c
@@ -13,107 +13,46 @@
* Copyright (c) Jean-Francois Dive <jef@linuxbe.org>
*/
#include <crypto/internal/hash.h>
-#include <linux/init.h>
-#include <linux/module.h>
-#include <linux/mm.h>
-#include <linux/types.h>
#include <crypto/sha1.h>
#include <crypto/sha1_base.h>
-#include <asm/byteorder.h>
-
-void powerpc_sha_transform(u32 *state, const u8 *src);
-
-static int powerpc_sha1_update(struct shash_desc *desc, const u8 *data,
- unsigned int len)
-{
- struct sha1_state *sctx = shash_desc_ctx(desc);
- unsigned int partial, done;
- const u8 *src;
-
- partial = sctx->count & 0x3f;
- sctx->count += len;
- done = 0;
- src = data;
-
- if ((partial + len) > 63) {
-
- if (partial) {
- done = -partial;
- memcpy(sctx->buffer + partial, data, done + 64);
- src = sctx->buffer;
- }
-
- do {
- powerpc_sha_transform(sctx->state, src);
- done += 64;
- src = data + done;
- } while (done + 63 < len);
-
- partial = 0;
- }
- memcpy(sctx->buffer + partial, src, len - done);
-
- return 0;
-}
+#include <linux/kernel.h>
+#include <linux/module.h>
+asmlinkage void powerpc_sha_transform(u32 *state, const u8 *src);
-/* Add padding and return the message digest. */
-static int powerpc_sha1_final(struct shash_desc *desc, u8 *out)
+static void powerpc_sha_block(struct sha1_state *sctx, const u8 *data,
+ int blocks)
{
- struct sha1_state *sctx = shash_desc_ctx(desc);
- __be32 *dst = (__be32 *)out;
- u32 i, index, padlen;
- __be64 bits;
- static const u8 padding[64] = { 0x80, };
-
- bits = cpu_to_be64(sctx->count << 3);
-
- /* Pad out to 56 mod 64 */
- index = sctx->count & 0x3f;
- padlen = (index < 56) ? (56 - index) : ((64+56) - index);
- powerpc_sha1_update(desc, padding, padlen);
-
- /* Append length */
- powerpc_sha1_update(desc, (const u8 *)&bits, sizeof(bits));
-
- /* Store state in digest */
- for (i = 0; i < 5; i++)
- dst[i] = cpu_to_be32(sctx->state[i]);
-
- /* Wipe context */
- memset(sctx, 0, sizeof *sctx);
-
- return 0;
+ do {
+ powerpc_sha_transform(sctx->state, data);
+ data += 64;
+ } while (--blocks);
}
-static int powerpc_sha1_export(struct shash_desc *desc, void *out)
+static int powerpc_sha1_update(struct shash_desc *desc, const u8 *data,
+ unsigned int len)
{
- struct sha1_state *sctx = shash_desc_ctx(desc);
-
- memcpy(out, sctx, sizeof(*sctx));
- return 0;
+ return sha1_base_do_update_blocks(desc, data, len, powerpc_sha_block);
}
-static int powerpc_sha1_import(struct shash_desc *desc, const void *in)
+/* Add padding and return the message digest. */
+static int powerpc_sha1_finup(struct shash_desc *desc, const u8 *src,
+ unsigned int len, u8 *out)
{
- struct sha1_state *sctx = shash_desc_ctx(desc);
-
- memcpy(sctx, in, sizeof(*sctx));
- return 0;
+ sha1_base_do_finup(desc, src, len, powerpc_sha_block);
+ return sha1_base_finish(desc, out);
}
static struct shash_alg alg = {
.digestsize = SHA1_DIGEST_SIZE,
.init = sha1_base_init,
.update = powerpc_sha1_update,
- .final = powerpc_sha1_final,
- .export = powerpc_sha1_export,
- .import = powerpc_sha1_import,
- .descsize = sizeof(struct sha1_state),
- .statesize = sizeof(struct sha1_state),
+ .finup = powerpc_sha1_finup,
+ .descsize = SHA1_STATE_SIZE,
.base = {
.cra_name = "sha1",
.cra_driver_name= "sha1-powerpc",
+ .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY,
.cra_blocksize = SHA1_BLOCK_SIZE,
.cra_module = THIS_MODULE,
}
diff --git a/arch/powerpc/crypto/sha256-spe-asm.S b/arch/powerpc/crypto/sha256-spe-asm.S
deleted file mode 100644
index cd99d71dae34..000000000000
--- a/arch/powerpc/crypto/sha256-spe-asm.S
+++ /dev/null
@@ -1,318 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * Fast SHA-256 implementation for SPE instruction set (PPC)
- *
- * This code makes use of the SPE SIMD instruction set as defined in
- * http://cache.freescale.com/files/32bit/doc/ref_manual/SPEPIM.pdf
- * Implementation is based on optimization guide notes from
- * http://cache.freescale.com/files/32bit/doc/app_note/AN2665.pdf
- *
- * Copyright (c) 2015 Markus Stockhausen <stockhausen@collogia.de>
- */
-
-#include <asm/ppc_asm.h>
-#include <asm/asm-offsets.h>
-
-#define rHP r3 /* pointer to hash values in memory */
-#define rKP r24 /* pointer to round constants */
-#define rWP r4 /* pointer to input data */
-
-#define rH0 r5 /* 8 32 bit hash values in 8 registers */
-#define rH1 r6
-#define rH2 r7
-#define rH3 r8
-#define rH4 r9
-#define rH5 r10
-#define rH6 r11
-#define rH7 r12
-
-#define rW0 r14 /* 64 bit registers. 16 words in 8 registers */
-#define rW1 r15
-#define rW2 r16
-#define rW3 r17
-#define rW4 r18
-#define rW5 r19
-#define rW6 r20
-#define rW7 r21
-
-#define rT0 r22 /* 64 bit temporaries */
-#define rT1 r23
-#define rT2 r0 /* 32 bit temporaries */
-#define rT3 r25
-
-#define CMP_KN_LOOP
-#define CMP_KC_LOOP \
- cmpwi rT1,0;
-
-#define INITIALIZE \
- stwu r1,-128(r1); /* create stack frame */ \
- evstdw r14,8(r1); /* We must save non volatile */ \
- evstdw r15,16(r1); /* registers. Take the chance */ \
- evstdw r16,24(r1); /* and save the SPE part too */ \
- evstdw r17,32(r1); \
- evstdw r18,40(r1); \
- evstdw r19,48(r1); \
- evstdw r20,56(r1); \
- evstdw r21,64(r1); \
- evstdw r22,72(r1); \
- evstdw r23,80(r1); \
- stw r24,88(r1); /* save normal registers */ \
- stw r25,92(r1);
-
-
-#define FINALIZE \
- evldw r14,8(r1); /* restore SPE registers */ \
- evldw r15,16(r1); \
- evldw r16,24(r1); \
- evldw r17,32(r1); \
- evldw r18,40(r1); \
- evldw r19,48(r1); \
- evldw r20,56(r1); \
- evldw r21,64(r1); \
- evldw r22,72(r1); \
- evldw r23,80(r1); \
- lwz r24,88(r1); /* restore normal registers */ \
- lwz r25,92(r1); \
- xor r0,r0,r0; \
- stw r0,8(r1); /* Delete sensitive data */ \
- stw r0,16(r1); /* that we might have pushed */ \
- stw r0,24(r1); /* from other context that runs */ \
- stw r0,32(r1); /* the same code. Assume that */ \
- stw r0,40(r1); /* the lower part of the GPRs */ \
- stw r0,48(r1); /* was already overwritten on */ \
- stw r0,56(r1); /* the way down to here */ \
- stw r0,64(r1); \
- stw r0,72(r1); \
- stw r0,80(r1); \
- addi r1,r1,128; /* cleanup stack frame */
-
-#ifdef __BIG_ENDIAN__
-#define LOAD_DATA(reg, off) \
- lwz reg,off(rWP); /* load data */
-#define NEXT_BLOCK \
- addi rWP,rWP,64; /* increment per block */
-#else
-#define LOAD_DATA(reg, off) \
- lwbrx reg,0,rWP; /* load data */ \
- addi rWP,rWP,4; /* increment per word */
-#define NEXT_BLOCK /* nothing to do */
-#endif
-
-#define R_LOAD_W(a, b, c, d, e, f, g, h, w, off) \
- LOAD_DATA(w, off) /* 1: W */ \
- rotrwi rT0,e,6; /* 1: S1 = e rotr 6 */ \
- rotrwi rT1,e,11; /* 1: S1' = e rotr 11 */ \
- rotrwi rT2,e,25; /* 1: S1" = e rotr 25 */ \
- xor rT0,rT0,rT1; /* 1: S1 = S1 xor S1' */ \
- and rT3,e,f; /* 1: ch = e and f */ \
- xor rT0,rT0,rT2; /* 1: S1 = S1 xor S1" */ \
- andc rT1,g,e; /* 1: ch' = ~e and g */ \
- lwz rT2,off(rKP); /* 1: K */ \
- xor rT3,rT3,rT1; /* 1: ch = ch xor ch' */ \
- add h,h,rT0; /* 1: temp1 = h + S1 */ \
- add rT3,rT3,w; /* 1: temp1' = ch + w */ \
- rotrwi rT0,a,2; /* 1: S0 = a rotr 2 */ \
- add h,h,rT3; /* 1: temp1 = temp1 + temp1' */ \
- rotrwi rT1,a,13; /* 1: S0' = a rotr 13 */ \
- add h,h,rT2; /* 1: temp1 = temp1 + K */ \
- rotrwi rT3,a,22; /* 1: S0" = a rotr 22 */ \
- xor rT0,rT0,rT1; /* 1: S0 = S0 xor S0' */ \
- add d,d,h; /* 1: d = d + temp1 */ \
- xor rT3,rT0,rT3; /* 1: S0 = S0 xor S0" */ \
- evmergelo w,w,w; /* shift W */ \
- or rT2,a,b; /* 1: maj = a or b */ \
- and rT1,a,b; /* 1: maj' = a and b */ \
- and rT2,rT2,c; /* 1: maj = maj and c */ \
- LOAD_DATA(w, off+4) /* 2: W */ \
- or rT2,rT1,rT2; /* 1: maj = maj or maj' */ \
- rotrwi rT0,d,6; /* 2: S1 = e rotr 6 */ \
- add rT3,rT3,rT2; /* 1: temp2 = S0 + maj */ \
- rotrwi rT1,d,11; /* 2: S1' = e rotr 11 */ \
- add h,h,rT3; /* 1: h = temp1 + temp2 */ \
- rotrwi rT2,d,25; /* 2: S1" = e rotr 25 */ \
- xor rT0,rT0,rT1; /* 2: S1 = S1 xor S1' */ \
- and rT3,d,e; /* 2: ch = e and f */ \
- xor rT0,rT0,rT2; /* 2: S1 = S1 xor S1" */ \
- andc rT1,f,d; /* 2: ch' = ~e and g */ \
- lwz rT2,off+4(rKP); /* 2: K */ \
- xor rT3,rT3,rT1; /* 2: ch = ch xor ch' */ \
- add g,g,rT0; /* 2: temp1 = h + S1 */ \
- add rT3,rT3,w; /* 2: temp1' = ch + w */ \
- rotrwi rT0,h,2; /* 2: S0 = a rotr 2 */ \
- add g,g,rT3; /* 2: temp1 = temp1 + temp1' */ \
- rotrwi rT1,h,13; /* 2: S0' = a rotr 13 */ \
- add g,g,rT2; /* 2: temp1 = temp1 + K */ \
- rotrwi rT3,h,22; /* 2: S0" = a rotr 22 */ \
- xor rT0,rT0,rT1; /* 2: S0 = S0 xor S0' */ \
- or rT2,h,a; /* 2: maj = a or b */ \
- xor rT3,rT0,rT3; /* 2: S0 = S0 xor S0" */ \
- and rT1,h,a; /* 2: maj' = a and b */ \
- and rT2,rT2,b; /* 2: maj = maj and c */ \
- add c,c,g; /* 2: d = d + temp1 */ \
- or rT2,rT1,rT2; /* 2: maj = maj or maj' */ \
- add rT3,rT3,rT2; /* 2: temp2 = S0 + maj */ \
- add g,g,rT3 /* 2: h = temp1 + temp2 */
-
-#define R_CALC_W(a, b, c, d, e, f, g, h, w0, w1, w4, w5, w7, k, off) \
- rotrwi rT2,e,6; /* 1: S1 = e rotr 6 */ \
- evmergelohi rT0,w0,w1; /* w[-15] */ \
- rotrwi rT3,e,11; /* 1: S1' = e rotr 11 */ \
- evsrwiu rT1,rT0,3; /* s0 = w[-15] >> 3 */ \
- xor rT2,rT2,rT3; /* 1: S1 = S1 xor S1' */ \
- evrlwi rT0,rT0,25; /* s0' = w[-15] rotr 7 */ \
- rotrwi rT3,e,25; /* 1: S1' = e rotr 25 */ \
- evxor rT1,rT1,rT0; /* s0 = s0 xor s0' */ \
- xor rT2,rT2,rT3; /* 1: S1 = S1 xor S1' */ \
- evrlwi rT0,rT0,21; /* s0' = w[-15] rotr 18 */ \
- add h,h,rT2; /* 1: temp1 = h + S1 */ \
- evxor rT0,rT0,rT1; /* s0 = s0 xor s0' */ \
- and rT2,e,f; /* 1: ch = e and f */ \
- evaddw w0,w0,rT0; /* w = w[-16] + s0 */ \
- andc rT3,g,e; /* 1: ch' = ~e and g */ \
- evsrwiu rT0,w7,10; /* s1 = w[-2] >> 10 */ \
- xor rT2,rT2,rT3; /* 1: ch = ch xor ch' */ \
- evrlwi rT1,w7,15; /* s1' = w[-2] rotr 17 */ \
- add h,h,rT2; /* 1: temp1 = temp1 + ch */ \
- evxor rT0,rT0,rT1; /* s1 = s1 xor s1' */ \
- rotrwi rT2,a,2; /* 1: S0 = a rotr 2 */ \
- evrlwi rT1,w7,13; /* s1' = w[-2] rotr 19 */ \
- rotrwi rT3,a,13; /* 1: S0' = a rotr 13 */ \
- evxor rT0,rT0,rT1; /* s1 = s1 xor s1' */ \
- xor rT2,rT2,rT3; /* 1: S0 = S0 xor S0' */ \
- evldw rT1,off(rKP); /* k */ \
- rotrwi rT3,a,22; /* 1: S0' = a rotr 22 */ \
- evaddw w0,w0,rT0; /* w = w + s1 */ \
- xor rT2,rT2,rT3; /* 1: S0 = S0 xor S0' */ \
- evmergelohi rT0,w4,w5; /* w[-7] */ \
- and rT3,a,b; /* 1: maj = a and b */ \
- evaddw w0,w0,rT0; /* w = w + w[-7] */ \
- CMP_K##k##_LOOP \
- add rT2,rT2,rT3; /* 1: temp2 = S0 + maj */ \
- evaddw rT1,rT1,w0; /* wk = w + k */ \
- xor rT3,a,b; /* 1: maj = a xor b */ \
- evmergehi rT0,rT1,rT1; /* wk1/wk2 */ \
- and rT3,rT3,c; /* 1: maj = maj and c */ \
- add h,h,rT0; /* 1: temp1 = temp1 + wk */ \
- add rT2,rT2,rT3; /* 1: temp2 = temp2 + maj */ \
- add g,g,rT1; /* 2: temp1 = temp1 + wk */ \
- add d,d,h; /* 1: d = d + temp1 */ \
- rotrwi rT0,d,6; /* 2: S1 = e rotr 6 */ \
- add h,h,rT2; /* 1: h = temp1 + temp2 */ \
- rotrwi rT1,d,11; /* 2: S1' = e rotr 11 */ \
- rotrwi rT2,d,25; /* 2: S" = e rotr 25 */ \
- xor rT0,rT0,rT1; /* 2: S1 = S1 xor S1' */ \
- and rT3,d,e; /* 2: ch = e and f */ \
- xor rT0,rT0,rT2; /* 2: S1 = S1 xor S1" */ \
- andc rT1,f,d; /* 2: ch' = ~e and g */ \
- add g,g,rT0; /* 2: temp1 = h + S1 */ \
- xor rT3,rT3,rT1; /* 2: ch = ch xor ch' */ \
- rotrwi rT0,h,2; /* 2: S0 = a rotr 2 */ \
- add g,g,rT3; /* 2: temp1 = temp1 + ch */ \
- rotrwi rT1,h,13; /* 2: S0' = a rotr 13 */ \
- rotrwi rT3,h,22; /* 2: S0" = a rotr 22 */ \
- xor rT0,rT0,rT1; /* 2: S0 = S0 xor S0' */ \
- or rT2,h,a; /* 2: maj = a or b */ \
- and rT1,h,a; /* 2: maj' = a and b */ \
- and rT2,rT2,b; /* 2: maj = maj and c */ \
- xor rT3,rT0,rT3; /* 2: S0 = S0 xor S0" */ \
- or rT2,rT1,rT2; /* 2: maj = maj or maj' */ \
- add c,c,g; /* 2: d = d + temp1 */ \
- add rT3,rT3,rT2; /* 2: temp2 = S0 + maj */ \
- add g,g,rT3 /* 2: h = temp1 + temp2 */
-
-_GLOBAL(ppc_spe_sha256_transform)
- INITIALIZE
-
- mtctr r5
- lwz rH0,0(rHP)
- lwz rH1,4(rHP)
- lwz rH2,8(rHP)
- lwz rH3,12(rHP)
- lwz rH4,16(rHP)
- lwz rH5,20(rHP)
- lwz rH6,24(rHP)
- lwz rH7,28(rHP)
-
-ppc_spe_sha256_main:
- lis rKP,PPC_SPE_SHA256_K@ha
- addi rKP,rKP,PPC_SPE_SHA256_K@l
-
- R_LOAD_W(rH0, rH1, rH2, rH3, rH4, rH5, rH6, rH7, rW0, 0)
- R_LOAD_W(rH6, rH7, rH0, rH1, rH2, rH3, rH4, rH5, rW1, 8)
- R_LOAD_W(rH4, rH5, rH6, rH7, rH0, rH1, rH2, rH3, rW2, 16)
- R_LOAD_W(rH2, rH3, rH4, rH5, rH6, rH7, rH0, rH1, rW3, 24)
- R_LOAD_W(rH0, rH1, rH2, rH3, rH4, rH5, rH6, rH7, rW4, 32)
- R_LOAD_W(rH6, rH7, rH0, rH1, rH2, rH3, rH4, rH5, rW5, 40)
- R_LOAD_W(rH4, rH5, rH6, rH7, rH0, rH1, rH2, rH3, rW6, 48)
- R_LOAD_W(rH2, rH3, rH4, rH5, rH6, rH7, rH0, rH1, rW7, 56)
-ppc_spe_sha256_16_rounds:
- addi rKP,rKP,64
- R_CALC_W(rH0, rH1, rH2, rH3, rH4, rH5, rH6, rH7,
- rW0, rW1, rW4, rW5, rW7, N, 0)
- R_CALC_W(rH6, rH7, rH0, rH1, rH2, rH3, rH4, rH5,
- rW1, rW2, rW5, rW6, rW0, N, 8)
- R_CALC_W(rH4, rH5, rH6, rH7, rH0, rH1, rH2, rH3,
- rW2, rW3, rW6, rW7, rW1, N, 16)
- R_CALC_W(rH2, rH3, rH4, rH5, rH6, rH7, rH0, rH1,
- rW3, rW4, rW7, rW0, rW2, N, 24)
- R_CALC_W(rH0, rH1, rH2, rH3, rH4, rH5, rH6, rH7,
- rW4, rW5, rW0, rW1, rW3, N, 32)
- R_CALC_W(rH6, rH7, rH0, rH1, rH2, rH3, rH4, rH5,
- rW5, rW6, rW1, rW2, rW4, N, 40)
- R_CALC_W(rH4, rH5, rH6, rH7, rH0, rH1, rH2, rH3,
- rW6, rW7, rW2, rW3, rW5, N, 48)
- R_CALC_W(rH2, rH3, rH4, rH5, rH6, rH7, rH0, rH1,
- rW7, rW0, rW3, rW4, rW6, C, 56)
- bt gt,ppc_spe_sha256_16_rounds
-
- lwz rW0,0(rHP)
- NEXT_BLOCK
- lwz rW1,4(rHP)
- lwz rW2,8(rHP)
- lwz rW3,12(rHP)
- lwz rW4,16(rHP)
- lwz rW5,20(rHP)
- lwz rW6,24(rHP)
- lwz rW7,28(rHP)
-
- add rH0,rH0,rW0
- stw rH0,0(rHP)
- add rH1,rH1,rW1
- stw rH1,4(rHP)
- add rH2,rH2,rW2
- stw rH2,8(rHP)
- add rH3,rH3,rW3
- stw rH3,12(rHP)
- add rH4,rH4,rW4
- stw rH4,16(rHP)
- add rH5,rH5,rW5
- stw rH5,20(rHP)
- add rH6,rH6,rW6
- stw rH6,24(rHP)
- add rH7,rH7,rW7
- stw rH7,28(rHP)
-
- bdnz ppc_spe_sha256_main
-
- FINALIZE
- blr
-
-.data
-.align 5
-PPC_SPE_SHA256_K:
- .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
- .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
- .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
- .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
- .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
- .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
- .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
- .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
- .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
- .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
- .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
- .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
- .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
- .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
- .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
- .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
diff --git a/arch/powerpc/crypto/sha256-spe-glue.c b/arch/powerpc/crypto/sha256-spe-glue.c
deleted file mode 100644
index 2997d13236e0..000000000000
--- a/arch/powerpc/crypto/sha256-spe-glue.c
+++ /dev/null
@@ -1,235 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * Glue code for SHA-256 implementation for SPE instructions (PPC)
- *
- * Based on generic implementation. The assembler module takes care
- * about the SPE registers so it can run from interrupt context.
- *
- * Copyright (c) 2015 Markus Stockhausen <stockhausen@collogia.de>
- */
-
-#include <crypto/internal/hash.h>
-#include <linux/init.h>
-#include <linux/module.h>
-#include <linux/mm.h>
-#include <linux/types.h>
-#include <crypto/sha2.h>
-#include <crypto/sha256_base.h>
-#include <asm/byteorder.h>
-#include <asm/switch_to.h>
-#include <linux/hardirq.h>
-
-/*
- * MAX_BYTES defines the number of bytes that are allowed to be processed
- * between preempt_disable() and preempt_enable(). SHA256 takes ~2,000
- * operations per 64 bytes. e500 cores can issue two arithmetic instructions
- * per clock cycle using one 32/64 bit unit (SU1) and one 32 bit unit (SU2).
- * Thus 1KB of input data will need an estimated maximum of 18,000 cycles.
- * Headroom for cache misses included. Even with the low end model clocked
- * at 667 MHz this equals to a critical time window of less than 27us.
- *
- */
-#define MAX_BYTES 1024
-
-extern void ppc_spe_sha256_transform(u32 *state, const u8 *src, u32 blocks);
-
-static void spe_begin(void)
-{
- /* We just start SPE operations and will save SPE registers later. */
- preempt_disable();
- enable_kernel_spe();
-}
-
-static void spe_end(void)
-{
- disable_kernel_spe();
- /* reenable preemption */
- preempt_enable();
-}
-
-static inline void ppc_sha256_clear_context(struct sha256_state *sctx)
-{
- int count = sizeof(struct sha256_state) >> 2;
- u32 *ptr = (u32 *)sctx;
-
- /* make sure we can clear the fast way */
- BUILD_BUG_ON(sizeof(struct sha256_state) % 4);
- do { *ptr++ = 0; } while (--count);
-}
-
-static int ppc_spe_sha256_update(struct shash_desc *desc, const u8 *data,
- unsigned int len)
-{
- struct sha256_state *sctx = shash_desc_ctx(desc);
- const unsigned int offset = sctx->count & 0x3f;
- const unsigned int avail = 64 - offset;
- unsigned int bytes;
- const u8 *src = data;
-
- if (avail > len) {
- sctx->count += len;
- memcpy((char *)sctx->buf + offset, src, len);
- return 0;
- }
-
- sctx->count += len;
-
- if (offset) {
- memcpy((char *)sctx->buf + offset, src, avail);
-
- spe_begin();
- ppc_spe_sha256_transform(sctx->state, (const u8 *)sctx->buf, 1);
- spe_end();
-
- len -= avail;
- src += avail;
- }
-
- while (len > 63) {
- /* cut input data into smaller blocks */
- bytes = (len > MAX_BYTES) ? MAX_BYTES : len;
- bytes = bytes & ~0x3f;
-
- spe_begin();
- ppc_spe_sha256_transform(sctx->state, src, bytes >> 6);
- spe_end();
-
- src += bytes;
- len -= bytes;
- }
-
- memcpy((char *)sctx->buf, src, len);
- return 0;
-}
-
-static int ppc_spe_sha256_final(struct shash_desc *desc, u8 *out)
-{
- struct sha256_state *sctx = shash_desc_ctx(desc);
- const unsigned int offset = sctx->count & 0x3f;
- char *p = (char *)sctx->buf + offset;
- int padlen;
- __be64 *pbits = (__be64 *)(((char *)&sctx->buf) + 56);
- __be32 *dst = (__be32 *)out;
-
- padlen = 55 - offset;
- *p++ = 0x80;
-
- spe_begin();
-
- if (padlen < 0) {
- memset(p, 0x00, padlen + sizeof (u64));
- ppc_spe_sha256_transform(sctx->state, sctx->buf, 1);
- p = (char *)sctx->buf;
- padlen = 56;
- }
-
- memset(p, 0, padlen);
- *pbits = cpu_to_be64(sctx->count << 3);
- ppc_spe_sha256_transform(sctx->state, sctx->buf, 1);
-
- spe_end();
-
- dst[0] = cpu_to_be32(sctx->state[0]);
- dst[1] = cpu_to_be32(sctx->state[1]);
- dst[2] = cpu_to_be32(sctx->state[2]);
- dst[3] = cpu_to_be32(sctx->state[3]);
- dst[4] = cpu_to_be32(sctx->state[4]);
- dst[5] = cpu_to_be32(sctx->state[5]);
- dst[6] = cpu_to_be32(sctx->state[6]);
- dst[7] = cpu_to_be32(sctx->state[7]);
-
- ppc_sha256_clear_context(sctx);
- return 0;
-}
-
-static int ppc_spe_sha224_final(struct shash_desc *desc, u8 *out)
-{
- __be32 D[SHA256_DIGEST_SIZE >> 2];
- __be32 *dst = (__be32 *)out;
-
- ppc_spe_sha256_final(desc, (u8 *)D);
-
- /* avoid bytewise memcpy */
- dst[0] = D[0];
- dst[1] = D[1];
- dst[2] = D[2];
- dst[3] = D[3];
- dst[4] = D[4];
- dst[5] = D[5];
- dst[6] = D[6];
-
- /* clear sensitive data */
- memzero_explicit(D, SHA256_DIGEST_SIZE);
- return 0;
-}
-
-static int ppc_spe_sha256_export(struct shash_desc *desc, void *out)
-{
- struct sha256_state *sctx = shash_desc_ctx(desc);
-
- memcpy(out, sctx, sizeof(*sctx));
- return 0;
-}
-
-static int ppc_spe_sha256_import(struct shash_desc *desc, const void *in)
-{
- struct sha256_state *sctx = shash_desc_ctx(desc);
-
- memcpy(sctx, in, sizeof(*sctx));
- return 0;
-}
-
-static struct shash_alg algs[2] = { {
- .digestsize = SHA256_DIGEST_SIZE,
- .init = sha256_base_init,
- .update = ppc_spe_sha256_update,
- .final = ppc_spe_sha256_final,
- .export = ppc_spe_sha256_export,
- .import = ppc_spe_sha256_import,
- .descsize = sizeof(struct sha256_state),
- .statesize = sizeof(struct sha256_state),
- .base = {
- .cra_name = "sha256",
- .cra_driver_name= "sha256-ppc-spe",
- .cra_priority = 300,
- .cra_blocksize = SHA256_BLOCK_SIZE,
- .cra_module = THIS_MODULE,
- }
-}, {
- .digestsize = SHA224_DIGEST_SIZE,
- .init = sha224_base_init,
- .update = ppc_spe_sha256_update,
- .final = ppc_spe_sha224_final,
- .export = ppc_spe_sha256_export,
- .import = ppc_spe_sha256_import,
- .descsize = sizeof(struct sha256_state),
- .statesize = sizeof(struct sha256_state),
- .base = {
- .cra_name = "sha224",
- .cra_driver_name= "sha224-ppc-spe",
- .cra_priority = 300,
- .cra_blocksize = SHA224_BLOCK_SIZE,
- .cra_module = THIS_MODULE,
- }
-} };
-
-static int __init ppc_spe_sha256_mod_init(void)
-{
- return crypto_register_shashes(algs, ARRAY_SIZE(algs));
-}
-
-static void __exit ppc_spe_sha256_mod_fini(void)
-{
- crypto_unregister_shashes(algs, ARRAY_SIZE(algs));
-}
-
-module_init(ppc_spe_sha256_mod_init);
-module_exit(ppc_spe_sha256_mod_fini);
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("SHA-224 and SHA-256 Secure Hash Algorithm, SPE optimized");
-
-MODULE_ALIAS_CRYPTO("sha224");
-MODULE_ALIAS_CRYPTO("sha224-ppc-spe");
-MODULE_ALIAS_CRYPTO("sha256");
-MODULE_ALIAS_CRYPTO("sha256-ppc-spe");
diff --git a/arch/powerpc/crypto/vmx.c b/arch/powerpc/crypto/vmx.c
index 7eb713cc87c8..0b725e826388 100644
--- a/arch/powerpc/crypto/vmx.c
+++ b/arch/powerpc/crypto/vmx.c
@@ -74,4 +74,4 @@ MODULE_DESCRIPTION("IBM VMX cryptographic acceleration instructions "
"support on Power 8");
MODULE_LICENSE("GPL");
MODULE_VERSION("1.0.0");
-MODULE_IMPORT_NS(CRYPTO_INTERNAL);
+MODULE_IMPORT_NS("CRYPTO_INTERNAL");