aboutsummaryrefslogtreecommitdiffstats
path: root/arch/s390/crypto
diff options
context:
space:
mode:
Diffstat (limited to 'arch/s390/crypto')
-rw-r--r--arch/s390/crypto/Makefile3
-rw-r--r--arch/s390/crypto/aes_s390.c113
-rw-r--r--arch/s390/crypto/crc32-vx.c310
-rw-r--r--arch/s390/crypto/crc32be-vx.S207
-rw-r--r--arch/s390/crypto/crc32le-vx.S268
5 files changed, 848 insertions, 53 deletions
diff --git a/arch/s390/crypto/Makefile b/arch/s390/crypto/Makefile
index 7f0b7cda6259..d1033de4c4ee 100644
--- a/arch/s390/crypto/Makefile
+++ b/arch/s390/crypto/Makefile
@@ -9,3 +9,6 @@ obj-$(CONFIG_CRYPTO_DES_S390) += des_s390.o
obj-$(CONFIG_CRYPTO_AES_S390) += aes_s390.o
obj-$(CONFIG_S390_PRNG) += prng.o
obj-$(CONFIG_CRYPTO_GHASH_S390) += ghash_s390.o
+obj-$(CONFIG_CRYPTO_CRC32_S390) += crc32-vx_s390.o
+
+crc32-vx_s390-y := crc32-vx.o crc32le-vx.o crc32be-vx.o
diff --git a/arch/s390/crypto/aes_s390.c b/arch/s390/crypto/aes_s390.c
index 7554a8bb2adc..2ea18b050309 100644
--- a/arch/s390/crypto/aes_s390.c
+++ b/arch/s390/crypto/aes_s390.c
@@ -22,6 +22,7 @@
#include <crypto/aes.h>
#include <crypto/algapi.h>
+#include <crypto/internal/skcipher.h>
#include <linux/err.h>
#include <linux/module.h>
#include <linux/cpufeature.h>
@@ -44,7 +45,7 @@ struct s390_aes_ctx {
long dec;
int key_len;
union {
- struct crypto_blkcipher *blk;
+ struct crypto_skcipher *blk;
struct crypto_cipher *cip;
} fallback;
};
@@ -63,7 +64,7 @@ struct s390_xts_ctx {
long enc;
long dec;
int key_len;
- struct crypto_blkcipher *fallback;
+ struct crypto_skcipher *fallback;
};
/*
@@ -237,16 +238,16 @@ static int setkey_fallback_blk(struct crypto_tfm *tfm, const u8 *key,
struct s390_aes_ctx *sctx = crypto_tfm_ctx(tfm);
unsigned int ret;
- sctx->fallback.blk->base.crt_flags &= ~CRYPTO_TFM_REQ_MASK;
- sctx->fallback.blk->base.crt_flags |= (tfm->crt_flags &
- CRYPTO_TFM_REQ_MASK);
+ crypto_skcipher_clear_flags(sctx->fallback.blk, CRYPTO_TFM_REQ_MASK);
+ crypto_skcipher_set_flags(sctx->fallback.blk, tfm->crt_flags &
+ CRYPTO_TFM_REQ_MASK);
+
+ ret = crypto_skcipher_setkey(sctx->fallback.blk, key, len);
+
+ tfm->crt_flags &= ~CRYPTO_TFM_RES_MASK;
+ tfm->crt_flags |= crypto_skcipher_get_flags(sctx->fallback.blk) &
+ CRYPTO_TFM_RES_MASK;
- ret = crypto_blkcipher_setkey(sctx->fallback.blk, key, len);
- if (ret) {
- tfm->crt_flags &= ~CRYPTO_TFM_RES_MASK;
- tfm->crt_flags |= (sctx->fallback.blk->base.crt_flags &
- CRYPTO_TFM_RES_MASK);
- }
return ret;
}
@@ -255,15 +256,17 @@ static int fallback_blk_dec(struct blkcipher_desc *desc,
unsigned int nbytes)
{
unsigned int ret;
- struct crypto_blkcipher *tfm;
- struct s390_aes_ctx *sctx = crypto_blkcipher_ctx(desc->tfm);
+ struct crypto_blkcipher *tfm = desc->tfm;
+ struct s390_aes_ctx *sctx = crypto_blkcipher_ctx(tfm);
+ SKCIPHER_REQUEST_ON_STACK(req, sctx->fallback.blk);
- tfm = desc->tfm;
- desc->tfm = sctx->fallback.blk;
+ skcipher_request_set_tfm(req, sctx->fallback.blk);
+ skcipher_request_set_callback(req, desc->flags, NULL, NULL);
+ skcipher_request_set_crypt(req, src, dst, nbytes, desc->info);
- ret = crypto_blkcipher_decrypt_iv(desc, dst, src, nbytes);
+ ret = crypto_skcipher_decrypt(req);
- desc->tfm = tfm;
+ skcipher_request_zero(req);
return ret;
}
@@ -272,15 +275,15 @@ static int fallback_blk_enc(struct blkcipher_desc *desc,
unsigned int nbytes)
{
unsigned int ret;
- struct crypto_blkcipher *tfm;
- struct s390_aes_ctx *sctx = crypto_blkcipher_ctx(desc->tfm);
+ struct crypto_blkcipher *tfm = desc->tfm;
+ struct s390_aes_ctx *sctx = crypto_blkcipher_ctx(tfm);
+ SKCIPHER_REQUEST_ON_STACK(req, sctx->fallback.blk);
- tfm = desc->tfm;
- desc->tfm = sctx->fallback.blk;
+ skcipher_request_set_tfm(req, sctx->fallback.blk);
+ skcipher_request_set_callback(req, desc->flags, NULL, NULL);
+ skcipher_request_set_crypt(req, src, dst, nbytes, desc->info);
- ret = crypto_blkcipher_encrypt_iv(desc, dst, src, nbytes);
-
- desc->tfm = tfm;
+ ret = crypto_skcipher_encrypt(req);
return ret;
}
@@ -370,8 +373,9 @@ static int fallback_init_blk(struct crypto_tfm *tfm)
const char *name = tfm->__crt_alg->cra_name;
struct s390_aes_ctx *sctx = crypto_tfm_ctx(tfm);
- sctx->fallback.blk = crypto_alloc_blkcipher(name, 0,
- CRYPTO_ALG_ASYNC | CRYPTO_ALG_NEED_FALLBACK);
+ sctx->fallback.blk = crypto_alloc_skcipher(name, 0,
+ CRYPTO_ALG_ASYNC |
+ CRYPTO_ALG_NEED_FALLBACK);
if (IS_ERR(sctx->fallback.blk)) {
pr_err("Allocating AES fallback algorithm %s failed\n",
@@ -386,8 +390,7 @@ static void fallback_exit_blk(struct crypto_tfm *tfm)
{
struct s390_aes_ctx *sctx = crypto_tfm_ctx(tfm);
- crypto_free_blkcipher(sctx->fallback.blk);
- sctx->fallback.blk = NULL;
+ crypto_free_skcipher(sctx->fallback.blk);
}
static struct crypto_alg ecb_aes_alg = {
@@ -536,16 +539,16 @@ static int xts_fallback_setkey(struct crypto_tfm *tfm, const u8 *key,
struct s390_xts_ctx *xts_ctx = crypto_tfm_ctx(tfm);
unsigned int ret;
- xts_ctx->fallback->base.crt_flags &= ~CRYPTO_TFM_REQ_MASK;
- xts_ctx->fallback->base.crt_flags |= (tfm->crt_flags &
- CRYPTO_TFM_REQ_MASK);
+ crypto_skcipher_clear_flags(xts_ctx->fallback, CRYPTO_TFM_REQ_MASK);
+ crypto_skcipher_set_flags(xts_ctx->fallback, tfm->crt_flags &
+ CRYPTO_TFM_REQ_MASK);
+
+ ret = crypto_skcipher_setkey(xts_ctx->fallback, key, len);
+
+ tfm->crt_flags &= ~CRYPTO_TFM_RES_MASK;
+ tfm->crt_flags |= crypto_skcipher_get_flags(xts_ctx->fallback) &
+ CRYPTO_TFM_RES_MASK;
- ret = crypto_blkcipher_setkey(xts_ctx->fallback, key, len);
- if (ret) {
- tfm->crt_flags &= ~CRYPTO_TFM_RES_MASK;
- tfm->crt_flags |= (xts_ctx->fallback->base.crt_flags &
- CRYPTO_TFM_RES_MASK);
- }
return ret;
}
@@ -553,16 +556,18 @@ static int xts_fallback_decrypt(struct blkcipher_desc *desc,
struct scatterlist *dst, struct scatterlist *src,
unsigned int nbytes)
{
- struct s390_xts_ctx *xts_ctx = crypto_blkcipher_ctx(desc->tfm);
- struct crypto_blkcipher *tfm;
+ struct crypto_blkcipher *tfm = desc->tfm;
+ struct s390_xts_ctx *xts_ctx = crypto_blkcipher_ctx(tfm);
+ SKCIPHER_REQUEST_ON_STACK(req, xts_ctx->fallback);
unsigned int ret;
- tfm = desc->tfm;
- desc->tfm = xts_ctx->fallback;
+ skcipher_request_set_tfm(req, xts_ctx->fallback);
+ skcipher_request_set_callback(req, desc->flags, NULL, NULL);
+ skcipher_request_set_crypt(req, src, dst, nbytes, desc->info);
- ret = crypto_blkcipher_decrypt_iv(desc, dst, src, nbytes);
+ ret = crypto_skcipher_decrypt(req);
- desc->tfm = tfm;
+ skcipher_request_zero(req);
return ret;
}
@@ -570,16 +575,18 @@ static int xts_fallback_encrypt(struct blkcipher_desc *desc,
struct scatterlist *dst, struct scatterlist *src,
unsigned int nbytes)
{
- struct s390_xts_ctx *xts_ctx = crypto_blkcipher_ctx(desc->tfm);
- struct crypto_blkcipher *tfm;
+ struct crypto_blkcipher *tfm = desc->tfm;
+ struct s390_xts_ctx *xts_ctx = crypto_blkcipher_ctx(tfm);
+ SKCIPHER_REQUEST_ON_STACK(req, xts_ctx->fallback);
unsigned int ret;
- tfm = desc->tfm;
- desc->tfm = xts_ctx->fallback;
+ skcipher_request_set_tfm(req, xts_ctx->fallback);
+ skcipher_request_set_callback(req, desc->flags, NULL, NULL);
+ skcipher_request_set_crypt(req, src, dst, nbytes, desc->info);
- ret = crypto_blkcipher_encrypt_iv(desc, dst, src, nbytes);
+ ret = crypto_skcipher_encrypt(req);
- desc->tfm = tfm;
+ skcipher_request_zero(req);
return ret;
}
@@ -700,8 +707,9 @@ static int xts_fallback_init(struct crypto_tfm *tfm)
const char *name = tfm->__crt_alg->cra_name;
struct s390_xts_ctx *xts_ctx = crypto_tfm_ctx(tfm);
- xts_ctx->fallback = crypto_alloc_blkcipher(name, 0,
- CRYPTO_ALG_ASYNC | CRYPTO_ALG_NEED_FALLBACK);
+ xts_ctx->fallback = crypto_alloc_skcipher(name, 0,
+ CRYPTO_ALG_ASYNC |
+ CRYPTO_ALG_NEED_FALLBACK);
if (IS_ERR(xts_ctx->fallback)) {
pr_err("Allocating XTS fallback algorithm %s failed\n",
@@ -715,8 +723,7 @@ static void xts_fallback_exit(struct crypto_tfm *tfm)
{
struct s390_xts_ctx *xts_ctx = crypto_tfm_ctx(tfm);
- crypto_free_blkcipher(xts_ctx->fallback);
- xts_ctx->fallback = NULL;
+ crypto_free_skcipher(xts_ctx->fallback);
}
static struct crypto_alg xts_aes_alg = {
diff --git a/arch/s390/crypto/crc32-vx.c b/arch/s390/crypto/crc32-vx.c
new file mode 100644
index 000000000000..577ae1d4ae89
--- /dev/null
+++ b/arch/s390/crypto/crc32-vx.c
@@ -0,0 +1,310 @@
+/*
+ * Crypto-API module for CRC-32 algorithms implemented with the
+ * z/Architecture Vector Extension Facility.
+ *
+ * Copyright IBM Corp. 2015
+ * Author(s): Hendrik Brueckner <brueckner@linux.vnet.ibm.com>
+ */
+#define KMSG_COMPONENT "crc32-vx"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/module.h>
+#include <linux/cpufeature.h>
+#include <linux/crc32.h>
+#include <crypto/internal/hash.h>
+#include <asm/fpu/api.h>
+
+
+#define CRC32_BLOCK_SIZE 1
+#define CRC32_DIGEST_SIZE 4
+
+#define VX_MIN_LEN 64
+#define VX_ALIGNMENT 16L
+#define VX_ALIGN_MASK (VX_ALIGNMENT - 1)
+
+struct crc_ctx {
+ u32 key;
+};
+
+struct crc_desc_ctx {
+ u32 crc;
+};
+
+/* Prototypes for functions in assembly files */
+u32 crc32_le_vgfm_16(u32 crc, unsigned char const *buf, size_t size);
+u32 crc32_be_vgfm_16(u32 crc, unsigned char const *buf, size_t size);
+u32 crc32c_le_vgfm_16(u32 crc, unsigned char const *buf, size_t size);
+
+/*
+ * DEFINE_CRC32_VX() - Define a CRC-32 function using the vector extension
+ *
+ * Creates a function to perform a particular CRC-32 computation. Depending
+ * on the message buffer, the hardware-accelerated or software implementation
+ * is used. Note that the message buffer is aligned to improve fetch
+ * operations of VECTOR LOAD MULTIPLE instructions.
+ *
+ */
+#define DEFINE_CRC32_VX(___fname, ___crc32_vx, ___crc32_sw) \
+ static u32 __pure ___fname(u32 crc, \
+ unsigned char const *data, size_t datalen) \
+ { \
+ struct kernel_fpu vxstate; \
+ unsigned long prealign, aligned, remaining; \
+ \
+ if ((unsigned long)data & VX_ALIGN_MASK) { \
+ prealign = VX_ALIGNMENT - \
+ ((unsigned long)data & VX_ALIGN_MASK); \
+ datalen -= prealign; \
+ crc = ___crc32_sw(crc, data, prealign); \
+ data = (void *)((unsigned long)data + prealign); \
+ } \
+ \
+ if (datalen < VX_MIN_LEN) \
+ return ___crc32_sw(crc, data, datalen); \
+ \
+ aligned = datalen & ~VX_ALIGN_MASK; \
+ remaining = datalen & VX_ALIGN_MASK; \
+ \
+ kernel_fpu_begin(&vxstate, KERNEL_VXR_LOW); \
+ crc = ___crc32_vx(crc, data, aligned); \
+ kernel_fpu_end(&vxstate); \
+ \
+ if (remaining) \
+ crc = ___crc32_sw(crc, data + aligned, remaining); \
+ \
+ return crc; \
+ }
+
+DEFINE_CRC32_VX(crc32_le_vx, crc32_le_vgfm_16, crc32_le)
+DEFINE_CRC32_VX(crc32_be_vx, crc32_be_vgfm_16, crc32_be)
+DEFINE_CRC32_VX(crc32c_le_vx, crc32c_le_vgfm_16, __crc32c_le)
+
+
+static int crc32_vx_cra_init_zero(struct crypto_tfm *tfm)
+{
+ struct crc_ctx *mctx = crypto_tfm_ctx(tfm);
+
+ mctx->key = 0;
+ return 0;
+}
+
+static int crc32_vx_cra_init_invert(struct crypto_tfm *tfm)
+{
+ struct crc_ctx *mctx = crypto_tfm_ctx(tfm);
+
+ mctx->key = ~0;
+ return 0;
+}
+
+static int crc32_vx_init(struct shash_desc *desc)
+{
+ struct crc_ctx *mctx = crypto_shash_ctx(desc->tfm);
+ struct crc_desc_ctx *ctx = shash_desc_ctx(desc);
+
+ ctx->crc = mctx->key;
+ return 0;
+}
+
+static int crc32_vx_setkey(struct crypto_shash *tfm, const u8 *newkey,
+ unsigned int newkeylen)
+{
+ struct crc_ctx *mctx = crypto_shash_ctx(tfm);
+
+ if (newkeylen != sizeof(mctx->key)) {
+ crypto_shash_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
+ return -EINVAL;
+ }
+ mctx->key = le32_to_cpu(*(__le32 *)newkey);
+ return 0;
+}
+
+static int crc32be_vx_setkey(struct crypto_shash *tfm, const u8 *newkey,
+ unsigned int newkeylen)
+{
+ struct crc_ctx *mctx = crypto_shash_ctx(tfm);
+
+ if (newkeylen != sizeof(mctx->key)) {
+ crypto_shash_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
+ return -EINVAL;
+ }
+ mctx->key = be32_to_cpu(*(__be32 *)newkey);
+ return 0;
+}
+
+static int crc32le_vx_final(struct shash_desc *desc, u8 *out)
+{
+ struct crc_desc_ctx *ctx = shash_desc_ctx(desc);
+
+ *(__le32 *)out = cpu_to_le32p(&ctx->crc);
+ return 0;
+}
+
+static int crc32be_vx_final(struct shash_desc *desc, u8 *out)
+{
+ struct crc_desc_ctx *ctx = shash_desc_ctx(desc);
+
+ *(__be32 *)out = cpu_to_be32p(&ctx->crc);
+ return 0;
+}
+
+static int crc32c_vx_final(struct shash_desc *desc, u8 *out)
+{
+ struct crc_desc_ctx *ctx = shash_desc_ctx(desc);
+
+ /*
+ * Perform a final XOR with 0xFFFFFFFF to be in sync
+ * with the generic crc32c shash implementation.
+ */
+ *(__le32 *)out = ~cpu_to_le32p(&ctx->crc);
+ return 0;
+}
+
+static int __crc32le_vx_finup(u32 *crc, const u8 *data, unsigned int len,
+ u8 *out)
+{
+ *(__le32 *)out = cpu_to_le32(crc32_le_vx(*crc, data, len));
+ return 0;
+}
+
+static int __crc32be_vx_finup(u32 *crc, const u8 *data, unsigned int len,
+ u8 *out)
+{
+ *(__be32 *)out = cpu_to_be32(crc32_be_vx(*crc, data, len));
+ return 0;
+}
+
+static int __crc32c_vx_finup(u32 *crc, const u8 *data, unsigned int len,
+ u8 *out)
+{
+ /*
+ * Perform a final XOR with 0xFFFFFFFF to be in sync
+ * with the generic crc32c shash implementation.
+ */
+ *(__le32 *)out = ~cpu_to_le32(crc32c_le_vx(*crc, data, len));
+ return 0;
+}
+
+
+#define CRC32_VX_FINUP(alg, func) \
+ static int alg ## _vx_finup(struct shash_desc *desc, const u8 *data, \
+ unsigned int datalen, u8 *out) \
+ { \
+ return __ ## alg ## _vx_finup(shash_desc_ctx(desc), \
+ data, datalen, out); \
+ }
+
+CRC32_VX_FINUP(crc32le, crc32_le_vx)
+CRC32_VX_FINUP(crc32be, crc32_be_vx)
+CRC32_VX_FINUP(crc32c, crc32c_le_vx)
+
+#define CRC32_VX_DIGEST(alg, func) \
+ static int alg ## _vx_digest(struct shash_desc *desc, const u8 *data, \
+ unsigned int len, u8 *out) \
+ { \
+ return __ ## alg ## _vx_finup(crypto_shash_ctx(desc->tfm), \
+ data, len, out); \
+ }
+
+CRC32_VX_DIGEST(crc32le, crc32_le_vx)
+CRC32_VX_DIGEST(crc32be, crc32_be_vx)
+CRC32_VX_DIGEST(crc32c, crc32c_le_vx)
+
+#define CRC32_VX_UPDATE(alg, func) \
+ static int alg ## _vx_update(struct shash_desc *desc, const u8 *data, \
+ unsigned int datalen) \
+ { \
+ struct crc_desc_ctx *ctx = shash_desc_ctx(desc); \
+ ctx->crc = func(ctx->crc, data, datalen); \
+ return 0; \
+ }
+
+CRC32_VX_UPDATE(crc32le, crc32_le_vx)
+CRC32_VX_UPDATE(crc32be, crc32_be_vx)
+CRC32_VX_UPDATE(crc32c, crc32c_le_vx)
+
+
+static struct shash_alg crc32_vx_algs[] = {
+ /* CRC-32 LE */
+ {
+ .init = crc32_vx_init,
+ .setkey = crc32_vx_setkey,
+ .update = crc32le_vx_update,
+ .final = crc32le_vx_final,
+ .finup = crc32le_vx_finup,
+ .digest = crc32le_vx_digest,
+ .descsize = sizeof(struct crc_desc_ctx),
+ .digestsize = CRC32_DIGEST_SIZE,
+ .base = {
+ .cra_name = "crc32",
+ .cra_driver_name = "crc32-vx",
+ .cra_priority = 200,
+ .cra_blocksize = CRC32_BLOCK_SIZE,
+ .cra_ctxsize = sizeof(struct crc_ctx),
+ .cra_module = THIS_MODULE,
+ .cra_init = crc32_vx_cra_init_zero,
+ },
+ },
+ /* CRC-32 BE */
+ {
+ .init = crc32_vx_init,
+ .setkey = crc32be_vx_setkey,
+ .update = crc32be_vx_update,
+ .final = crc32be_vx_final,
+ .finup = crc32be_vx_finup,
+ .digest = crc32be_vx_digest,
+ .descsize = sizeof(struct crc_desc_ctx),
+ .digestsize = CRC32_DIGEST_SIZE,
+ .base = {
+ .cra_name = "crc32be",
+ .cra_driver_name = "crc32be-vx",
+ .cra_priority = 200,
+ .cra_blocksize = CRC32_BLOCK_SIZE,
+ .cra_ctxsize = sizeof(struct crc_ctx),
+ .cra_module = THIS_MODULE,
+ .cra_init = crc32_vx_cra_init_zero,
+ },
+ },
+ /* CRC-32C LE */
+ {
+ .init = crc32_vx_init,
+ .setkey = crc32_vx_setkey,
+ .update = crc32c_vx_update,
+ .final = crc32c_vx_final,
+ .finup = crc32c_vx_finup,
+ .digest = crc32c_vx_digest,
+ .descsize = sizeof(struct crc_desc_ctx),
+ .digestsize = CRC32_DIGEST_SIZE,
+ .base = {
+ .cra_name = "crc32c",
+ .cra_driver_name = "crc32c-vx",
+ .cra_priority = 200,
+ .cra_blocksize = CRC32_BLOCK_SIZE,
+ .cra_ctxsize = sizeof(struct crc_ctx),
+ .cra_module = THIS_MODULE,
+ .cra_init = crc32_vx_cra_init_invert,
+ },
+ },
+};
+
+
+static int __init crc_vx_mod_init(void)
+{
+ return crypto_register_shashes(crc32_vx_algs,
+ ARRAY_SIZE(crc32_vx_algs));
+}
+
+static void __exit crc_vx_mod_exit(void)
+{
+ crypto_unregister_shashes(crc32_vx_algs, ARRAY_SIZE(crc32_vx_algs));
+}
+
+module_cpu_feature_match(VXRS, crc_vx_mod_init);
+module_exit(crc_vx_mod_exit);
+
+MODULE_AUTHOR("Hendrik Brueckner <brueckner@linux.vnet.ibm.com>");
+MODULE_LICENSE("GPL");
+
+MODULE_ALIAS_CRYPTO("crc32");
+MODULE_ALIAS_CRYPTO("crc32-vx");
+MODULE_ALIAS_CRYPTO("crc32c");
+MODULE_ALIAS_CRYPTO("crc32c-vx");
diff --git a/arch/s390/crypto/crc32be-vx.S b/arch/s390/crypto/crc32be-vx.S
new file mode 100644
index 000000000000..8013989cd2e5
--- /dev/null
+++ b/arch/s390/crypto/crc32be-vx.S
@@ -0,0 +1,207 @@
+/*
+ * Hardware-accelerated CRC-32 variants for Linux on z Systems
+ *
+ * Use the z/Architecture Vector Extension Facility to accelerate the
+ * computing of CRC-32 checksums.
+ *
+ * This CRC-32 implementation algorithm processes the most-significant
+ * bit first (BE).
+ *
+ * Copyright IBM Corp. 2015
+ * Author(s): Hendrik Brueckner <brueckner@linux.vnet.ibm.com>
+ */
+
+#include <linux/linkage.h>
+#include <asm/vx-insn.h>
+
+/* Vector register range containing CRC-32 constants */
+#define CONST_R1R2 %v9
+#define CONST_R3R4 %v10
+#define CONST_R5 %v11
+#define CONST_R6 %v12
+#define CONST_RU_POLY %v13
+#define CONST_CRC_POLY %v14
+
+.data
+.align 8
+
+/*
+ * The CRC-32 constant block contains reduction constants to fold and
+ * process particular chunks of the input data stream in parallel.
+ *
+ * For the CRC-32 variants, the constants are precomputed according to
+ * these defintions:
+ *
+ * R1 = x4*128+64 mod P(x)
+ * R2 = x4*128 mod P(x)
+ * R3 = x128+64 mod P(x)
+ * R4 = x128 mod P(x)
+ * R5 = x96 mod P(x)
+ * R6 = x64 mod P(x)
+ *
+ * Barret reduction constant, u, is defined as floor(x**64 / P(x)).
+ *
+ * where P(x) is the polynomial in the normal domain and the P'(x) is the
+ * polynomial in the reversed (bitreflected) domain.
+ *
+ * Note that the constant definitions below are extended in order to compute
+ * intermediate results with a single VECTOR GALOIS FIELD MULTIPLY instruction.
+ * The righmost doubleword can be 0 to prevent contribution to the result or
+ * can be multiplied by 1 to perform an XOR without the need for a separate
+ * VECTOR EXCLUSIVE OR instruction.
+ *
+ * CRC-32 (IEEE 802.3 Ethernet, ...) polynomials:
+ *
+ * P(x) = 0x04C11DB7
+ * P'(x) = 0xEDB88320
+ */
+
+.Lconstants_CRC_32_BE:
+ .quad 0x08833794c, 0x0e6228b11 # R1, R2
+ .quad 0x0c5b9cd4c, 0x0e8a45605 # R3, R4
+ .quad 0x0f200aa66, 1 << 32 # R5, x32
+ .quad 0x0490d678d, 1 # R6, 1
+ .quad 0x104d101df, 0 # u
+ .quad 0x104C11DB7, 0 # P(x)
+
+.previous
+
+.text
+/*
+ * The CRC-32 function(s) use these calling conventions:
+ *
+ * Parameters:
+ *
+ * %r2: Initial CRC value, typically ~0; and final CRC (return) value.
+ * %r3: Input buffer pointer, performance might be improved if the
+ * buffer is on a doubleword boundary.
+ * %r4: Length of the buffer, must be 64 bytes or greater.
+ *
+ * Register usage:
+ *
+ * %r5: CRC-32 constant pool base pointer.
+ * V0: Initial CRC value and intermediate constants and results.
+ * V1..V4: Data for CRC computation.
+ * V5..V8: Next data chunks that are fetched from the input buffer.
+ *
+ * V9..V14: CRC-32 constants.
+ */
+ENTRY(crc32_be_vgfm_16)
+ /* Load CRC-32 constants */
+ larl %r5,.Lconstants_CRC_32_BE
+ VLM CONST_R1R2,CONST_CRC_POLY,0,%r5
+
+ /* Load the initial CRC value into the leftmost word of V0. */
+ VZERO %v0
+ VLVGF %v0,%r2,0
+
+ /* Load a 64-byte data chunk and XOR with CRC */
+ VLM %v1,%v4,0,%r3 /* 64-bytes into V1..V4 */
+ VX %v1,%v0,%v1 /* V1 ^= CRC */
+ aghi %r3,64 /* BUF = BUF + 64 */
+ aghi %r4,-64 /* LEN = LEN - 64 */
+
+ /* Check remaining buffer size and jump to proper folding method */
+ cghi %r4,64
+ jl .Lless_than_64bytes
+
+.Lfold_64bytes_loop:
+ /* Load the next 64-byte data chunk into V5 to V8 */
+ VLM %v5,%v8,0,%r3
+
+ /*
+ * Perform a GF(2) multiplication of the doublewords in V1 with
+ * the reduction constants in V0. The intermediate result is
+ * then folded (accumulated) with the next data chunk in V5 and
+ * stored in V1. Repeat this step for the register contents
+ * in V2, V3, and V4 respectively.
+ */
+ VGFMAG %v1,CONST_R1R2,%v1,%v5
+ VGFMAG %v2,CONST_R1R2,%v2,%v6
+ VGFMAG %v3,CONST_R1R2,%v3,%v7
+ VGFMAG %v4,CONST_R1R2,%v4,%v8
+
+ /* Adjust buffer pointer and length for next loop */
+ aghi %r3,64 /* BUF = BUF + 64 */
+ aghi %r4,-64 /* LEN = LEN - 64 */
+
+ cghi %r4,64
+ jnl .Lfold_64bytes_loop
+
+.Lless_than_64bytes:
+ /* Fold V1 to V4 into a single 128-bit value in V1 */
+ VGFMAG %v1,CONST_R3R4,%v1,%v2
+ VGFMAG %v1,CONST_R3R4,%v1,%v3
+ VGFMAG %v1,CONST_R3R4,%v1,%v4
+
+ /* Check whether to continue with 64-bit folding */
+ cghi %r4,16
+ jl .Lfinal_fold
+
+.Lfold_16bytes_loop:
+
+ VL %v2,0,,%r3 /* Load next data chunk */
+ VGFMAG %v1,CONST_R3R4,%v1,%v2 /* Fold next data chunk */
+
+ /* Adjust buffer pointer and size for folding next data chunk */
+ aghi %r3,16
+ aghi %r4,-16
+
+ /* Process remaining data chunks */
+ cghi %r4,16
+ jnl .Lfold_16bytes_loop
+
+.Lfinal_fold:
+ /*
+ * The R5 constant is used to fold a 128-bit value into an 96-bit value
+ * that is XORed with the next 96-bit input data chunk. To use a single
+ * VGFMG instruction, multiply the rightmost 64-bit with x^32 (1<<32) to
+ * form an intermediate 96-bit value (with appended zeros) which is then
+ * XORed with the intermediate reduction result.
+ */
+ VGFMG %v1,CONST_R5,%v1
+
+ /*
+ * Further reduce the remaining 96-bit value to a 64-bit value using a
+ * single VGFMG, the rightmost doubleword is multiplied with 0x1. The
+ * intermediate result is then XORed with the product of the leftmost
+ * doubleword with R6. The result is a 64-bit value and is subject to
+ * the Barret reduction.
+ */
+ VGFMG %v1,CONST_R6,%v1
+
+ /*
+ * The input values to the Barret reduction are the degree-63 polynomial
+ * in V1 (R(x)), degree-32 generator polynomial, and the reduction
+ * constant u. The Barret reduction result is the CRC value of R(x) mod
+ * P(x).
+ *
+ * The Barret reduction algorithm is defined as:
+ *
+ * 1. T1(x) = floor( R(x) / x^32 ) GF2MUL u
+ * 2. T2(x) = floor( T1(x) / x^32 ) GF2MUL P(x)
+ * 3. C(x) = R(x) XOR T2(x) mod x^32
+ *
+ * Note: To compensate the division by x^32, use the vector unpack
+ * instruction to move the leftmost word into the leftmost doubleword
+ * of the vector register. The rightmost doubleword is multiplied
+ * with zero to not contribute to the intermedate results.
+ */
+
+ /* T1(x) = floor( R(x) / x^32 ) GF2MUL u */
+ VUPLLF %v2,%v1
+ VGFMG %v2,CONST_RU_POLY,%v2
+
+ /*
+ * Compute the GF(2) product of the CRC polynomial in VO with T1(x) in
+ * V2 and XOR the intermediate result, T2(x), with the value in V1.
+ * The final result is in the rightmost word of V2.
+ */
+ VUPLLF %v2,%v2
+ VGFMAG %v2,CONST_CRC_POLY,%v2,%v1
+
+.Ldone:
+ VLGVF %r2,%v2,3
+ br %r14
+
+.previous
diff --git a/arch/s390/crypto/crc32le-vx.S b/arch/s390/crypto/crc32le-vx.S
new file mode 100644
index 000000000000..17f2504c2633
--- /dev/null
+++ b/arch/s390/crypto/crc32le-vx.S
@@ -0,0 +1,268 @@
+/*
+ * Hardware-accelerated CRC-32 variants for Linux on z Systems
+ *
+ * Use the z/Architecture Vector Extension Facility to accelerate the
+ * computing of bitreflected CRC-32 checksums for IEEE 802.3 Ethernet
+ * and Castagnoli.
+ *
+ * This CRC-32 implementation algorithm is bitreflected and processes
+ * the least-significant bit first (Little-Endian).
+ *
+ * Copyright IBM Corp. 2015
+ * Author(s): Hendrik Brueckner <brueckner@linux.vnet.ibm.com>
+ */
+
+#include <linux/linkage.h>
+#include <asm/vx-insn.h>
+
+/* Vector register range containing CRC-32 constants */
+#define CONST_PERM_LE2BE %v9
+#define CONST_R2R1 %v10
+#define CONST_R4R3 %v11
+#define CONST_R5 %v12
+#define CONST_RU_POLY %v13
+#define CONST_CRC_POLY %v14
+
+.data
+.align 8
+
+/*
+ * The CRC-32 constant block contains reduction constants to fold and
+ * process particular chunks of the input data stream in parallel.
+ *
+ * For the CRC-32 variants, the constants are precomputed according to
+ * these definitions:
+ *
+ * R1 = [(x4*128+32 mod P'(x) << 32)]' << 1
+ * R2 = [(x4*128-32 mod P'(x) << 32)]' << 1
+ * R3 = [(x128+32 mod P'(x) << 32)]' << 1
+ * R4 = [(x128-32 mod P'(x) << 32)]' << 1
+ * R5 = [(x64 mod P'(x) << 32)]' << 1
+ * R6 = [(x32 mod P'(x) << 32)]' << 1
+ *
+ * The bitreflected Barret reduction constant, u', is defined as
+ * the bit reversal of floor(x**64 / P(x)).
+ *
+ * where P(x) is the polynomial in the normal domain and the P'(x) is the
+ * polynomial in the reversed (bitreflected) domain.
+ *
+ * CRC-32 (IEEE 802.3 Ethernet, ...) polynomials:
+ *
+ * P(x) = 0x04C11DB7
+ * P'(x) = 0xEDB88320
+ *
+ * CRC-32C (Castagnoli) polynomials:
+ *
+ * P(x) = 0x1EDC6F41
+ * P'(x) = 0x82F63B78
+ */
+
+.Lconstants_CRC_32_LE:
+ .octa 0x0F0E0D0C0B0A09080706050403020100 # BE->LE mask
+ .quad 0x1c6e41596, 0x154442bd4 # R2, R1
+ .quad 0x0ccaa009e, 0x1751997d0 # R4, R3
+ .octa 0x163cd6124 # R5
+ .octa 0x1F7011641 # u'
+ .octa 0x1DB710641 # P'(x) << 1
+
+.Lconstants_CRC_32C_LE:
+ .octa 0x0F0E0D0C0B0A09080706050403020100 # BE->LE mask
+ .quad 0x09e4addf8, 0x740eef02 # R2, R1
+ .quad 0x14cd00bd6, 0xf20c0dfe # R4, R3
+ .octa 0x0dd45aab8 # R5
+ .octa 0x0dea713f1 # u'
+ .octa 0x105ec76f0 # P'(x) << 1
+
+.previous
+
+
+.text
+
+/*
+ * The CRC-32 functions use these calling conventions:
+ *
+ * Parameters:
+ *
+ * %r2: Initial CRC value, typically ~0; and final CRC (return) value.
+ * %r3: Input buffer pointer, performance might be improved if the
+ * buffer is on a doubleword boundary.
+ * %r4: Length of the buffer, must be 64 bytes or greater.
+ *
+ * Register usage:
+ *
+ * %r5: CRC-32 constant pool base pointer.
+ * V0: Initial CRC value and intermediate constants and results.
+ * V1..V4: Data for CRC computation.
+ * V5..V8: Next data chunks that are fetched from the input buffer.
+ * V9: Constant for BE->LE conversion and shift operations
+ *
+ * V10..V14: CRC-32 constants.
+ */
+
+ENTRY(crc32_le_vgfm_16)
+ larl %r5,.Lconstants_CRC_32_LE
+ j crc32_le_vgfm_generic
+
+ENTRY(crc32c_le_vgfm_16)
+ larl %r5,.Lconstants_CRC_32C_LE
+ j crc32_le_vgfm_generic
+
+
+crc32_le_vgfm_generic:
+ /* Load CRC-32 constants */
+ VLM CONST_PERM_LE2BE,CONST_CRC_POLY,0,%r5
+
+ /*
+ * Load the initial CRC value.
+ *
+ * The CRC value is loaded into the rightmost word of the
+ * vector register and is later XORed with the LSB portion
+ * of the loaded input data.
+ */
+ VZERO %v0 /* Clear V0 */
+ VLVGF %v0,%r2,3 /* Load CRC into rightmost word */
+
+ /* Load a 64-byte data chunk and XOR with CRC */
+ VLM %v1,%v4,0,%r3 /* 64-bytes into V1..V4 */
+ VPERM %v1,%v1,%v1,CONST_PERM_LE2BE
+ VPERM %v2,%v2,%v2,CONST_PERM_LE2BE
+ VPERM %v3,%v3,%v3,CONST_PERM_LE2BE
+ VPERM %v4,%v4,%v4,CONST_PERM_LE2BE
+
+ VX %v1,%v0,%v1 /* V1 ^= CRC */
+ aghi %r3,64 /* BUF = BUF + 64 */
+ aghi %r4,-64 /* LEN = LEN - 64 */
+
+ cghi %r4,64
+ jl .Lless_than_64bytes
+
+.Lfold_64bytes_loop:
+ /* Load the next 64-byte data chunk into V5 to V8 */
+ VLM %v5,%v8,0,%r3
+ VPERM %v5,%v5,%v5,CONST_PERM_LE2BE
+ VPERM %v6,%v6,%v6,CONST_PERM_LE2BE
+ VPERM %v7,%v7,%v7,CONST_PERM_LE2BE
+ VPERM %v8,%v8,%v8,CONST_PERM_LE2BE
+
+ /*
+ * Perform a GF(2) multiplication of the doublewords in V1 with
+ * the R1 and R2 reduction constants in V0. The intermediate result
+ * is then folded (accumulated) with the next data chunk in V5 and
+ * stored in V1. Repeat this step for the register contents
+ * in V2, V3, and V4 respectively.
+ */
+ VGFMAG %v1,CONST_R2R1,%v1,%v5
+ VGFMAG %v2,CONST_R2R1,%v2,%v6
+ VGFMAG %v3,CONST_R2R1,%v3,%v7
+ VGFMAG %v4,CONST_R2R1,%v4,%v8
+
+ aghi %r3,64 /* BUF = BUF + 64 */
+ aghi %r4,-64 /* LEN = LEN - 64 */
+
+ cghi %r4,64
+ jnl .Lfold_64bytes_loop
+
+.Lless_than_64bytes:
+ /*
+ * Fold V1 to V4 into a single 128-bit value in V1. Multiply V1 with R3
+ * and R4 and accumulating the next 128-bit chunk until a single 128-bit
+ * value remains.
+ */
+ VGFMAG %v1,CONST_R4R3,%v1,%v2
+ VGFMAG %v1,CONST_R4R3,%v1,%v3
+ VGFMAG %v1,CONST_R4R3,%v1,%v4
+
+ cghi %r4,16
+ jl .Lfinal_fold
+
+.Lfold_16bytes_loop:
+
+ VL %v2,0,,%r3 /* Load next data chunk */
+ VPERM %v2,%v2,%v2,CONST_PERM_LE2BE
+ VGFMAG %v1,CONST_R4R3,%v1,%v2 /* Fold next data chunk */
+
+ aghi %r3,16
+ aghi %r4,-16
+
+ cghi %r4,16
+ jnl .Lfold_16bytes_loop
+
+.Lfinal_fold:
+ /*
+ * Set up a vector register for byte shifts. The shift value must
+ * be loaded in bits 1-4 in byte element 7 of a vector register.
+ * Shift by 8 bytes: 0x40
+ * Shift by 4 bytes: 0x20
+ */
+ VLEIB %v9,0x40,7
+
+ /*
+ * Prepare V0 for the next GF(2) multiplication: shift V0 by 8 bytes
+ * to move R4 into the rightmost doubleword and set the leftmost
+ * doubleword to 0x1.
+ */
+ VSRLB %v0,CONST_R4R3,%v9
+ VLEIG %v0,1,0
+
+ /*
+ * Compute GF(2) product of V1 and V0. The rightmost doubleword
+ * of V1 is multiplied with R4. The leftmost doubleword of V1 is
+ * multiplied by 0x1 and is then XORed with rightmost product.
+ * Implicitly, the intermediate leftmost product becomes padded
+ */
+ VGFMG %v1,%v0,%v1
+
+ /*
+ * Now do the final 32-bit fold by multiplying the rightmost word
+ * in V1 with R5 and XOR the result with the remaining bits in V1.
+ *
+ * To achieve this by a single VGFMAG, right shift V1 by a word
+ * and store the result in V2 which is then accumulated. Use the
+ * vector unpack instruction to load the rightmost half of the
+ * doubleword into the rightmost doubleword element of V1; the other
+ * half is loaded in the leftmost doubleword.
+ * The vector register with CONST_R5 contains the R5 constant in the
+ * rightmost doubleword and the leftmost doubleword is zero to ignore
+ * the leftmost product of V1.
+ */
+ VLEIB %v9,0x20,7 /* Shift by words */
+ VSRLB %v2,%v1,%v9 /* Store remaining bits in V2 */
+ VUPLLF %v1,%v1 /* Split rightmost doubleword */
+ VGFMAG %v1,CONST_R5,%v1,%v2 /* V1 = (V1 * R5) XOR V2 */
+
+ /*
+ * Apply a Barret reduction to compute the final 32-bit CRC value.
+ *
+ * The input values to the Barret reduction are the degree-63 polynomial
+ * in V1 (R(x)), degree-32 generator polynomial, and the reduction
+ * constant u. The Barret reduction result is the CRC value of R(x) mod
+ * P(x).
+ *
+ * The Barret reduction algorithm is defined as:
+ *
+ * 1. T1(x) = floor( R(x) / x^32 ) GF2MUL u
+ * 2. T2(x) = floor( T1(x) / x^32 ) GF2MUL P(x)
+ * 3. C(x) = R(x) XOR T2(x) mod x^32
+ *
+ * Note: The leftmost doubleword of vector register containing
+ * CONST_RU_POLY is zero and, thus, the intermediate GF(2) product
+ * is zero and does not contribute to the final result.
+ */
+
+ /* T1(x) = floor( R(x) / x^32 ) GF2MUL u */
+ VUPLLF %v2,%v1
+ VGFMG %v2,CONST_RU_POLY,%v2
+
+ /*
+ * Compute the GF(2) product of the CRC polynomial with T1(x) in
+ * V2 and XOR the intermediate result, T2(x), with the value in V1.
+ * The final result is stored in word element 2 of V2.
+ */
+ VUPLLF %v2,%v2
+ VGFMAG %v2,CONST_CRC_POLY,%v2,%v1
+
+.Ldone:
+ VLGVF %r2,%v2,2
+ br %r14
+
+.previous