aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/crypto/sha1_ssse3_glue.c
diff options
context:
space:
mode:
authorMathias Krause <minipli@googlemail.com>2011-08-04 20:19:25 +0200
committerHerbert Xu <herbert@gondor.apana.org.au>2011-08-10 19:00:29 +0800
commit66be895158886a6cd816aa1eaa18965a5c522d8f (patch)
tree2eee685d2249cd5973e15303b8101df7c956e607 /arch/x86/crypto/sha1_ssse3_glue.c
parentcrypto: sha1 - export sha1_update for reuse (diff)
downloadlinux-dev-66be895158886a6cd816aa1eaa18965a5c522d8f.tar.xz
linux-dev-66be895158886a6cd816aa1eaa18965a5c522d8f.zip
crypto: sha1 - SSSE3 based SHA1 implementation for x86-64
This is an assembler implementation of the SHA1 algorithm using the Supplemental SSE3 (SSSE3) instructions or, when available, the Advanced Vector Extensions (AVX). Testing with the tcrypt module shows the raw hash performance is up to 2.3 times faster than the C implementation, using 8k data blocks on a Core 2 Duo T5500. For the smalest data set (16 byte) it is still 25% faster. Since this implementation uses SSE/YMM registers it cannot safely be used in every situation, e.g. while an IRQ interrupts a kernel thread. The implementation falls back to the generic SHA1 variant, if using the SSE/YMM registers is not possible. With this algorithm I was able to increase the throughput of a single IPsec link from 344 Mbit/s to 464 Mbit/s on a Core 2 Quad CPU using the SSSE3 variant -- a speedup of +34.8%. Saving and restoring SSE/YMM state might make the actual throughput fluctuate when there are FPU intensive userland applications running. For example, meassuring the performance using iperf2 directly on the machine under test gives wobbling numbers because iperf2 uses the FPU for each packet to check if the reporting interval has expired (in the above test I got min/max/avg: 402/484/464 MBit/s). Using this algorithm on a IPsec gateway gives much more reasonable and stable numbers, albeit not as high as in the directly connected case. Here is the result from an RFC 2544 test run with a EXFO Packet Blazer FTB-8510: frame size sha1-generic sha1-ssse3 delta 64 byte 37.5 MBit/s 37.5 MBit/s 0.0% 128 byte 56.3 MBit/s 62.5 MBit/s +11.0% 256 byte 87.5 MBit/s 100.0 MBit/s +14.3% 512 byte 131.3 MBit/s 150.0 MBit/s +14.2% 1024 byte 162.5 MBit/s 193.8 MBit/s +19.3% 1280 byte 175.0 MBit/s 212.5 MBit/s +21.4% 1420 byte 175.0 MBit/s 218.7 MBit/s +25.0% 1518 byte 150.0 MBit/s 181.2 MBit/s +20.8% The throughput for the largest frame size is lower than for the previous size because the IP packets need to be fragmented in this case to make there way through the IPsec tunnel. Signed-off-by: Mathias Krause <minipli@googlemail.com> Cc: Maxim Locktyukhin <maxim.locktyukhin@intel.com> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Diffstat (limited to 'arch/x86/crypto/sha1_ssse3_glue.c')
-rw-r--r--arch/x86/crypto/sha1_ssse3_glue.c240
1 files changed, 240 insertions, 0 deletions
diff --git a/arch/x86/crypto/sha1_ssse3_glue.c b/arch/x86/crypto/sha1_ssse3_glue.c
new file mode 100644
index 000000000000..f916499d0abe
--- /dev/null
+++ b/arch/x86/crypto/sha1_ssse3_glue.c
@@ -0,0 +1,240 @@
+/*
+ * Cryptographic API.
+ *
+ * Glue code for the SHA1 Secure Hash Algorithm assembler implementation using
+ * Supplemental SSE3 instructions.
+ *
+ * This file is based on sha1_generic.c
+ *
+ * Copyright (c) Alan Smithee.
+ * Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk>
+ * Copyright (c) Jean-Francois Dive <jef@linuxbe.org>
+ * Copyright (c) Mathias Krause <minipli@googlemail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <crypto/internal/hash.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/cryptohash.h>
+#include <linux/types.h>
+#include <crypto/sha.h>
+#include <asm/byteorder.h>
+#include <asm/i387.h>
+#include <asm/xcr.h>
+#include <asm/xsave.h>
+
+
+asmlinkage void sha1_transform_ssse3(u32 *digest, const char *data,
+ unsigned int rounds);
+#ifdef SHA1_ENABLE_AVX_SUPPORT
+asmlinkage void sha1_transform_avx(u32 *digest, const char *data,
+ unsigned int rounds);
+#endif
+
+static asmlinkage void (*sha1_transform_asm)(u32 *, const char *, unsigned int);
+
+
+static int sha1_ssse3_init(struct shash_desc *desc)
+{
+ struct sha1_state *sctx = shash_desc_ctx(desc);
+
+ *sctx = (struct sha1_state){
+ .state = { SHA1_H0, SHA1_H1, SHA1_H2, SHA1_H3, SHA1_H4 },
+ };
+
+ return 0;
+}
+
+static int __sha1_ssse3_update(struct shash_desc *desc, const u8 *data,
+ unsigned int len, unsigned int partial)
+{
+ struct sha1_state *sctx = shash_desc_ctx(desc);
+ unsigned int done = 0;
+
+ sctx->count += len;
+
+ if (partial) {
+ done = SHA1_BLOCK_SIZE - partial;
+ memcpy(sctx->buffer + partial, data, done);
+ sha1_transform_asm(sctx->state, sctx->buffer, 1);
+ }
+
+ if (len - done >= SHA1_BLOCK_SIZE) {
+ const unsigned int rounds = (len - done) / SHA1_BLOCK_SIZE;
+
+ sha1_transform_asm(sctx->state, data + done, rounds);
+ done += rounds * SHA1_BLOCK_SIZE;
+ }
+
+ memcpy(sctx->buffer, data + done, len - done);
+
+ return 0;
+}
+
+static int sha1_ssse3_update(struct shash_desc *desc, const u8 *data,
+ unsigned int len)
+{
+ struct sha1_state *sctx = shash_desc_ctx(desc);
+ unsigned int partial = sctx->count % SHA1_BLOCK_SIZE;
+ int res;
+
+ /* Handle the fast case right here */
+ if (partial + len < SHA1_BLOCK_SIZE) {
+ sctx->count += len;
+ memcpy(sctx->buffer + partial, data, len);
+
+ return 0;
+ }
+
+ if (!irq_fpu_usable()) {
+ res = crypto_sha1_update(desc, data, len);
+ } else {
+ kernel_fpu_begin();
+ res = __sha1_ssse3_update(desc, data, len, partial);
+ kernel_fpu_end();
+ }
+
+ return res;
+}
+
+
+/* Add padding and return the message digest. */
+static int sha1_ssse3_final(struct shash_desc *desc, u8 *out)
+{
+ struct sha1_state *sctx = shash_desc_ctx(desc);
+ unsigned int i, index, padlen;
+ __be32 *dst = (__be32 *)out;
+ __be64 bits;
+ static const u8 padding[SHA1_BLOCK_SIZE] = { 0x80, };
+
+ bits = cpu_to_be64(sctx->count << 3);
+
+ /* Pad out to 56 mod 64 and append length */
+ index = sctx->count % SHA1_BLOCK_SIZE;
+ padlen = (index < 56) ? (56 - index) : ((SHA1_BLOCK_SIZE+56) - index);
+ if (!irq_fpu_usable()) {
+ crypto_sha1_update(desc, padding, padlen);
+ crypto_sha1_update(desc, (const u8 *)&bits, sizeof(bits));
+ } else {
+ kernel_fpu_begin();
+ /* We need to fill a whole block for __sha1_ssse3_update() */
+ if (padlen <= 56) {
+ sctx->count += padlen;
+ memcpy(sctx->buffer + index, padding, padlen);
+ } else {
+ __sha1_ssse3_update(desc, padding, padlen, index);
+ }
+ __sha1_ssse3_update(desc, (const u8 *)&bits, sizeof(bits), 56);
+ kernel_fpu_end();
+ }
+
+ /* Store state in digest */
+ for (i = 0; i < 5; i++)
+ dst[i] = cpu_to_be32(sctx->state[i]);
+
+ /* Wipe context */
+ memset(sctx, 0, sizeof(*sctx));
+
+ return 0;
+}
+
+static int sha1_ssse3_export(struct shash_desc *desc, void *out)
+{
+ struct sha1_state *sctx = shash_desc_ctx(desc);
+
+ memcpy(out, sctx, sizeof(*sctx));
+
+ return 0;
+}
+
+static int sha1_ssse3_import(struct shash_desc *desc, const void *in)
+{
+ struct sha1_state *sctx = shash_desc_ctx(desc);
+
+ memcpy(sctx, in, sizeof(*sctx));
+
+ return 0;
+}
+
+static struct shash_alg alg = {
+ .digestsize = SHA1_DIGEST_SIZE,
+ .init = sha1_ssse3_init,
+ .update = sha1_ssse3_update,
+ .final = sha1_ssse3_final,
+ .export = sha1_ssse3_export,
+ .import = sha1_ssse3_import,
+ .descsize = sizeof(struct sha1_state),
+ .statesize = sizeof(struct sha1_state),
+ .base = {
+ .cra_name = "sha1",
+ .cra_driver_name= "sha1-ssse3",
+ .cra_priority = 150,
+ .cra_flags = CRYPTO_ALG_TYPE_SHASH,
+ .cra_blocksize = SHA1_BLOCK_SIZE,
+ .cra_module = THIS_MODULE,
+ }
+};
+
+#ifdef SHA1_ENABLE_AVX_SUPPORT
+static bool __init avx_usable(void)
+{
+ u64 xcr0;
+
+ if (!cpu_has_avx || !cpu_has_osxsave)
+ return false;
+
+ xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
+ if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) {
+ pr_info("AVX detected but unusable.\n");
+
+ return false;
+ }
+
+ return true;
+}
+#endif
+
+static int __init sha1_ssse3_mod_init(void)
+{
+ /* test for SSSE3 first */
+ if (cpu_has_ssse3)
+ sha1_transform_asm = sha1_transform_ssse3;
+
+#ifdef SHA1_ENABLE_AVX_SUPPORT
+ /* allow AVX to override SSSE3, it's a little faster */
+ if (avx_usable())
+ sha1_transform_asm = sha1_transform_avx;
+#endif
+
+ if (sha1_transform_asm) {
+ pr_info("Using %s optimized SHA-1 implementation\n",
+ sha1_transform_asm == sha1_transform_ssse3 ? "SSSE3"
+ : "AVX");
+ return crypto_register_shash(&alg);
+ }
+ pr_info("Neither AVX nor SSSE3 is available/usable.\n");
+
+ return -ENODEV;
+}
+
+static void __exit sha1_ssse3_mod_fini(void)
+{
+ crypto_unregister_shash(&alg);
+}
+
+module_init(sha1_ssse3_mod_init);
+module_exit(sha1_ssse3_mod_fini);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("SHA1 Secure Hash Algorithm, Supplemental SSE3 accelerated");
+
+MODULE_ALIAS("sha1");