aboutsummaryrefslogtreecommitdiffstats
path: root/main.c
diff options
context:
space:
mode:
Diffstat (limited to 'main.c')
-rw-r--r--main.c80
1 files changed, 69 insertions, 11 deletions
diff --git a/main.c b/main.c
index d538f48..ad8de4c 100644
--- a/main.c
+++ b/main.c
@@ -6,37 +6,95 @@
#include <linux/init.h>
#include <linux/module.h>
#include <linux/delay.h>
-#include "function.h"
+#include <linux/slab.h>
+#include <linux/sort.h>
+#include <asm/cpufeature.h>
+#include <asm/processor.h>
+#include <asm/fpu/api.h>
+#include <asm/simd.h>
static unsigned long stamp = 0;
module_param(stamp, ulong, 0);
int dummy;
+enum { BLOCKS_PER_CALL = 16 };
+
+static u8 state[128];
+static u8 input[64 * BLOCKS_PER_CALL];
+
+#define declare_it(name) \
+asmlinkage void blake2s_compress_ ## name(u8 state[128], const u8 *block, const size_t nblocks, const u32 inc); \
+static __always_inline u8 name(void) \
+{ \
+ blake2s_compress_ ## name(state, input, BLOCKS_PER_CALL, 0); \
+ return input[0]; \
+}
+
+#define do_it(name) do { \
+ u32 eax = 0, ebx = 0, ecx = 0, edx = 0; \
+ for (i = 0; i < WARMUP; ++i) \
+ ret |= name(); \
+ asm volatile("cpuid" : "+a" (eax), "=b" (ebx), "=d" (edx), "+c" (ecx)); \
+ for (i = 0; i <= TRIALS; ++i) { \
+ trial_times[i] = get_cycles(); \
+ ret |= name(); \
+ } \
+ for (i = 0; i < TRIALS; ++i) \
+ trial_times[i] = trial_times[i + 1] - trial_times[i]; \
+ sort(trial_times, TRIALS + 1, sizeof(cycles_t), compare_cycles, NULL); \
+ median_ ## name = trial_times[TRIALS / 2]; \
+} while (0)
+
+#define report_it(name) do { \
+ pr_err("%lu: %12s: %6llu cycles per block\n", stamp, #name, median_ ## name / BLOCKS_PER_CALL); \
+} while (0)
+
+
+declare_it(avx)
+declare_it(avx512_ymm)
+declare_it(avx512_zmm)
+
+static int compare_cycles(const void *a, const void *b)
+{
+ return *((cycles_t *)a) - *((cycles_t *)b);
+}
+
static int __init mod_init(void)
{
+ enum { WARMUP = 6000, TRIALS = 5000, IDLE = 1 * 1000 };
int ret = 0, i;
- cycles_t start, end;
+ cycles_t *trial_times;
+ cycles_t median_avx = 0;
+ cycles_t median_avx512_ymm = 0;
+ cycles_t median_avx512_zmm = 0;
unsigned long flags;
DEFINE_SPINLOCK(lock);
-
+
+ trial_times = kcalloc(TRIALS + 1, sizeof(cycles_t), GFP_KERNEL);
+ if (!trial_times)
+ return -ENOMEM;
+
msleep(IDLE);
spin_lock_irqsave(&lock, flags);
-
- for (i = 0; i < WARMUP; ++i)
- ret |= function();
- start = get_cycles();
- for (i = 0; i < TRIALS; ++i)
- ret |= function();
- end = get_cycles();
+ kernel_fpu_begin();
+
+ do_it(avx);
+ do_it(avx512_ymm);
+ do_it(avx512_zmm);
+
+ kernel_fpu_end();
spin_unlock_irqrestore(&lock, flags);
- pr_err("%lu: %llu cycles per call\n", stamp, (end - start) / TRIALS);
+ report_it(avx);
+ report_it(avx512_ymm);
+ report_it(avx512_zmm);
/* Don't let compiler be too clever. */
dummy = ret;
+ kfree(trial_times);
/* We should never actually agree to insert the module. Choosing
* -0x1000 here is an amazing hack. It causes the kernel to not