From adcaee95cce63db00e0d338b4cc4c8d13ef121c4 Mon Sep 17 00:00:00 2001
From: "Jason A. Donenfeld" <Jason@zx2c4.com>
Date: Tue, 16 Jan 2018 13:42:15 +0100
Subject: Initial scaffolding

---
 Makefile     |  19 ++++++++++
 README.md    |  26 ++++++++++++++
 blake2s.c    |   9 +++++
 crc32loop.c  |  62 ++++++++++++++++++++++++++++++++
 dblblake2s.c |  10 ++++++
 k12f1600.c   |  54 ++++++++++++++++++++++++++++
 k12f800.c    |  54 ++++++++++++++++++++++++++++
 main.c       | 115 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 original.c   |  65 +++++++++++++++++++++++++++++++++
 run.sh       |  35 ++++++++++++++++++
 tinylfsr.c   |  38 ++++++++++++++++++++
 widelfsr.c   |  30 ++++++++++++++++
 12 files changed, 517 insertions(+)
 create mode 100644 Makefile
 create mode 100644 README.md
 create mode 100644 blake2s.c
 create mode 100644 crc32loop.c
 create mode 100644 dblblake2s.c
 create mode 100644 k12f1600.c
 create mode 100644 k12f800.c
 create mode 100644 main.c
 create mode 100644 original.c
 create mode 100755 run.sh
 create mode 100644 tinylfsr.c
 create mode 100644 widelfsr.c

diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..7df6fb2
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,19 @@
+ifneq ($(KERNELRELEASE),)
+kbench9000-y := main.o original.o crc32loop.o tinylfsr.o widelfsr.o blake2s.o dblblake2s.o k12f800.o k12f1600.o
+obj-m := kbench9000.o
+ccflags-y += -D'pr_fmt(fmt)=KBUILD_MODNAME ": " fmt'
+else
+KERNELDIR ?= /lib/modules/$(shell uname -r)/build
+PWD := $(shell pwd)
+
+default: build
+
+run: build
+	sudo ./run.sh
+build:
+	$(MAKE) -C $(KERNELDIR) M=$(PWD)
+clean:
+	$(MAKE) -C $(KERNELDIR) M=$(PWD) clean
+.PHONY: default run build clean
+endif
+
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..3fd3180
--- /dev/null
+++ b/README.md
@@ -0,0 +1,26 @@
+# kBench9000 &ndash; simple kernel land cycle counter
+### by [Jason A. Donenfeld](mailto:jason@zx2c4.com)
+
+This is a very simple kernel land cycle counter. To use, simply edit `function.h`,
+add any other `.c` files and mention them in the `kbench9000-y +=` line of the
+`Makefile`, and then type:
+
+```
+$ make run
+```
+
+![Expected kBench9000 output](https://data.zx2c4.com/kbench9000-screenshot.png)
+
+### Kernel Toolchain
+
+You'll need to have a working kernel toolchain, usually achievable by:
+
+```
+$ sudo apt install linux-headers-$(uname -r) build-essential
+```
+
+or
+
+```
+$ sudo dnf install kernel-devel @development-tools
+```
diff --git a/blake2s.c b/blake2s.c
new file mode 100644
index 0000000..6d5500a
--- /dev/null
+++ b/blake2s.c
@@ -0,0 +1,9 @@
+#include <linux/kernel.h>
+#include <crypto/blake2s.h>
+
+static struct blake2s_state hash = { .outlen = 32 /* TODO: other constants */ };
+
+void mix_blake2s(const void *in, int nbytes)
+{
+	blake2s_update(&hash, in, nbytes);
+}
diff --git a/crc32loop.c b/crc32loop.c
new file mode 100644
index 0000000..371bc96
--- /dev/null
+++ b/crc32loop.c
@@ -0,0 +1,62 @@
+#include <linux/kernel.h>
+
+enum poolinfo {
+	POOL_WORDS = 128,
+	POOL_WORDMASK = POOL_WORDS - 1,
+	POOL_BITS = POOL_WORDS * sizeof(u32) * 8,
+
+	/* x^128 + x^104 + x^76 + x^51 +x^25 + x + 1 */
+	POOL_TAP1 = 104,
+	POOL_TAP2 = 76,
+	POOL_TAP3 = 51,
+	POOL_TAP4 = 25,
+	POOL_TAP5 = 1,
+};
+
+static u32 input_pool_data[POOL_WORDS];
+
+static struct {
+	u16 add_ptr;
+	u16 input_rotate;
+} input_pool = { 0 };
+
+void mix_crc32loop(const void *in, int nbytes)
+{
+	unsigned long i, j;
+	int input_rotate;
+	const u8 *bytes = in;
+	u32 w;
+
+	input_rotate = input_pool.input_rotate;
+	i = input_pool.add_ptr;
+
+	/* mix one byte at a time to simplify size handling and churn faster */
+	while (nbytes--) {
+		w = rol32(*bytes++, input_rotate);
+		i = (i - 1) & POOL_WORDMASK;
+
+		/* XOR in the various taps */
+		w ^= input_pool_data[i];
+		w ^= input_pool_data[(i + POOL_TAP1) & POOL_WORDMASK];
+		w ^= input_pool_data[(i + POOL_TAP2) & POOL_WORDMASK];
+		w ^= input_pool_data[(i + POOL_TAP3) & POOL_WORDMASK];
+		w ^= input_pool_data[(i + POOL_TAP4) & POOL_WORDMASK];
+		w ^= input_pool_data[(i + POOL_TAP5) & POOL_WORDMASK];
+
+		/* Mix the result back in with a twist */
+		for (j = 0; j < 3; ++j)
+			w = (w >> 1) ^ (0xedb88320 & -(w & 1));
+		input_pool_data[i] = w;
+
+		/*
+		 * Normally, we add 7 bits of rotation to the pool.
+		 * At the beginning of the pool, add an extra 7 bits
+		 * rotation, so that successive passes spread the
+		 * input bits across the pool evenly.
+		 */
+		input_rotate = (input_rotate + (i ? 7 : 14)) & 31;
+	}
+
+	input_pool.input_rotate = input_rotate;
+	input_pool.add_ptr = i;
+}
diff --git a/dblblake2s.c b/dblblake2s.c
new file mode 100644
index 0000000..54f5515
--- /dev/null
+++ b/dblblake2s.c
@@ -0,0 +1,10 @@
+#include <linux/kernel.h>
+#include <crypto/blake2s.h>
+
+static struct blake2s_state hash[2] = { {.outlen = 32 /* TODO: other constants */ }, {.outlen = 32 /* TODO: other constants */ } };
+
+void mix_dblblake2s(const void *in, int nbytes)
+{
+	blake2s_update(&hash[0], in, nbytes);
+	blake2s_update(&hash[1], in, nbytes);
+}
diff --git a/k12f1600.c b/k12f1600.c
new file mode 100644
index 0000000..26d974b
--- /dev/null
+++ b/k12f1600.c
@@ -0,0 +1,54 @@
+#include <linux/kernel.h>
+#include <crypto/algapi.h>
+
+enum poolinfo {
+	POOL_BITS = 512,
+	POOL_BYTES = POOL_BITS / 8
+};
+
+static const u64 keccak_round_constants[12] = {
+	0x0000000000000001ULL, 0x0000000000008082ULL, 0x800000000000808aULL, 0x8000000080008000ULL,
+	0x000000000000808bULL, 0x0000000080000001ULL, 0x8000000080008081ULL, 0x8000000000008009ULL,
+	0x000000000000008aULL, 0x0000000000000088ULL, 0x0000000080008009ULL, 0x000000008000000aULL
+};
+
+static const int keccak_rotations[5][5] = {
+	{ 0,  1,  62, 28, 27 }, { 36, 44, 6,  55, 20 }, { 3,  10, 43, 25, 39 },
+	{ 41, 45, 15, 21, 8  }, { 18, 2,  61, 56, 14 }
+};
+
+static void keccak(u64 A[5][5])
+{
+	u64 B[5][5], C[5];
+	int r, x, y;
+
+#define N5(N, expr) N = 0, expr; N = 1, expr; N = 2, expr; N = 3, expr; N = 4, expr;
+
+	for (r = 0; r < 12; ++r) {
+		N5(x, C[x] = A[0][x] ^ A[1][x] ^ A[2][x] ^ A[3][x] ^ A[4][x]);
+		N5(x, N5(y, A[y][x] ^= C[(x + 4) % 5] ^ rol64(C[(x + 1) % 5], 1)));
+		N5(x, N5(y, B[(2 * x + 3 * y) % 5][y] = rol64(A[y][x], keccak_rotations[y][x])));
+		N5(x, N5(y, A[y][x] = B[y][x] ^ (~B[y][(x + 1) % 5] & B[y][(x + 2) % 5])));
+		A[0][0] ^= keccak_round_constants[r];
+	}
+}
+
+static u64 state[5][5];
+static int idx;
+
+void mix_k12f1600(const void *in, int nbytes)
+{
+	const u8 *src = in;
+	u8 *dst = (u8 *)state;
+
+	while (nbytes) {
+		size_t l = min_t(size_t, nbytes, sizeof(state) - POOL_BYTES - idx);
+		if (!l) {
+			keccak(state);
+			idx = 0;
+			continue;
+		}
+		crypto_xor(&dst[idx], src, l);
+		nbytes -= l, src += l, idx += l;
+	}
+}
diff --git a/k12f800.c b/k12f800.c
new file mode 100644
index 0000000..67cc2b3
--- /dev/null
+++ b/k12f800.c
@@ -0,0 +1,54 @@
+#include <linux/kernel.h>
+#include <crypto/algapi.h>
+
+enum poolinfo {
+	POOL_BITS = 512,
+	POOL_BYTES = POOL_BITS / 8
+};
+
+static const u32 keccak_round_constants[12] = {
+	0x00000001, 0x00008082, 0x0000808a, 0x80008000,
+	0x0000808b, 0x80000001, 0x80008081, 0x00008009,
+	0x0000008a, 0x00000088, 0x80008009, 0x8000000a
+};
+
+static const int keccak_rotations[5][5] = {
+	{ 0,  1,  62, 28, 27 }, { 36, 44, 6,  55, 20 }, { 3,  10, 43, 25, 39 },
+	{ 41, 45, 15, 21, 8  }, { 18, 2,  61, 56, 14 }
+};
+
+static void keccak(u32 A[5][5])
+{
+	u32 B[5][5], C[5];
+	int r, x, y;
+
+#define N5(N, expr) N = 0, expr; N = 1, expr; N = 2, expr; N = 3, expr; N = 4, expr;
+
+	for (r = 0; r < 12; ++r) {
+		N5(x, C[x] = A[0][x] ^ A[1][x] ^ A[2][x] ^ A[3][x] ^ A[4][x]);
+		N5(x, N5(y, A[y][x] ^= C[(x + 4) % 5] ^ rol64(C[(x + 1) % 5], 1)));
+		N5(x, N5(y, B[(2 * x + 3 * y) % 5][y] = rol64(A[y][x], keccak_rotations[y][x])));
+		N5(x, N5(y, A[y][x] = B[y][x] ^ (~B[y][(x + 1) % 5] & B[y][(x + 2) % 5])));
+		A[0][0] ^= keccak_round_constants[r];
+	}
+}
+
+static u32 state[5][5];
+static int idx;
+
+void mix_k12f800(const void *in, int nbytes)
+{
+	const u8 *src = in;
+	u8 *dst = (u8 *)state;
+
+	while (nbytes) {
+		size_t l = min_t(size_t, nbytes, sizeof(state) - POOL_BYTES - idx);
+		if (!l) {
+			keccak(state);
+			idx = 0;
+			continue;
+		}
+		crypto_xor(&dst[idx], src, l);
+		nbytes -= l, src += l, idx += l;
+	}
+}
diff --git a/main.c b/main.c
new file mode 100644
index 0000000..7b74e68
--- /dev/null
+++ b/main.c
@@ -0,0 +1,115 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * Copyright (C) 2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/delay.h>
+#include <linux/slab.h>
+#include <linux/sort.h>
+#include <linux/random.h>
+#include <asm/cpufeature.h>
+#include <asm/processor.h>
+#include <asm/fpu/api.h>
+#include <asm/simd.h>
+
+static unsigned long stamp = 0;
+module_param(stamp, ulong, 0);
+
+#define declare_it(name) bool mix_ ##name(void *input, size_t len);
+
+#define do_it(name, input, len) do { \
+	u32 eax = 0, ebx = 0, ecx = 0, edx = 0; \
+	for (i = 0; i < WARMUP; ++i) \
+		ret |= mix_ ##name(input, len); \
+	asm volatile("cpuid" : "+a" (eax), "=b" (ebx), "=d" (edx), "+c" (ecx)); \
+	for (i = 0; i <= TRIALS; ++i) { \
+		trial_times[i] = get_cycles(); \
+		ret |= mix_ ##name(input, len); \
+	} \
+	for (i = 0; i < TRIALS; ++i) \
+		trial_times[i] = trial_times[i + 1] - trial_times[i]; \
+	sort(trial_times, TRIALS + 1, sizeof(cycles_t), compare_cycles, NULL); \
+	median_ ## name = trial_times[TRIALS / 2]; \
+} while (0)
+
+#define report_it(name) do { \
+	pr_err("%lu: %12s: %6llu cycles per call\n", stamp, #name, median_ ## name); \
+} while (0)
+
+
+declare_it(original)
+declare_it(crc32loop)
+declare_it(tinylfsr)
+declare_it(widelfsr)
+declare_it(blake2s)
+declare_it(dblblake2s)
+declare_it(k12f800)
+declare_it(k12f1600)
+
+static int compare_cycles(const void *a, const void *b)
+{
+	return *((cycles_t *)a) - *((cycles_t *)b);
+}
+
+static int __init mod_init(void)
+{
+	enum { WARMUP = 6000, TRIALS = 10000, IDLE = 1 * 1000, LEN = 4096 };
+	static u8 buffer[LEN];
+	int ret = 0, i;
+	cycles_t *trial_times;
+	cycles_t median_original = 0;
+	cycles_t median_crc32loop = 0;
+	cycles_t median_tinylfsr = 0;
+	cycles_t median_widelfsr = 0;
+	cycles_t median_blake2s = 0;
+	cycles_t median_dblblake2s = 0;
+	cycles_t median_k12f800 = 0;
+	cycles_t median_k12f1600 = 0;
+	unsigned long flags;
+	DEFINE_SPINLOCK(lock);
+
+	get_random_bytes(buffer, sizeof(buffer));
+
+	trial_times = kcalloc(TRIALS + 1, sizeof(cycles_t), GFP_KERNEL);
+	if (!trial_times)
+		return -ENOMEM;
+
+	msleep(IDLE);
+
+	spin_lock_irqsave(&lock, flags);
+
+	do_it(original, buffer, sizeof(buffer));
+	do_it(crc32loop, buffer, sizeof(buffer));
+	do_it(tinylfsr, buffer, sizeof(buffer));
+	do_it(widelfsr, buffer, sizeof(buffer));
+	do_it(blake2s, buffer, sizeof(buffer));
+	do_it(dblblake2s, buffer, sizeof(buffer));
+	do_it(k12f800, buffer, sizeof(buffer));
+	do_it(k12f1600, buffer, sizeof(buffer));
+
+	spin_unlock_irqrestore(&lock, flags);
+	
+	report_it(original);
+	report_it(crc32loop);
+	report_it(tinylfsr);
+	report_it(widelfsr);
+	report_it(blake2s);
+	report_it(dblblake2s);
+	report_it(k12f800);
+	report_it(k12f1600);
+
+	kfree(trial_times);
+	
+	/* We should never actually agree to insert the module. Choosing
+	 * -0x1000 here is an amazing hack. It causes the kernel to not
+	 * actually load the module, while the standard userspace tools
+	 * don't return an error, because it's too big. */
+	return -0x1000;
+}
+
+module_init(mod_init);
+MODULE_LICENSE("GPL v2");
+MODULE_DESCRIPTION("kBench9000 Cycle Counter");
+MODULE_AUTHOR("Jason A. Donenfeld <Jason@zx2c4.com>");
diff --git a/original.c b/original.c
new file mode 100644
index 0000000..32be348
--- /dev/null
+++ b/original.c
@@ -0,0 +1,65 @@
+#include <linux/kernel.h>
+
+enum poolinfo {
+	POOL_WORDS = 128,
+	POOL_WORDMASK = POOL_WORDS - 1,
+	POOL_BITS = POOL_WORDS * sizeof(u32) * 8,
+
+	/* x^128 + x^104 + x^76 + x^51 +x^25 + x + 1 */
+	POOL_TAP1 = 104,
+	POOL_TAP2 = 76,
+	POOL_TAP3 = 51,
+	POOL_TAP4 = 25,
+	POOL_TAP5 = 1,
+};
+
+static u32 input_pool_data[POOL_WORDS];
+
+static struct {
+	u16 add_ptr;
+	u16 input_rotate;
+} input_pool = { 0 };
+
+static const u32 twist_table[8] = {
+	0x00000000, 0x3b6e20c8, 0x76dc4190, 0x4db26158,
+	0xedb88320, 0xd6d6a3e8, 0x9b64c2b0, 0xa00ae278
+};
+
+void mix_original(const void *in, int nbytes)
+{
+	unsigned long i;
+	int input_rotate;
+	const u8 *bytes = in;
+	u32 w;
+
+	input_rotate = input_pool.input_rotate;
+	i = input_pool.add_ptr;
+
+	/* mix one byte at a time to simplify size handling and churn faster */
+	while (nbytes--) {
+		w = rol32(*bytes++, input_rotate);
+		i = (i - 1) & POOL_WORDMASK;
+
+		/* XOR in the various taps */
+		w ^= input_pool_data[i];
+		w ^= input_pool_data[(i + POOL_TAP1) & POOL_WORDMASK];
+		w ^= input_pool_data[(i + POOL_TAP2) & POOL_WORDMASK];
+		w ^= input_pool_data[(i + POOL_TAP3) & POOL_WORDMASK];
+		w ^= input_pool_data[(i + POOL_TAP4) & POOL_WORDMASK];
+		w ^= input_pool_data[(i + POOL_TAP5) & POOL_WORDMASK];
+
+		/* Mix the result back in with a twist */
+		input_pool_data[i] = (w >> 3) ^ twist_table[w & 7];
+
+		/*
+		 * Normally, we add 7 bits of rotation to the pool.
+		 * At the beginning of the pool, add an extra 7 bits
+		 * rotation, so that successive passes spread the
+		 * input bits across the pool evenly.
+		 */
+		input_rotate = (input_rotate + (i ? 7 : 14)) & 31;
+	}
+
+	input_pool.input_rotate = input_rotate;
+	input_pool.add_ptr = i;
+}
diff --git a/run.sh b/run.sh
new file mode 100755
index 0000000..39c4719
--- /dev/null
+++ b/run.sh
@@ -0,0 +1,35 @@
+#!/bin/bash
+set -e
+
+nob_cpus() {
+	echo "[+] Setting non-boot CPUs to status $1"
+	for i in /sys/devices/system/cpu/*/online; do
+		echo "$1" > "$i"
+	done
+}
+
+noturbo() {
+	echo "[+] Setting no-turbo to status $1"
+	if [[ -e /sys/devices/system/cpu/intel_pstate/no_turbo ]]; then
+		echo "$1" > /sys/devices/system/cpu/intel_pstate/no_turbo
+	else
+		local val
+		[[ $1 == 0 ]] && val=0x850089
+		[[ $1 == 1 ]] && val=0x4000850089
+		[[ -n $val ]] || return 0
+		wrmsr -a 0x1a0 $val
+	fi
+}
+
+[[ -e kbench9000.ko ]]
+
+trap "nob_cpus 1; noturbo 0;" INT TERM EXIT
+noturbo 1
+nob_cpus 0
+
+echo "[+] Inserting module to run tests"
+stamp="$(date +%s)"
+insmod kbench9000.ko stamp="$stamp"
+
+echo "[+] Gathering results"
+dmesg | sed -n "s/.*kbench9000: $stamp: \\(.*\\)/\\x1b[37m\\x1b[44m\\x1b[1m\\1\\x1b[0m/p"
diff --git a/tinylfsr.c b/tinylfsr.c
new file mode 100644
index 0000000..fcea1a0
--- /dev/null
+++ b/tinylfsr.c
@@ -0,0 +1,38 @@
+#include <linux/kernel.h>
+
+enum poolinfo {
+	POOL_WORDS = 128,
+	POOL_WORDMASK = POOL_WORDS - 1,
+	POOL_BITS = POOL_WORDS * sizeof(u32) * 8,
+};
+
+static u32 input_pool_data[POOL_WORDS];
+
+static struct {
+	u16 add_ptr;
+} input_pool = { 0 };
+
+void mix_tinylfsr(const void *in, int nbytes)
+{
+	unsigned long i = input_pool.add_ptr, next;
+	const u8 *src = in;
+
+	/* Mix one byte at a time to simplify size handling and churn faster. */
+	while (nbytes--) {
+		/* The simplest possible max period LFSR.
+		 * This is morally equivalent to evaluating
+		 * the input as a polynomial at x^32.
+		 *
+		 * It can be evaluated in parallel, since i+0
+		 * only depends on i+0 and i+1. Thus i+0 depends
+		 * on i+0,i+1, i+1 depends on i+1 and i+2,
+		 * so we can absorb up to 126 bytes in parallel
+		 * if necessary.
+		 */
+		next = (i + 1) & POOL_WORDMASK;
+		input_pool_data[i] = *src++ ^ rol32(input_pool_data[i], 9) ^
+				(input_pool_data[next] << 15) ^ (input_pool_data[next] >> 24);
+		i = next;
+	}
+	input_pool.add_ptr = i;
+}
diff --git a/widelfsr.c b/widelfsr.c
new file mode 100644
index 0000000..088ec7e
--- /dev/null
+++ b/widelfsr.c
@@ -0,0 +1,30 @@
+#include <linux/kernel.h>
+
+enum poolinfo {
+	POOL_WORDS = 128,
+	POOL_WORDMASK = POOL_WORDS - 1,
+	POOL_BITS = POOL_WORDS * sizeof(u32) * 8,
+};
+
+static u32 input_pool_data[POOL_WORDS];
+
+static struct {
+	u16 add_ptr;
+} input_pool = { 0 };
+
+void mix_widelfsr(const void *in, int nbytes)
+{
+	unsigned long i = input_pool.add_ptr;
+	const u8 *src = in;
+
+	/* Mix one byte at a time to simplify size handling and churn faster. */
+	while (nbytes--) {
+		input_pool_data[(i+0)&127] = rol32(input_pool_data[(i+0)&127], 4) ^
+					     (input_pool_data[(i+126)&127] << 7) ^
+					     rol32(input_pool_data[(i+127)&127],12) ^
+					     rol32(input_pool_data[(i+127)&127],21) ^
+					     *src++;
+		i = (i+1)&127;
+	}
+	input_pool.add_ptr = i;
+}
-- 
cgit v1.2.3-59-g8ed1b