Initial scaffoldingjd/mix-pool-bytes-comparison

author: Jason A. Donenfeld <Jason@zx2c4.com> 2018-01-16 13:42:15 +0100
committer: Jason A. Donenfeld <Jason@zx2c4.com> 2022-01-24 16:59:37 +0100
commit: adcaee95cce63db00e0d338b4cc4c8d13ef121c4 (patch)
tree: 41e115318bc5f2afdbd23388dfb2b6c6ce62f06e
download: kbench9000-jd/mix-pool-bytes-comparison.tar.xz
kbench9000-jd/mix-pool-bytes-comparison.zip
12 files changed, 517 insertions, 0 deletions
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..7df6fb2
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,19 @@
+ifneq ($(KERNELRELEASE),)
+kbench9000-y := main.o original.o crc32loop.o tinylfsr.o widelfsr.o blake2s.o dblblake2s.o k12f800.o k12f1600.o
+obj-m := kbench9000.o
+ccflags-y += -D'pr_fmt(fmt)=KBUILD_MODNAME ": " fmt'
+else
+KERNELDIR ?= /lib/modules/$(shell uname -r)/build
+PWD := $(shell pwd)
+
+default: build
+
+run: build
+	sudo ./run.sh
+build:
+	$(MAKE) -C $(KERNELDIR) M=$(PWD)
+clean:
+	$(MAKE) -C $(KERNELDIR) M=$(PWD) clean
+.PHONY: default run build clean
+endif
+
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..3fd3180
--- /dev/null
+++ b/README.md
@@ -0,0 +1,26 @@
+# kBench9000 &ndash; simple kernel land cycle counter
+### by [Jason A. Donenfeld](mailto:jason@zx2c4.com)
+
+This is a very simple kernel land cycle counter. To use, simply edit `function.h`,
+add any other `.c` files and mention them in the `kbench9000-y +=` line of the
+`Makefile`, and then type:
+
+```
+$ make run
+```
+
+![Expected kBench9000 output](https://data.zx2c4.com/kbench9000-screenshot.png)
+
+### Kernel Toolchain
+
+You'll need to have a working kernel toolchain, usually achievable by:
+
+```
+$ sudo apt install linux-headers-$(uname -r) build-essential
+```
+
+or
+
+```
+$ sudo dnf install kernel-devel @development-tools
+```
diff --git a/blake2s.c b/blake2s.c
new file mode 100644
index 0000000..6d5500a
--- /dev/null
+++ b/blake2s.c
@@ -0,0 +1,9 @@
+#include <linux/kernel.h>
+#include <crypto/blake2s.h>
+
+static struct blake2s_state hash = { .outlen = 32 /* TODO: other constants */ };
+
+void mix_blake2s(const void *in, int nbytes)
+{
+	blake2s_update(&hash, in, nbytes);
+}
diff --git a/crc32loop.c b/crc32loop.c
new file mode 100644
index 0000000..371bc96
--- /dev/null
+++ b/crc32loop.c
@@ -0,0 +1,62 @@
+#include <linux/kernel.h>
+
+enum poolinfo {
+	POOL_WORDS = 128,
+	POOL_WORDMASK = POOL_WORDS - 1,
+	POOL_BITS = POOL_WORDS * sizeof(u32) * 8,
+
+	/* x^128 + x^104 + x^76 + x^51 +x^25 + x + 1 */
+	POOL_TAP1 = 104,
+	POOL_TAP2 = 76,
+	POOL_TAP3 = 51,
+	POOL_TAP4 = 25,
+	POOL_TAP5 = 1,
+};
+
+static u32 input_pool_data[POOL_WORDS];
+
+static struct {
+	u16 add_ptr;
+	u16 input_rotate;
+} input_pool = { 0 };
+
+void mix_crc32loop(const void *in, int nbytes)
+{
+	unsigned long i, j;
+	int input_rotate;
+	const u8 *bytes = in;
+	u32 w;
+
+	input_rotate = input_pool.input_rotate;
+	i = input_pool.add_ptr;
+
+	/* mix one byte at a time to simplify size handling and churn faster */
+	while (nbytes--) {
+		w = rol32(*bytes++, input_rotate);
+		i = (i - 1) & POOL_WORDMASK;
+
+		/* XOR in the various taps */
+		w ^= input_pool_data[i];
+		w ^= input_pool_data[(i + POOL_TAP1) & POOL_WORDMASK];
+		w ^= input_pool_data[(i + POOL_TAP2) & POOL_WORDMASK];
+		w ^= input_pool_data[(i + POOL_TAP3) & POOL_WORDMASK];
+		w ^= input_pool_data[(i + POOL_TAP4) & POOL_WORDMASK];
+		w ^= input_pool_data[(i + POOL_TAP5) & POOL_WORDMASK];
+
+		/* Mix the result back in with a twist */
+		for (j = 0; j < 3; ++j)
+			w = (w >> 1) ^ (0xedb88320 & -(w & 1));
+		input_pool_data[i] = w;
+
+		/*
+		 * Normally, we add 7 bits of rotation to the pool.
+		 * At the beginning of the pool, add an extra 7 bits
+		 * rotation, so that successive passes spread the
+		 * input bits across the pool evenly.
+		 */
+		input_rotate = (input_rotate + (i ? 7 : 14)) & 31;
+	}
+
+	input_pool.input_rotate = input_rotate;
+	input_pool.add_ptr = i;
+}
diff --git a/dblblake2s.c b/dblblake2s.c
new file mode 100644
index 0000000..54f5515
--- /dev/null
+++ b/dblblake2s.c
@@ -0,0 +1,10 @@
+#include <linux/kernel.h>
+#include <crypto/blake2s.h>
+
+static struct blake2s_state hash[2] = { {.outlen = 32 /* TODO: other constants */ }, {.outlen = 32 /* TODO: other constants */ } };
+
+void mix_dblblake2s(const void *in, int nbytes)
+{
+	blake2s_update(&hash[0], in, nbytes);
+	blake2s_update(&hash[1], in, nbytes);
+}
diff --git a/k12f1600.c b/k12f1600.c
new file mode 100644
index 0000000..26d974b
--- /dev/null
+++ b/k12f1600.c
@@ -0,0 +1,54 @@
+#include <linux/kernel.h>
+#include <crypto/algapi.h>
+
+enum poolinfo {
+	POOL_BITS = 512,
+	POOL_BYTES = POOL_BITS / 8
+};
+
+static const u64 keccak_round_constants[12] = {
+	0x0000000000000001ULL, 0x0000000000008082ULL, 0x800000000000808aULL, 0x8000000080008000ULL,
+	0x000000000000808bULL, 0x0000000080000001ULL, 0x8000000080008081ULL, 0x8000000000008009ULL,
+	0x000000000000008aULL, 0x0000000000000088ULL, 0x0000000080008009ULL, 0x000000008000000aULL
+};
+
+static const int keccak_rotations[5][5] = {
+	{ 0,  1,  62, 28, 27 }, { 36, 44, 6,  55, 20 }, { 3,  10, 43, 25, 39 },
+	{ 41, 45, 15, 21, 8  }, { 18, 2,  61, 56, 14 }
+};
+
+static void keccak(u64 A[5][5])
+{
+	u64 B[5][5], C[5];
+	int r, x, y;
+
+#define N5(N, expr) N = 0, expr; N = 1, expr; N = 2, expr; N = 3, expr; N = 4, expr;
+
+	for (r = 0; r < 12; ++r) {
+		N5(x, C[x] = A[0][x] ^ A[1][x] ^ A[2][x] ^ A[3][x] ^ A[4][x]);
+		N5(x, N5(y, A[y][x] ^= C[(x + 4) % 5] ^ rol64(C[(x + 1) % 5], 1)));
+		N5(x, N5(y, B[(2 * x + 3 * y) % 5][y] = rol64(A[y][x], keccak_rotations[y][x])));
+		N5(x, N5(y, A[y][x] = B[y][x] ^ (~B[y][(x + 1) % 5] & B[y][(x + 2) % 5])));
+		A[0][0] ^= keccak_round_constants[r];
+	}
+}
+
+static u64 state[5][5];
+static int idx;
+
+void mix_k12f1600(const void *in, int nbytes)
+{
+	const u8 *src = in;
+	u8 *dst = (u8 *)state;
+
+	while (nbytes) {
+		size_t l = min_t(size_t, nbytes, sizeof(state) - POOL_BYTES - idx);
+		if (!l) {
+			keccak(state);
+			idx = 0;
+			continue;
+		}
+		crypto_xor(&dst[idx], src, l);
+		nbytes -= l, src += l, idx += l;
+	}
+}
diff --git a/k12f800.c b/k12f800.c
new file mode 100644
index 0000000..67cc2b3
--- /dev/null
+++ b/k12f800.c
@@ -0,0 +1,54 @@
+#include <linux/kernel.h>
+#include <crypto/algapi.h>
+
+enum poolinfo {
+	POOL_BITS = 512,
+	POOL_BYTES = POOL_BITS / 8
+};
+
+static const u32 keccak_round_constants[12] = {
+	0x00000001, 0x00008082, 0x0000808a, 0x80008000,
+	0x0000808b, 0x80000001, 0x80008081, 0x00008009,
+	0x0000008a, 0x00000088, 0x80008009, 0x8000000a
+};
+
+static const int keccak_rotations[5][5] = {
+	{ 0,  1,  62, 28, 27 }, { 36, 44, 6,  55, 20 }, { 3,  10, 43, 25, 39 },
+	{ 41, 45, 15, 21, 8  }, { 18, 2,  61, 56, 14 }
+};
+
+static void keccak(u32 A[5][5])
+{
+	u32 B[5][5], C[5];
+	int r, x, y;
+
+#define N5(N, expr) N = 0, expr; N = 1, expr; N = 2, expr; N = 3, expr; N = 4, expr;
+
+	for (r = 0; r < 12; ++r) {
+		N5(x, C[x] = A[0][x] ^ A[1][x] ^ A[2][x] ^ A[3][x] ^ A[4][x]);
+		N5(x, N5(y, A[y][x] ^= C[(x + 4) % 5] ^ rol64(C[(x + 1) % 5], 1)));
+		N5(x, N5(y, B[(2 * x + 3 * y) % 5][y] = rol64(A[y][x], keccak_rotations[y][x])));
+		N5(x, N5(y, A[y][x] = B[y][x] ^ (~B[y][(x + 1) % 5] & B[y][(x + 2) % 5])));
+		A[0][0] ^= keccak_round_constants[r];
+	}
+}
+
+static u32 state[5][5];
+static int idx;
+
+void mix_k12f800(const void *in, int nbytes)
+{
+	const u8 *src = in;
+	u8 *dst = (u8 *)state;
+
+	while (nbytes) {
+		size_t l = min_t(size_t, nbytes, sizeof(state) - POOL_BYTES - idx);
+		if (!l) {
+			keccak(state);
+			idx = 0;
+			continue;
+		}
+		crypto_xor(&dst[idx], src, l);
+		nbytes -= l, src += l, idx += l;
+	}
+}
diff --git a/main.c b/main.c
new file mode 100644
index 0000000..7b74e68
--- /dev/null
+++ b/main.c
@@ -0,0 +1,115 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * Copyright (C) 2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/delay.h>
+#include <linux/slab.h>
+#include <linux/sort.h>
+#include <linux/random.h>
+#include <asm/cpufeature.h>
+#include <asm/processor.h>
+#include <asm/fpu/api.h>
+#include <asm/simd.h>
+
+static unsigned long stamp = 0;
+module_param(stamp, ulong, 0);
+
+#define declare_it(name) bool mix_ ##name(void *input, size_t len);
+
+#define do_it(name, input, len) do { \
+	u32 eax = 0, ebx = 0, ecx = 0, edx = 0; \
+	for (i = 0; i < WARMUP; ++i) \
+		ret |= mix_ ##name(input, len); \
+	asm volatile("cpuid" : "+a" (eax), "=b" (ebx), "=d" (edx), "+c" (ecx)); \
+	for (i = 0; i <= TRIALS; ++i) { \
+		trial_times[i] = get_cycles(); \
+		ret |= mix_ ##name(input, len); \
+	} \
+	for (i = 0; i < TRIALS; ++i) \
+		trial_times[i] = trial_times[i + 1] - trial_times[i]; \
+	sort(trial_times, TRIALS + 1, sizeof(cycles_t), compare_cycles, NULL); \
+	median_ ## name = trial_times[TRIALS / 2]; \
+} while (0)
+
+#define report_it(name) do { \
+	pr_err("%lu: %12s: %6llu cycles per call\n", stamp, #name, median_ ## name); \
+} while (0)
+
+
+declare_it(original)
+declare_it(crc32loop)
+declare_it(tinylfsr)
+declare_it(widelfsr)
+declare_it(blake2s)
+declare_it(dblblake2s)
+declare_it(k12f800)
+declare_it(k12f1600)
+
+static int compare_cycles(const void *a, const void *b)
+{
+	return *((cycles_t *)a) - *((cycles_t *)b);
+}
+
+static int __init mod_init(void)
+{
+	enum { WARMUP = 6000, TRIALS = 10000, IDLE = 1 * 1000, LEN = 4096 };
+	static u8 buffer[LEN];
+	int ret = 0, i;
+	cycles_t *trial_times;
+	cycles_t median_original = 0;
+	cycles_t median_crc32loop = 0;
+	cycles_t median_tinylfsr = 0;
+	cycles_t median_widelfsr = 0;
+	cycles_t median_blake2s = 0;
+	cycles_t median_dblblake2s = 0;
+	cycles_t median_k12f800 = 0;
+	cycles_t median_k12f1600 = 0;
+	unsigned long flags;
+	DEFINE_SPINLOCK(lock);
+
+	get_random_bytes(buffer, sizeof(buffer));
+
+	trial_times = kcalloc(TRIALS + 1, sizeof(cycles_t), GFP_KERNEL);
+	if (!trial_times)
+		return -ENOMEM;
+
+	msleep(IDLE);
+
+	spin_lock_irqsave(&lock, flags);
+
+	do_it(original, buffer, sizeof(buffer));
+	do_it(crc32loop, buffer, sizeof(buffer));
+	do_it(tinylfsr, buffer, sizeof(buffer));
+	do_it(widelfsr, buffer, sizeof(buffer));
+	do_it(blake2s, buffer, sizeof(buffer));
+	do_it(dblblake2s, buffer, sizeof(buffer));
+	do_it(k12f800, buffer, sizeof(buffer));
+	do_it(k12f1600, buffer, sizeof(buffer));
+
+	spin_unlock_irqrestore(&lock, flags);
+	
+	report_it(original);
+	report_it(crc32loop);
+	report_it(tinylfsr);
+	report_it(widelfsr);
+	report_it(blake2s);
+	report_it(dblblake2s);
+	report_it(k12f800);
+	report_it(k12f1600);
+
+	kfree(trial_times);
+	
+	/* We should never actually agree to insert the module. Choosing
+	 * -0x1000 here is an amazing hack. It causes the kernel to not
+	 * actually load the module, while the standard userspace tools
+	 * don't return an error, because it's too big. */
+	return -0x1000;
+}
+
+module_init(mod_init);
+MODULE_LICENSE("GPL v2");
+MODULE_DESCRIPTION("kBench9000 Cycle Counter");
+MODULE_AUTHOR("Jason A. Donenfeld <Jason@zx2c4.com>");
diff --git a/original.c b/original.c
new file mode 100644
index 0000000..32be348
--- /dev/null
+++ b/original.c
@@ -0,0 +1,65 @@
+#include <linux/kernel.h>
+
+enum poolinfo {
+	POOL_WORDS = 128,
+	POOL_WORDMASK = POOL_WORDS - 1,
+	POOL_BITS = POOL_WORDS * sizeof(u32) * 8,
+
+	/* x^128 + x^104 + x^76 + x^51 +x^25 + x + 1 */
+	POOL_TAP1 = 104,
+	POOL_TAP2 = 76,
+	POOL_TAP3 = 51,
+	POOL_TAP4 = 25,
+	POOL_TAP5 = 1,
+};
+
+static u32 input_pool_data[POOL_WORDS];
+
+static struct {
+	u16 add_ptr;
+	u16 input_rotate;
+} input_pool = { 0 };
+
+static const u32 twist_table[8] = {
+	0x00000000, 0x3b6e20c8, 0x76dc4190, 0x4db26158,
+	0xedb88320, 0xd6d6a3e8, 0x9b64c2b0, 0xa00ae278
+};
+
+void mix_original(const void *in, int nbytes)
+{
+	unsigned long i;
+	int input_rotate;
+	const u8 *bytes = in;
+	u32 w;
+
+	input_rotate = input_pool.input_rotate;
+	i = input_pool.add_ptr;
+
+	/* mix one byte at a time to simplify size handling and churn faster */
+	while (nbytes--) {
+		w = rol32(*bytes++, input_rotate);
+		i = (i - 1) & POOL_WORDMASK;
+
+		/* XOR in the various taps */
+		w ^= input_pool_data[i];
+		w ^= input_pool_data[(i + POOL_TAP1) & POOL_WORDMASK];
+		w ^= input_pool_data[(i + POOL_TAP2) & POOL_WORDMASK];
+		w ^= input_pool_data[(i + POOL_TAP3) & POOL_WORDMASK];
+		w ^= input_pool_data[(i + POOL_TAP4) & POOL_WORDMASK];
+		w ^= input_pool_data[(i + POOL_TAP5) & POOL_WORDMASK];
+
+		/* Mix the result back in with a twist */
+		input_pool_data[i] = (w >> 3) ^ twist_table[w & 7];
+
+		/*
+		 * Normally, we add 7 bits of rotation to the pool.
+		 * At the beginning of the pool, add an extra 7 bits
+		 * rotation, so that successive passes spread the
+		 * input bits across the pool evenly.
+		 */
+		input_rotate = (input_rotate + (i ? 7 : 14)) & 31;
+	}
+
+	input_pool.input_rotate = input_rotate;
+	input_pool.add_ptr = i;
+}
diff --git a/run.sh b/run.sh
new file mode 100755
index 0000000..39c4719
--- /dev/null
+++ b/run.sh
@@ -0,0 +1,35 @@
+#!/bin/bash
+set -e
+
+nob_cpus() {
+	echo "[+] Setting non-boot CPUs to status $1"
+	for i in /sys/devices/system/cpu/*/online; do
+		echo "$1" > "$i"
+	done
+}
+
+noturbo() {
+	echo "[+] Setting no-turbo to status $1"
+	if [[ -e /sys/devices/system/cpu/intel_pstate/no_turbo ]]; then
+		echo "$1" > /sys/devices/system/cpu/intel_pstate/no_turbo
+	else
+		local val
+		[[ $1 == 0 ]] && val=0x850089
+		[[ $1 == 1 ]] && val=0x4000850089
+		[[ -n $val ]] || return 0
+		wrmsr -a 0x1a0 $val
+	fi
+}
+
+[[ -e kbench9000.ko ]]
+
+trap "nob_cpus 1; noturbo 0;" INT TERM EXIT
+noturbo 1
+nob_cpus 0
+
+echo "[+] Inserting module to run tests"
+stamp="$(date +%s)"
+insmod kbench9000.ko stamp="$stamp"
+
+echo "[+] Gathering results"
+dmesg | sed -n "s/.*kbench9000: $stamp: \\(.*\\)/\\x1b[37m\\x1b[44m\\x1b[1m\\1\\x1b[0m/p"
diff --git a/tinylfsr.c b/tinylfsr.c
new file mode 100644
index 0000000..fcea1a0
--- /dev/null
+++ b/tinylfsr.c
@@ -0,0 +1,38 @@
+#include <linux/kernel.h>
+
+enum poolinfo {
+	POOL_WORDS = 128,
+	POOL_WORDMASK = POOL_WORDS - 1,
+	POOL_BITS = POOL_WORDS * sizeof(u32) * 8,
+};
+
+static u32 input_pool_data[POOL_WORDS];
+
+static struct {
+	u16 add_ptr;
+} input_pool = { 0 };
+
+void mix_tinylfsr(const void *in, int nbytes)
+{
+	unsigned long i = input_pool.add_ptr, next;
+	const u8 *src = in;
+
+	/* Mix one byte at a time to simplify size handling and churn faster. */
+	while (nbytes--) {
+		/* The simplest possible max period LFSR.
+		 * This is morally equivalent to evaluating
+		 * the input as a polynomial at x^32.
+		 *
+		 * It can be evaluated in parallel, since i+0
+		 * only depends on i+0 and i+1. Thus i+0 depends
+		 * on i+0,i+1, i+1 depends on i+1 and i+2,
+		 * so we can absorb up to 126 bytes in parallel
+		 * if necessary.
+		 */
+		next = (i + 1) & POOL_WORDMASK;
+		input_pool_data[i] = *src++ ^ rol32(input_pool_data[i], 9) ^
+				(input_pool_data[next] << 15) ^ (input_pool_data[next] >> 24);
+		i = next;
+	}
+	input_pool.add_ptr = i;
+}
diff --git a/widelfsr.c b/widelfsr.c
new file mode 100644
index 0000000..088ec7e
--- /dev/null
+++ b/widelfsr.c
@@ -0,0 +1,30 @@
+#include <linux/kernel.h>
+
+enum poolinfo {
+	POOL_WORDS = 128,
+	POOL_WORDMASK = POOL_WORDS - 1,
+	POOL_BITS = POOL_WORDS * sizeof(u32) * 8,
+};
+
+static u32 input_pool_data[POOL_WORDS];
+
+static struct {
+	u16 add_ptr;
+} input_pool = { 0 };
+
+void mix_widelfsr(const void *in, int nbytes)
+{
+	unsigned long i = input_pool.add_ptr;
+	const u8 *src = in;
+
+	/* Mix one byte at a time to simplify size handling and churn faster. */
+	while (nbytes--) {
+		input_pool_data[(i+0)&127] = rol32(input_pool_data[(i+0)&127], 4) ^
+					     (input_pool_data[(i+126)&127] << 7) ^
+					     rol32(input_pool_data[(i+127)&127],12) ^
+					     rol32(input_pool_data[(i+127)&127],21) ^
+					     *src++;
+		i = (i+1)&127;
+	}
+	input_pool.add_ptr = i;
+}
author	Jason A. Donenfeld <Jason@zx2c4.com>	2018-01-16 13:42:15 +0100
committer	Jason A. Donenfeld <Jason@zx2c4.com>	2022-01-24 16:59:37 +0100
commit	adcaee95cce63db00e0d338b4cc4c8d13ef121c4 (patch)
tree	41e115318bc5f2afdbd23388dfb2b6c6ce62f06e
download	kbench9000-jd/mix-pool-bytes-comparison.tar.xz kbench9000-jd/mix-pool-bytes-comparison.zip