aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJason A. Donenfeld <Jason@zx2c4.com>2018-01-16 13:42:15 +0100
committerJason A. Donenfeld <Jason@zx2c4.com>2022-01-24 16:59:37 +0100
commitadcaee95cce63db00e0d338b4cc4c8d13ef121c4 (patch)
tree41e115318bc5f2afdbd23388dfb2b6c6ce62f06e
downloadkbench9000-jd/mix-pool-bytes-comparison.tar.xz
kbench9000-jd/mix-pool-bytes-comparison.zip
Initial scaffoldingjd/mix-pool-bytes-comparison
-rw-r--r--Makefile19
-rw-r--r--README.md26
-rw-r--r--blake2s.c9
-rw-r--r--crc32loop.c62
-rw-r--r--dblblake2s.c10
-rw-r--r--k12f1600.c54
-rw-r--r--k12f800.c54
-rw-r--r--main.c115
-rw-r--r--original.c65
-rwxr-xr-xrun.sh35
-rw-r--r--tinylfsr.c38
-rw-r--r--widelfsr.c30
12 files changed, 517 insertions, 0 deletions
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..7df6fb2
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,19 @@
+ifneq ($(KERNELRELEASE),)
+kbench9000-y := main.o original.o crc32loop.o tinylfsr.o widelfsr.o blake2s.o dblblake2s.o k12f800.o k12f1600.o
+obj-m := kbench9000.o
+ccflags-y += -D'pr_fmt(fmt)=KBUILD_MODNAME ": " fmt'
+else
+KERNELDIR ?= /lib/modules/$(shell uname -r)/build
+PWD := $(shell pwd)
+
+default: build
+
+run: build
+ sudo ./run.sh
+build:
+ $(MAKE) -C $(KERNELDIR) M=$(PWD)
+clean:
+ $(MAKE) -C $(KERNELDIR) M=$(PWD) clean
+.PHONY: default run build clean
+endif
+
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..3fd3180
--- /dev/null
+++ b/README.md
@@ -0,0 +1,26 @@
+# kBench9000 &ndash; simple kernel land cycle counter
+### by [Jason A. Donenfeld](mailto:jason@zx2c4.com)
+
+This is a very simple kernel land cycle counter. To use, simply edit `function.h`,
+add any other `.c` files and mention them in the `kbench9000-y +=` line of the
+`Makefile`, and then type:
+
+```
+$ make run
+```
+
+![Expected kBench9000 output](https://data.zx2c4.com/kbench9000-screenshot.png)
+
+### Kernel Toolchain
+
+You'll need to have a working kernel toolchain, usually achievable by:
+
+```
+$ sudo apt install linux-headers-$(uname -r) build-essential
+```
+
+or
+
+```
+$ sudo dnf install kernel-devel @development-tools
+```
diff --git a/blake2s.c b/blake2s.c
new file mode 100644
index 0000000..6d5500a
--- /dev/null
+++ b/blake2s.c
@@ -0,0 +1,9 @@
+#include <linux/kernel.h>
+#include <crypto/blake2s.h>
+
+static struct blake2s_state hash = { .outlen = 32 /* TODO: other constants */ };
+
+void mix_blake2s(const void *in, int nbytes)
+{
+ blake2s_update(&hash, in, nbytes);
+}
diff --git a/crc32loop.c b/crc32loop.c
new file mode 100644
index 0000000..371bc96
--- /dev/null
+++ b/crc32loop.c
@@ -0,0 +1,62 @@
+#include <linux/kernel.h>
+
+enum poolinfo {
+ POOL_WORDS = 128,
+ POOL_WORDMASK = POOL_WORDS - 1,
+ POOL_BITS = POOL_WORDS * sizeof(u32) * 8,
+
+ /* x^128 + x^104 + x^76 + x^51 +x^25 + x + 1 */
+ POOL_TAP1 = 104,
+ POOL_TAP2 = 76,
+ POOL_TAP3 = 51,
+ POOL_TAP4 = 25,
+ POOL_TAP5 = 1,
+};
+
+static u32 input_pool_data[POOL_WORDS];
+
+static struct {
+ u16 add_ptr;
+ u16 input_rotate;
+} input_pool = { 0 };
+
+void mix_crc32loop(const void *in, int nbytes)
+{
+ unsigned long i, j;
+ int input_rotate;
+ const u8 *bytes = in;
+ u32 w;
+
+ input_rotate = input_pool.input_rotate;
+ i = input_pool.add_ptr;
+
+ /* mix one byte at a time to simplify size handling and churn faster */
+ while (nbytes--) {
+ w = rol32(*bytes++, input_rotate);
+ i = (i - 1) & POOL_WORDMASK;
+
+ /* XOR in the various taps */
+ w ^= input_pool_data[i];
+ w ^= input_pool_data[(i + POOL_TAP1) & POOL_WORDMASK];
+ w ^= input_pool_data[(i + POOL_TAP2) & POOL_WORDMASK];
+ w ^= input_pool_data[(i + POOL_TAP3) & POOL_WORDMASK];
+ w ^= input_pool_data[(i + POOL_TAP4) & POOL_WORDMASK];
+ w ^= input_pool_data[(i + POOL_TAP5) & POOL_WORDMASK];
+
+ /* Mix the result back in with a twist */
+ for (j = 0; j < 3; ++j)
+ w = (w >> 1) ^ (0xedb88320 & -(w & 1));
+ input_pool_data[i] = w;
+
+ /*
+ * Normally, we add 7 bits of rotation to the pool.
+ * At the beginning of the pool, add an extra 7 bits
+ * rotation, so that successive passes spread the
+ * input bits across the pool evenly.
+ */
+ input_rotate = (input_rotate + (i ? 7 : 14)) & 31;
+ }
+
+ input_pool.input_rotate = input_rotate;
+ input_pool.add_ptr = i;
+}
diff --git a/dblblake2s.c b/dblblake2s.c
new file mode 100644
index 0000000..54f5515
--- /dev/null
+++ b/dblblake2s.c
@@ -0,0 +1,10 @@
+#include <linux/kernel.h>
+#include <crypto/blake2s.h>
+
+static struct blake2s_state hash[2] = { {.outlen = 32 /* TODO: other constants */ }, {.outlen = 32 /* TODO: other constants */ } };
+
+void mix_dblblake2s(const void *in, int nbytes)
+{
+ blake2s_update(&hash[0], in, nbytes);
+ blake2s_update(&hash[1], in, nbytes);
+}
diff --git a/k12f1600.c b/k12f1600.c
new file mode 100644
index 0000000..26d974b
--- /dev/null
+++ b/k12f1600.c
@@ -0,0 +1,54 @@
+#include <linux/kernel.h>
+#include <crypto/algapi.h>
+
+enum poolinfo {
+ POOL_BITS = 512,
+ POOL_BYTES = POOL_BITS / 8
+};
+
+static const u64 keccak_round_constants[12] = {
+ 0x0000000000000001ULL, 0x0000000000008082ULL, 0x800000000000808aULL, 0x8000000080008000ULL,
+ 0x000000000000808bULL, 0x0000000080000001ULL, 0x8000000080008081ULL, 0x8000000000008009ULL,
+ 0x000000000000008aULL, 0x0000000000000088ULL, 0x0000000080008009ULL, 0x000000008000000aULL
+};
+
+static const int keccak_rotations[5][5] = {
+ { 0, 1, 62, 28, 27 }, { 36, 44, 6, 55, 20 }, { 3, 10, 43, 25, 39 },
+ { 41, 45, 15, 21, 8 }, { 18, 2, 61, 56, 14 }
+};
+
+static void keccak(u64 A[5][5])
+{
+ u64 B[5][5], C[5];
+ int r, x, y;
+
+#define N5(N, expr) N = 0, expr; N = 1, expr; N = 2, expr; N = 3, expr; N = 4, expr;
+
+ for (r = 0; r < 12; ++r) {
+ N5(x, C[x] = A[0][x] ^ A[1][x] ^ A[2][x] ^ A[3][x] ^ A[4][x]);
+ N5(x, N5(y, A[y][x] ^= C[(x + 4) % 5] ^ rol64(C[(x + 1) % 5], 1)));
+ N5(x, N5(y, B[(2 * x + 3 * y) % 5][y] = rol64(A[y][x], keccak_rotations[y][x])));
+ N5(x, N5(y, A[y][x] = B[y][x] ^ (~B[y][(x + 1) % 5] & B[y][(x + 2) % 5])));
+ A[0][0] ^= keccak_round_constants[r];
+ }
+}
+
+static u64 state[5][5];
+static int idx;
+
+void mix_k12f1600(const void *in, int nbytes)
+{
+ const u8 *src = in;
+ u8 *dst = (u8 *)state;
+
+ while (nbytes) {
+ size_t l = min_t(size_t, nbytes, sizeof(state) - POOL_BYTES - idx);
+ if (!l) {
+ keccak(state);
+ idx = 0;
+ continue;
+ }
+ crypto_xor(&dst[idx], src, l);
+ nbytes -= l, src += l, idx += l;
+ }
+}
diff --git a/k12f800.c b/k12f800.c
new file mode 100644
index 0000000..67cc2b3
--- /dev/null
+++ b/k12f800.c
@@ -0,0 +1,54 @@
+#include <linux/kernel.h>
+#include <crypto/algapi.h>
+
+enum poolinfo {
+ POOL_BITS = 512,
+ POOL_BYTES = POOL_BITS / 8
+};
+
+static const u32 keccak_round_constants[12] = {
+ 0x00000001, 0x00008082, 0x0000808a, 0x80008000,
+ 0x0000808b, 0x80000001, 0x80008081, 0x00008009,
+ 0x0000008a, 0x00000088, 0x80008009, 0x8000000a
+};
+
+static const int keccak_rotations[5][5] = {
+ { 0, 1, 62, 28, 27 }, { 36, 44, 6, 55, 20 }, { 3, 10, 43, 25, 39 },
+ { 41, 45, 15, 21, 8 }, { 18, 2, 61, 56, 14 }
+};
+
+static void keccak(u32 A[5][5])
+{
+ u32 B[5][5], C[5];
+ int r, x, y;
+
+#define N5(N, expr) N = 0, expr; N = 1, expr; N = 2, expr; N = 3, expr; N = 4, expr;
+
+ for (r = 0; r < 12; ++r) {
+ N5(x, C[x] = A[0][x] ^ A[1][x] ^ A[2][x] ^ A[3][x] ^ A[4][x]);
+ N5(x, N5(y, A[y][x] ^= C[(x + 4) % 5] ^ rol64(C[(x + 1) % 5], 1)));
+ N5(x, N5(y, B[(2 * x + 3 * y) % 5][y] = rol64(A[y][x], keccak_rotations[y][x])));
+ N5(x, N5(y, A[y][x] = B[y][x] ^ (~B[y][(x + 1) % 5] & B[y][(x + 2) % 5])));
+ A[0][0] ^= keccak_round_constants[r];
+ }
+}
+
+static u32 state[5][5];
+static int idx;
+
+void mix_k12f800(const void *in, int nbytes)
+{
+ const u8 *src = in;
+ u8 *dst = (u8 *)state;
+
+ while (nbytes) {
+ size_t l = min_t(size_t, nbytes, sizeof(state) - POOL_BYTES - idx);
+ if (!l) {
+ keccak(state);
+ idx = 0;
+ continue;
+ }
+ crypto_xor(&dst[idx], src, l);
+ nbytes -= l, src += l, idx += l;
+ }
+}
diff --git a/main.c b/main.c
new file mode 100644
index 0000000..7b74e68
--- /dev/null
+++ b/main.c
@@ -0,0 +1,115 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * Copyright (C) 2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/delay.h>
+#include <linux/slab.h>
+#include <linux/sort.h>
+#include <linux/random.h>
+#include <asm/cpufeature.h>
+#include <asm/processor.h>
+#include <asm/fpu/api.h>
+#include <asm/simd.h>
+
+static unsigned long stamp = 0;
+module_param(stamp, ulong, 0);
+
+#define declare_it(name) bool mix_ ##name(void *input, size_t len);
+
+#define do_it(name, input, len) do { \
+ u32 eax = 0, ebx = 0, ecx = 0, edx = 0; \
+ for (i = 0; i < WARMUP; ++i) \
+ ret |= mix_ ##name(input, len); \
+ asm volatile("cpuid" : "+a" (eax), "=b" (ebx), "=d" (edx), "+c" (ecx)); \
+ for (i = 0; i <= TRIALS; ++i) { \
+ trial_times[i] = get_cycles(); \
+ ret |= mix_ ##name(input, len); \
+ } \
+ for (i = 0; i < TRIALS; ++i) \
+ trial_times[i] = trial_times[i + 1] - trial_times[i]; \
+ sort(trial_times, TRIALS + 1, sizeof(cycles_t), compare_cycles, NULL); \
+ median_ ## name = trial_times[TRIALS / 2]; \
+} while (0)
+
+#define report_it(name) do { \
+ pr_err("%lu: %12s: %6llu cycles per call\n", stamp, #name, median_ ## name); \
+} while (0)
+
+
+declare_it(original)
+declare_it(crc32loop)
+declare_it(tinylfsr)
+declare_it(widelfsr)
+declare_it(blake2s)
+declare_it(dblblake2s)
+declare_it(k12f800)
+declare_it(k12f1600)
+
+static int compare_cycles(const void *a, const void *b)
+{
+ return *((cycles_t *)a) - *((cycles_t *)b);
+}
+
+static int __init mod_init(void)
+{
+ enum { WARMUP = 6000, TRIALS = 10000, IDLE = 1 * 1000, LEN = 4096 };
+ static u8 buffer[LEN];
+ int ret = 0, i;
+ cycles_t *trial_times;
+ cycles_t median_original = 0;
+ cycles_t median_crc32loop = 0;
+ cycles_t median_tinylfsr = 0;
+ cycles_t median_widelfsr = 0;
+ cycles_t median_blake2s = 0;
+ cycles_t median_dblblake2s = 0;
+ cycles_t median_k12f800 = 0;
+ cycles_t median_k12f1600 = 0;
+ unsigned long flags;
+ DEFINE_SPINLOCK(lock);
+
+ get_random_bytes(buffer, sizeof(buffer));
+
+ trial_times = kcalloc(TRIALS + 1, sizeof(cycles_t), GFP_KERNEL);
+ if (!trial_times)
+ return -ENOMEM;
+
+ msleep(IDLE);
+
+ spin_lock_irqsave(&lock, flags);
+
+ do_it(original, buffer, sizeof(buffer));
+ do_it(crc32loop, buffer, sizeof(buffer));
+ do_it(tinylfsr, buffer, sizeof(buffer));
+ do_it(widelfsr, buffer, sizeof(buffer));
+ do_it(blake2s, buffer, sizeof(buffer));
+ do_it(dblblake2s, buffer, sizeof(buffer));
+ do_it(k12f800, buffer, sizeof(buffer));
+ do_it(k12f1600, buffer, sizeof(buffer));
+
+ spin_unlock_irqrestore(&lock, flags);
+
+ report_it(original);
+ report_it(crc32loop);
+ report_it(tinylfsr);
+ report_it(widelfsr);
+ report_it(blake2s);
+ report_it(dblblake2s);
+ report_it(k12f800);
+ report_it(k12f1600);
+
+ kfree(trial_times);
+
+ /* We should never actually agree to insert the module. Choosing
+ * -0x1000 here is an amazing hack. It causes the kernel to not
+ * actually load the module, while the standard userspace tools
+ * don't return an error, because it's too big. */
+ return -0x1000;
+}
+
+module_init(mod_init);
+MODULE_LICENSE("GPL v2");
+MODULE_DESCRIPTION("kBench9000 Cycle Counter");
+MODULE_AUTHOR("Jason A. Donenfeld <Jason@zx2c4.com>");
diff --git a/original.c b/original.c
new file mode 100644
index 0000000..32be348
--- /dev/null
+++ b/original.c
@@ -0,0 +1,65 @@
+#include <linux/kernel.h>
+
+enum poolinfo {
+ POOL_WORDS = 128,
+ POOL_WORDMASK = POOL_WORDS - 1,
+ POOL_BITS = POOL_WORDS * sizeof(u32) * 8,
+
+ /* x^128 + x^104 + x^76 + x^51 +x^25 + x + 1 */
+ POOL_TAP1 = 104,
+ POOL_TAP2 = 76,
+ POOL_TAP3 = 51,
+ POOL_TAP4 = 25,
+ POOL_TAP5 = 1,
+};
+
+static u32 input_pool_data[POOL_WORDS];
+
+static struct {
+ u16 add_ptr;
+ u16 input_rotate;
+} input_pool = { 0 };
+
+static const u32 twist_table[8] = {
+ 0x00000000, 0x3b6e20c8, 0x76dc4190, 0x4db26158,
+ 0xedb88320, 0xd6d6a3e8, 0x9b64c2b0, 0xa00ae278
+};
+
+void mix_original(const void *in, int nbytes)
+{
+ unsigned long i;
+ int input_rotate;
+ const u8 *bytes = in;
+ u32 w;
+
+ input_rotate = input_pool.input_rotate;
+ i = input_pool.add_ptr;
+
+ /* mix one byte at a time to simplify size handling and churn faster */
+ while (nbytes--) {
+ w = rol32(*bytes++, input_rotate);
+ i = (i - 1) & POOL_WORDMASK;
+
+ /* XOR in the various taps */
+ w ^= input_pool_data[i];
+ w ^= input_pool_data[(i + POOL_TAP1) & POOL_WORDMASK];
+ w ^= input_pool_data[(i + POOL_TAP2) & POOL_WORDMASK];
+ w ^= input_pool_data[(i + POOL_TAP3) & POOL_WORDMASK];
+ w ^= input_pool_data[(i + POOL_TAP4) & POOL_WORDMASK];
+ w ^= input_pool_data[(i + POOL_TAP5) & POOL_WORDMASK];
+
+ /* Mix the result back in with a twist */
+ input_pool_data[i] = (w >> 3) ^ twist_table[w & 7];
+
+ /*
+ * Normally, we add 7 bits of rotation to the pool.
+ * At the beginning of the pool, add an extra 7 bits
+ * rotation, so that successive passes spread the
+ * input bits across the pool evenly.
+ */
+ input_rotate = (input_rotate + (i ? 7 : 14)) & 31;
+ }
+
+ input_pool.input_rotate = input_rotate;
+ input_pool.add_ptr = i;
+}
diff --git a/run.sh b/run.sh
new file mode 100755
index 0000000..39c4719
--- /dev/null
+++ b/run.sh
@@ -0,0 +1,35 @@
+#!/bin/bash
+set -e
+
+nob_cpus() {
+ echo "[+] Setting non-boot CPUs to status $1"
+ for i in /sys/devices/system/cpu/*/online; do
+ echo "$1" > "$i"
+ done
+}
+
+noturbo() {
+ echo "[+] Setting no-turbo to status $1"
+ if [[ -e /sys/devices/system/cpu/intel_pstate/no_turbo ]]; then
+ echo "$1" > /sys/devices/system/cpu/intel_pstate/no_turbo
+ else
+ local val
+ [[ $1 == 0 ]] && val=0x850089
+ [[ $1 == 1 ]] && val=0x4000850089
+ [[ -n $val ]] || return 0
+ wrmsr -a 0x1a0 $val
+ fi
+}
+
+[[ -e kbench9000.ko ]]
+
+trap "nob_cpus 1; noturbo 0;" INT TERM EXIT
+noturbo 1
+nob_cpus 0
+
+echo "[+] Inserting module to run tests"
+stamp="$(date +%s)"
+insmod kbench9000.ko stamp="$stamp"
+
+echo "[+] Gathering results"
+dmesg | sed -n "s/.*kbench9000: $stamp: \\(.*\\)/\\x1b[37m\\x1b[44m\\x1b[1m\\1\\x1b[0m/p"
diff --git a/tinylfsr.c b/tinylfsr.c
new file mode 100644
index 0000000..fcea1a0
--- /dev/null
+++ b/tinylfsr.c
@@ -0,0 +1,38 @@
+#include <linux/kernel.h>
+
+enum poolinfo {
+ POOL_WORDS = 128,
+ POOL_WORDMASK = POOL_WORDS - 1,
+ POOL_BITS = POOL_WORDS * sizeof(u32) * 8,
+};
+
+static u32 input_pool_data[POOL_WORDS];
+
+static struct {
+ u16 add_ptr;
+} input_pool = { 0 };
+
+void mix_tinylfsr(const void *in, int nbytes)
+{
+ unsigned long i = input_pool.add_ptr, next;
+ const u8 *src = in;
+
+ /* Mix one byte at a time to simplify size handling and churn faster. */
+ while (nbytes--) {
+ /* The simplest possible max period LFSR.
+ * This is morally equivalent to evaluating
+ * the input as a polynomial at x^32.
+ *
+ * It can be evaluated in parallel, since i+0
+ * only depends on i+0 and i+1. Thus i+0 depends
+ * on i+0,i+1, i+1 depends on i+1 and i+2,
+ * so we can absorb up to 126 bytes in parallel
+ * if necessary.
+ */
+ next = (i + 1) & POOL_WORDMASK;
+ input_pool_data[i] = *src++ ^ rol32(input_pool_data[i], 9) ^
+ (input_pool_data[next] << 15) ^ (input_pool_data[next] >> 24);
+ i = next;
+ }
+ input_pool.add_ptr = i;
+}
diff --git a/widelfsr.c b/widelfsr.c
new file mode 100644
index 0000000..088ec7e
--- /dev/null
+++ b/widelfsr.c
@@ -0,0 +1,30 @@
+#include <linux/kernel.h>
+
+enum poolinfo {
+ POOL_WORDS = 128,
+ POOL_WORDMASK = POOL_WORDS - 1,
+ POOL_BITS = POOL_WORDS * sizeof(u32) * 8,
+};
+
+static u32 input_pool_data[POOL_WORDS];
+
+static struct {
+ u16 add_ptr;
+} input_pool = { 0 };
+
+void mix_widelfsr(const void *in, int nbytes)
+{
+ unsigned long i = input_pool.add_ptr;
+ const u8 *src = in;
+
+ /* Mix one byte at a time to simplify size handling and churn faster. */
+ while (nbytes--) {
+ input_pool_data[(i+0)&127] = rol32(input_pool_data[(i+0)&127], 4) ^
+ (input_pool_data[(i+126)&127] << 7) ^
+ rol32(input_pool_data[(i+127)&127],12) ^
+ rol32(input_pool_data[(i+127)&127],21) ^
+ *src++;
+ i = (i+1)&127;
+ }
+ input_pool.add_ptr = i;
+}