diff options
author | 2018-01-16 13:42:15 +0100 | |
---|---|---|
committer | 2022-01-24 16:59:37 +0100 | |
commit | adcaee95cce63db00e0d338b4cc4c8d13ef121c4 (patch) | |
tree | 41e115318bc5f2afdbd23388dfb2b6c6ce62f06e | |
download | kbench9000-adcaee95cce63db00e0d338b4cc4c8d13ef121c4.tar.xz kbench9000-adcaee95cce63db00e0d338b4cc4c8d13ef121c4.zip |
Initial scaffoldingjd/mix-pool-bytes-comparison
-rw-r--r-- | Makefile | 19 | ||||
-rw-r--r-- | README.md | 26 | ||||
-rw-r--r-- | blake2s.c | 9 | ||||
-rw-r--r-- | crc32loop.c | 62 | ||||
-rw-r--r-- | dblblake2s.c | 10 | ||||
-rw-r--r-- | k12f1600.c | 54 | ||||
-rw-r--r-- | k12f800.c | 54 | ||||
-rw-r--r-- | main.c | 115 | ||||
-rw-r--r-- | original.c | 65 | ||||
-rwxr-xr-x | run.sh | 35 | ||||
-rw-r--r-- | tinylfsr.c | 38 | ||||
-rw-r--r-- | widelfsr.c | 30 |
12 files changed, 517 insertions, 0 deletions
diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..7df6fb2 --- /dev/null +++ b/Makefile @@ -0,0 +1,19 @@ +ifneq ($(KERNELRELEASE),) +kbench9000-y := main.o original.o crc32loop.o tinylfsr.o widelfsr.o blake2s.o dblblake2s.o k12f800.o k12f1600.o +obj-m := kbench9000.o +ccflags-y += -D'pr_fmt(fmt)=KBUILD_MODNAME ": " fmt' +else +KERNELDIR ?= /lib/modules/$(shell uname -r)/build +PWD := $(shell pwd) + +default: build + +run: build + sudo ./run.sh +build: + $(MAKE) -C $(KERNELDIR) M=$(PWD) +clean: + $(MAKE) -C $(KERNELDIR) M=$(PWD) clean +.PHONY: default run build clean +endif + diff --git a/README.md b/README.md new file mode 100644 index 0000000..3fd3180 --- /dev/null +++ b/README.md @@ -0,0 +1,26 @@ +# kBench9000 – simple kernel land cycle counter +### by [Jason A. Donenfeld](mailto:jason@zx2c4.com) + +This is a very simple kernel land cycle counter. To use, simply edit `function.h`, +add any other `.c` files and mention them in the `kbench9000-y +=` line of the +`Makefile`, and then type: + +``` +$ make run +``` + + + +### Kernel Toolchain + +You'll need to have a working kernel toolchain, usually achievable by: + +``` +$ sudo apt install linux-headers-$(uname -r) build-essential +``` + +or + +``` +$ sudo dnf install kernel-devel @development-tools +``` diff --git a/blake2s.c b/blake2s.c new file mode 100644 index 0000000..6d5500a --- /dev/null +++ b/blake2s.c @@ -0,0 +1,9 @@ +#include <linux/kernel.h> +#include <crypto/blake2s.h> + +static struct blake2s_state hash = { .outlen = 32 /* TODO: other constants */ }; + +void mix_blake2s(const void *in, int nbytes) +{ + blake2s_update(&hash, in, nbytes); +} diff --git a/crc32loop.c b/crc32loop.c new file mode 100644 index 0000000..371bc96 --- /dev/null +++ b/crc32loop.c @@ -0,0 +1,62 @@ +#include <linux/kernel.h> + +enum poolinfo { + POOL_WORDS = 128, + POOL_WORDMASK = POOL_WORDS - 1, + POOL_BITS = POOL_WORDS * sizeof(u32) * 8, + + /* x^128 + x^104 + x^76 + x^51 +x^25 + x + 1 */ + POOL_TAP1 = 104, + POOL_TAP2 = 76, + POOL_TAP3 = 51, + POOL_TAP4 = 25, + POOL_TAP5 = 1, +}; + +static u32 input_pool_data[POOL_WORDS]; + +static struct { + u16 add_ptr; + u16 input_rotate; +} input_pool = { 0 }; + +void mix_crc32loop(const void *in, int nbytes) +{ + unsigned long i, j; + int input_rotate; + const u8 *bytes = in; + u32 w; + + input_rotate = input_pool.input_rotate; + i = input_pool.add_ptr; + + /* mix one byte at a time to simplify size handling and churn faster */ + while (nbytes--) { + w = rol32(*bytes++, input_rotate); + i = (i - 1) & POOL_WORDMASK; + + /* XOR in the various taps */ + w ^= input_pool_data[i]; + w ^= input_pool_data[(i + POOL_TAP1) & POOL_WORDMASK]; + w ^= input_pool_data[(i + POOL_TAP2) & POOL_WORDMASK]; + w ^= input_pool_data[(i + POOL_TAP3) & POOL_WORDMASK]; + w ^= input_pool_data[(i + POOL_TAP4) & POOL_WORDMASK]; + w ^= input_pool_data[(i + POOL_TAP5) & POOL_WORDMASK]; + + /* Mix the result back in with a twist */ + for (j = 0; j < 3; ++j) + w = (w >> 1) ^ (0xedb88320 & -(w & 1)); + input_pool_data[i] = w; + + /* + * Normally, we add 7 bits of rotation to the pool. + * At the beginning of the pool, add an extra 7 bits + * rotation, so that successive passes spread the + * input bits across the pool evenly. + */ + input_rotate = (input_rotate + (i ? 7 : 14)) & 31; + } + + input_pool.input_rotate = input_rotate; + input_pool.add_ptr = i; +} diff --git a/dblblake2s.c b/dblblake2s.c new file mode 100644 index 0000000..54f5515 --- /dev/null +++ b/dblblake2s.c @@ -0,0 +1,10 @@ +#include <linux/kernel.h> +#include <crypto/blake2s.h> + +static struct blake2s_state hash[2] = { {.outlen = 32 /* TODO: other constants */ }, {.outlen = 32 /* TODO: other constants */ } }; + +void mix_dblblake2s(const void *in, int nbytes) +{ + blake2s_update(&hash[0], in, nbytes); + blake2s_update(&hash[1], in, nbytes); +} diff --git a/k12f1600.c b/k12f1600.c new file mode 100644 index 0000000..26d974b --- /dev/null +++ b/k12f1600.c @@ -0,0 +1,54 @@ +#include <linux/kernel.h> +#include <crypto/algapi.h> + +enum poolinfo { + POOL_BITS = 512, + POOL_BYTES = POOL_BITS / 8 +}; + +static const u64 keccak_round_constants[12] = { + 0x0000000000000001ULL, 0x0000000000008082ULL, 0x800000000000808aULL, 0x8000000080008000ULL, + 0x000000000000808bULL, 0x0000000080000001ULL, 0x8000000080008081ULL, 0x8000000000008009ULL, + 0x000000000000008aULL, 0x0000000000000088ULL, 0x0000000080008009ULL, 0x000000008000000aULL +}; + +static const int keccak_rotations[5][5] = { + { 0, 1, 62, 28, 27 }, { 36, 44, 6, 55, 20 }, { 3, 10, 43, 25, 39 }, + { 41, 45, 15, 21, 8 }, { 18, 2, 61, 56, 14 } +}; + +static void keccak(u64 A[5][5]) +{ + u64 B[5][5], C[5]; + int r, x, y; + +#define N5(N, expr) N = 0, expr; N = 1, expr; N = 2, expr; N = 3, expr; N = 4, expr; + + for (r = 0; r < 12; ++r) { + N5(x, C[x] = A[0][x] ^ A[1][x] ^ A[2][x] ^ A[3][x] ^ A[4][x]); + N5(x, N5(y, A[y][x] ^= C[(x + 4) % 5] ^ rol64(C[(x + 1) % 5], 1))); + N5(x, N5(y, B[(2 * x + 3 * y) % 5][y] = rol64(A[y][x], keccak_rotations[y][x]))); + N5(x, N5(y, A[y][x] = B[y][x] ^ (~B[y][(x + 1) % 5] & B[y][(x + 2) % 5]))); + A[0][0] ^= keccak_round_constants[r]; + } +} + +static u64 state[5][5]; +static int idx; + +void mix_k12f1600(const void *in, int nbytes) +{ + const u8 *src = in; + u8 *dst = (u8 *)state; + + while (nbytes) { + size_t l = min_t(size_t, nbytes, sizeof(state) - POOL_BYTES - idx); + if (!l) { + keccak(state); + idx = 0; + continue; + } + crypto_xor(&dst[idx], src, l); + nbytes -= l, src += l, idx += l; + } +} diff --git a/k12f800.c b/k12f800.c new file mode 100644 index 0000000..67cc2b3 --- /dev/null +++ b/k12f800.c @@ -0,0 +1,54 @@ +#include <linux/kernel.h> +#include <crypto/algapi.h> + +enum poolinfo { + POOL_BITS = 512, + POOL_BYTES = POOL_BITS / 8 +}; + +static const u32 keccak_round_constants[12] = { + 0x00000001, 0x00008082, 0x0000808a, 0x80008000, + 0x0000808b, 0x80000001, 0x80008081, 0x00008009, + 0x0000008a, 0x00000088, 0x80008009, 0x8000000a +}; + +static const int keccak_rotations[5][5] = { + { 0, 1, 62, 28, 27 }, { 36, 44, 6, 55, 20 }, { 3, 10, 43, 25, 39 }, + { 41, 45, 15, 21, 8 }, { 18, 2, 61, 56, 14 } +}; + +static void keccak(u32 A[5][5]) +{ + u32 B[5][5], C[5]; + int r, x, y; + +#define N5(N, expr) N = 0, expr; N = 1, expr; N = 2, expr; N = 3, expr; N = 4, expr; + + for (r = 0; r < 12; ++r) { + N5(x, C[x] = A[0][x] ^ A[1][x] ^ A[2][x] ^ A[3][x] ^ A[4][x]); + N5(x, N5(y, A[y][x] ^= C[(x + 4) % 5] ^ rol64(C[(x + 1) % 5], 1))); + N5(x, N5(y, B[(2 * x + 3 * y) % 5][y] = rol64(A[y][x], keccak_rotations[y][x]))); + N5(x, N5(y, A[y][x] = B[y][x] ^ (~B[y][(x + 1) % 5] & B[y][(x + 2) % 5]))); + A[0][0] ^= keccak_round_constants[r]; + } +} + +static u32 state[5][5]; +static int idx; + +void mix_k12f800(const void *in, int nbytes) +{ + const u8 *src = in; + u8 *dst = (u8 *)state; + + while (nbytes) { + size_t l = min_t(size_t, nbytes, sizeof(state) - POOL_BYTES - idx); + if (!l) { + keccak(state); + idx = 0; + continue; + } + crypto_xor(&dst[idx], src, l); + nbytes -= l, src += l, idx += l; + } +} @@ -0,0 +1,115 @@ +/* SPDX-License-Identifier: GPL-2.0 + * + * Copyright (C) 2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. + */ + +#include <linux/init.h> +#include <linux/module.h> +#include <linux/delay.h> +#include <linux/slab.h> +#include <linux/sort.h> +#include <linux/random.h> +#include <asm/cpufeature.h> +#include <asm/processor.h> +#include <asm/fpu/api.h> +#include <asm/simd.h> + +static unsigned long stamp = 0; +module_param(stamp, ulong, 0); + +#define declare_it(name) bool mix_ ##name(void *input, size_t len); + +#define do_it(name, input, len) do { \ + u32 eax = 0, ebx = 0, ecx = 0, edx = 0; \ + for (i = 0; i < WARMUP; ++i) \ + ret |= mix_ ##name(input, len); \ + asm volatile("cpuid" : "+a" (eax), "=b" (ebx), "=d" (edx), "+c" (ecx)); \ + for (i = 0; i <= TRIALS; ++i) { \ + trial_times[i] = get_cycles(); \ + ret |= mix_ ##name(input, len); \ + } \ + for (i = 0; i < TRIALS; ++i) \ + trial_times[i] = trial_times[i + 1] - trial_times[i]; \ + sort(trial_times, TRIALS + 1, sizeof(cycles_t), compare_cycles, NULL); \ + median_ ## name = trial_times[TRIALS / 2]; \ +} while (0) + +#define report_it(name) do { \ + pr_err("%lu: %12s: %6llu cycles per call\n", stamp, #name, median_ ## name); \ +} while (0) + + +declare_it(original) +declare_it(crc32loop) +declare_it(tinylfsr) +declare_it(widelfsr) +declare_it(blake2s) +declare_it(dblblake2s) +declare_it(k12f800) +declare_it(k12f1600) + +static int compare_cycles(const void *a, const void *b) +{ + return *((cycles_t *)a) - *((cycles_t *)b); +} + +static int __init mod_init(void) +{ + enum { WARMUP = 6000, TRIALS = 10000, IDLE = 1 * 1000, LEN = 4096 }; + static u8 buffer[LEN]; + int ret = 0, i; + cycles_t *trial_times; + cycles_t median_original = 0; + cycles_t median_crc32loop = 0; + cycles_t median_tinylfsr = 0; + cycles_t median_widelfsr = 0; + cycles_t median_blake2s = 0; + cycles_t median_dblblake2s = 0; + cycles_t median_k12f800 = 0; + cycles_t median_k12f1600 = 0; + unsigned long flags; + DEFINE_SPINLOCK(lock); + + get_random_bytes(buffer, sizeof(buffer)); + + trial_times = kcalloc(TRIALS + 1, sizeof(cycles_t), GFP_KERNEL); + if (!trial_times) + return -ENOMEM; + + msleep(IDLE); + + spin_lock_irqsave(&lock, flags); + + do_it(original, buffer, sizeof(buffer)); + do_it(crc32loop, buffer, sizeof(buffer)); + do_it(tinylfsr, buffer, sizeof(buffer)); + do_it(widelfsr, buffer, sizeof(buffer)); + do_it(blake2s, buffer, sizeof(buffer)); + do_it(dblblake2s, buffer, sizeof(buffer)); + do_it(k12f800, buffer, sizeof(buffer)); + do_it(k12f1600, buffer, sizeof(buffer)); + + spin_unlock_irqrestore(&lock, flags); + + report_it(original); + report_it(crc32loop); + report_it(tinylfsr); + report_it(widelfsr); + report_it(blake2s); + report_it(dblblake2s); + report_it(k12f800); + report_it(k12f1600); + + kfree(trial_times); + + /* We should never actually agree to insert the module. Choosing + * -0x1000 here is an amazing hack. It causes the kernel to not + * actually load the module, while the standard userspace tools + * don't return an error, because it's too big. */ + return -0x1000; +} + +module_init(mod_init); +MODULE_LICENSE("GPL v2"); +MODULE_DESCRIPTION("kBench9000 Cycle Counter"); +MODULE_AUTHOR("Jason A. Donenfeld <Jason@zx2c4.com>"); diff --git a/original.c b/original.c new file mode 100644 index 0000000..32be348 --- /dev/null +++ b/original.c @@ -0,0 +1,65 @@ +#include <linux/kernel.h> + +enum poolinfo { + POOL_WORDS = 128, + POOL_WORDMASK = POOL_WORDS - 1, + POOL_BITS = POOL_WORDS * sizeof(u32) * 8, + + /* x^128 + x^104 + x^76 + x^51 +x^25 + x + 1 */ + POOL_TAP1 = 104, + POOL_TAP2 = 76, + POOL_TAP3 = 51, + POOL_TAP4 = 25, + POOL_TAP5 = 1, +}; + +static u32 input_pool_data[POOL_WORDS]; + +static struct { + u16 add_ptr; + u16 input_rotate; +} input_pool = { 0 }; + +static const u32 twist_table[8] = { + 0x00000000, 0x3b6e20c8, 0x76dc4190, 0x4db26158, + 0xedb88320, 0xd6d6a3e8, 0x9b64c2b0, 0xa00ae278 +}; + +void mix_original(const void *in, int nbytes) +{ + unsigned long i; + int input_rotate; + const u8 *bytes = in; + u32 w; + + input_rotate = input_pool.input_rotate; + i = input_pool.add_ptr; + + /* mix one byte at a time to simplify size handling and churn faster */ + while (nbytes--) { + w = rol32(*bytes++, input_rotate); + i = (i - 1) & POOL_WORDMASK; + + /* XOR in the various taps */ + w ^= input_pool_data[i]; + w ^= input_pool_data[(i + POOL_TAP1) & POOL_WORDMASK]; + w ^= input_pool_data[(i + POOL_TAP2) & POOL_WORDMASK]; + w ^= input_pool_data[(i + POOL_TAP3) & POOL_WORDMASK]; + w ^= input_pool_data[(i + POOL_TAP4) & POOL_WORDMASK]; + w ^= input_pool_data[(i + POOL_TAP5) & POOL_WORDMASK]; + + /* Mix the result back in with a twist */ + input_pool_data[i] = (w >> 3) ^ twist_table[w & 7]; + + /* + * Normally, we add 7 bits of rotation to the pool. + * At the beginning of the pool, add an extra 7 bits + * rotation, so that successive passes spread the + * input bits across the pool evenly. + */ + input_rotate = (input_rotate + (i ? 7 : 14)) & 31; + } + + input_pool.input_rotate = input_rotate; + input_pool.add_ptr = i; +} @@ -0,0 +1,35 @@ +#!/bin/bash +set -e + +nob_cpus() { + echo "[+] Setting non-boot CPUs to status $1" + for i in /sys/devices/system/cpu/*/online; do + echo "$1" > "$i" + done +} + +noturbo() { + echo "[+] Setting no-turbo to status $1" + if [[ -e /sys/devices/system/cpu/intel_pstate/no_turbo ]]; then + echo "$1" > /sys/devices/system/cpu/intel_pstate/no_turbo + else + local val + [[ $1 == 0 ]] && val=0x850089 + [[ $1 == 1 ]] && val=0x4000850089 + [[ -n $val ]] || return 0 + wrmsr -a 0x1a0 $val + fi +} + +[[ -e kbench9000.ko ]] + +trap "nob_cpus 1; noturbo 0;" INT TERM EXIT +noturbo 1 +nob_cpus 0 + +echo "[+] Inserting module to run tests" +stamp="$(date +%s)" +insmod kbench9000.ko stamp="$stamp" + +echo "[+] Gathering results" +dmesg | sed -n "s/.*kbench9000: $stamp: \\(.*\\)/\\x1b[37m\\x1b[44m\\x1b[1m\\1\\x1b[0m/p" diff --git a/tinylfsr.c b/tinylfsr.c new file mode 100644 index 0000000..fcea1a0 --- /dev/null +++ b/tinylfsr.c @@ -0,0 +1,38 @@ +#include <linux/kernel.h> + +enum poolinfo { + POOL_WORDS = 128, + POOL_WORDMASK = POOL_WORDS - 1, + POOL_BITS = POOL_WORDS * sizeof(u32) * 8, +}; + +static u32 input_pool_data[POOL_WORDS]; + +static struct { + u16 add_ptr; +} input_pool = { 0 }; + +void mix_tinylfsr(const void *in, int nbytes) +{ + unsigned long i = input_pool.add_ptr, next; + const u8 *src = in; + + /* Mix one byte at a time to simplify size handling and churn faster. */ + while (nbytes--) { + /* The simplest possible max period LFSR. + * This is morally equivalent to evaluating + * the input as a polynomial at x^32. + * + * It can be evaluated in parallel, since i+0 + * only depends on i+0 and i+1. Thus i+0 depends + * on i+0,i+1, i+1 depends on i+1 and i+2, + * so we can absorb up to 126 bytes in parallel + * if necessary. + */ + next = (i + 1) & POOL_WORDMASK; + input_pool_data[i] = *src++ ^ rol32(input_pool_data[i], 9) ^ + (input_pool_data[next] << 15) ^ (input_pool_data[next] >> 24); + i = next; + } + input_pool.add_ptr = i; +} diff --git a/widelfsr.c b/widelfsr.c new file mode 100644 index 0000000..088ec7e --- /dev/null +++ b/widelfsr.c @@ -0,0 +1,30 @@ +#include <linux/kernel.h> + +enum poolinfo { + POOL_WORDS = 128, + POOL_WORDMASK = POOL_WORDS - 1, + POOL_BITS = POOL_WORDS * sizeof(u32) * 8, +}; + +static u32 input_pool_data[POOL_WORDS]; + +static struct { + u16 add_ptr; +} input_pool = { 0 }; + +void mix_widelfsr(const void *in, int nbytes) +{ + unsigned long i = input_pool.add_ptr; + const u8 *src = in; + + /* Mix one byte at a time to simplify size handling and churn faster. */ + while (nbytes--) { + input_pool_data[(i+0)&127] = rol32(input_pool_data[(i+0)&127], 4) ^ + (input_pool_data[(i+126)&127] << 7) ^ + rol32(input_pool_data[(i+127)&127],12) ^ + rol32(input_pool_data[(i+127)&127],21) ^ + *src++; + i = (i+1)&127; + } + input_pool.add_ptr = i; +} |