timeout(9): switch to tickless backend

Rebase the timeout wheel on the system uptime clock. Timeouts are now set to run at or after an absolute time as returned by nanouptime(9). Timeouts are thus "tickless": they expire at a real time on that clock instead of at a particular value of the global "ticks" variable. To facilitate this change the timeout struct's .to_time member becomes a timespec. Hashing timeouts into a bucket on the wheel changes slightly: we build a 32-bit hash with 25 bits of seconds (.tv_sec) and 7 bits of subseconds (.tv_nsec). 7 bits of subseconds means the width of the lowest wheel level is now 2 seconds on all platforms and each bucket in that lowest level corresponds to 1/128 seconds on the uptime clock. These values were chosen to closely align with the current 100hz hardclock(9) typical on almost all of our platforms. At 100hz a bucket is currently ~1/100 seconds wide on the lowest level and the lowest level itself is ~2.56 seconds wide. Not a huge change, but a change nonetheless. Because a bucket no longer corresponds to a single tick more than one bucket may be dumped during an average timeout_hardclock_update() call. On 100hz platforms you now dump ~2 buckets. On 64hz machines (sh) you dump ~4 buckets. On 1024hz machines (alpha) you dump only 1 bucket, but you are doing extra work in softclock() to reschedule timeouts that aren't due yet. To avoid changing current behavior all timeout_add*(9) interfaces convert their timeout interval into ticks, compute an equivalent timespec interval, and then add that interval to the timestamp of the most recent timeout_hardclock_update() call to determine an absolute deadline. So all current timeouts still "use" ticks, but the ticks are faked in the timeout layer. A new interface, timeout_at_ts(9), is introduced here to bypass this backwardly compatible behavior. It will be used in subsequent diffs to add absolute timeout support for userland and to clean up some of the messier parts of kernel timekeeping, especially at the syscall layer. Because timeouts are based against the uptime clock they are subject to NTP adjustment via adjtime(2) and adjfreq(2). Unless you have a crazy adjfreq(2) adjustment set this will not change the expiration behavior of your timeouts. Tons of design feedback from mpi@, visa@, guenther@, and kettenis@. Additional amd64 testing from anton@ and visa@. Octeon testing from visa@. macppc testing from me. Positive feedback from deraadt@, ok visa@
author: cheloha <cheloha@openbsd.org> 2019-11-26 15:27:08 +0000
committer: cheloha <cheloha@openbsd.org> 2019-11-26 15:27:08 +0000
commit: 4b4793304325522e159494e65cc0b3cd5a4dfa75 (patch)
tree: df2c0c77813c229cb8d5362293895047d491a9b1 /sys/kern/kern_timeout.c
parent: Revert previous "honour DEBUG", otherwise clang uses -g resulting in huge (diff)
download: wireguard-openbsd-4b4793304325522e159494e65cc0b3cd5a4dfa75.tar.xz
wireguard-openbsd-4b4793304325522e159494e65cc0b3cd5a4dfa75.zip
1 files changed, 182 insertions, 112 deletions
diff --git a/sys/kern/kern_timeout.c b/sys/kern/kern_timeout.c
index 5ba025e1ce7..01d3fc5e2ef 100644
--- a/sys/kern/kern_timeout.c
+++ b/sys/kern/kern_timeout.c
@@ -1,4 +1,4 @@
-/*	$OpenBSD: kern_timeout.c,v 1.62 2019/11/07 14:05:12 mpi Exp $	*/
+/*	$OpenBSD: kern_timeout.c,v 1.63 2019/11/26 15:27:08 cheloha Exp $	*/
 /*
  * Copyright (c) 2001 Thomas Nordin <nordin@openbsd.org>
  * Copyright (c) 2000-2001 Artur Grabowski <art@openbsd.org>
@@ -55,33 +55,23 @@ struct timeoutstat tostat;		/* [t] statistics and totals */
 
 /*
  * Timeouts are kept in a hierarchical timing wheel. The to_time is the value
- * of the global variable "ticks" when the timeout should be called. There are
+ * of the system uptime clock when the timeout should be called. There are
  * four levels with 256 buckets each.
  */
-#define BUCKETS 1024
+#define WHEELCOUNT 4
 #define WHEELSIZE 256
 #define WHEELMASK 255
 #define WHEELBITS 8
+#define BUCKETS (WHEELCOUNT * WHEELSIZE)
 
 struct circq timeout_wheel[BUCKETS];	/* [t] Queues of timeouts */
 struct circq timeout_todo;		/* [t] Due or needs scheduling */
 struct circq timeout_proc;		/* [t] Due + needs process context */
 
-#define MASKWHEEL(wheel, time) (((time) >> ((wheel)*WHEELBITS)) & WHEELMASK)
-
-#define BUCKET(rel, abs)						\
-    (timeout_wheel[							\
-	((rel) <= (1 << (2*WHEELBITS)))					\
-	    ? ((rel) <= (1 << WHEELBITS))				\
-		? MASKWHEEL(0, (abs))					\
-		: MASKWHEEL(1, (abs)) + WHEELSIZE			\
-	    : ((rel) <= (1 << (3*WHEELBITS)))				\
-		? MASKWHEEL(2, (abs)) + 2*WHEELSIZE			\
-		: MASKWHEEL(3, (abs)) + 3*WHEELSIZE])
-
-#define MOVEBUCKET(wheel, time)						\
-    CIRCQ_APPEND(&timeout_todo,						\
-        &timeout_wheel[MASKWHEEL((wheel), (time)) + (wheel)*WHEELSIZE])
+time_t timeout_level_width[WHEELCOUNT];	/* [I] Wheel level width (seconds) */
+struct timespec tick_ts;		/* [I] Length of a tick (1/hz secs) */
+struct timespec timeout_lastscan;	/* [t] Uptime at last wheel scan */
+struct timespec timeout_late;		/* [t] Late if due prior to this */
 
 /*
  * Circular queue definitions.
@@ -146,6 +136,10 @@ struct lock_type timeout_spinlock_type = {
 void softclock(void *);
 void softclock_create_thread(void *);
 void softclock_thread(void *);
+int timeout_at_ts_locked(struct timeout *, clockid_t, const struct timespec *);
+unsigned int timeout_bucket(const struct timespec *, const struct timespec *);
+int timeout_clock_is_valid(clockid_t);
+unsigned int timeout_maskwheel(unsigned int, const struct timespec *);
 void timeout_proc_barrier(void *);
 
 /*
@@ -178,29 +172,23 @@ timeout_sync_leave(int needsproc)
 	WITNESS_UNLOCK(TIMEOUT_LOCK_OBJ(needsproc), 0);
 }
 
-/*
- * Some of the "math" in here is a bit tricky.
- *
- * We have to beware of wrapping ints.
- * We use the fact that any element added to the queue must be added with a
- * positive time. That means that any element `to' on the queue cannot be
- * scheduled to timeout further in time than INT_MAX, but to->to_time can
- * be positive or negative so comparing it with anything is dangerous.
- * The only way we can use the to->to_time value in any predictable way
- * is when we calculate how far in the future `to' will timeout -
- * "to->to_time - ticks". The result will always be positive for future
- * timeouts and 0 or negative for due timeouts.
- */
-
 void
 timeout_startup(void)
 {
-	int b;
+	unsigned int b, level;
 
 	CIRCQ_INIT(&timeout_todo);
 	CIRCQ_INIT(&timeout_proc);
 	for (b = 0; b < nitems(timeout_wheel); b++)
 		CIRCQ_INIT(&timeout_wheel[b]);
+
+	for (level = 0; level < nitems(timeout_level_width); level++)
+		timeout_level_width[level] = 2 << (level * WHEELBITS);
+
+	tick_ts.tv_sec = 0;
+	tick_ts.tv_nsec = tick_nsec;
+	timespecclear(&timeout_lastscan);
+	timespecclear(&timeout_late);
 }
 
 void
@@ -222,6 +210,7 @@ timeout_set(struct timeout *new, void (*fn)(void *), void *arg)
 	new->to_func = fn;
 	new->to_arg = arg;
 	new->to_flags = TIMEOUT_INITIALIZED;
+	timespecclear(&new->to_time);
 }
 
 void
@@ -234,36 +223,15 @@ timeout_set_proc(struct timeout *new, void (*fn)(void *), void *arg)
 int
 timeout_add(struct timeout *new, int to_ticks)
 {
-	int old_time;
-	int ret = 1;
+	struct timespec ts, when;
+	int ret;
 
-	KASSERT(ISSET(new->to_flags, TIMEOUT_INITIALIZED));
 	KASSERT(to_ticks >= 0);
+	NSEC_TO_TIMESPEC((uint64_t)to_ticks * tick_nsec, &ts);
 
 	mtx_enter(&timeout_mutex);
-
-	/* Initialize the time here, it won't change. */
-	old_time = new->to_time;
-	new->to_time = to_ticks + ticks;
-	CLR(new->to_flags, TIMEOUT_TRIGGERED);
-
-	/*
-	 * If this timeout already is scheduled and now is moved
-	 * earlier, reschedule it now. Otherwise leave it in place
-	 * and let it be rescheduled later.
-	 */
-	if (ISSET(new->to_flags, TIMEOUT_ONQUEUE)) {
-		if (new->to_time - ticks < old_time - ticks) {
-			CIRCQ_REMOVE(&new->to_list);
-			CIRCQ_INSERT(&new->to_list, &timeout_todo);
-		}
-		tostat.tos_readded++;
-		ret = 0;
-	} else {
-		SET(new->to_flags, TIMEOUT_ONQUEUE);
-		CIRCQ_INSERT(&new->to_list, &timeout_todo);
-	}
-	tostat.tos_added++;
+	timespecadd(&timeout_lastscan, &ts, &when);
+	ret = timeout_at_ts_locked(new, CLOCK_BOOTTIME, &when);
 	mtx_leave(&timeout_mutex);
 
 	return ret;
@@ -361,6 +329,49 @@ timeout_add_nsec(struct timeout *to, int nsecs)
 }
 
 int
+timeout_at_ts(struct timeout *to, clockid_t clock, const struct timespec *ts)
+{
+	int ret;
+
+	mtx_enter(&timeout_mutex);
+	ret = timeout_at_ts_locked(to, clock, ts);
+	mtx_leave(&timeout_mutex);
+
+	return ret;
+}
+
+int
+timeout_at_ts_locked(struct timeout *to, clockid_t clock,
+    const struct timespec *when)
+{
+	struct timespec old_time;
+	int ret = 1;
+
+	MUTEX_ASSERT_LOCKED(&timeout_mutex);
+	KASSERT(ISSET(to->to_flags, TIMEOUT_INITIALIZED));
+	KASSERT(timeout_clock_is_valid(clock));
+
+	old_time = to->to_time;
+	to->to_time = *when;
+	CLR(to->to_flags, TIMEOUT_TRIGGERED);
+
+	if (ISSET(to->to_flags, TIMEOUT_ONQUEUE)) {
+		if (timespeccmp(&to->to_time, &old_time, <)) {
+			CIRCQ_REMOVE(&to->to_list);
+			CIRCQ_INSERT(&to->to_list, &timeout_todo);
+		}
+		tostat.tos_readded++;
+		ret = 0;
+	} else {
+		SET(to->to_flags, TIMEOUT_ONQUEUE);
+		CIRCQ_INSERT(&to->to_list, &timeout_todo);
+	}
+	tostat.tos_added++;
+
+	return ret;
+}
+
+int
 timeout_del(struct timeout *to)
 {
 	int ret = 0;
@@ -429,6 +440,46 @@ timeout_proc_barrier(void *arg)
 	cond_signal(c);
 }
 
+unsigned int
+timeout_maskwheel(unsigned int level, const struct timespec *abstime)
+{
+	uint32_t hi, lo;
+
+	hi = abstime->tv_sec << 7;
+	lo = abstime->tv_nsec / 7812500;
+
+	return ((hi | lo) >> (level * WHEELBITS)) & WHEELMASK;
+}
+
+unsigned int
+timeout_bucket(const struct timespec *now, const struct timespec *later)
+{
+	struct timespec diff;
+	unsigned int level;
+
+	KASSERT(timespeccmp(now, later, <));
+
+	timespecsub(later, now, &diff);
+	for (level = 0; level < nitems(timeout_level_width) - 1; level++) {
+		if (diff.tv_sec < timeout_level_width[level])
+			break;
+	}
+	return level * WHEELSIZE + timeout_maskwheel(level, later);
+}
+
+int
+timeout_clock_is_valid(clockid_t clock)
+{
+	switch (clock) {
+	case CLOCK_BOOTTIME:
+	case CLOCK_MONOTONIC:
+		return 1;
+	default:
+		break;
+	}
+	return 0;
+}
+
 /*
  * This is called from hardclock() on the primary CPU at the start of
  * every tick.
@@ -436,19 +487,47 @@ timeout_proc_barrier(void *arg)
 void
 timeout_hardclock_update(void)
 {
-	int need_softclock;
+	struct timespec elapsed, now;
+	unsigned int b, done, first, last, level, need_softclock, offset;
+
+	nanouptime(&now);
 
 	mtx_enter(&timeout_mutex);
 
-	MOVEBUCKET(0, ticks);
-	if (MASKWHEEL(0, ticks) == 0) {
-		MOVEBUCKET(1, ticks);
-		if (MASKWHEEL(1, ticks) == 0) {
-			MOVEBUCKET(2, ticks);
-			if (MASKWHEEL(2, ticks) == 0)
-				MOVEBUCKET(3, ticks);
+	/*
+	 * Dump the buckets that expired while we were away.
+	 *
+	 * If the elapsed time has exceeded a level's width then we need
+	 * to dump every bucket in the level.  We have necessarily completed
+	 * a lap of that level so we need to process buckets in the next.
+	 *
+	 * Otherwise we just need to compare indices.  If the index of the
+	 * first expired bucket is greater than or equal to that of the last
+	 * then we have completed a lap of the level and need to process
+	 * buckets in the next.
+	 */
+	timespecsub(&now, &timeout_lastscan, &elapsed);
+	for (level = 0; level < nitems(timeout_level_width); level++) {
+		first = timeout_maskwheel(level, &timeout_lastscan);
+		if (elapsed.tv_sec >= timeout_level_width[level]) {
+			last = (first == 0) ? WHEELSIZE - 1 : first - 1;
+			done = 0;
+		} else {
+			last = timeout_maskwheel(level, &now);
+			done = first <= last;
+		}
+		offset = level * WHEELSIZE;
+		for (b = first;; b = (b + 1) % WHEELSIZE) {
+			CIRCQ_APPEND(&timeout_todo, &timeout_wheel[offset + b]);
+			if (b == last)
+				break;
 		}
+		if (done)
+			break;
 	}
+
+	timespecsub(&now, &tick_ts, &timeout_late);
+	timeout_lastscan = now;
 	need_softclock = !CIRCQ_EMPTY(&timeout_todo);
 
 	mtx_leave(&timeout_mutex);
@@ -489,10 +568,8 @@ timeout_run(struct timeout *to)
 void
 softclock(void *arg)
 {
-	int delta;
-	struct circq *bucket;
 	struct timeout *to;
-	int needsproc = 0;
+	unsigned int b, needsproc = 0;
 
 	mtx_enter(&timeout_mutex);
 	while (!CIRCQ_EMPTY(&timeout_todo)) {
@@ -503,14 +580,13 @@ softclock(void *arg)
 		 * If due run it or defer execution to the thread,
 		 * otherwise insert it into the right bucket.
 		 */
-		delta = to->to_time - ticks;
-		if (delta > 0) {
-			bucket = &BUCKET(delta, to->to_time);
-			CIRCQ_INSERT(&to->to_list, bucket);
+		if (timespeccmp(&timeout_lastscan, &to->to_time, <)) {
+			b = timeout_bucket(&timeout_lastscan, &to->to_time);
+			CIRCQ_INSERT(&to->to_list, &timeout_wheel[b]);
 			tostat.tos_rescheduled++;
 			continue;
 		}
-		if (delta < 0)
+		if (timespeccmp(&to->to_time, &timeout_late, <))
 			tostat.tos_late++;
 		if (ISSET(to->to_flags, TIMEOUT_NEEDPROCCTX)) {
 			CIRCQ_INSERT(&to->to_list, &timeout_proc);
@@ -568,38 +644,6 @@ softclock_thread(void *arg)
 	}
 }
 
-#ifndef SMALL_KERNEL
-void
-timeout_adjust_ticks(int adj)
-{
-	struct timeout *to;
-	struct circq *p;
-	int new_ticks, b;
-
-	/* adjusting the monotonic clock backwards would be a Bad Thing */
-	if (adj <= 0)
-		return;
-
-	mtx_enter(&timeout_mutex);
-	new_ticks = ticks + adj;
-	for (b = 0; b < nitems(timeout_wheel); b++) {
-		p = CIRCQ_FIRST(&timeout_wheel[b]);
-		while (p != &timeout_wheel[b]) {
-			to = timeout_from_circq(p);
-			p = CIRCQ_FIRST(p);
-
-			/* when moving a timeout forward need to reinsert it */
-			if (to->to_time - ticks < adj)
-				to->to_time = new_ticks;
-			CIRCQ_REMOVE(&to->to_list);
-			CIRCQ_INSERT(&to->to_list, &timeout_todo);
-		}
-	}
-	ticks = new_ticks;
-	mtx_leave(&timeout_mutex);
-}
-#endif
-
 int
 timeout_sysctl(void *oldp, size_t *oldlenp, void *newp, size_t newlen)
 {
@@ -614,10 +658,34 @@ timeout_sysctl(void *oldp, size_t *oldlenp, void *newp, size_t newlen)
 
 #ifdef DDB
 void db_show_callout_bucket(struct circq *);
+char *db_ts_str(const struct timespec *);
+
+char *
+db_ts_str(const struct timespec *ts)
+{
+	static char buf[64];
+	struct timespec tmp = *ts;
+	int neg = 0;
+
+	if (tmp.tv_sec < 0) {
+		tmp.tv_sec = -tmp.tv_sec;
+		if (tmp.tv_nsec > 0) {
+			tmp.tv_sec--;
+			tmp.tv_nsec = 1000000000 - tmp.tv_nsec;
+		}
+		neg = 1;
+	}
+
+	snprintf(buf, sizeof(buf), "%s%lld.%09ld",
+	    neg ? "-" : "", tmp.tv_sec, tmp.tv_nsec);
+
+	return buf;
+}
 
 void
 db_show_callout_bucket(struct circq *bucket)
 {
+	struct timespec left;
 	char buf[8];
 	struct timeout *to;
 	struct circq *p;
@@ -639,8 +707,9 @@ db_show_callout_bucket(struct circq *bucket)
 			    (bucket - timeout_wheel) / WHEELSIZE);
 			where = buf;
 		}
-		db_printf("%9d  %7s  0x%0*lx  %s\n",
-		    to->to_time - ticks, where, width, (ulong)to->to_arg, name);
+		timespecsub(&to->to_time, &timeout_lastscan, &left);
+		db_printf("%18s  %7s  0x%0*lx  %s\n",
+		    db_ts_str(&left), where, width, (ulong)to->to_arg, name);
 	}
 }
 
@@ -648,10 +717,11 @@ void
 db_show_callout(db_expr_t addr, int haddr, db_expr_t count, char *modif)
 {
 	int width = sizeof(long) * 2 + 2;
-	int b;
+	unsigned int b;
 
-	db_printf("ticks now: %d\n", ticks);
-	db_printf("%9s  %7s  %*s  func\n", "ticks", "wheel", width, "arg");
+	db_printf("%18s seconds up at last scan\n",
+	    db_ts_str(&timeout_lastscan));
+	db_printf("%18s  %7s  %*s  func\n", "remaining", "wheel", width, "arg");
 
 	db_show_callout_bucket(&timeout_todo);
 	db_show_callout_bucket(&timeout_proc);
author	cheloha <cheloha@openbsd.org>	2019-11-26 15:27:08 +0000
committer	cheloha <cheloha@openbsd.org>	2019-11-26 15:27:08 +0000
commit	4b4793304325522e159494e65cc0b3cd5a4dfa75 (patch)
tree	df2c0c77813c229cb8d5362293895047d491a9b1 /sys/kern/kern_timeout.c
parent	Revert previous "honour DEBUG", otherwise clang uses -g resulting in huge (diff)
download	wireguard-openbsd-4b4793304325522e159494e65cc0b3cd5a4dfa75.tar.xz wireguard-openbsd-4b4793304325522e159494e65cc0b3cd5a4dfa75.zip