aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/time/timer.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/time/timer.c')
-rw-r--r--kernel/time/timer.c479
1 files changed, 262 insertions, 217 deletions
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 4820823515e9..717fcb9fb14a 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -43,6 +43,8 @@
#include <linux/sched/debug.h>
#include <linux/slab.h>
#include <linux/compat.h>
+#include <linux/random.h>
+#include <linux/sysctl.h>
#include <linux/uaccess.h>
#include <asm/unistd.h>
@@ -156,7 +158,8 @@ EXPORT_SYMBOL(jiffies_64);
/*
* The time start value for each level to select the bucket at enqueue
- * time.
+ * time. We start from the last possible delta of the previous level
+ * so that we can later add an extra LVL_GRAN(n) to n (see calc_index()).
*/
#define LVL_START(n) ((LVL_SIZE - 1) << (((n) - 1) * LVL_CLK_SHIFT))
@@ -203,8 +206,9 @@ struct timer_base {
unsigned long clk;
unsigned long next_expiry;
unsigned int cpu;
+ bool next_expiry_recalc;
bool is_idle;
- bool must_forward_clk;
+ bool timers_pending;
DECLARE_BITMAP(pending_map, WHEEL_SIZE);
struct hlist_head vectors[WHEEL_SIZE];
} ____cacheline_aligned;
@@ -220,7 +224,7 @@ static void timer_update_keys(struct work_struct *work);
static DECLARE_WORK(timer_update_work, timer_update_keys);
#ifdef CONFIG_SMP
-unsigned int sysctl_timer_migration = 1;
+static unsigned int sysctl_timer_migration = 1;
DEFINE_STATIC_KEY_FALSE(timers_migration_enabled);
@@ -231,7 +235,42 @@ static void timers_update_migration(void)
else
static_branch_disable(&timers_migration_enabled);
}
-#else
+
+#ifdef CONFIG_SYSCTL
+static int timer_migration_handler(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ int ret;
+
+ mutex_lock(&timer_keys_mutex);
+ ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+ if (!ret && write)
+ timers_update_migration();
+ mutex_unlock(&timer_keys_mutex);
+ return ret;
+}
+
+static struct ctl_table timer_sysctl[] = {
+ {
+ .procname = "timer_migration",
+ .data = &sysctl_timer_migration,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = timer_migration_handler,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
+ {}
+};
+
+static int __init timer_sysctl_init(void)
+{
+ register_sysctl("kernel", timer_sysctl);
+ return 0;
+}
+device_initcall(timer_sysctl_init);
+#endif /* CONFIG_SYSCTL */
+#else /* CONFIG_SMP */
static inline void timers_update_migration(void) { }
#endif /* !CONFIG_SMP */
@@ -248,20 +287,6 @@ void timers_update_nohz(void)
schedule_work(&timer_update_work);
}
-int timer_migration_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp,
- loff_t *ppos)
-{
- int ret;
-
- mutex_lock(&timer_keys_mutex);
- ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
- if (!ret && write)
- timers_update_migration();
- mutex_unlock(&timer_keys_mutex);
- return ret;
-}
-
static inline bool is_timers_nohz_active(void)
{
return static_branch_unlikely(&timers_nohz_active);
@@ -488,71 +513,61 @@ static inline void timer_set_idx(struct timer_list *timer, unsigned int idx)
* Helper function to calculate the array index for a given expiry
* time.
*/
-static inline unsigned calc_index(unsigned expires, unsigned lvl)
+static inline unsigned calc_index(unsigned long expires, unsigned lvl,
+ unsigned long *bucket_expiry)
{
- expires = (expires + LVL_GRAN(lvl)) >> LVL_SHIFT(lvl);
+
+ /*
+ * The timer wheel has to guarantee that a timer does not fire
+ * early. Early expiry can happen due to:
+ * - Timer is armed at the edge of a tick
+ * - Truncation of the expiry time in the outer wheel levels
+ *
+ * Round up with level granularity to prevent this.
+ */
+ expires = (expires >> LVL_SHIFT(lvl)) + 1;
+ *bucket_expiry = expires << LVL_SHIFT(lvl);
return LVL_OFFS(lvl) + (expires & LVL_MASK);
}
-static int calc_wheel_index(unsigned long expires, unsigned long clk)
+static int calc_wheel_index(unsigned long expires, unsigned long clk,
+ unsigned long *bucket_expiry)
{
unsigned long delta = expires - clk;
unsigned int idx;
if (delta < LVL_START(1)) {
- idx = calc_index(expires, 0);
+ idx = calc_index(expires, 0, bucket_expiry);
} else if (delta < LVL_START(2)) {
- idx = calc_index(expires, 1);
+ idx = calc_index(expires, 1, bucket_expiry);
} else if (delta < LVL_START(3)) {
- idx = calc_index(expires, 2);
+ idx = calc_index(expires, 2, bucket_expiry);
} else if (delta < LVL_START(4)) {
- idx = calc_index(expires, 3);
+ idx = calc_index(expires, 3, bucket_expiry);
} else if (delta < LVL_START(5)) {
- idx = calc_index(expires, 4);
+ idx = calc_index(expires, 4, bucket_expiry);
} else if (delta < LVL_START(6)) {
- idx = calc_index(expires, 5);
+ idx = calc_index(expires, 5, bucket_expiry);
} else if (delta < LVL_START(7)) {
- idx = calc_index(expires, 6);
+ idx = calc_index(expires, 6, bucket_expiry);
} else if (LVL_DEPTH > 8 && delta < LVL_START(8)) {
- idx = calc_index(expires, 7);
+ idx = calc_index(expires, 7, bucket_expiry);
} else if ((long) delta < 0) {
idx = clk & LVL_MASK;
+ *bucket_expiry = clk;
} else {
/*
* Force expire obscene large timeouts to expire at the
* capacity limit of the wheel.
*/
- if (expires >= WHEEL_TIMEOUT_CUTOFF)
- expires = WHEEL_TIMEOUT_MAX;
+ if (delta >= WHEEL_TIMEOUT_CUTOFF)
+ expires = clk + WHEEL_TIMEOUT_MAX;
- idx = calc_index(expires, LVL_DEPTH - 1);
+ idx = calc_index(expires, LVL_DEPTH - 1, bucket_expiry);
}
return idx;
}
-/*
- * Enqueue the timer into the hash bucket, mark it pending in
- * the bitmap and store the index in the timer flags.
- */
-static void enqueue_timer(struct timer_base *base, struct timer_list *timer,
- unsigned int idx)
-{
- hlist_add_head(&timer->entry, base->vectors + idx);
- __set_bit(idx, base->pending_map);
- timer_set_idx(timer, idx);
-
- trace_timer_start(timer, timer->expires, timer->flags);
-}
-
-static void
-__internal_add_timer(struct timer_base *base, struct timer_list *timer)
-{
- unsigned int idx;
-
- idx = calc_wheel_index(timer->expires, base->clk);
- enqueue_timer(base, timer, idx);
-}
-
static void
trigger_dyntick_cpu(struct timer_base *base, struct timer_list *timer)
{
@@ -574,35 +589,88 @@ trigger_dyntick_cpu(struct timer_base *base, struct timer_list *timer)
* timer is not deferrable. If the other CPU is on the way to idle
* then it can't set base->is_idle as we hold the base lock:
*/
- if (!base->is_idle)
- return;
+ if (base->is_idle)
+ wake_up_nohz_cpu(base->cpu);
+}
- /* Check whether this is the new first expiring timer: */
- if (time_after_eq(timer->expires, base->next_expiry))
- return;
+/*
+ * Enqueue the timer into the hash bucket, mark it pending in
+ * the bitmap, store the index in the timer flags then wake up
+ * the target CPU if needed.
+ */
+static void enqueue_timer(struct timer_base *base, struct timer_list *timer,
+ unsigned int idx, unsigned long bucket_expiry)
+{
+
+ hlist_add_head(&timer->entry, base->vectors + idx);
+ __set_bit(idx, base->pending_map);
+ timer_set_idx(timer, idx);
+
+ trace_timer_start(timer, timer->expires, timer->flags);
/*
- * Set the next expiry time and kick the CPU so it can reevaluate the
- * wheel:
+ * Check whether this is the new first expiring timer. The
+ * effective expiry time of the timer is required here
+ * (bucket_expiry) instead of timer->expires.
*/
- base->next_expiry = timer->expires;
- wake_up_nohz_cpu(base->cpu);
+ if (time_before(bucket_expiry, base->next_expiry)) {
+ /*
+ * Set the next expiry time and kick the CPU so it
+ * can reevaluate the wheel:
+ */
+ base->next_expiry = bucket_expiry;
+ base->timers_pending = true;
+ base->next_expiry_recalc = false;
+ trigger_dyntick_cpu(base, timer);
+ }
}
-static void
-internal_add_timer(struct timer_base *base, struct timer_list *timer)
+static void internal_add_timer(struct timer_base *base, struct timer_list *timer)
{
- __internal_add_timer(base, timer);
- trigger_dyntick_cpu(base, timer);
+ unsigned long bucket_expiry;
+ unsigned int idx;
+
+ idx = calc_wheel_index(timer->expires, base->clk, &bucket_expiry);
+ enqueue_timer(base, timer, idx, bucket_expiry);
}
#ifdef CONFIG_DEBUG_OBJECTS_TIMERS
-static struct debug_obj_descr timer_debug_descr;
+static const struct debug_obj_descr timer_debug_descr;
+
+struct timer_hint {
+ void (*function)(struct timer_list *t);
+ long offset;
+};
+
+#define TIMER_HINT(fn, container, timr, hintfn) \
+ { \
+ .function = fn, \
+ .offset = offsetof(container, hintfn) - \
+ offsetof(container, timr) \
+ }
+
+static const struct timer_hint timer_hints[] = {
+ TIMER_HINT(delayed_work_timer_fn,
+ struct delayed_work, timer, work.func),
+ TIMER_HINT(kthread_delayed_work_timer_fn,
+ struct kthread_delayed_work, timer, work.func),
+};
static void *timer_debug_hint(void *addr)
{
- return ((struct timer_list *) addr)->function;
+ struct timer_list *timer = addr;
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(timer_hints); i++) {
+ if (timer_hints[i].function == timer->function) {
+ void (**fn)(void) = addr + timer_hints[i].offset;
+
+ return *fn;
+ }
+ }
+
+ return timer->function;
}
static bool timer_is_static_object(void *addr)
@@ -653,7 +721,7 @@ static bool timer_fixup_activate(void *addr, enum debug_obj_state state)
case ODEBUG_STATE_ACTIVE:
WARN_ON(1);
- /* fall through */
+ fallthrough;
default:
return false;
}
@@ -694,7 +762,7 @@ static bool timer_fixup_assert_init(void *addr, enum debug_obj_state state)
}
}
-static struct debug_obj_descr timer_debug_descr = {
+static const struct debug_obj_descr timer_debug_descr = {
.name = "timer_list",
.debug_hint = timer_debug_hint,
.is_static_object = timer_is_static_object,
@@ -719,11 +787,6 @@ static inline void debug_timer_deactivate(struct timer_list *timer)
debug_object_deactivate(timer, &timer_debug_descr);
}
-static inline void debug_timer_free(struct timer_list *timer)
-{
- debug_object_free(timer, &timer_debug_descr);
-}
-
static inline void debug_timer_assert_init(struct timer_list *timer)
{
debug_object_assert_init(timer, &timer_debug_descr);
@@ -781,6 +844,8 @@ static void do_init_timer(struct timer_list *timer,
{
timer->entry.pprev = NULL;
timer->function = func;
+ if (WARN_ON_ONCE(flags & ~TIMER_INIT_FLAGS))
+ flags &= TIMER_INIT_FLAGS;
timer->flags = flags | raw_smp_processor_id();
lockdep_init_map(&timer->lockdep_map, name, key, 0);
}
@@ -826,8 +891,10 @@ static int detach_if_pending(struct timer_list *timer, struct timer_base *base,
if (!timer_pending(timer))
return 0;
- if (hlist_is_singular_node(&timer->entry, base->vectors + idx))
+ if (hlist_is_singular_node(&timer->entry, base->vectors + idx)) {
__clear_bit(idx, base->pending_map);
+ base->next_expiry_recalc = true;
+ }
detach_timer(timer, clear_pending);
return 1;
@@ -877,31 +944,27 @@ get_target_base(struct timer_base *base, unsigned tflags)
static inline void forward_timer_base(struct timer_base *base)
{
-#ifdef CONFIG_NO_HZ_COMMON
- unsigned long jnow;
+ unsigned long jnow = READ_ONCE(jiffies);
/*
- * We only forward the base when we are idle or have just come out of
- * idle (must_forward_clk logic), and have a delta between base clock
- * and jiffies. In the common case, run_timers will take care of it.
+ * No need to forward if we are close enough below jiffies.
+ * Also while executing timers, base->clk is 1 offset ahead
+ * of jiffies to avoid endless requeuing to current jiffies.
*/
- if (likely(!base->must_forward_clk))
- return;
-
- jnow = READ_ONCE(jiffies);
- base->must_forward_clk = base->is_idle;
- if ((long)(jnow - base->clk) < 2)
+ if ((long)(jnow - base->clk) < 1)
return;
/*
* If the next expiry value is > jiffies, then we fast forward to
* jiffies otherwise we forward to the next expiry value.
*/
- if (time_after(base->next_expiry, jnow))
+ if (time_after(base->next_expiry, jnow)) {
base->clk = jnow;
- else
+ } else {
+ if (WARN_ON_ONCE(time_before(base->next_expiry, base->clk)))
+ return;
base->clk = base->next_expiry;
-#endif
+ }
}
@@ -944,13 +1007,14 @@ static struct timer_base *lock_timer_base(struct timer_list *timer,
#define MOD_TIMER_PENDING_ONLY 0x01
#define MOD_TIMER_REDUCE 0x02
+#define MOD_TIMER_NOTPENDING 0x04
static inline int
__mod_timer(struct timer_list *timer, unsigned long expires, unsigned int options)
{
+ unsigned long clk = 0, flags, bucket_expiry;
struct timer_base *base, *new_base;
unsigned int idx = UINT_MAX;
- unsigned long clk = 0, flags;
int ret = 0;
BUG_ON(!timer->function);
@@ -960,7 +1024,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires, unsigned int option
* the timer is re-modified to have the same timeout or ends up in the
* same array bucket then just return:
*/
- if (timer_pending(timer)) {
+ if (!(options & MOD_TIMER_NOTPENDING) && timer_pending(timer)) {
/*
* The downside of this optimization is that it can result in
* larger granularity than you would get from adding a new
@@ -989,7 +1053,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires, unsigned int option
}
clk = base->clk;
- idx = calc_wheel_index(expires, clk);
+ idx = calc_wheel_index(expires, clk, &bucket_expiry);
/*
* Retrieve and compare the array index of the pending
@@ -1042,16 +1106,13 @@ __mod_timer(struct timer_list *timer, unsigned long expires, unsigned int option
/*
* If 'idx' was calculated above and the base time did not advance
* between calculating 'idx' and possibly switching the base, only
- * enqueue_timer() and trigger_dyntick_cpu() is required. Otherwise
- * we need to (re)calculate the wheel index via
- * internal_add_timer().
+ * enqueue_timer() is required. Otherwise we need to (re)calculate
+ * the wheel index via internal_add_timer().
*/
- if (idx != UINT_MAX && clk == base->clk) {
- enqueue_timer(base, timer, idx);
- trigger_dyntick_cpu(base, timer);
- } else {
+ if (idx != UINT_MAX && clk == base->clk)
+ enqueue_timer(base, timer, idx, bucket_expiry);
+ else
internal_add_timer(base, timer);
- }
out_unlock:
raw_spin_unlock_irqrestore(&base->lock, flags);
@@ -1133,7 +1194,7 @@ EXPORT_SYMBOL(timer_reduce);
void add_timer(struct timer_list *timer)
{
BUG_ON(timer_pending(timer));
- mod_timer(timer, timer->expires);
+ __mod_timer(timer, timer->expires, MOD_TIMER_NOTPENDING);
}
EXPORT_SYMBOL(add_timer);
@@ -1251,14 +1312,16 @@ static inline void timer_base_unlock_expiry(struct timer_base *base)
* The counterpart to del_timer_wait_running().
*
* If there is a waiter for base->expiry_lock, then it was waiting for the
- * timer callback to finish. Drop expiry_lock and reaquire it. That allows
+ * timer callback to finish. Drop expiry_lock and reacquire it. That allows
* the waiter to acquire the lock and make progress.
*/
static void timer_sync_wait_running(struct timer_base *base)
{
if (atomic_read(&base->timer_waiters)) {
+ raw_spin_unlock_irq(&base->lock);
spin_unlock(&base->expiry_lock);
spin_lock(&base->expiry_lock);
+ raw_spin_lock_irq(&base->lock);
}
}
@@ -1277,7 +1340,7 @@ static void del_timer_wait_running(struct timer_list *timer)
u32 tf;
tf = READ_ONCE(timer->flags);
- if (!(tf & TIMER_MIGRATING)) {
+ if (!(tf & (TIMER_MIGRATING | TIMER_IRQSAFE))) {
struct timer_base *base = get_timer_base(tf);
/*
@@ -1361,6 +1424,13 @@ int del_timer_sync(struct timer_list *timer)
*/
WARN_ON(in_irq() && !(timer->flags & TIMER_IRQSAFE));
+ /*
+ * Must be able to sleep on PREEMPT_RT because of the slowpath in
+ * del_timer_wait_running().
+ */
+ if (IS_ENABLED(CONFIG_PREEMPT_RT) && !(timer->flags & TIMER_IRQSAFE))
+ lockdep_assert_preemption_enabled();
+
do {
ret = try_to_del_timer_sync(timer);
@@ -1442,22 +1512,22 @@ static void expire_timers(struct timer_base *base, struct hlist_head *head)
if (timer->flags & TIMER_IRQSAFE) {
raw_spin_unlock(&base->lock);
call_timer_fn(timer, fn, baseclk);
- base->running_timer = NULL;
raw_spin_lock(&base->lock);
+ base->running_timer = NULL;
} else {
raw_spin_unlock_irq(&base->lock);
call_timer_fn(timer, fn, baseclk);
+ raw_spin_lock_irq(&base->lock);
base->running_timer = NULL;
timer_sync_wait_running(base);
- raw_spin_lock_irq(&base->lock);
}
}
}
-static int __collect_expired_timers(struct timer_base *base,
- struct hlist_head *heads)
+static int collect_expired_timers(struct timer_base *base,
+ struct hlist_head *heads)
{
- unsigned long clk = base->clk;
+ unsigned long clk = base->clk = base->next_expiry;
struct hlist_head *vec;
int i, levels = 0;
unsigned int idx;
@@ -1479,7 +1549,6 @@ static int __collect_expired_timers(struct timer_base *base,
return levels;
}
-#ifdef CONFIG_NO_HZ_COMMON
/*
* Find the next pending bucket of a level. Search from level start (@offset)
* + @clk upwards and if nothing there, search from start of the level
@@ -1512,6 +1581,7 @@ static unsigned long __next_timer_interrupt(struct timer_base *base)
clk = base->clk;
for (lvl = 0; lvl < LVL_DEPTH; lvl++, offset += LVL_SIZE) {
int pos = next_pending_bucket(base, offset, clk & LVL_MASK);
+ unsigned long lvl_clk = clk & LVL_CLK_MASK;
if (pos >= 0) {
unsigned long tmp = clk + (unsigned long) pos;
@@ -1519,6 +1589,13 @@ static unsigned long __next_timer_interrupt(struct timer_base *base)
tmp <<= LVL_SHIFT(lvl);
if (time_before(tmp, next))
next = tmp;
+
+ /*
+ * If the next expiration happens before we reach
+ * the next level, no need to check further.
+ */
+ if (pos <= ((LVL_CLK_DIV - lvl_clk) & LVL_CLK_MASK))
+ break;
}
/*
* Clock for the next level. If the current level clock lower
@@ -1556,13 +1633,18 @@ static unsigned long __next_timer_interrupt(struct timer_base *base)
* So the simple check whether the lower bits of the current
* level are 0 or not is sufficient for all cases.
*/
- adj = clk & LVL_CLK_MASK ? 1 : 0;
+ adj = lvl_clk ? 1 : 0;
clk >>= LVL_CLK_SHIFT;
clk += adj;
}
+
+ base->next_expiry_recalc = false;
+ base->timers_pending = !(next == base->clk + NEXT_TIMER_MAX_DELTA);
+
return next;
}
+#ifdef CONFIG_NO_HZ_COMMON
/*
* Check, if the next hrtimer event is before the next timer wheel
* event:
@@ -1609,7 +1691,6 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
u64 expires = KTIME_MAX;
unsigned long nextevt;
- bool is_max_delta;
/*
* Pretend that there is no timer pending if the cpu is offline.
@@ -1619,9 +1700,10 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
return expires;
raw_spin_lock(&base->lock);
- nextevt = __next_timer_interrupt(base);
- is_max_delta = (nextevt == base->clk + NEXT_TIMER_MAX_DELTA);
- base->next_expiry = nextevt;
+ if (base->next_expiry_recalc)
+ base->next_expiry = __next_timer_interrupt(base);
+ nextevt = base->next_expiry;
+
/*
* We have a fresh next event. Check whether we can forward the
* base. We can only do that when @basej is past base->clk
@@ -1638,7 +1720,7 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
expires = basem;
base->is_idle = false;
} else {
- if (!is_max_delta)
+ if (base->timers_pending)
expires = basem + (u64)(nextevt - basej) * TICK_NSEC;
/*
* If we expect to sleep more than a tick, mark the base idle.
@@ -1647,10 +1729,8 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
* logic is only maintained for the BASE_STD base, deferrable
* timers may still see large granularity skew (by design).
*/
- if ((expires - basem) > TICK_NSEC) {
- base->must_forward_clk = true;
+ if ((expires - basem) > TICK_NSEC)
base->is_idle = true;
- }
}
raw_spin_unlock(&base->lock);
@@ -1674,64 +1754,7 @@ void timer_clear_idle(void)
*/
base->is_idle = false;
}
-
-static int collect_expired_timers(struct timer_base *base,
- struct hlist_head *heads)
-{
- unsigned long now = READ_ONCE(jiffies);
-
- /*
- * NOHZ optimization. After a long idle sleep we need to forward the
- * base to current jiffies. Avoid a loop by searching the bitfield for
- * the next expiring timer.
- */
- if ((long)(now - base->clk) > 2) {
- unsigned long next = __next_timer_interrupt(base);
-
- /*
- * If the next timer is ahead of time forward to current
- * jiffies, otherwise forward to the next expiry time:
- */
- if (time_after(next, now)) {
- /*
- * The call site will increment base->clk and then
- * terminate the expiry loop immediately.
- */
- base->clk = now;
- return 0;
- }
- base->clk = next;
- }
- return __collect_expired_timers(base, heads);
-}
-#else
-static inline int collect_expired_timers(struct timer_base *base,
- struct hlist_head *heads)
-{
- return __collect_expired_timers(base, heads);
-}
-#endif
-
-/*
- * Called from the timer interrupt handler to charge one tick to the current
- * process. user_tick is 1 if the tick is user time, 0 for system.
- */
-void update_process_times(int user_tick)
-{
- struct task_struct *p = current;
-
- /* Note: this timer irq context must be accounted for as well. */
- account_process_tick(p, user_tick);
- run_local_timers();
- rcu_sched_clock_irq(user_tick);
-#ifdef CONFIG_IRQ_WORK
- if (in_irq())
- irq_work_tick();
#endif
- scheduler_tick();
- if (IS_ENABLED(CONFIG_POSIX_TIMERS))
- run_posix_cpu_timers();
-}
/**
* __run_timers - run all expired timers (if any) on this CPU.
@@ -1742,32 +1765,26 @@ static inline void __run_timers(struct timer_base *base)
struct hlist_head heads[LVL_DEPTH];
int levels;
- if (!time_after_eq(jiffies, base->clk))
+ if (time_before(jiffies, base->next_expiry))
return;
timer_base_lock_expiry(base);
raw_spin_lock_irq(&base->lock);
- /*
- * timer_base::must_forward_clk must be cleared before running
- * timers so that any timer functions that call mod_timer() will
- * not try to forward the base. Idle tracking / clock forwarding
- * logic is only used with BASE_STD timers.
- *
- * The must_forward_clk flag is cleared unconditionally also for
- * the deferrable base. The deferrable base is not affected by idle
- * tracking and never forwarded, so clearing the flag is a NOOP.
- *
- * The fact that the deferrable base is never forwarded can cause
- * large variations in granularity for deferrable timers, but they
- * can be deferred for long periods due to idle anyway.
- */
- base->must_forward_clk = false;
-
- while (time_after_eq(jiffies, base->clk)) {
-
+ while (time_after_eq(jiffies, base->clk) &&
+ time_after_eq(jiffies, base->next_expiry)) {
levels = collect_expired_timers(base, heads);
+ /*
+ * The two possible reasons for not finding any expired
+ * timer at this clk are that all matching timers have been
+ * dequeued or no timer has been queued since
+ * base::next_expiry was set to base::clk +
+ * NEXT_TIMER_MAX_DELTA.
+ */
+ WARN_ON_ONCE(!levels && !base->next_expiry_recalc
+ && base->timers_pending);
base->clk++;
+ base->next_expiry = __next_timer_interrupt(base);
while (levels--)
expire_timers(base, heads + levels);
@@ -1791,24 +1808,45 @@ static __latent_entropy void run_timer_softirq(struct softirq_action *h)
/*
* Called by the local, per-CPU timer interrupt on SMP.
*/
-void run_local_timers(void)
+static void run_local_timers(void)
{
struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
hrtimer_run_queues();
/* Raise the softirq only if required. */
- if (time_before(jiffies, base->clk)) {
+ if (time_before(jiffies, base->next_expiry)) {
if (!IS_ENABLED(CONFIG_NO_HZ_COMMON))
return;
/* CPU is awake, so check the deferrable base. */
base++;
- if (time_before(jiffies, base->clk))
+ if (time_before(jiffies, base->next_expiry))
return;
}
raise_softirq(TIMER_SOFTIRQ);
}
/*
+ * Called from the timer interrupt handler to charge one tick to the current
+ * process. user_tick is 1 if the tick is user time, 0 for system.
+ */
+void update_process_times(int user_tick)
+{
+ struct task_struct *p = current;
+
+ /* Note: this timer irq context must be accounted for as well. */
+ account_process_tick(p, user_tick);
+ run_local_timers();
+ rcu_sched_clock_irq(user_tick);
+#ifdef CONFIG_IRQ_WORK
+ if (in_irq())
+ irq_work_tick();
+#endif
+ scheduler_tick();
+ if (IS_ENABLED(CONFIG_POSIX_TIMERS))
+ run_posix_cpu_timers();
+}
+
+/*
* Since schedule_timeout()'s timer is defined on the stack, it must store
* the target task on the stack as well.
*/
@@ -1828,21 +1866,23 @@ static void process_timeout(struct timer_list *t)
* schedule_timeout - sleep until timeout
* @timeout: timeout value in jiffies
*
- * Make the current task sleep until @timeout jiffies have
- * elapsed. The routine will return immediately unless
- * the current task state has been set (see set_current_state()).
+ * Make the current task sleep until @timeout jiffies have elapsed.
+ * The function behavior depends on the current task state
+ * (see also set_current_state() description):
*
- * You can set the task state as follows -
+ * %TASK_RUNNING - the scheduler is called, but the task does not sleep
+ * at all. That happens because sched_submit_work() does nothing for
+ * tasks in %TASK_RUNNING state.
*
* %TASK_UNINTERRUPTIBLE - at least @timeout jiffies are guaranteed to
* pass before the routine returns unless the current task is explicitly
- * woken up, (e.g. by wake_up_process())".
+ * woken up, (e.g. by wake_up_process()).
*
* %TASK_INTERRUPTIBLE - the routine may return early if a signal is
* delivered to the current task or the current task is explicitly woken
* up.
*
- * The current task state is guaranteed to be TASK_RUNNING when this
+ * The current task state is guaranteed to be %TASK_RUNNING when this
* routine returns.
*
* Specifying a @timeout value of %MAX_SCHEDULE_TIMEOUT will schedule
@@ -1850,7 +1890,7 @@ static void process_timeout(struct timer_list *t)
* value will be %MAX_SCHEDULE_TIMEOUT.
*
* Returns 0 when the timer has expired otherwise the remaining time in
- * jiffies will be returned. In all cases the return value is guaranteed
+ * jiffies will be returned. In all cases the return value is guaranteed
* to be non-negative.
*/
signed long __sched schedule_timeout(signed long timeout)
@@ -1882,7 +1922,7 @@ signed long __sched schedule_timeout(signed long timeout)
printk(KERN_ERR "schedule_timeout: wrong timeout "
"value %lx\n", timeout);
dump_stack();
- current->state = TASK_RUNNING;
+ __set_current_state(TASK_RUNNING);
goto out;
}
}
@@ -1891,7 +1931,7 @@ signed long __sched schedule_timeout(signed long timeout)
timer.task = current;
timer_setup_on_stack(&timer.timer, process_timeout, 0);
- __mod_timer(&timer.timer, expire, 0);
+ __mod_timer(&timer.timer, expire, MOD_TIMER_NOTPENDING);
schedule();
del_singleshot_timer_sync(&timer.timer);
@@ -1964,8 +2004,9 @@ int timers_prepare_cpu(unsigned int cpu)
base = per_cpu_ptr(&timer_bases[b], cpu);
base->clk = jiffies;
base->next_expiry = base->clk + NEXT_TIMER_MAX_DELTA;
+ base->next_expiry_recalc = false;
+ base->timers_pending = false;
base->is_idle = false;
- base->must_forward_clk = true;
}
return 0;
}
@@ -2018,6 +2059,7 @@ static void __init init_timer_cpu(int cpu)
base->cpu = cpu;
raw_spin_lock_init(&base->lock);
base->clk = jiffies;
+ base->next_expiry = base->clk + NEXT_TIMER_MAX_DELTA;
timer_base_init_expiry_lock(base);
}
}
@@ -2033,6 +2075,7 @@ static void __init init_timer_cpus(void)
void __init init_timers(void)
{
init_timer_cpus();
+ posix_cputimers_init_work();
open_softirq(TIMER_SOFTIRQ, run_timer_softirq);
}
@@ -2066,26 +2109,28 @@ unsigned long msleep_interruptible(unsigned int msecs)
EXPORT_SYMBOL(msleep_interruptible);
/**
- * usleep_range - Sleep for an approximate time
- * @min: Minimum time in usecs to sleep
- * @max: Maximum time in usecs to sleep
+ * usleep_range_state - Sleep for an approximate time in a given state
+ * @min: Minimum time in usecs to sleep
+ * @max: Maximum time in usecs to sleep
+ * @state: State of the current task that will be while sleeping
*
* In non-atomic context where the exact wakeup time is flexible, use
- * usleep_range() instead of udelay(). The sleep improves responsiveness
+ * usleep_range_state() instead of udelay(). The sleep improves responsiveness
* by avoiding the CPU-hogging busy-wait of udelay(), and the range reduces
* power usage by allowing hrtimers to take advantage of an already-
* scheduled interrupt instead of scheduling a new one just for this sleep.
*/
-void __sched usleep_range(unsigned long min, unsigned long max)
+void __sched usleep_range_state(unsigned long min, unsigned long max,
+ unsigned int state)
{
ktime_t exp = ktime_add_us(ktime_get(), min);
u64 delta = (u64)(max - min) * NSEC_PER_USEC;
for (;;) {
- __set_current_state(TASK_UNINTERRUPTIBLE);
+ __set_current_state(state);
/* Do not return before the requested sleep time has elapsed */
if (!schedule_hrtimeout_range(&exp, delta, HRTIMER_MODE_ABS))
break;
}
}
-EXPORT_SYMBOL(usleep_range);
+EXPORT_SYMBOL(usleep_range_state);