From a092ff0f90cae22b2ac8028ecd2c6f6c1a9e4601 Mon Sep 17 00:00:00 2001 From: john stultz Date: Fri, 2 Oct 2009 16:17:53 -0700 Subject: time: Implement logarithmic time accumulation Accumulating one tick at a time works well unless we're using NOHZ. Then it can be an issue, since we may have to run through the loop a few thousand times, which can increase timer interrupt caused latency. The current solution was to accumulate in half-second intervals with NOHZ. This kept the number of loops down, however it did slightly change how we make NTP adjustments. While not an issue with NTPd users, as NTPd makes adjustments over a longer period of time, other adjtimex() users have noticed the half-second granularity with which we can apply frequency changes to the clock. For instance, if a application tries to apply a 100ppm frequency correction for 20ms to correct a 2us offset, with NOHZ they either get no correction, or a 50us correction. Now, there will always be some granularity error for applying frequency corrections. However with users sensitive to this error have seen a 50-500x increase with NOHZ compared to running without NOHZ. So I figured I'd try another approach then just simply increasing the interval. My approach is to consume the time interval logarithmically. This reduces the number of times through the loop needed keeping latency down, while still preserving the original granularity error for adjtimex() changes. Further, this change allows us to remove the xtime_cache code (patch to follow), as xtime is always within one tick of the current time, instead of the half-second updates it saw before. An earlier version of this patch has been shipping to x86 users in the RedHat MRG releases for awhile without issue, but I've reworked this version to be even more careful about avoiding possible overflows if the shift value gets too large. Signed-off-by: John Stultz Acked-by: Thomas Gleixner Reviewed-by: John Kacur Cc: Clark Williams Cc: Martin Schwidefsky Cc: Andrew Morton LKML-Reference: <1254525473.7741.88.camel@localhost.localdomain> Signed-off-by: Ingo Molnar --- include/linux/timex.h | 4 --- kernel/time/timekeeping.c | 85 +++++++++++++++++++++++++++++++++-------------- 2 files changed, 60 insertions(+), 29 deletions(-) diff --git a/include/linux/timex.h b/include/linux/timex.h index e6967d10d9e5..0c0ef7d4db7c 100644 --- a/include/linux/timex.h +++ b/include/linux/timex.h @@ -261,11 +261,7 @@ static inline int ntp_synced(void) #define NTP_SCALE_SHIFT 32 -#ifdef CONFIG_NO_HZ -#define NTP_INTERVAL_FREQ (2) -#else #define NTP_INTERVAL_FREQ (HZ) -#endif #define NTP_INTERVAL_LENGTH (NSEC_PER_SEC/NTP_INTERVAL_FREQ) /* Returns how long ticks are at present, in ns / 2^NTP_SCALE_SHIFT. */ diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index fb0f46fa1ecd..5fdd78e0858a 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -721,6 +721,51 @@ static void timekeeping_adjust(s64 offset) timekeeper.ntp_error_shift; } + +/** + * logarithmic_accumulation - shifted accumulation of cycles + * + * This functions accumulates a shifted interval of cycles into + * into a shifted interval nanoseconds. Allows for O(log) accumulation + * loop. + * + * Returns the unconsumed cycles. + */ +static cycle_t logarithmic_accumulation(cycle_t offset, int shift) +{ + u64 nsecps = (u64)NSEC_PER_SEC << timekeeper.shift; + + /* If the offset is smaller then a shifted interval, do nothing */ + if (offset < timekeeper.cycle_interval<cycle_last += timekeeper.cycle_interval << shift; + + timekeeper.xtime_nsec += timekeeper.xtime_interval << shift; + while (timekeeper.xtime_nsec >= nsecps) { + timekeeper.xtime_nsec -= nsecps; + xtime.tv_sec++; + second_overflow(); + } + + /* Accumulate into raw time */ + raw_time.tv_nsec += timekeeper.raw_interval << shift;; + while (raw_time.tv_nsec >= NSEC_PER_SEC) { + raw_time.tv_nsec -= NSEC_PER_SEC; + raw_time.tv_sec++; + } + + /* Accumulate error between NTP and clock interval */ + timekeeper.ntp_error += tick_length << shift; + timekeeper.ntp_error -= timekeeper.xtime_interval << + (timekeeper.ntp_error_shift + shift); + + return offset; +} + + /** * update_wall_time - Uses the current clocksource to increment the wall time * @@ -731,6 +776,7 @@ void update_wall_time(void) struct clocksource *clock; cycle_t offset; u64 nsecs; + int shift = 0, maxshift; /* Make sure we're fully resumed: */ if (unlikely(timekeeping_suspended)) @@ -744,33 +790,22 @@ void update_wall_time(void) #endif timekeeper.xtime_nsec = (s64)xtime.tv_nsec << timekeeper.shift; - /* normally this loop will run just once, however in the - * case of lost or late ticks, it will accumulate correctly. + /* + * With NO_HZ we may have to accumulate many cycle_intervals + * (think "ticks") worth of time at once. To do this efficiently, + * we calculate the largest doubling multiple of cycle_intervals + * that is smaller then the offset. We then accumulate that + * chunk in one go, and then try to consume the next smaller + * doubled multiple. */ + shift = ilog2(offset) - ilog2(timekeeper.cycle_interval); + shift = max(0, shift); + /* Bound shift to one less then what overflows tick_length */ + maxshift = (8*sizeof(tick_length) - (ilog2(tick_length)+1)) - 1; + shift = min(shift, maxshift); while (offset >= timekeeper.cycle_interval) { - u64 nsecps = (u64)NSEC_PER_SEC << timekeeper.shift; - - /* accumulate one interval */ - offset -= timekeeper.cycle_interval; - clock->cycle_last += timekeeper.cycle_interval; - - timekeeper.xtime_nsec += timekeeper.xtime_interval; - if (timekeeper.xtime_nsec >= nsecps) { - timekeeper.xtime_nsec -= nsecps; - xtime.tv_sec++; - second_overflow(); - } - - raw_time.tv_nsec += timekeeper.raw_interval; - if (raw_time.tv_nsec >= NSEC_PER_SEC) { - raw_time.tv_nsec -= NSEC_PER_SEC; - raw_time.tv_sec++; - } - - /* accumulate error between NTP and clock interval */ - timekeeper.ntp_error += tick_length; - timekeeper.ntp_error -= timekeeper.xtime_interval << - timekeeper.ntp_error_shift; + offset = logarithmic_accumulation(offset, shift); + shift--; } /* correct the clock when NTP error is too big */ -- cgit v1.2.3-59-g8ed1b From 7bc7d637452383d56ba4368d4336b0dde1bb476d Mon Sep 17 00:00:00 2001 From: john stultz Date: Fri, 2 Oct 2009 16:24:15 -0700 Subject: time: Remove xtime_cache With the prior logarithmic time accumulation patch, xtime will now always be within one "tick" of the current time, instead of possibly half a second off. This removes the need for the xtime_cache value, which always stored the time at the last interrupt, so this patch cleans that up removing the xtime_cache related code. This is a bit simpler, but still could use some wider testing. Signed-off-by: John Stultz Acked-by: Thomas Gleixner Reviewed-by: John Kacur Cc: Clark Williams Cc: Martin Schwidefsky Cc: Andrew Morton LKML-Reference: <1254525855.7741.95.camel@localhost.localdomain> Signed-off-by: Ingo Molnar --- kernel/time.c | 1 - kernel/time/timekeeping.c | 27 ++++----------------------- 2 files changed, 4 insertions(+), 24 deletions(-) diff --git a/kernel/time.c b/kernel/time.c index 2e2e469a7fec..2ef4fe2079b6 100644 --- a/kernel/time.c +++ b/kernel/time.c @@ -136,7 +136,6 @@ static inline void warp_clock(void) write_seqlock_irq(&xtime_lock); wall_to_monotonic.tv_sec -= sys_tz.tz_minuteswest * 60; xtime.tv_sec += sys_tz.tz_minuteswest * 60; - update_xtime_cache(0); write_sequnlock_irq(&xtime_lock); clock_was_set(); } diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 5fdd78e0858a..96b3f0dfa5dc 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -164,13 +164,6 @@ struct timespec raw_time; /* flag for if timekeeping is suspended */ int __read_mostly timekeeping_suspended; -static struct timespec xtime_cache __attribute__ ((aligned (16))); -void update_xtime_cache(u64 nsec) -{ - xtime_cache = xtime; - timespec_add_ns(&xtime_cache, nsec); -} - /* must hold xtime_lock */ void timekeeping_leap_insert(int leapsecond) { @@ -331,8 +324,6 @@ int do_settimeofday(struct timespec *tv) xtime = *tv; - update_xtime_cache(0); - timekeeper.ntp_error = 0; ntp_clear(); @@ -547,7 +538,6 @@ void __init timekeeping_init(void) } set_normalized_timespec(&wall_to_monotonic, -boot.tv_sec, -boot.tv_nsec); - update_xtime_cache(0); total_sleep_time.tv_sec = 0; total_sleep_time.tv_nsec = 0; write_sequnlock_irqrestore(&xtime_lock, flags); @@ -581,7 +571,6 @@ static int timekeeping_resume(struct sys_device *dev) wall_to_monotonic = timespec_sub(wall_to_monotonic, ts); total_sleep_time = timespec_add_safe(total_sleep_time, ts); } - update_xtime_cache(0); /* re-base the last cycle value */ timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock); timekeeper.ntp_error = 0; @@ -721,7 +710,6 @@ static void timekeeping_adjust(s64 offset) timekeeper.ntp_error_shift; } - /** * logarithmic_accumulation - shifted accumulation of cycles * @@ -765,7 +753,6 @@ static cycle_t logarithmic_accumulation(cycle_t offset, int shift) return offset; } - /** * update_wall_time - Uses the current clocksource to increment the wall time * @@ -775,7 +762,6 @@ void update_wall_time(void) { struct clocksource *clock; cycle_t offset; - u64 nsecs; int shift = 0, maxshift; /* Make sure we're fully resumed: */ @@ -841,9 +827,6 @@ void update_wall_time(void) timekeeper.ntp_error += timekeeper.xtime_nsec << timekeeper.ntp_error_shift; - nsecs = clocksource_cyc2ns(offset, timekeeper.mult, timekeeper.shift); - update_xtime_cache(nsecs); - /* check to see if there is a new clocksource to use */ update_vsyscall(&xtime, timekeeper.clock); } @@ -880,13 +863,13 @@ void monotonic_to_bootbased(struct timespec *ts) unsigned long get_seconds(void) { - return xtime_cache.tv_sec; + return xtime.tv_sec; } EXPORT_SYMBOL(get_seconds); struct timespec __current_kernel_time(void) { - return xtime_cache; + return xtime; } struct timespec current_kernel_time(void) @@ -896,8 +879,7 @@ struct timespec current_kernel_time(void) do { seq = read_seqbegin(&xtime_lock); - - now = xtime_cache; + now = xtime; } while (read_seqretry(&xtime_lock, seq)); return now; @@ -911,8 +893,7 @@ struct timespec get_monotonic_coarse(void) do { seq = read_seqbegin(&xtime_lock); - - now = xtime_cache; + now = xtime; mono = wall_to_monotonic; } while (read_seqretry(&xtime_lock, seq)); -- cgit v1.2.3-59-g8ed1b From eed3b9cf3fe3fcc7a50238dfcab63a63914e8f42 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Tue, 29 Sep 2009 14:25:15 +0200 Subject: nohz: Reuse ktime in sub-functions of tick_check_idle. On a system with NOHZ=y tick_check_idle calls tick_nohz_stop_idle and tick_nohz_update_jiffies. Given the right conditions (ts->idle_active and/or ts->tick_stopped) both function get a time stamp with ktime_get. The same time stamp can be reused if both function require one. On s390 this change has the additional benefit that gcc inlines the tick_nohz_stop_idle function into tick_check_idle. The number of instructions to execute tick_check_idle drops from 225 to 144 (without the ktime_get optimization it is 367 vs 215 instructions). before: 0) | tick_check_idle() { 0) | tick_nohz_stop_idle() { 0) | ktime_get() { 0) | read_tod_clock() { 0) 0.601 us | } 0) 1.765 us | } 0) 3.047 us | } 0) | ktime_get() { 0) | read_tod_clock() { 0) 0.570 us | } 0) 1.727 us | } 0) | tick_do_update_jiffies64() { 0) 0.609 us | } 0) 8.055 us | } after: 0) | tick_check_idle() { 0) | ktime_get() { 0) | read_tod_clock() { 0) 0.617 us | } 0) 1.773 us | } 0) | tick_do_update_jiffies64() { 0) 0.593 us | } 0) 4.477 us | } Signed-off-by: Martin Schwidefsky Cc: john stultz LKML-Reference: <20090929122533.206589318@de.ibm.com> Signed-off-by: Thomas Gleixner --- kernel/time/tick-sched.c | 62 ++++++++++++++++++++++++++---------------------- 1 file changed, 33 insertions(+), 29 deletions(-) diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index e0f59a21c061..7378e2c71ca6 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -134,18 +134,13 @@ __setup("nohz=", setup_tick_nohz); * value. We do this unconditionally on any cpu, as we don't know whether the * cpu, which has the update task assigned is in a long sleep. */ -static void tick_nohz_update_jiffies(void) +static void tick_nohz_update_jiffies(ktime_t now) { int cpu = smp_processor_id(); struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); unsigned long flags; - ktime_t now; - - if (!ts->tick_stopped) - return; cpumask_clear_cpu(cpu, nohz_cpu_mask); - now = ktime_get(); ts->idle_waketime = now; local_irq_save(flags); @@ -155,20 +150,17 @@ static void tick_nohz_update_jiffies(void) touch_softlockup_watchdog(); } -static void tick_nohz_stop_idle(int cpu) +static void tick_nohz_stop_idle(int cpu, ktime_t now) { struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); + ktime_t delta; - if (ts->idle_active) { - ktime_t now, delta; - now = ktime_get(); - delta = ktime_sub(now, ts->idle_entrytime); - ts->idle_lastupdate = now; - ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta); - ts->idle_active = 0; + delta = ktime_sub(now, ts->idle_entrytime); + ts->idle_lastupdate = now; + ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta); + ts->idle_active = 0; - sched_clock_idle_wakeup_event(0); - } + sched_clock_idle_wakeup_event(0); } static ktime_t tick_nohz_start_idle(struct tick_sched *ts) @@ -431,7 +423,11 @@ void tick_nohz_restart_sched_tick(void) ktime_t now; local_irq_disable(); - tick_nohz_stop_idle(cpu); + if (ts->idle_active || (ts->inidle && ts->tick_stopped)) + now = ktime_get(); + + if (ts->idle_active) + tick_nohz_stop_idle(cpu, now); if (!ts->inidle || !ts->tick_stopped) { ts->inidle = 0; @@ -445,7 +441,6 @@ void tick_nohz_restart_sched_tick(void) /* Update jiffies first */ select_nohz_load_balancer(0); - now = ktime_get(); tick_do_update_jiffies64(now); cpumask_clear_cpu(cpu, nohz_cpu_mask); @@ -579,22 +574,18 @@ static void tick_nohz_switch_to_nohz(void) * timer and do not touch the other magic bits which need to be done * when idle is left. */ -static void tick_nohz_kick_tick(int cpu) +static void tick_nohz_kick_tick(int cpu, ktime_t now) { #if 0 /* Switch back to 2.6.27 behaviour */ struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); - ktime_t delta, now; - - if (!ts->tick_stopped) - return; + ktime_t delta; /* * Do not touch the tick device, when the next expiry is either * already reached or less/equal than the tick period. */ - now = ktime_get(); delta = ktime_sub(hrtimer_get_expires(&ts->sched_timer), now); if (delta.tv64 <= tick_period.tv64) return; @@ -603,9 +594,26 @@ static void tick_nohz_kick_tick(int cpu) #endif } +static inline void tick_check_nohz(int cpu) +{ + struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); + ktime_t now; + + if (!ts->idle_active && !ts->tick_stopped) + return; + now = ktime_get(); + if (ts->idle_active) + tick_nohz_stop_idle(cpu, now); + if (ts->tick_stopped) { + tick_nohz_update_jiffies(now); + tick_nohz_kick_tick(cpu, now); + } +} + #else static inline void tick_nohz_switch_to_nohz(void) { } +static inline void tick_check_nohz(int cpu) { } #endif /* NO_HZ */ @@ -615,11 +623,7 @@ static inline void tick_nohz_switch_to_nohz(void) { } void tick_check_idle(int cpu) { tick_check_oneshot_broadcast(cpu); -#ifdef CONFIG_NO_HZ - tick_nohz_stop_idle(cpu); - tick_nohz_update_jiffies(); - tick_nohz_kick_tick(cpu); -#endif + tick_check_nohz(cpu); } /* -- cgit v1.2.3-59-g8ed1b From 3c5d92a0cfb5103c0d5ab74d4ae6373d3af38148 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Tue, 29 Sep 2009 14:25:16 +0200 Subject: nohz: Introduce arch_needs_cpu Allow the architecture to request a normal jiffy tick when the system goes idle and tick_nohz_stop_sched_tick is called . On s390 the hook is used to prevent the system going fully idle if there has been an interrupt other than a clock comparator interrupt since the last wakeup. On s390 the HiperSockets response time for 1 connection ping-pong goes down from 42 to 34 microseconds. The CPU cost decreases by 27%. Signed-off-by: Martin Schwidefsky LKML-Reference: <20090929122533.402715150@de.ibm.com> Signed-off-by: Thomas Gleixner --- arch/s390/include/asm/cputime.h | 8 ++++++++ arch/s390/kernel/s390_ext.c | 2 ++ arch/s390/kernel/vtime.c | 2 ++ drivers/s390/cio/cio.c | 1 + include/linux/tick.h | 3 +++ kernel/time/tick-sched.c | 13 ++++++++----- 6 files changed, 24 insertions(+), 5 deletions(-) diff --git a/arch/s390/include/asm/cputime.h b/arch/s390/include/asm/cputime.h index 24b1244aadb9..95f3561517c7 100644 --- a/arch/s390/include/asm/cputime.h +++ b/arch/s390/include/asm/cputime.h @@ -183,6 +183,7 @@ struct s390_idle_data { unsigned long long idle_count; unsigned long long idle_enter; unsigned long long idle_time; + int nohz_delay; }; DECLARE_PER_CPU(struct s390_idle_data, s390_idle); @@ -198,4 +199,11 @@ static inline void s390_idle_check(void) vtime_start_cpu(); } +static inline int s390_nohz_delay(int cpu) +{ + return per_cpu(s390_idle, cpu).nohz_delay != 0; +} + +#define arch_needs_cpu(cpu) s390_nohz_delay(cpu) + #endif /* _S390_CPUTIME_H */ diff --git a/arch/s390/kernel/s390_ext.c b/arch/s390/kernel/s390_ext.c index 0de305b598ce..59618bcd99b7 100644 --- a/arch/s390/kernel/s390_ext.c +++ b/arch/s390/kernel/s390_ext.c @@ -126,6 +126,8 @@ void __irq_entry do_extint(struct pt_regs *regs, unsigned short code) /* Serve timer interrupts first. */ clock_comparator_work(); kstat_cpu(smp_processor_id()).irqs[EXTERNAL_INTERRUPT]++; + if (code != 0x1004) + __get_cpu_var(s390_idle).nohz_delay = 1; index = ext_hash(code); for (p = ext_int_hash[index]; p; p = p->next) { if (likely(p->code == code)) diff --git a/arch/s390/kernel/vtime.c b/arch/s390/kernel/vtime.c index c41bb0d416e1..b59a812a010e 100644 --- a/arch/s390/kernel/vtime.c +++ b/arch/s390/kernel/vtime.c @@ -167,6 +167,8 @@ void vtime_stop_cpu(void) /* Wait for external, I/O or machine check interrupt. */ psw.mask = psw_kernel_bits | PSW_MASK_WAIT | PSW_MASK_IO | PSW_MASK_EXT; + idle->nohz_delay = 0; + /* Check if the CPU timer needs to be reprogrammed. */ if (vq->do_spt) { __u64 vmax = VTIMER_MAX_SLICE; diff --git a/drivers/s390/cio/cio.c b/drivers/s390/cio/cio.c index 138124fcfcad..126f240715a4 100644 --- a/drivers/s390/cio/cio.c +++ b/drivers/s390/cio/cio.c @@ -618,6 +618,7 @@ void __irq_entry do_IRQ(struct pt_regs *regs) old_regs = set_irq_regs(regs); s390_idle_check(); irq_enter(); + __get_cpu_var(s390_idle).nohz_delay = 1; if (S390_lowcore.int_clock >= S390_lowcore.clock_comparator) /* Serve timer interrupts first. */ clock_comparator_work(); diff --git a/include/linux/tick.h b/include/linux/tick.h index 0482229c07db..8dc082194b22 100644 --- a/include/linux/tick.h +++ b/include/linux/tick.h @@ -98,6 +98,9 @@ extern int tick_check_oneshot_change(int allow_nohz); extern struct tick_sched *tick_get_tick_sched(int cpu); extern void tick_check_idle(int cpu); extern int tick_oneshot_mode_active(void); +# ifndef arch_needs_cpu +# define arch_needs_cpu(cpu) (0) +# endif # else static inline void tick_clock_notify(void) { } static inline int tick_check_oneshot_change(int allow_nohz) { return 0; } diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 7378e2c71ca6..3840f6dff7eb 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -264,12 +264,15 @@ void tick_nohz_stop_sched_tick(int inidle) last_jiffies = jiffies; } while (read_seqretry(&xtime_lock, seq)); - /* Get the next timer wheel timer */ - next_jiffies = get_next_timer_interrupt(last_jiffies); - delta_jiffies = next_jiffies - last_jiffies; - - if (rcu_needs_cpu(cpu) || printk_needs_cpu(cpu)) + if (rcu_needs_cpu(cpu) || printk_needs_cpu(cpu) || + arch_needs_cpu(cpu)) { + next_jiffies = last_jiffies + 1; delta_jiffies = 1; + } else { + /* Get the next timer wheel timer */ + next_jiffies = get_next_timer_interrupt(last_jiffies); + delta_jiffies = next_jiffies - last_jiffies; + } /* * Do not stop the tick, if we are only one off * or if the cpu is required for rcu -- cgit v1.2.3-59-g8ed1b From 23af368e9a904f59256c27d371ce223d6cee0430 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 11 Nov 2009 14:05:25 +0000 Subject: clockevents: Use u32 for mult and shift factors The mult and shift factors of clock events differ in their data type from those of clock sources for no reason. u32 is sufficient for both. shift is always <= 32 and mult is limited to 2^32-1 to avoid 64bit multiplication overflows in the conversion. Preparatory patch for a generic mult/shift factor calculation function. Signed-off-by: Thomas Gleixner Tested-by: Mikael Pettersson Acked-by: Ralf Baechle Acked-by: Linus Walleij Cc: John Stultz LKML-Reference: <20091111134229.725664788@linutronix.de> --- include/linux/clockchips.h | 4 ++-- kernel/time/timer_list.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/include/linux/clockchips.h b/include/linux/clockchips.h index 3a1dbba4d3ae..3b5841016276 100644 --- a/include/linux/clockchips.h +++ b/include/linux/clockchips.h @@ -79,8 +79,8 @@ struct clock_event_device { unsigned int features; unsigned long max_delta_ns; unsigned long min_delta_ns; - unsigned long mult; - int shift; + u32 mult; + u32 shift; int rating; int irq; const struct cpumask *cpumask; diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index 1b5b7aa2fdfd..fa00da108a14 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -206,8 +206,8 @@ print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu) SEQ_printf(m, "%s\n", dev->name); SEQ_printf(m, " max_delta_ns: %lu\n", dev->max_delta_ns); SEQ_printf(m, " min_delta_ns: %lu\n", dev->min_delta_ns); - SEQ_printf(m, " mult: %lu\n", dev->mult); - SEQ_printf(m, " shift: %d\n", dev->shift); + SEQ_printf(m, " mult: %u\n", dev->mult); + SEQ_printf(m, " shift: %u\n", dev->shift); SEQ_printf(m, " mode: %d\n", dev->mode); SEQ_printf(m, " next_event: %Ld nsecs\n", (unsigned long long) ktime_to_ns(dev->next_event)); -- cgit v1.2.3-59-g8ed1b From 7d2f944a2b836c69a9d260a0a5f0d1720d57fdff Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 11 Nov 2009 14:05:29 +0000 Subject: clocksource: Provide a generic mult/shift factor calculation MIPS has two functions to calculcate the mult/shift factors for clock sources and clock events at run time. ARM needs such functions as well. Implement a function which calculates the mult/shift factors based on the frequencies to which and from which is converted. The function also has a parameter to specify the minimum conversion range in seconds. This range is guaranteed not to produce a 64bit overflow when a value is multiplied with the calculated mult factor. The larger the conversion range the less becomes the conversion accuracy. Provide two inline wrappers which handle clock events and clock sources. For clock events the "from" frequency is nano seconds per second which corresponds to 1GHz and "to" is the device frequency. For clock sources "from" is the device frequency and "to" is nano seconds per second. Signed-off-by: Thomas Gleixner Tested-by: Mikael Pettersson Acked-by: Ralf Baechle Acked-by: Linus Walleij Cc: John Stultz LKML-Reference: <20091111134229.766673305@linutronix.de> --- include/linux/clockchips.h | 7 ++++++ include/linux/clocksource.h | 10 +++++++++ kernel/time/clocksource.c | 53 +++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 70 insertions(+) diff --git a/include/linux/clockchips.h b/include/linux/clockchips.h index 3b5841016276..4d438b0bc10a 100644 --- a/include/linux/clockchips.h +++ b/include/linux/clockchips.h @@ -130,6 +130,13 @@ extern int clockevents_program_event(struct clock_event_device *dev, extern void clockevents_handle_noop(struct clock_event_device *dev); +static inline void +clockevents_calc_mult_shift(struct clock_event_device *ce, u32 freq, u32 minsec) +{ + return clocks_calc_mult_shift(&ce->mult, &ce->shift, NSEC_PER_SEC, + freq, minsec); +} + #ifdef CONFIG_GENERIC_CLOCKEVENTS extern void clockevents_notify(unsigned long reason, void *arg); #else diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h index 83d2fbd81b93..f57f88250526 100644 --- a/include/linux/clocksource.h +++ b/include/linux/clocksource.h @@ -279,6 +279,16 @@ extern void clocksource_resume(void); extern struct clocksource * __init __weak clocksource_default_clock(void); extern void clocksource_mark_unstable(struct clocksource *cs); +extern void +clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 minsec); + +static inline void +clocksource_calc_mult_shift(struct clocksource *cs, u32 freq, u32 minsec) +{ + return clocks_calc_mult_shift(&cs->mult, &cs->shift, freq, + NSEC_PER_SEC, minsec); +} + #ifdef CONFIG_GENERIC_TIME_VSYSCALL extern void update_vsyscall(struct timespec *ts, struct clocksource *c); extern void update_vsyscall_tz(void); diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 5e18c6ab2c6a..407c0894ef37 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -107,6 +107,59 @@ u64 timecounter_cyc2time(struct timecounter *tc, } EXPORT_SYMBOL(timecounter_cyc2time); +/** + * clocks_calc_mult_shift - calculate mult/shift factors for scaled math of clocks + * @mult: pointer to mult variable + * @shift: pointer to shift variable + * @from: frequency to convert from + * @to: frequency to convert to + * @minsec: guaranteed runtime conversion range in seconds + * + * The function evaluates the shift/mult pair for the scaled math + * operations of clocksources and clockevents. + * + * @to and @from are frequency values in HZ. For clock sources @to is + * NSEC_PER_SEC == 1GHz and @from is the counter frequency. For clock + * event @to is the counter frequency and @from is NSEC_PER_SEC. + * + * The @minsec conversion range argument controls the time frame in + * seconds which must be covered by the runtime conversion with the + * calculated mult and shift factors. This guarantees that no 64bit + * overflow happens when the input value of the conversion is + * multiplied with the calculated mult factor. Larger ranges may + * reduce the conversion accuracy by chosing smaller mult and shift + * factors. + */ +void +clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 minsec) +{ + u64 tmp; + u32 sft, sftacc= 32; + + /* + * Calculate the shift factor which is limiting the conversion + * range: + */ + tmp = ((u64)minsec * from) >> 32; + while (tmp) { + tmp >>=1; + sftacc--; + } + + /* + * Find the conversion shift/mult pair which has the best + * accuracy and fits the maxsec conversion range: + */ + for (sft = 32; sft > 0; sft--) { + tmp = (u64) to << sft; + do_div(tmp, from); + if ((tmp >> sftacc) == 0) + break; + } + *mult = tmp; + *shift = sft; +} + /*[Clocksource internal variables]--------- * curr_clocksource: * currently selected clocksource. -- cgit v1.2.3-59-g8ed1b From e3a4fab0c0c30e21e104712f4e9cb39f175d0f21 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 11 Nov 2009 14:05:34 +0000 Subject: mips: Use generic mult/shift factor calculation for clocks Replace the MIPS functions of mult/shift factor calculation for clock events and clock sources with inline functions which call the generic functions. The minimum guaranteed conversion range is set to 4 seconds which corresponds to the current MIPS implementation. Signed-off-by: Thomas Gleixner Cc: Mikael Pettersson Acked-by: Ralf Baechle Acked-by: Linus Walleij Cc: John Stultz LKML-Reference: <20091111134229.807255074@linutronix.de> --- arch/mips/include/asm/time.h | 14 +++++++++++--- arch/mips/kernel/time.c | 33 --------------------------------- 2 files changed, 11 insertions(+), 36 deletions(-) diff --git a/arch/mips/include/asm/time.h b/arch/mips/include/asm/time.h index df6a430de5eb..c7f1bfef1574 100644 --- a/arch/mips/include/asm/time.h +++ b/arch/mips/include/asm/time.h @@ -84,8 +84,16 @@ static inline int init_mips_clocksource(void) #endif } -extern void clocksource_set_clock(struct clocksource *cs, unsigned int clock); -extern void clockevent_set_clock(struct clock_event_device *cd, - unsigned int clock); +static inline void clocksource_set_clock(struct clocksource *cs, + unsigned int clock) +{ + clocksource_calc_mult_shift(cs, clock, 4); +} + +static inline void clockevent_set_clock(struct clock_event_device *cd, + unsigned int clock) +{ + clockevents_calc_mult_shift(cd, clock, 4); +} #endif /* _ASM_TIME_H */ diff --git a/arch/mips/kernel/time.c b/arch/mips/kernel/time.c index 1f467d534642..fb7497405510 100644 --- a/arch/mips/kernel/time.c +++ b/arch/mips/kernel/time.c @@ -71,39 +71,6 @@ EXPORT_SYMBOL(perf_irq); unsigned int mips_hpt_frequency; -void __init clocksource_set_clock(struct clocksource *cs, unsigned int clock) -{ - u64 temp; - u32 shift; - - /* Find a shift value */ - for (shift = 32; shift > 0; shift--) { - temp = (u64) NSEC_PER_SEC << shift; - do_div(temp, clock); - if ((temp >> 32) == 0) - break; - } - cs->shift = shift; - cs->mult = (u32) temp; -} - -void __cpuinit clockevent_set_clock(struct clock_event_device *cd, - unsigned int clock) -{ - u64 temp; - u32 shift; - - /* Find a shift value */ - for (shift = 32; shift > 0; shift--) { - temp = (u64) clock << shift; - do_div(temp, NSEC_PER_SEC); - if ((temp >> 32) == 0) - break; - } - cd->shift = shift; - cd->mult = (u32) temp; -} - /* * This function exists in order to cause an error due to a duplicate * definition if platform code should have its own implementation. The hook -- cgit v1.2.3-59-g8ed1b From 529eaccd900a59724619b4a6ef6579fd518d5218 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 13 Nov 2009 14:32:19 +0100 Subject: nohz: Type cast printk argument On some archs local_softirq_pending() has a data type of unsigned long on others its unsigned int. Type cast it to (unsigned int) in the printk to avoid the compiler warning. Signed-off-by: Thomas Gleixner LKML-Reference: --- kernel/time/tick-sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 3840f6dff7eb..c65ba0faa98f 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -250,7 +250,7 @@ void tick_nohz_stop_sched_tick(int inidle) if (ratelimit < 10) { printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n", - local_softirq_pending()); + (unsigned int) local_softirq_pending()); ratelimit++; } goto end; -- cgit v1.2.3-59-g8ed1b From 98962465ed9e6ea99c38e0af63fe1dcb5a79dc25 Mon Sep 17 00:00:00 2001 From: Jon Hunter Date: Tue, 18 Aug 2009 12:45:10 -0500 Subject: nohz: Prevent clocksource wrapping during idle The dynamic tick allows the kernel to sleep for periods longer than a single tick, but it does not limit the sleep time currently. In the worst case the kernel could sleep longer than the wrap around time of the time keeping clock source which would result in losing track of time. Prevent this by limiting it to the safe maximum sleep time of the current time keeping clock source. The value is calculated when the clock source is registered. [ tglx: simplified the code a bit and massaged the commit msg ] Signed-off-by: Jon Hunter Cc: John Stultz LKML-Reference: <1250617512-23567-2-git-send-email-jon-hunter@ti.com> Signed-off-by: Thomas Gleixner --- include/linux/clocksource.h | 2 ++ include/linux/time.h | 1 + kernel/time/clocksource.c | 44 ++++++++++++++++++++++++++++++++++++++ kernel/time/tick-sched.c | 52 +++++++++++++++++++++++++++++++++------------ kernel/time/timekeeping.c | 11 ++++++++++ 5 files changed, 96 insertions(+), 14 deletions(-) diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h index f57f88250526..279c5478e8a6 100644 --- a/include/linux/clocksource.h +++ b/include/linux/clocksource.h @@ -151,6 +151,7 @@ extern u64 timecounter_cyc2time(struct timecounter *tc, * subtraction of non 64 bit counters * @mult: cycle to nanosecond multiplier * @shift: cycle to nanosecond divisor (power of two) + * @max_idle_ns: max idle time permitted by the clocksource (nsecs) * @flags: flags describing special properties * @vread: vsyscall based read * @resume: resume function for the clocksource, if necessary @@ -168,6 +169,7 @@ struct clocksource { cycle_t mask; u32 mult; u32 shift; + u64 max_idle_ns; unsigned long flags; cycle_t (*vread)(void); void (*resume)(void); diff --git a/include/linux/time.h b/include/linux/time.h index fe04e5ef6a59..6e026e45a179 100644 --- a/include/linux/time.h +++ b/include/linux/time.h @@ -148,6 +148,7 @@ extern void monotonic_to_bootbased(struct timespec *ts); extern struct timespec timespec_trunc(struct timespec t, unsigned gran); extern int timekeeping_valid_for_hres(void); +extern u64 timekeeping_max_deferment(void); extern void update_wall_time(void); extern void update_xtime_cache(u64 nsec); extern void timekeeping_leap_insert(int leapsecond); diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 407c0894ef37..b65b242f04dd 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -468,6 +468,47 @@ void clocksource_touch_watchdog(void) #ifdef CONFIG_GENERIC_TIME +/** + * clocksource_max_deferment - Returns max time the clocksource can be deferred + * @cs: Pointer to clocksource + * + */ +static u64 clocksource_max_deferment(struct clocksource *cs) +{ + u64 max_nsecs, max_cycles; + + /* + * Calculate the maximum number of cycles that we can pass to the + * cyc2ns function without overflowing a 64-bit signed result. The + * maximum number of cycles is equal to ULLONG_MAX/cs->mult which + * is equivalent to the below. + * max_cycles < (2^63)/cs->mult + * max_cycles < 2^(log2((2^63)/cs->mult)) + * max_cycles < 2^(log2(2^63) - log2(cs->mult)) + * max_cycles < 2^(63 - log2(cs->mult)) + * max_cycles < 1 << (63 - log2(cs->mult)) + * Please note that we add 1 to the result of the log2 to account for + * any rounding errors, ensure the above inequality is satisfied and + * no overflow will occur. + */ + max_cycles = 1ULL << (63 - (ilog2(cs->mult) + 1)); + + /* + * The actual maximum number of cycles we can defer the clocksource is + * determined by the minimum of max_cycles and cs->mask. + */ + max_cycles = min_t(u64, max_cycles, (u64) cs->mask); + max_nsecs = clocksource_cyc2ns(max_cycles, cs->mult, cs->shift); + + /* + * To ensure that the clocksource does not wrap whilst we are idle, + * limit the time the clocksource can be deferred by 12.5%. Please + * note a margin of 12.5% is used because this can be computed with + * a shift, versus say 10% which would require division. + */ + return max_nsecs - (max_nsecs >> 5); +} + /** * clocksource_select - Select the best clocksource available * @@ -564,6 +605,9 @@ static void clocksource_enqueue(struct clocksource *cs) */ int clocksource_register(struct clocksource *cs) { + /* calculate max idle time permitted for this clocksource */ + cs->max_idle_ns = clocksource_max_deferment(cs); + mutex_lock(&clocksource_mutex); clocksource_enqueue(cs); clocksource_select(); diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index c65ba0faa98f..a80b4644fe6b 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -208,6 +208,7 @@ void tick_nohz_stop_sched_tick(int inidle) struct tick_sched *ts; ktime_t last_update, expires, now; struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; + u64 time_delta; int cpu; local_irq_save(flags); @@ -262,6 +263,17 @@ void tick_nohz_stop_sched_tick(int inidle) seq = read_seqbegin(&xtime_lock); last_update = last_jiffies_update; last_jiffies = jiffies; + + /* + * On SMP we really should only care for the CPU which + * has the do_timer duty assigned. All other CPUs can + * sleep as long as they want. + */ + if (cpu == tick_do_timer_cpu || + tick_do_timer_cpu == TICK_DO_TIMER_NONE) + time_delta = timekeeping_max_deferment(); + else + time_delta = KTIME_MAX; } while (read_seqretry(&xtime_lock, seq)); if (rcu_needs_cpu(cpu) || printk_needs_cpu(cpu) || @@ -284,11 +296,26 @@ void tick_nohz_stop_sched_tick(int inidle) if ((long)delta_jiffies >= 1) { /* - * calculate the expiry time for the next timer wheel - * timer - */ - expires = ktime_add_ns(last_update, tick_period.tv64 * - delta_jiffies); + * calculate the expiry time for the next timer wheel + * timer. delta_jiffies >= NEXT_TIMER_MAX_DELTA signals + * that there is no timer pending or at least extremely + * far into the future (12 days for HZ=1000). In this + * case we set the expiry to the end of time. + */ + if (likely(delta_jiffies < NEXT_TIMER_MAX_DELTA)) { + /* + * Calculate the time delta for the next timer event. + * If the time delta exceeds the maximum time delta + * permitted by the current clocksource then adjust + * the time delta accordingly to ensure the + * clocksource does not wrap. + */ + time_delta = min_t(u64, time_delta, + tick_period.tv64 * delta_jiffies); + expires = ktime_add_ns(last_update, time_delta); + } else { + expires.tv64 = KTIME_MAX; + } /* * If this cpu is the one which updates jiffies, then @@ -332,22 +359,19 @@ void tick_nohz_stop_sched_tick(int inidle) ts->idle_sleeps++; + /* Mark expires */ + ts->idle_expires = expires; + /* - * delta_jiffies >= NEXT_TIMER_MAX_DELTA signals that - * there is no timer pending or at least extremly far - * into the future (12 days for HZ=1000). In this case - * we simply stop the tick timer: + * If the expiration time == KTIME_MAX, then + * in this case we simply stop the tick timer. */ - if (unlikely(delta_jiffies >= NEXT_TIMER_MAX_DELTA)) { - ts->idle_expires.tv64 = KTIME_MAX; + if (unlikely(expires.tv64 == KTIME_MAX)) { if (ts->nohz_mode == NOHZ_MODE_HIGHRES) hrtimer_cancel(&ts->sched_timer); goto out; } - /* Mark expiries */ - ts->idle_expires = expires; - if (ts->nohz_mode == NOHZ_MODE_HIGHRES) { hrtimer_start(&ts->sched_timer, expires, HRTIMER_MODE_ABS_PINNED); diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 96b3f0dfa5dc..5d4d4239a0aa 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -477,6 +477,17 @@ int timekeeping_valid_for_hres(void) return ret; } +/** + * timekeeping_max_deferment - Returns max time the clocksource can be deferred + * + * Caller must observe xtime_lock via read_seqbegin/read_seqretry to + * ensure that the clocksource does not change! + */ +u64 timekeeping_max_deferment(void) +{ + return timekeeper.clock->max_idle_ns; +} + /** * read_persistent_clock - Return time from the persistent clock. * -- cgit v1.2.3-59-g8ed1b From 27185016b806d5a1181ff501cae120582b2b27dd Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 12 Nov 2009 22:12:06 +0100 Subject: nohz: Track last do_timer() cpu The previous patch which limits the sleep time to the maximum deferment time of the time keeping clocksource has some limitations on SMP machines: if all CPUs are idle then for all CPUs the maximum sleep time is limited. Solve this by keeping track of which cpu had the do_timer() duty assigned last and limit the sleep time only for this cpu. Signed-off-by: Thomas Gleixner LKML-Reference: Cc: Jon Hunter Cc: John Stultz --- include/linux/tick.h | 2 ++ kernel/time/tick-sched.c | 52 ++++++++++++++++++++++++++---------------------- 2 files changed, 30 insertions(+), 24 deletions(-) diff --git a/include/linux/tick.h b/include/linux/tick.h index 8dc082194b22..d2ae79e21be3 100644 --- a/include/linux/tick.h +++ b/include/linux/tick.h @@ -43,6 +43,7 @@ enum tick_nohz_mode { * @idle_exittime: Time when the idle state was left * @idle_sleeptime: Sum of the time slept in idle with sched tick stopped * @sleep_length: Duration of the current idle sleep + * @do_timer_lst: CPU was the last one doing do_timer before going idle */ struct tick_sched { struct hrtimer sched_timer; @@ -64,6 +65,7 @@ struct tick_sched { unsigned long last_jiffies; unsigned long next_jiffies; ktime_t idle_expires; + int do_timer_last; }; extern void __init tick_init(void); diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index a80b4644fe6b..df133bc29f89 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -263,17 +263,7 @@ void tick_nohz_stop_sched_tick(int inidle) seq = read_seqbegin(&xtime_lock); last_update = last_jiffies_update; last_jiffies = jiffies; - - /* - * On SMP we really should only care for the CPU which - * has the do_timer duty assigned. All other CPUs can - * sleep as long as they want. - */ - if (cpu == tick_do_timer_cpu || - tick_do_timer_cpu == TICK_DO_TIMER_NONE) - time_delta = timekeeping_max_deferment(); - else - time_delta = KTIME_MAX; + time_delta = timekeeping_max_deferment(); } while (read_seqretry(&xtime_lock, seq)); if (rcu_needs_cpu(cpu) || printk_needs_cpu(cpu) || @@ -295,6 +285,29 @@ void tick_nohz_stop_sched_tick(int inidle) /* Schedule the tick, if we are at least one jiffie off */ if ((long)delta_jiffies >= 1) { + /* + * If this cpu is the one which updates jiffies, then + * give up the assignment and let it be taken by the + * cpu which runs the tick timer next, which might be + * this cpu as well. If we don't drop this here the + * jiffies might be stale and do_timer() never + * invoked. Keep track of the fact that it was the one + * which had the do_timer() duty last. If this cpu is + * the one which had the do_timer() duty last, we + * limit the sleep time to the timekeeping + * max_deferement value which we retrieved + * above. Otherwise we can sleep as long as we want. + */ + if (cpu == tick_do_timer_cpu) { + tick_do_timer_cpu = TICK_DO_TIMER_NONE; + ts->do_timer_last = 1; + } else if (tick_do_timer_cpu != TICK_DO_TIMER_NONE) { + time_delta = KTIME_MAX; + ts->do_timer_last = 0; + } else if (!ts->do_timer_last) { + time_delta = KTIME_MAX; + } + /* * calculate the expiry time for the next timer wheel * timer. delta_jiffies >= NEXT_TIMER_MAX_DELTA signals @@ -312,21 +325,12 @@ void tick_nohz_stop_sched_tick(int inidle) */ time_delta = min_t(u64, time_delta, tick_period.tv64 * delta_jiffies); - expires = ktime_add_ns(last_update, time_delta); - } else { - expires.tv64 = KTIME_MAX; } - /* - * If this cpu is the one which updates jiffies, then - * give up the assignment and let it be taken by the - * cpu which runs the tick timer next, which might be - * this cpu as well. If we don't drop this here the - * jiffies might be stale and do_timer() never - * invoked. - */ - if (cpu == tick_do_timer_cpu) - tick_do_timer_cpu = TICK_DO_TIMER_NONE; + if (time_delta < KTIME_MAX) + expires = ktime_add_ns(last_update, time_delta); + else + expires.tv64 = KTIME_MAX; if (delta_jiffies > 1) cpumask_set_cpu(cpu, nohz_cpu_mask); -- cgit v1.2.3-59-g8ed1b From 97813f2fe77804a4464564c75ba8d8826377feea Mon Sep 17 00:00:00 2001 From: Jon Hunter Date: Tue, 18 Aug 2009 12:45:11 -0500 Subject: nohz: Allow 32-bit machines to sleep for more than 2.15 seconds In the dynamic tick code, "max_delta_ns" (member of the "clock_event_device" structure) represents the maximum sleep time that can occur between timer events in nanoseconds. The variable, "max_delta_ns", is defined as an unsigned long which is a 32-bit integer for 32-bit machines and a 64-bit integer for 64-bit machines (if -m64 option is used for gcc). The value of max_delta_ns is set by calling the function "clockevent_delta2ns()" which returns a maximum value of LONG_MAX. For a 32-bit machine LONG_MAX is equal to 0x7fffffff and in nanoseconds this equates to ~2.15 seconds. Hence, the maximum sleep time for a 32-bit machine is ~2.15 seconds, where as for a 64-bit machine it will be many years. This patch changes the type of max_delta_ns to be "u64" instead of "unsigned long" so that this variable is a 64-bit type for both 32-bit and 64-bit machines. It also changes the maximum value returned by clockevent_delta2ns() to KTIME_MAX. Hence this allows a 32-bit machine to sleep for longer than ~2.15 seconds. Please note that this patch also changes "min_delta_ns" to be "u64" too and although this is unnecessary, it makes the patch simpler as it avoids to fixup all callers of clockevent_delta2ns(). [ tglx: changed "unsigned long long" to u64 as we use this data type through out the time code ] Signed-off-by: Jon Hunter Cc: John Stultz LKML-Reference: <1250617512-23567-3-git-send-email-jon-hunter@ti.com> Signed-off-by: Thomas Gleixner --- include/linux/clockchips.h | 8 ++++---- kernel/hrtimer.c | 3 ++- kernel/time/clockevents.c | 11 +++++------ kernel/time/tick-oneshot.c | 4 ++-- kernel/time/timer_list.c | 6 ++++-- 5 files changed, 17 insertions(+), 15 deletions(-) diff --git a/include/linux/clockchips.h b/include/linux/clockchips.h index 4d438b0bc10a..0cf725bdd2a1 100644 --- a/include/linux/clockchips.h +++ b/include/linux/clockchips.h @@ -77,8 +77,8 @@ enum clock_event_nofitiers { struct clock_event_device { const char *name; unsigned int features; - unsigned long max_delta_ns; - unsigned long min_delta_ns; + u64 max_delta_ns; + u64 min_delta_ns; u32 mult; u32 shift; int rating; @@ -116,8 +116,8 @@ static inline unsigned long div_sc(unsigned long ticks, unsigned long nsec, } /* Clock event layer functions */ -extern unsigned long clockevent_delta2ns(unsigned long latch, - struct clock_event_device *evt); +extern u64 clockevent_delta2ns(unsigned long latch, + struct clock_event_device *evt); extern void clockevents_register_device(struct clock_event_device *dev); extern void clockevents_exchange_device(struct clock_event_device *old, diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 6d7020490f94..c215b74cd953 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -1240,7 +1240,8 @@ hrtimer_interrupt_hanging(struct clock_event_device *dev, force_clock_reprogram = 1; dev->min_delta_ns = (unsigned long)try_time.tv64 * 3; printk(KERN_WARNING "hrtimer: interrupt too slow, " - "forcing clock min delta to %lu ns\n", dev->min_delta_ns); + "forcing clock min delta to %llu ns\n", + (unsigned long long) dev->min_delta_ns); } /* * High resolution timer interrupt diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 620b58abdc32..05e8aeedcdf3 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -37,10 +37,9 @@ static DEFINE_SPINLOCK(clockevents_lock); * * Math helper, returns latch value converted to nanoseconds (bound checked) */ -unsigned long clockevent_delta2ns(unsigned long latch, - struct clock_event_device *evt) +u64 clockevent_delta2ns(unsigned long latch, struct clock_event_device *evt) { - u64 clc = ((u64) latch << evt->shift); + u64 clc = (u64) latch << evt->shift; if (unlikely(!evt->mult)) { evt->mult = 1; @@ -50,10 +49,10 @@ unsigned long clockevent_delta2ns(unsigned long latch, do_div(clc, evt->mult); if (clc < 1000) clc = 1000; - if (clc > LONG_MAX) - clc = LONG_MAX; + if (clc > KTIME_MAX) + clc = KTIME_MAX; - return (unsigned long) clc; + return clc; } EXPORT_SYMBOL_GPL(clockevent_delta2ns); diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c index a96c0e2b89cf..0a8a213016f0 100644 --- a/kernel/time/tick-oneshot.c +++ b/kernel/time/tick-oneshot.c @@ -50,9 +50,9 @@ int tick_dev_program_event(struct clock_event_device *dev, ktime_t expires, dev->min_delta_ns += dev->min_delta_ns >> 1; printk(KERN_WARNING - "CE: %s increasing min_delta_ns to %lu nsec\n", + "CE: %s increasing min_delta_ns to %llu nsec\n", dev->name ? dev->name : "?", - dev->min_delta_ns << 1); + (unsigned long long) dev->min_delta_ns << 1); i = 0; } diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index fa00da108a14..665c76edbf17 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -204,8 +204,10 @@ print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu) return; } SEQ_printf(m, "%s\n", dev->name); - SEQ_printf(m, " max_delta_ns: %lu\n", dev->max_delta_ns); - SEQ_printf(m, " min_delta_ns: %lu\n", dev->min_delta_ns); + SEQ_printf(m, " max_delta_ns: %llu\n", + (unsigned long long) dev->max_delta_ns); + SEQ_printf(m, " min_delta_ns: %llu\n", + (unsigned long long) dev->min_delta_ns); SEQ_printf(m, " mult: %u\n", dev->mult); SEQ_printf(m, " shift: %u\n", dev->shift); SEQ_printf(m, " mode: %d\n", dev->mode); -- cgit v1.2.3-59-g8ed1b From a362c638bdf052bf424bce7645d39b101090f6ba Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 14 Nov 2009 00:26:34 +0100 Subject: clocksource/events: Fix fallout of generic code changes powerpc grew a new warning due to the type change of clockevent->mult. The architectures which use parts of the generic time keeping infrastructure tripped over my wrong assumption that clocksource_register is only used when GENERIC_TIME=y. I should have looked and also I should have known better. These renitent Gaul villages are racking my nerves. Some serious deprecating is due. Signed-off-by: Thomas Gleixner --- arch/powerpc/kernel/time.c | 2 +- kernel/time/clocksource.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index 92dc844299b6..60ceb2708948 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -905,7 +905,7 @@ static void register_decrementer_clockevent(int cpu) *dec = decrementer_clockevent; dec->cpumask = cpumask_of(cpu); - printk(KERN_DEBUG "clockevent: %s mult[%lx] shift[%d] cpu[%d]\n", + printk(KERN_DEBUG "clockevent: %s mult[%x] shift[%d] cpu[%d]\n", dec->name, dec->mult, dec->shift, cpu); clockevents_register_device(dec); diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index b65b242f04dd..72a2dcbd8b48 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -466,8 +466,6 @@ void clocksource_touch_watchdog(void) clocksource_resume_watchdog(); } -#ifdef CONFIG_GENERIC_TIME - /** * clocksource_max_deferment - Returns max time the clocksource can be deferred * @cs: Pointer to clocksource @@ -509,6 +507,8 @@ static u64 clocksource_max_deferment(struct clocksource *cs) return max_nsecs - (max_nsecs >> 5); } +#ifdef CONFIG_GENERIC_TIME + /** * clocksource_select - Select the best clocksource available * -- cgit v1.2.3-59-g8ed1b From 621a071f4323a5eb71ea4f289c3d8ce65a4758e4 Mon Sep 17 00:00:00 2001 From: Stephen Rothwell Date: Mon, 16 Nov 2009 17:54:47 +1100 Subject: sparc: fix printk for change of variable type The clockevent mult field became a u32. Signed-off-by: Stephen Rothwell Cc: "David S. Miller" Cc: Peter Zijlstra LKML-Reference: <20091116180118.aa1bf1e4.sfr@canb.auug.org.au> Signed-off-by: Thomas Gleixner --- arch/sparc/kernel/time_64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/sparc/kernel/time_64.c b/arch/sparc/kernel/time_64.c index da1218e8ee87..63f73ae8a892 100644 --- a/arch/sparc/kernel/time_64.c +++ b/arch/sparc/kernel/time_64.c @@ -847,7 +847,7 @@ void __init time_init(void) sparc64_clockevent.min_delta_ns = clockevent_delta2ns(0xF, &sparc64_clockevent); - printk("clockevent: mult[%lx] shift[%d]\n", + printk("clockevent: mult[%ux] shift[%d]\n", sparc64_clockevent.mult, sparc64_clockevent.shift); setup_sparc64_timer(); -- cgit v1.2.3-59-g8ed1b From 411462f62a65eeae7f451c6eb7a38b9d8759c61a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 16 Nov 2009 11:52:39 +0100 Subject: x86: Fix printk format due to variable type change clockevents.mult became u32. Fix the printk format. Signed-off-by: Thomas Gleixner --- arch/x86/kernel/apic/apic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 894aa97f0717..cf4ee5195c5e 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -662,7 +662,7 @@ static int __init calibrate_APIC_clock(void) calibration_result = (delta * APIC_DIVISOR) / LAPIC_CAL_LOOPS; apic_printk(APIC_VERBOSE, "..... delta %ld\n", delta); - apic_printk(APIC_VERBOSE, "..... mult: %ld\n", lapic_clockevent.mult); + apic_printk(APIC_VERBOSE, "..... mult: %u\n", lapic_clockevent.mult); apic_printk(APIC_VERBOSE, "..... calibration result: %u\n", calibration_result); -- cgit v1.2.3-59-g8ed1b From 070e5c3f9989a72076e83fdd5ede3f0f3eb17264 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 18 Nov 2009 12:27:47 +0100 Subject: x86: vmiclock: Fix printk format clockevents.mult became u32. Fix the printk format. Pointed-out-by: Randy Dunlap Signed-off-by: Thomas Gleixner --- arch/x86/kernel/vmiclock_32.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/vmiclock_32.c b/arch/x86/kernel/vmiclock_32.c index 611b9e2360d3..74c92bb194df 100644 --- a/arch/x86/kernel/vmiclock_32.c +++ b/arch/x86/kernel/vmiclock_32.c @@ -226,7 +226,7 @@ static void __devinit vmi_time_init_clockevent(void) evt->min_delta_ns = clockevent_delta2ns(1, evt); evt->cpumask = cpumask_of(cpu); - printk(KERN_WARNING "vmi: registering clock event %s. mult=%lu shift=%u\n", + printk(KERN_WARNING "vmi: registering clock event %s. mult=%u shift=%u\n", evt->name, evt->mult, evt->shift); clockevents_register_device(evt); } -- cgit v1.2.3-59-g8ed1b From 8e1a928a2ed7e8d5cad97c8e985294b4caedd168 Mon Sep 17 00:00:00 2001 From: H Hartley Sweeten Date: Fri, 16 Oct 2009 18:19:01 -0400 Subject: clockevents: Add missing include to pacify sparse Include "tick-internal.h" in order to pick up the extern function prototype for clockevents_shutdown(). This quiets the following sparse build noise: warning: symbol 'clockevents_shutdown' was not declared. Should it be static? Signed-off-by: H Hartley Sweeten LKML-Reference: Reviewed-by: Yong Zhang Cc: johnstul@us.ibm.com Signed-off-by: Andrew Morton Signed-off-by: Thomas Gleixner --- kernel/time/clockevents.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 05e8aeedcdf3..20a8920029ee 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -20,6 +20,8 @@ #include #include +#include "tick-internal.h" + /* The registered clock event devices */ static LIST_HEAD(clockevent_devices); static LIST_HEAD(clockevents_released); -- cgit v1.2.3-59-g8ed1b From ba5ea951d0b5e5896180e21fe07f228d2b3b0e63 Mon Sep 17 00:00:00 2001 From: Stanislaw Gruszka Date: Tue, 17 Nov 2009 14:14:13 -0800 Subject: posix-cpu-timers: optimize and document timer_create callback We have already new_timer initialized to all-zeros hence in function initializations are not needed. Document function expectation about new_timer argument as well. Signed-off-by: Stanislaw Gruszka Cc: johnstul@us.ibm.com Cc: Oleg Nesterov Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Thomas Gleixner --- kernel/posix-cpu-timers.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index 5c9dc228747b..438ff4523513 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -384,7 +384,8 @@ int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp) /* * Validate the clockid_t for a new CPU-clock timer, and initialize the timer. - * This is called from sys_timer_create with the new timer already locked. + * This is called from sys_timer_create() and do_cpu_nanosleep() with the + * new timer already all-zeros initialized. */ int posix_cpu_timer_create(struct k_itimer *new_timer) { @@ -396,8 +397,6 @@ int posix_cpu_timer_create(struct k_itimer *new_timer) return -EINVAL; INIT_LIST_HEAD(&new_timer->it.cpu.entry); - new_timer->it.cpu.incr.sched = 0; - new_timer->it.cpu.expires.sched = 0; read_lock(&tasklist_lock); if (CPUCLOCK_PERTHREAD(new_timer->it_clock)) { -- cgit v1.2.3-59-g8ed1b From feae3203d711db0a9965300ee6d592257fdaae4f Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Tue, 17 Nov 2009 18:22:13 -0600 Subject: timers, init: Limit the number of per cpu calibration bootup messages Limit the number of per cpu calibration messages by only printing out results for the first cpu to boot. Also, don't print "CPUx is down" as this is expected, and we don't need 4096 reminders... ;-) Signed-off-by: Mike Travis Cc: Heiko Carstens Cc: Roland Dreier Cc: Randy Dunlap Cc: Tejun Heo Cc: Andi Kleen Cc: Greg Kroah-Hartman Cc: Yinghai Lu Cc: David Rientjes Cc: Steven Rostedt Cc: Rusty Russell Cc: Hidetoshi Seto Cc: Jack Steiner Cc: Frederic Weisbecker LKML-Reference: <20091118002219.889552000@alcatraz.americas.sgi.com> Signed-off-by: Ingo Molnar --- init/calibrate.c | 24 +++++++++++++++--------- kernel/cpu.c | 5 ++--- 2 files changed, 17 insertions(+), 12 deletions(-) diff --git a/init/calibrate.c b/init/calibrate.c index a379c9061199..6eb48e53d61c 100644 --- a/init/calibrate.c +++ b/init/calibrate.c @@ -123,23 +123,26 @@ void __cpuinit calibrate_delay(void) { unsigned long ticks, loopbit; int lps_precision = LPS_PREC; + static bool printed; if (preset_lpj) { loops_per_jiffy = preset_lpj; - printk(KERN_INFO - "Calibrating delay loop (skipped) preset value.. "); - } else if ((smp_processor_id() == 0) && lpj_fine) { + if (!printed) + pr_info("Calibrating delay loop (skipped) " + "preset value.. "); + } else if ((!printed) && lpj_fine) { loops_per_jiffy = lpj_fine; - printk(KERN_INFO - "Calibrating delay loop (skipped), " + pr_info("Calibrating delay loop (skipped), " "value calculated using timer frequency.. "); } else if ((loops_per_jiffy = calibrate_delay_direct()) != 0) { - printk(KERN_INFO - "Calibrating delay using timer specific routine.. "); + if (!printed) + pr_info("Calibrating delay using timer " + "specific routine.. "); } else { loops_per_jiffy = (1<<12); - printk(KERN_INFO "Calibrating delay loop... "); + if (!printed) + pr_info("Calibrating delay loop... "); while ((loops_per_jiffy <<= 1) != 0) { /* wait for "start of" clock tick */ ticks = jiffies; @@ -170,7 +173,10 @@ void __cpuinit calibrate_delay(void) loops_per_jiffy &= ~loopbit; } } - printk(KERN_CONT "%lu.%02lu BogoMIPS (lpj=%lu)\n", + if (!printed) + pr_cont("%lu.%02lu BogoMIPS (lpj=%lu)\n", loops_per_jiffy/(500000/HZ), (loops_per_jiffy/(5000/HZ)) % 100, loops_per_jiffy); + + printed = true; } diff --git a/kernel/cpu.c b/kernel/cpu.c index 6ba0f1ecb212..7c4e2713df0a 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -392,10 +392,9 @@ int disable_nonboot_cpus(void) if (cpu == first_cpu) continue; error = _cpu_down(cpu, 1); - if (!error) { + if (!error) cpumask_set_cpu(cpu, frozen_cpus); - printk("CPU%d is down\n", cpu); - } else { + else { printk(KERN_ERR "Error taking CPU%d down: %d\n", cpu, error); break; -- cgit v1.2.3-59-g8ed1b