From 866e26115cba6b59cec669b6307599e3e4440491 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@citi.umich.edu>
Date: Tue, 20 Jul 2010 15:23:15 -0700
Subject: timers: Document meaning of deferrable timer

Steal some text from 6e453a67510 "Add support for deferrable timers".  A
reader shouldn't have to dig through the git logs for the basic
description of a deferrable timer.

Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
Cc: johnstul@us.ibm.com
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/timer.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/kernel/timer.c b/kernel/timer.c
index ee305c8d4e18..ce98685cd1cb 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -90,8 +90,13 @@ static DEFINE_PER_CPU(struct tvec_base *, tvec_bases) = &boot_tvec_bases;
 
 /*
  * Note that all tvec_bases are 2 byte aligned and lower bit of
- * base in timer_list is guaranteed to be zero. Use the LSB for
- * the new flag to indicate whether the timer is deferrable
+ * base in timer_list is guaranteed to be zero. Use the LSB to
+ * indicate whether the timer is deferrable.
+ *
+ * A deferrable timer will work normally when the system is busy, but
+ * will not cause a CPU to come out of idle just to service it; instead,
+ * the timer will be serviced when the CPU eventually wakes up with a
+ * subsequent non-deferrable timer.
  */
 #define TBASE_DEFERRABLE_FLAG		(0x1)
 
-- 
cgit v1.2.3-59-g8ed1b


From 22b8f15c2f7130bb0386f548428df2ffd4e81903 Mon Sep 17 00:00:00 2001
From: Patrick Pannuto <ppannuto@codeaurora.org>
Date: Mon, 19 Jul 2010 15:09:26 -0700
Subject: timer: Added usleep[_range] timer

usleep[_range] are finer precision implementations of msleep
and are designed to be drop-in replacements for udelay where
a precise sleep / busy-wait is unnecessary. They also allow
an easy interface to specify slack when a precise (ish)
wakeup is unnecessary to help minimize wakeups

Signed-off-by: Patrick Pannuto <ppannuto@codeaurora.org>
Cc: akinobu.mita@gmail.com
Cc: sboyd@codeaurora.org
Acked-by: Arjan van de Ven <arjan@linux.intel.com>
LKML-Reference: <4C44CDD2.1070708@codeaurora.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/delay.h |  6 ++++++
 kernel/timer.c        | 22 ++++++++++++++++++++++
 2 files changed, 28 insertions(+)

diff --git a/include/linux/delay.h b/include/linux/delay.h
index fd832c6d419e..0e303d1aacd8 100644
--- a/include/linux/delay.h
+++ b/include/linux/delay.h
@@ -45,6 +45,12 @@ extern unsigned long lpj_fine;
 void calibrate_delay(void);
 void msleep(unsigned int msecs);
 unsigned long msleep_interruptible(unsigned int msecs);
+void usleep_range(unsigned long min, unsigned long max);
+
+static inline void usleep(unsigned long usecs)
+{
+	usleep_range(usecs, usecs);
+}
 
 static inline void ssleep(unsigned int seconds)
 {
diff --git a/kernel/timer.c b/kernel/timer.c
index ce98685cd1cb..f110f241ab66 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1755,3 +1755,25 @@ unsigned long msleep_interruptible(unsigned int msecs)
 }
 
 EXPORT_SYMBOL(msleep_interruptible);
+
+static int __sched do_usleep_range(unsigned long min, unsigned long max)
+{
+	ktime_t kmin;
+	unsigned long delta;
+
+	kmin = ktime_set(0, min * NSEC_PER_USEC);
+	delta = max - min;
+	return schedule_hrtimeout_range(&kmin, delta, HRTIMER_MODE_REL);
+}
+
+/**
+ * usleep_range - Drop in replacement for udelay where wakeup is flexible
+ * @min: Minimum time in usecs to sleep
+ * @max: Maximum time in usecs to sleep
+ */
+void usleep_range(unsigned long min, unsigned long max)
+{
+	__set_current_state(TASK_UNINTERRUPTIBLE);
+	do_usleep_range(min, max);
+}
+EXPORT_SYMBOL(usleep_range);
-- 
cgit v1.2.3-59-g8ed1b


From 2b08de0073a5697cf84d6f448d6dbc6cf02fc6b5 Mon Sep 17 00:00:00 2001
From: Andrey Vagin <avagin@openvz.org>
Date: Tue, 20 Jul 2010 15:23:14 -0700
Subject: posix_timer: Move copy_to_user(created_timer_id) down in
 timer_create()

According to Oleg Nesterov:
We can move copy_to_user(created_timer_id) down after
"if (timer_event_spec)" block too. (but before CLOCK_DISPATCH(),
of course).

Signed-off-by: Andrey Vagin <avagin@openvz.org>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Pavel Emelyanov <xemul@openvz.org>
Cc: Stanislaw Gruszka <sgruszka@redhat.com>
Cc: Andrey Vagin <avagin@openvz.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/posix-timers.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index ad723420acc3..9ca4973f736d 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -560,11 +560,6 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock,
 	new_timer->it_clock = which_clock;
 	new_timer->it_overrun = -1;
 
-	if (copy_to_user(created_timer_id,
-			 &new_timer_id, sizeof (new_timer_id))) {
-		error = -EFAULT;
-		goto out;
-	}
 	if (timer_event_spec) {
 		if (copy_from_user(&event, timer_event_spec, sizeof (event))) {
 			error = -EFAULT;
@@ -590,6 +585,12 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock,
 	new_timer->sigq->info.si_tid   = new_timer->it_id;
 	new_timer->sigq->info.si_code  = SI_TIMER;
 
+	if (copy_to_user(created_timer_id,
+			 &new_timer_id, sizeof (new_timer_id))) {
+		error = -EFAULT;
+		goto out;
+	}
+
 	error = CLOCK_DISPATCH(which_clock, timer_create, (new_timer));
 	if (error)
 		goto out;
-- 
cgit v1.2.3-59-g8ed1b


From af5ab277ded04bd9bc6b048c5a2f0e7d70ef0867 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@infradead.org>
Date: Tue, 27 Jul 2010 21:02:10 -0700
Subject: clockevents: Remove the per cpu tick skew

Historically, Linux has tried to make the regular timer tick on the
various CPUs not happen at the same time, to avoid contention on
xtime_lock.

Nowadays, with the tickless kernel, this contention no longer happens
since time keeping and updating are done differently. In addition,
this skew is actually hurting power consumption in a measurable way on
many-core systems.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
LKML-Reference: <20100727210210.58d3118c@infradead.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/tick-sched.c | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 813993b5fb61..74644ccd786c 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -780,7 +780,6 @@ void tick_setup_sched_timer(void)
 {
 	struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
 	ktime_t now = ktime_get();
-	u64 offset;
 
 	/*
 	 * Emulate tick processing via per-CPU hrtimers:
@@ -790,10 +789,6 @@ void tick_setup_sched_timer(void)
 
 	/* Get the next period (per cpu) */
 	hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update());
-	offset = ktime_to_ns(tick_period) >> 1;
-	do_div(offset, num_possible_cpus());
-	offset *= smp_processor_id();
-	hrtimer_add_expires_ns(&ts->sched_timer, offset);
 
 	for (;;) {
 		hrtimer_forward(&ts->sched_timer, now, tick_period);
-- 
cgit v1.2.3-59-g8ed1b


From e1b004c3ef9c59db5f013528628b51c8653155ec Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 4 Aug 2010 10:53:00 +0200
Subject: Revert "timer: Added usleep[_range] timer"

This reverts commit 22b8f15c2f7130bb0386f548428df2ffd4e81903 to merge
an advanced version.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/delay.h |  6 ------
 kernel/timer.c        | 22 ----------------------
 2 files changed, 28 deletions(-)

diff --git a/include/linux/delay.h b/include/linux/delay.h
index 0e303d1aacd8..fd832c6d419e 100644
--- a/include/linux/delay.h
+++ b/include/linux/delay.h
@@ -45,12 +45,6 @@ extern unsigned long lpj_fine;
 void calibrate_delay(void);
 void msleep(unsigned int msecs);
 unsigned long msleep_interruptible(unsigned int msecs);
-void usleep_range(unsigned long min, unsigned long max);
-
-static inline void usleep(unsigned long usecs)
-{
-	usleep_range(usecs, usecs);
-}
 
 static inline void ssleep(unsigned int seconds)
 {
diff --git a/kernel/timer.c b/kernel/timer.c
index f110f241ab66..ce98685cd1cb 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1755,25 +1755,3 @@ unsigned long msleep_interruptible(unsigned int msecs)
 }
 
 EXPORT_SYMBOL(msleep_interruptible);
-
-static int __sched do_usleep_range(unsigned long min, unsigned long max)
-{
-	ktime_t kmin;
-	unsigned long delta;
-
-	kmin = ktime_set(0, min * NSEC_PER_USEC);
-	delta = max - min;
-	return schedule_hrtimeout_range(&kmin, delta, HRTIMER_MODE_REL);
-}
-
-/**
- * usleep_range - Drop in replacement for udelay where wakeup is flexible
- * @min: Minimum time in usecs to sleep
- * @max: Maximum time in usecs to sleep
- */
-void usleep_range(unsigned long min, unsigned long max)
-{
-	__set_current_state(TASK_UNINTERRUPTIBLE);
-	do_usleep_range(min, max);
-}
-EXPORT_SYMBOL(usleep_range);
-- 
cgit v1.2.3-59-g8ed1b


From 5e7f5a178bba45c5aca3448fddecabd4e28f1f6b Mon Sep 17 00:00:00 2001
From: Patrick Pannuto <ppannuto@codeaurora.org>
Date: Mon, 2 Aug 2010 15:01:04 -0700
Subject: timer: Added usleep_range timer

usleep_range is a finer precision implementations of msleep
and is designed to be a drop-in replacement for udelay where
a precise sleep / busy-wait is unnecessary.

Since an easy interface to hrtimers could lead to an undesired
proliferation of interrupts, we provide only a "range" API,
forcing the caller to think about an acceptable tolerance on
both ends and hopefully avoiding introducing another interrupt.

INTRO

As discussed here ( http://lkml.org/lkml/2007/8/3/250 ), msleep(1) is not
precise enough for many drivers (yes, sleep precision is an unfair notion,
but consistently sleeping for ~an order of magnitude greater than requested
is worth fixing). This patch adds a usleep API so that udelay does not have
to be used. Obviously not every udelay can be replaced (those in atomic
contexts or being used for simple bitbanging come to mind), but there are
many, many examples of

mydriver_write(...)
/* Wait for hardware to latch */
udelay(100)

in various drivers where a busy-wait loop is neither beneficial nor
necessary, but msleep simply does not provide enough precision and people
are using a busy-wait loop instead.

CONCERNS FROM THE RFC

Why is udelay a problem / necessary? Most callers of udelay are in device/
driver initialization code, which is serial...

	As I see it, there is only benefit to sleeping over a delay; the
	notion of "refactoring" areas that use udelay was presented, but
	I see usleep as the refactoring. Consider i2c, if the bus is busy,
	you need to wait a bit (say 100us) before trying again, your
	current options are:

		* udelay(100)
		* msleep(1) <-- As noted above, actually as high as ~20ms
				on some platforms, so not really an option
		* Manually set up an hrtimer to try again in 100us (which
		  is what usleep does anyway...)

	People choose the udelay route because it is EASY; we need to
	provide a better easy route.

	Device / driver / boot code is *currently* serial, but every few
	months someone makes noise about parallelizing boot, and IMHO, a
	little forward-thinking now is one less thing to worry about
	if/when that ever happens

udelay's could be preempted

	Sure, but if udelay plans on looping 1000 times, and it gets
	preempted on loop 200, whenever it's scheduled again, it is
	going to do the next 800 loops.

Is the interruptible case needed?

	Probably not, but I see usleep as a very logical parallel to msleep,
	so it made sense to include the "full" API. Processors are getting
	faster (albeit not as quickly as they are becoming more parallel),
	so if someone wanted to be interruptible for a few usecs, why not
	let them? If this is a contentious point, I'm happy to remove it.

OTHER THOUGHTS

I believe there is also value in exposing the usleep_range option; it gives
the scheduler a lot more flexibility and allows the programmer to express
his intent much more clearly; it's something I would hope future driver
writers will take advantage of.

To get the results in the NUMBERS section below, I literally s/udelay/usleep
the kernel tree; I had to go in and undo the changes to the USB drivers, but
everything else booted successfully; I find that extremely telling in and
of itself -- many people are using a delay API where a sleep will suit them
just fine.

SOME ATTEMPTS AT NUMBERS

It turns out that calculating quantifiable benefit on this is challenging,
so instead I will simply present the current state of things, and I hope
this to be sufficient:

How many udelay calls are there in 2.6.35-rc5?

	udealy(ARG) >=	| COUNT
	1000		| 319
	500		| 414
	100		| 1146
	20		| 1832

I am working on Android, so that is my focus for this. The following table
is a modified usleep that simply printk's the amount of time requested to
sleep; these tests were run on a kernel with udelay >= 20 --> usleep

"boot" is power-on to lock screen
"power collapse" is when the power button is pushed and the device suspends
"resume" is when the power button is pushed and the lock screen is displayed
         (no touchscreen events or anything, just turning on the display)
"use device" is from the unlock swipe to clicking around a bit; there is no
	sd card in this phone, so fail loading music, video, camera

	ACTION		| TOTAL NUMBER OF USLEEP CALLS	| NET TIME (us)
	boot		| 22				| 1250
	power-collapse	| 9				| 1200
	resume		| 5				| 500
	use device	| 59				| 7700

The most interesting category to me is the "use device" field; 7700us of
busy-wait time that could be put towards better responsiveness, or at the
least less power usage.

Signed-off-by: Patrick Pannuto <ppannuto@codeaurora.org>
Cc: apw@canonical.com
Cc: corbet@lwn.net
Cc: arjan@linux.intel.com
Cc: Randy Dunlap <rdunlap@xenotime.net>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/delay.h |  1 +
 kernel/timer.c        | 22 ++++++++++++++++++++++
 2 files changed, 23 insertions(+)

diff --git a/include/linux/delay.h b/include/linux/delay.h
index fd832c6d419e..a6ecb34cf547 100644
--- a/include/linux/delay.h
+++ b/include/linux/delay.h
@@ -45,6 +45,7 @@ extern unsigned long lpj_fine;
 void calibrate_delay(void);
 void msleep(unsigned int msecs);
 unsigned long msleep_interruptible(unsigned int msecs);
+void usleep_range(unsigned long min, unsigned long max);
 
 static inline void ssleep(unsigned int seconds)
 {
diff --git a/kernel/timer.c b/kernel/timer.c
index ce98685cd1cb..723a62e86dcb 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1755,3 +1755,25 @@ unsigned long msleep_interruptible(unsigned int msecs)
 }
 
 EXPORT_SYMBOL(msleep_interruptible);
+
+static int __sched do_usleep_range(unsigned long min, unsigned long max)
+{
+	ktime_t kmin;
+	unsigned long delta;
+
+	kmin = ktime_set(0, min * NSEC_PER_USEC);
+	delta = (max - min) * NSEC_PER_USEC;
+	return schedule_hrtimeout_range(&kmin, delta, HRTIMER_MODE_REL);
+}
+
+/**
+ * usleep_range - Drop in replacement for udelay where wakeup is flexible
+ * @min: Minimum time in usecs to sleep
+ * @max: Maximum time in usecs to sleep
+ */
+void usleep_range(unsigned long min, unsigned long max)
+{
+	__set_current_state(TASK_UNINTERRUPTIBLE);
+	do_usleep_range(min, max);
+}
+EXPORT_SYMBOL(usleep_range);
-- 
cgit v1.2.3-59-g8ed1b


From 0fcb80818bc3ade5befd409051089f710adcf7b0 Mon Sep 17 00:00:00 2001
From: Patrick Pannuto <ppannuto@codeaurora.org>
Date: Mon, 2 Aug 2010 15:01:05 -0700
Subject: Documentation: Add timers/timers-howto.txt

This file seeks to explain the nuances in various delays;
many driver writers are not necessarily familiar with the
various kernel timers, their shortfalls, and quirks. When
faced with

ndelay, udelay, mdelay, usleep_range, msleep, and msleep_interrubtible

the question "How do I just wait 1 ms for my hardware to
latch?" has the non-intuitive "best" answer:
	usleep_range(1000,1500)

This patch is followed by a series of checkpatch additions
that seek to help kernel hackers pick the best delay.

Signed-off-by: Patrick Pannuto <ppannuto@codeaurora.org>
Cc: apw@canonical.com
Cc: corbet@lwn.net
Cc: arjan@linux.intel.com
Cc: Randy Dunlap <rdunlap@xenotime.net>
Cc: Andrew Morton <akpm@linux-foundation.org>
LKML-Reference: <1280786467-26999-3-git-send-email-ppannuto@codeaurora.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 Documentation/timers/timers-howto.txt | 105 ++++++++++++++++++++++++++++++++++
 1 file changed, 105 insertions(+)
 create mode 100644 Documentation/timers/timers-howto.txt

diff --git a/Documentation/timers/timers-howto.txt b/Documentation/timers/timers-howto.txt
new file mode 100644
index 000000000000..c9ef29d2ede3
--- /dev/null
+++ b/Documentation/timers/timers-howto.txt
@@ -0,0 +1,105 @@
+delays - Information on the various kernel delay / sleep mechanisms
+-------------------------------------------------------------------
+
+This document seeks to answer the common question: "What is the
+RightWay (TM) to insert a delay?"
+
+This question is most often faced by driver writers who have to
+deal with hardware delays and who may not be the most intimately
+familiar with the inner workings of the Linux Kernel.
+
+
+Inserting Delays
+----------------
+
+The first, and most important, question you need to ask is "Is my
+code in an atomic context?"  This should be followed closely by "Does
+it really need to delay in atomic context?" If so...
+
+ATOMIC CONTEXT:
+	You must use the *delay family of functions. These
+	functions use the jiffie estimation of clock speed
+	and will busy wait for enough loop cycles to achieve
+	the desired delay:
+
+	ndelay(unsigned long nsecs)
+	udelay(unsigned long usecs)
+	mdelay(unsgined long msecs)
+
+	udelay is the generally preferred API; ndelay-level
+	precision may not actually exist on many non-PC devices.
+
+	mdelay is macro wrapper around udelay, to account for
+	possible overflow when passing large arguments to udelay.
+	In general, use of mdelay is discouraged and code should
+	be refactored to allow for the use of msleep.
+
+NON-ATOMIC CONTEXT:
+	You should use the *sleep[_range] family of functions.
+	There are a few more options here, while any of them may
+	work correctly, using the "right" sleep function will
+	help the scheduler, power management, and just make your
+	driver better :)
+
+	-- Backed by busy-wait loop:
+		udelay(unsigned long usecs)
+	-- Backed by hrtimers:
+		usleep_range(unsigned long min, unsigned long max)
+	-- Backed by jiffies / legacy_timers
+		msleep(unsigned long msecs)
+		msleep_interruptible(unsigned long msecs)
+
+	Unlike the *delay family, the underlying mechanism
+	driving each of these calls varies, thus there are
+	quirks you should be aware of.
+
+
+	SLEEPING FOR "A FEW" USECS ( < ~10us? ):
+		* Use udelay
+
+		- Why not usleep?
+			On slower systems, (embedded, OR perhaps a speed-
+			stepped PC!) the overhead of setting up the hrtimers
+			for usleep *may* not be worth it. Such an evaluation
+			will obviously depend on your specific situation, but
+			it is something to be aware of.
+
+	SLEEPING FOR ~USECS OR SMALL MSECS ( 10us - 20ms):
+		* Use usleep_range
+
+		- Why not msleep for (1ms - 20ms)?
+			Explained originally here:
+				http://lkml.org/lkml/2007/8/3/250
+			msleep(1~20) may not do what the caller intends, and
+			will often sleep longer (~20 ms actual sleep for any
+			value given in the 1~20ms range). In many cases this
+			is not the desired behavior.
+
+		- Why is there no "usleep" / What is a good range?
+			Since usleep_range is built on top of hrtimers, the
+			wakeup will be very precise (ish), thus a simple
+			usleep function would likely introduce a large number
+			of undesired interrupts.
+
+			With the introduction of a range, the scheduler is
+			free to coalesce your wakeup with any other wakeup
+			that may have happened for other reasons, or at the
+			worst case, fire an interrupt for your upper bound.
+
+			The larger a range you supply, the greater a chance
+			that you will not trigger an interrupt; this should
+			be balanced with what is an acceptable upper bound on
+			delay / performance for your specific code path. Exact
+			tolerances here are very situation specific, thus it
+			is left to the caller to determine a reasonable range.
+
+	SLEEPING FOR LARGER MSECS ( 10ms+ )
+		* Use msleep or possibly msleep_interruptible
+
+		- What's the difference?
+			msleep sets the current task to TASK_UNINTERRUPTIBLE
+			whereas msleep_interruptible sets the current task to
+			TASK_INTERRUPTIBLE before scheduling the sleep. In
+			short, the difference is whether the sleep can be ended
+			early by a signal. In general, just use msleep unless
+			you know you have a need for the interruptible variant.
-- 
cgit v1.2.3-59-g8ed1b