aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorIngo Molnar <mingo@elte.hu>2011-01-05 14:14:42 +0100
committerIngo Molnar <mingo@elte.hu>2011-01-05 14:14:46 +0100
commit27066fd484a32c80630136aa2b91c980f3198f9d (patch)
tree78ddabdedbfd7525d13ecd62a745525843f1d0e8 /kernel
parentsched, autogroup: Fix reference leak (diff)
parentLinux 2.6.37 (diff)
downloadlinux-dev-27066fd484a32c80630136aa2b91c980f3198f9d.tar.xz
linux-dev-27066fd484a32c80630136aa2b91c980f3198f9d.zip
Merge commit 'v2.6.37' into sched/core
Merge reason: Merge the final .37 tree. Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/fork.c1
-rw-r--r--kernel/kthread.c11
-rw-r--r--kernel/perf_event.c37
-rw-r--r--kernel/power/swap.c2
-rw-r--r--kernel/power/user.c2
-rw-r--r--kernel/resource.c104
-rw-r--r--kernel/sched.c287
-rw-r--r--kernel/taskstats.c57
-rw-r--r--kernel/timer.c8
-rw-r--r--kernel/trace/ring_buffer.c9
-rw-r--r--kernel/trace/trace.c10
-rw-r--r--kernel/user.c1
-rw-r--r--kernel/watchdog.c3
13 files changed, 361 insertions, 171 deletions
diff --git a/kernel/fork.c b/kernel/fork.c
index 067244495966..7d164e25b0f0 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -275,6 +275,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
setup_thread_stack(tsk, orig);
clear_user_return_notifier(tsk);
+ clear_tsk_need_resched(tsk);
stackend = end_of_stack(tsk);
*stackend = STACK_END_MAGIC; /* for overflow detection */
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 74cf6f5e7ade..5355cfd44a3f 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -265,6 +265,17 @@ int kthreadd(void *unused)
return 0;
}
+void __init_kthread_worker(struct kthread_worker *worker,
+ const char *name,
+ struct lock_class_key *key)
+{
+ spin_lock_init(&worker->lock);
+ lockdep_set_class_and_name(&worker->lock, key, name);
+ INIT_LIST_HEAD(&worker->work_list);
+ worker->task = NULL;
+}
+EXPORT_SYMBOL_GPL(__init_kthread_worker);
+
/**
* kthread_worker_fn - kthread function to process kthread_worker
* @worker_ptr: pointer to initialized kthread_worker
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index eac7e3364335..2870feee81dd 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -3824,6 +3824,8 @@ static void perf_event_task_event(struct perf_task_event *task_event)
rcu_read_lock();
list_for_each_entry_rcu(pmu, &pmus, entry) {
cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
+ if (cpuctx->active_pmu != pmu)
+ goto next;
perf_event_task_ctx(&cpuctx->ctx, task_event);
ctx = task_event->task_ctx;
@@ -3959,6 +3961,8 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event)
rcu_read_lock();
list_for_each_entry_rcu(pmu, &pmus, entry) {
cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
+ if (cpuctx->active_pmu != pmu)
+ goto next;
perf_event_comm_ctx(&cpuctx->ctx, comm_event);
ctxn = pmu->task_ctx_nr;
@@ -4144,6 +4148,8 @@ got_name:
rcu_read_lock();
list_for_each_entry_rcu(pmu, &pmus, entry) {
cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
+ if (cpuctx->active_pmu != pmu)
+ goto next;
perf_event_mmap_ctx(&cpuctx->ctx, mmap_event,
vma->vm_flags & VM_EXEC);
@@ -4713,7 +4719,7 @@ static int perf_swevent_init(struct perf_event *event)
break;
}
- if (event_id > PERF_COUNT_SW_MAX)
+ if (event_id >= PERF_COUNT_SW_MAX)
return -ENOENT;
if (!event->parent) {
@@ -5145,20 +5151,36 @@ static void *find_pmu_context(int ctxn)
return NULL;
}
-static void free_pmu_context(void * __percpu cpu_context)
+static void update_pmu_context(struct pmu *pmu, struct pmu *old_pmu)
{
- struct pmu *pmu;
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ struct perf_cpu_context *cpuctx;
+
+ cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
+
+ if (cpuctx->active_pmu == old_pmu)
+ cpuctx->active_pmu = pmu;
+ }
+}
+
+static void free_pmu_context(struct pmu *pmu)
+{
+ struct pmu *i;
mutex_lock(&pmus_lock);
/*
* Like a real lame refcount.
*/
- list_for_each_entry(pmu, &pmus, entry) {
- if (pmu->pmu_cpu_context == cpu_context)
+ list_for_each_entry(i, &pmus, entry) {
+ if (i->pmu_cpu_context == pmu->pmu_cpu_context) {
+ update_pmu_context(i, pmu);
goto out;
+ }
}
- free_percpu(cpu_context);
+ free_percpu(pmu->pmu_cpu_context);
out:
mutex_unlock(&pmus_lock);
}
@@ -5190,6 +5212,7 @@ int perf_pmu_register(struct pmu *pmu)
cpuctx->ctx.pmu = pmu;
cpuctx->jiffies_interval = 1;
INIT_LIST_HEAD(&cpuctx->rotation_list);
+ cpuctx->active_pmu = pmu;
}
got_cpu_context:
@@ -5241,7 +5264,7 @@ void perf_pmu_unregister(struct pmu *pmu)
synchronize_rcu();
free_percpu(pmu->pmu_disable_count);
- free_pmu_context(pmu->pmu_cpu_context);
+ free_pmu_context(pmu);
}
struct pmu *perf_init_event(struct perf_event *event)
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index baf667bb2794..8c7e4832b9be 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -30,7 +30,7 @@
#include "power.h"
-#define HIBERNATE_SIG "LINHIB0001"
+#define HIBERNATE_SIG "S1SUSPEND"
/*
* The swap map is a data structure used for keeping track of each page
diff --git a/kernel/power/user.c b/kernel/power/user.c
index 1b2ea31e6bd8..c36c3b9e8a84 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -137,7 +137,7 @@ static int snapshot_release(struct inode *inode, struct file *filp)
free_all_swap_pages(data->swap);
if (data->frozen)
thaw_processes();
- pm_notifier_call_chain(data->mode == O_WRONLY ?
+ pm_notifier_call_chain(data->mode == O_RDONLY ?
PM_POST_HIBERNATION : PM_POST_RESTORE);
atomic_inc(&snapshot_device_available);
diff --git a/kernel/resource.c b/kernel/resource.c
index 9fad33efd0db..798e2fae2a06 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -40,23 +40,6 @@ EXPORT_SYMBOL(iomem_resource);
static DEFINE_RWLOCK(resource_lock);
-/*
- * By default, we allocate free space bottom-up. The architecture can request
- * top-down by clearing this flag. The user can override the architecture's
- * choice with the "resource_alloc_from_bottom" kernel boot option, but that
- * should only be a debugging tool.
- */
-int resource_alloc_from_bottom = 1;
-
-static __init int setup_alloc_from_bottom(char *s)
-{
- printk(KERN_INFO
- "resource: allocating from bottom-up; please report a bug\n");
- resource_alloc_from_bottom = 1;
- return 0;
-}
-early_param("resource_alloc_from_bottom", setup_alloc_from_bottom);
-
static void *r_next(struct seq_file *m, void *v, loff_t *pos)
{
struct resource *p = v;
@@ -374,6 +357,10 @@ int __weak page_is_ram(unsigned long pfn)
return walk_system_ram_range(pfn, 1, NULL, __is_ram) == 1;
}
+void __weak arch_remove_reservations(struct resource *avail)
+{
+}
+
static resource_size_t simple_align_resource(void *data,
const struct resource *avail,
resource_size_t size,
@@ -397,74 +384,7 @@ static bool resource_contains(struct resource *res1, struct resource *res2)
}
/*
- * Find the resource before "child" in the sibling list of "root" children.
- */
-static struct resource *find_sibling_prev(struct resource *root, struct resource *child)
-{
- struct resource *this;
-
- for (this = root->child; this; this = this->sibling)
- if (this->sibling == child)
- return this;
-
- return NULL;
-}
-
-/*
* Find empty slot in the resource tree given range and alignment.
- * This version allocates from the end of the root resource first.
- */
-static int find_resource_from_top(struct resource *root, struct resource *new,
- resource_size_t size, resource_size_t min,
- resource_size_t max, resource_size_t align,
- resource_size_t (*alignf)(void *,
- const struct resource *,
- resource_size_t,
- resource_size_t),
- void *alignf_data)
-{
- struct resource *this;
- struct resource tmp, avail, alloc;
-
- tmp.start = root->end;
- tmp.end = root->end;
-
- this = find_sibling_prev(root, NULL);
- for (;;) {
- if (this) {
- if (this->end < root->end)
- tmp.start = this->end + 1;
- } else
- tmp.start = root->start;
-
- resource_clip(&tmp, min, max);
-
- /* Check for overflow after ALIGN() */
- avail = *new;
- avail.start = ALIGN(tmp.start, align);
- avail.end = tmp.end;
- if (avail.start >= tmp.start) {
- alloc.start = alignf(alignf_data, &avail, size, align);
- alloc.end = alloc.start + size - 1;
- if (resource_contains(&avail, &alloc)) {
- new->start = alloc.start;
- new->end = alloc.end;
- return 0;
- }
- }
-
- if (!this || this->start == root->start)
- break;
-
- tmp.end = this->start - 1;
- this = find_sibling_prev(root, this);
- }
- return -EBUSY;
-}
-
-/*
- * Find empty slot in the resource tree given range and alignment.
- * This version allocates from the beginning of the root resource first.
*/
static int find_resource(struct resource *root, struct resource *new,
resource_size_t size, resource_size_t min,
@@ -478,23 +398,24 @@ static int find_resource(struct resource *root, struct resource *new,
struct resource *this = root->child;
struct resource tmp = *new, avail, alloc;
+ tmp.flags = new->flags;
tmp.start = root->start;
/*
- * Skip past an allocated resource that starts at 0, since the
- * assignment of this->start - 1 to tmp->end below would cause an
- * underflow.
+ * Skip past an allocated resource that starts at 0, since the assignment
+ * of this->start - 1 to tmp->end below would cause an underflow.
*/
if (this && this->start == 0) {
tmp.start = this->end + 1;
this = this->sibling;
}
- for (;;) {
+ for(;;) {
if (this)
tmp.end = this->start - 1;
else
tmp.end = root->end;
resource_clip(&tmp, min, max);
+ arch_remove_reservations(&tmp);
/* Check for overflow after ALIGN() */
avail = *new;
@@ -509,10 +430,8 @@ static int find_resource(struct resource *root, struct resource *new,
return 0;
}
}
-
if (!this)
break;
-
tmp.start = this->end + 1;
this = this->sibling;
}
@@ -545,10 +464,7 @@ int allocate_resource(struct resource *root, struct resource *new,
alignf = simple_align_resource;
write_lock(&resource_lock);
- if (resource_alloc_from_bottom)
- err = find_resource(root, new, size, min, max, align, alignf, alignf_data);
- else
- err = find_resource_from_top(root, new, size, min, max, align, alignf, alignf_data);
+ err = find_resource(root, new, size, min, max, align, alignf, alignf_data);
if (err >= 0 && __request_resource(root, new))
err = -EBUSY;
write_unlock(&resource_lock);
diff --git a/kernel/sched.c b/kernel/sched.c
index 9f9dd8dda53c..f2f914e0c47c 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -642,22 +642,18 @@ static inline struct task_group *task_group(struct task_struct *p)
#endif /* CONFIG_CGROUP_SCHED */
-static u64 irq_time_cpu(int cpu);
-static void sched_irq_time_avg_update(struct rq *rq, u64 irq_time);
+static void update_rq_clock_task(struct rq *rq, s64 delta);
-inline void update_rq_clock(struct rq *rq)
+static void update_rq_clock(struct rq *rq)
{
- if (!rq->skip_clock_update) {
- int cpu = cpu_of(rq);
- u64 irq_time;
+ s64 delta;
- rq->clock = sched_clock_cpu(cpu);
- irq_time = irq_time_cpu(cpu);
- if (rq->clock - irq_time > rq->clock_task)
- rq->clock_task = rq->clock - irq_time;
+ if (rq->skip_clock_update)
+ return;
- sched_irq_time_avg_update(rq, irq_time);
- }
+ delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
+ rq->clock += delta;
+ update_rq_clock_task(rq, delta);
}
/*
@@ -1795,10 +1791,9 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
* They are read and saved off onto struct rq in update_rq_clock().
* This may result in other CPU reading this CPU's irq time and can
* race with irq/account_system_vtime on this CPU. We would either get old
- * or new value (or semi updated value on 32 bit) with a side effect of
- * accounting a slice of irq time to wrong task when irq is in progress
- * while we read rq->clock. That is a worthy compromise in place of having
- * locks on each irq in account_system_time.
+ * or new value with a side effect of accounting a slice of irq time to wrong
+ * task when irq is in progress while we read rq->clock. That is a worthy
+ * compromise in place of having locks on each irq in account_system_time.
*/
static DEFINE_PER_CPU(u64, cpu_hardirq_time);
static DEFINE_PER_CPU(u64, cpu_softirq_time);
@@ -1816,19 +1811,58 @@ void disable_sched_clock_irqtime(void)
sched_clock_irqtime = 0;
}
-static u64 irq_time_cpu(int cpu)
+#ifndef CONFIG_64BIT
+static DEFINE_PER_CPU(seqcount_t, irq_time_seq);
+
+static inline void irq_time_write_begin(void)
{
- if (!sched_clock_irqtime)
- return 0;
+ __this_cpu_inc(irq_time_seq.sequence);
+ smp_wmb();
+}
+
+static inline void irq_time_write_end(void)
+{
+ smp_wmb();
+ __this_cpu_inc(irq_time_seq.sequence);
+}
+
+static inline u64 irq_time_read(int cpu)
+{
+ u64 irq_time;
+ unsigned seq;
+ do {
+ seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu));
+ irq_time = per_cpu(cpu_softirq_time, cpu) +
+ per_cpu(cpu_hardirq_time, cpu);
+ } while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq));
+
+ return irq_time;
+}
+#else /* CONFIG_64BIT */
+static inline void irq_time_write_begin(void)
+{
+}
+
+static inline void irq_time_write_end(void)
+{
+}
+
+static inline u64 irq_time_read(int cpu)
+{
return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu);
}
+#endif /* CONFIG_64BIT */
+/*
+ * Called before incrementing preempt_count on {soft,}irq_enter
+ * and before decrementing preempt_count on {soft,}irq_exit.
+ */
void account_system_vtime(struct task_struct *curr)
{
unsigned long flags;
+ s64 delta;
int cpu;
- u64 now, delta;
if (!sched_clock_irqtime)
return;
@@ -1836,9 +1870,10 @@ void account_system_vtime(struct task_struct *curr)
local_irq_save(flags);
cpu = smp_processor_id();
- now = sched_clock_cpu(cpu);
- delta = now - per_cpu(irq_start_time, cpu);
- per_cpu(irq_start_time, cpu) = now;
+ delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time);
+ __this_cpu_add(irq_start_time, delta);
+
+ irq_time_write_begin();
/*
* We do not account for softirq time from ksoftirqd here.
* We want to continue accounting softirq time to ksoftirqd thread
@@ -1846,33 +1881,55 @@ void account_system_vtime(struct task_struct *curr)
* that do not consume any time, but still wants to run.
*/
if (hardirq_count())
- per_cpu(cpu_hardirq_time, cpu) += delta;
+ __this_cpu_add(cpu_hardirq_time, delta);
else if (in_serving_softirq() && !(curr->flags & PF_KSOFTIRQD))
- per_cpu(cpu_softirq_time, cpu) += delta;
+ __this_cpu_add(cpu_softirq_time, delta);
+ irq_time_write_end();
local_irq_restore(flags);
}
EXPORT_SYMBOL_GPL(account_system_vtime);
-static void sched_irq_time_avg_update(struct rq *rq, u64 curr_irq_time)
+static void update_rq_clock_task(struct rq *rq, s64 delta)
{
- if (sched_clock_irqtime && sched_feat(NONIRQ_POWER)) {
- u64 delta_irq = curr_irq_time - rq->prev_irq_time;
- rq->prev_irq_time = curr_irq_time;
- sched_rt_avg_update(rq, delta_irq);
- }
+ s64 irq_delta;
+
+ irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
+
+ /*
+ * Since irq_time is only updated on {soft,}irq_exit, we might run into
+ * this case when a previous update_rq_clock() happened inside a
+ * {soft,}irq region.
+ *
+ * When this happens, we stop ->clock_task and only update the
+ * prev_irq_time stamp to account for the part that fit, so that a next
+ * update will consume the rest. This ensures ->clock_task is
+ * monotonic.
+ *
+ * It does however cause some slight miss-attribution of {soft,}irq
+ * time, a more accurate solution would be to update the irq_time using
+ * the current rq->clock timestamp, except that would require using
+ * atomic ops.
+ */
+ if (irq_delta > delta)
+ irq_delta = delta;
+
+ rq->prev_irq_time += irq_delta;
+ delta -= irq_delta;
+ rq->clock_task += delta;
+
+ if (irq_delta && sched_feat(NONIRQ_POWER))
+ sched_rt_avg_update(rq, irq_delta);
}
-#else
+#else /* CONFIG_IRQ_TIME_ACCOUNTING */
-static u64 irq_time_cpu(int cpu)
+static void update_rq_clock_task(struct rq *rq, s64 delta)
{
- return 0;
+ rq->clock_task += delta;
}
-static void sched_irq_time_avg_update(struct rq *rq, u64 curr_irq_time) { }
-
-#endif
+#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
#include "sched_idletask.c"
#include "sched_fair.c"
@@ -2001,7 +2058,7 @@ static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
* A queue event has occurred, and we're going to schedule. In
* this case, we can save a useless back to back clock update.
*/
- if (test_tsk_need_resched(rq->curr))
+ if (rq->curr->se.on_rq && test_tsk_need_resched(rq->curr))
rq->skip_clock_update = 1;
}
@@ -2988,6 +3045,15 @@ static long calc_load_fold_active(struct rq *this_rq)
return delta;
}
+static unsigned long
+calc_load(unsigned long load, unsigned long exp, unsigned long active)
+{
+ load *= exp;
+ load += active * (FIXED_1 - exp);
+ load += 1UL << (FSHIFT - 1);
+ return load >> FSHIFT;
+}
+
#ifdef CONFIG_NO_HZ
/*
* For NO_HZ we delay the active fold to the next LOAD_FREQ update.
@@ -3017,6 +3083,128 @@ static long calc_load_fold_idle(void)
return delta;
}
+
+/**
+ * fixed_power_int - compute: x^n, in O(log n) time
+ *
+ * @x: base of the power
+ * @frac_bits: fractional bits of @x
+ * @n: power to raise @x to.
+ *
+ * By exploiting the relation between the definition of the natural power
+ * function: x^n := x*x*...*x (x multiplied by itself for n times), and
+ * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i,
+ * (where: n_i \elem {0, 1}, the binary vector representing n),
+ * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is
+ * of course trivially computable in O(log_2 n), the length of our binary
+ * vector.
+ */
+static unsigned long
+fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n)
+{
+ unsigned long result = 1UL << frac_bits;
+
+ if (n) for (;;) {
+ if (n & 1) {
+ result *= x;
+ result += 1UL << (frac_bits - 1);
+ result >>= frac_bits;
+ }
+ n >>= 1;
+ if (!n)
+ break;
+ x *= x;
+ x += 1UL << (frac_bits - 1);
+ x >>= frac_bits;
+ }
+
+ return result;
+}
+
+/*
+ * a1 = a0 * e + a * (1 - e)
+ *
+ * a2 = a1 * e + a * (1 - e)
+ * = (a0 * e + a * (1 - e)) * e + a * (1 - e)
+ * = a0 * e^2 + a * (1 - e) * (1 + e)
+ *
+ * a3 = a2 * e + a * (1 - e)
+ * = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e)
+ * = a0 * e^3 + a * (1 - e) * (1 + e + e^2)
+ *
+ * ...
+ *
+ * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1]
+ * = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e)
+ * = a0 * e^n + a * (1 - e^n)
+ *
+ * [1] application of the geometric series:
+ *
+ * n 1 - x^(n+1)
+ * S_n := \Sum x^i = -------------
+ * i=0 1 - x
+ */
+static unsigned long
+calc_load_n(unsigned long load, unsigned long exp,
+ unsigned long active, unsigned int n)
+{
+
+ return calc_load(load, fixed_power_int(exp, FSHIFT, n), active);
+}
+
+/*
+ * NO_HZ can leave us missing all per-cpu ticks calling
+ * calc_load_account_active(), but since an idle CPU folds its delta into
+ * calc_load_tasks_idle per calc_load_account_idle(), all we need to do is fold
+ * in the pending idle delta if our idle period crossed a load cycle boundary.
+ *
+ * Once we've updated the global active value, we need to apply the exponential
+ * weights adjusted to the number of cycles missed.
+ */
+static void calc_global_nohz(unsigned long ticks)
+{
+ long delta, active, n;
+
+ if (time_before(jiffies, calc_load_update))
+ return;
+
+ /*
+ * If we crossed a calc_load_update boundary, make sure to fold
+ * any pending idle changes, the respective CPUs might have
+ * missed the tick driven calc_load_account_active() update
+ * due to NO_HZ.
+ */
+ delta = calc_load_fold_idle();
+ if (delta)
+ atomic_long_add(delta, &calc_load_tasks);
+
+ /*
+ * If we were idle for multiple load cycles, apply them.
+ */
+ if (ticks >= LOAD_FREQ) {
+ n = ticks / LOAD_FREQ;
+
+ active = atomic_long_read(&calc_load_tasks);
+ active = active > 0 ? active * FIXED_1 : 0;
+
+ avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);
+ avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
+ avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
+
+ calc_load_update += n * LOAD_FREQ;
+ }
+
+ /*
+ * Its possible the remainder of the above division also crosses
+ * a LOAD_FREQ period, the regular check in calc_global_load()
+ * which comes after this will take care of that.
+ *
+ * Consider us being 11 ticks before a cycle completion, and us
+ * sleeping for 4*LOAD_FREQ + 22 ticks, then the above code will
+ * age us 4 cycles, and the test in calc_global_load() will
+ * pick up the final one.
+ */
+}
#else
static void calc_load_account_idle(struct rq *this_rq)
{
@@ -3026,6 +3214,10 @@ static inline long calc_load_fold_idle(void)
{
return 0;
}
+
+static void calc_global_nohz(unsigned long ticks)
+{
+}
#endif
/**
@@ -3043,24 +3235,17 @@ void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
loads[2] = (avenrun[2] + offset) << shift;
}
-static unsigned long
-calc_load(unsigned long load, unsigned long exp, unsigned long active)
-{
- load *= exp;
- load += active * (FIXED_1 - exp);
- return load >> FSHIFT;
-}
-
/*
* calc_load - update the avenrun load estimates 10 ticks after the
* CPUs have updated calc_load_tasks.
*/
-void calc_global_load(void)
+void calc_global_load(unsigned long ticks)
{
- unsigned long upd = calc_load_update + 10;
long active;
- if (time_before(jiffies, upd))
+ calc_global_nohz(ticks);
+
+ if (time_before(jiffies, calc_load_update + 10))
return;
active = atomic_long_read(&calc_load_tasks);
@@ -3714,7 +3899,6 @@ static void put_prev_task(struct rq *rq, struct task_struct *prev)
{
if (prev->se.on_rq)
update_rq_clock(rq);
- rq->skip_clock_update = 0;
prev->sched_class->put_prev_task(rq, prev);
}
@@ -3772,7 +3956,6 @@ need_resched_nonpreemptible:
hrtick_clear(rq);
raw_spin_lock_irq(&rq->lock);
- clear_tsk_need_resched(prev);
switch_count = &prev->nivcsw;
if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
@@ -3804,6 +3987,8 @@ need_resched_nonpreemptible:
put_prev_task(rq, prev);
next = pick_next_task(rq);
+ clear_tsk_need_resched(prev);
+ rq->skip_clock_update = 0;
if (likely(prev != next)) {
sched_info_switch(prev, next);
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index c8231fb15708..3308fd7f1b52 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -349,25 +349,47 @@ static int parse(struct nlattr *na, struct cpumask *mask)
return ret;
}
+#ifdef CONFIG_IA64
+#define TASKSTATS_NEEDS_PADDING 1
+#endif
+
static struct taskstats *mk_reply(struct sk_buff *skb, int type, u32 pid)
{
struct nlattr *na, *ret;
int aggr;
- /* If we don't pad, we end up with alignment on a 4 byte boundary.
- * This causes lots of runtime warnings on systems requiring 8 byte
- * alignment */
- u32 pids[2] = { pid, 0 };
- int pid_size = ALIGN(sizeof(pid), sizeof(long));
-
aggr = (type == TASKSTATS_TYPE_PID)
? TASKSTATS_TYPE_AGGR_PID
: TASKSTATS_TYPE_AGGR_TGID;
+ /*
+ * The taskstats structure is internally aligned on 8 byte
+ * boundaries but the layout of the aggregrate reply, with
+ * two NLA headers and the pid (each 4 bytes), actually
+ * force the entire structure to be unaligned. This causes
+ * the kernel to issue unaligned access warnings on some
+ * architectures like ia64. Unfortunately, some software out there
+ * doesn't properly unroll the NLA packet and assumes that the start
+ * of the taskstats structure will always be 20 bytes from the start
+ * of the netlink payload. Aligning the start of the taskstats
+ * structure breaks this software, which we don't want. So, for now
+ * the alignment only happens on architectures that require it
+ * and those users will have to update to fixed versions of those
+ * packages. Space is reserved in the packet only when needed.
+ * This ifdef should be removed in several years e.g. 2012 once
+ * we can be confident that fixed versions are installed on most
+ * systems. We add the padding before the aggregate since the
+ * aggregate is already a defined type.
+ */
+#ifdef TASKSTATS_NEEDS_PADDING
+ if (nla_put(skb, TASKSTATS_TYPE_NULL, 0, NULL) < 0)
+ goto err;
+#endif
na = nla_nest_start(skb, aggr);
if (!na)
goto err;
- if (nla_put(skb, type, pid_size, pids) < 0)
+
+ if (nla_put(skb, type, sizeof(pid), &pid) < 0)
goto err;
ret = nla_reserve(skb, TASKSTATS_TYPE_STATS, sizeof(struct taskstats));
if (!ret)
@@ -456,6 +478,18 @@ out:
return rc;
}
+static size_t taskstats_packet_size(void)
+{
+ size_t size;
+
+ size = nla_total_size(sizeof(u32)) +
+ nla_total_size(sizeof(struct taskstats)) + nla_total_size(0);
+#ifdef TASKSTATS_NEEDS_PADDING
+ size += nla_total_size(0); /* Padding for alignment */
+#endif
+ return size;
+}
+
static int cmd_attr_pid(struct genl_info *info)
{
struct taskstats *stats;
@@ -464,8 +498,7 @@ static int cmd_attr_pid(struct genl_info *info)
u32 pid;
int rc;
- size = nla_total_size(sizeof(u32)) +
- nla_total_size(sizeof(struct taskstats)) + nla_total_size(0);
+ size = taskstats_packet_size();
rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, size);
if (rc < 0)
@@ -494,8 +527,7 @@ static int cmd_attr_tgid(struct genl_info *info)
u32 tgid;
int rc;
- size = nla_total_size(sizeof(u32)) +
- nla_total_size(sizeof(struct taskstats)) + nla_total_size(0);
+ size = taskstats_packet_size();
rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, size);
if (rc < 0)
@@ -570,8 +602,7 @@ void taskstats_exit(struct task_struct *tsk, int group_dead)
/*
* Size includes space for nested attributes
*/
- size = nla_total_size(sizeof(u32)) +
- nla_total_size(sizeof(struct taskstats)) + nla_total_size(0);
+ size = taskstats_packet_size();
is_thread_group = !!taskstats_tgid_alloc(tsk);
if (is_thread_group) {
diff --git a/kernel/timer.c b/kernel/timer.c
index 68a9ae7679b7..353b9227c2ec 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1252,6 +1252,12 @@ unsigned long get_next_timer_interrupt(unsigned long now)
struct tvec_base *base = __get_cpu_var(tvec_bases);
unsigned long expires;
+ /*
+ * Pretend that there is no timer pending if the cpu is offline.
+ * Possible pending timers will be migrated later to an active cpu.
+ */
+ if (cpu_is_offline(smp_processor_id()))
+ return now + NEXT_TIMER_MAX_DELTA;
spin_lock(&base->lock);
if (time_before_eq(base->next_timer, base->timer_jiffies))
base->next_timer = __next_timer_interrupt(base);
@@ -1319,7 +1325,7 @@ void do_timer(unsigned long ticks)
{
jiffies_64 += ticks;
update_wall_time();
- calc_global_load();
+ calc_global_load(ticks);
}
#ifdef __ARCH_WANT_SYS_ALARM
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 9ed509a015d8..bd1c35a4fbcc 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -3853,6 +3853,13 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
/* Need to copy one event at a time */
do {
+ /* We need the size of one event, because
+ * rb_advance_reader only advances by one event,
+ * whereas rb_event_ts_length may include the size of
+ * one or two events.
+ * We have already ensured there's enough space if this
+ * is a time extend. */
+ size = rb_event_length(event);
memcpy(bpage->data + pos, rpage->data + rpos, size);
len -= size;
@@ -3867,7 +3874,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
event = rb_reader_event(cpu_buffer);
/* Always keep the time extend and data together */
size = rb_event_ts_length(event);
- } while (len > size);
+ } while (len >= size);
/* update bpage */
local_set(&bpage->commit, pos);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index c380612273bf..f8cf959bad45 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2338,11 +2338,19 @@ tracing_write_stub(struct file *filp, const char __user *ubuf,
return count;
}
+static loff_t tracing_seek(struct file *file, loff_t offset, int origin)
+{
+ if (file->f_mode & FMODE_READ)
+ return seq_lseek(file, offset, origin);
+ else
+ return 0;
+}
+
static const struct file_operations tracing_fops = {
.open = tracing_open,
.read = seq_read,
.write = tracing_write_stub,
- .llseek = seq_lseek,
+ .llseek = tracing_seek,
.release = tracing_release,
};
diff --git a/kernel/user.c b/kernel/user.c
index 2c7d8d5914b1..5c598ca781df 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -158,6 +158,7 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
spin_lock_irq(&uidhash_lock);
up = uid_hash_find(uid, hashent);
if (up) {
+ put_user_ns(ns);
key_put(new->uid_keyring);
key_put(new->session_keyring);
kmem_cache_free(uid_cachep, new);
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 14b8120d5232..c812c4927cab 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -364,7 +364,8 @@ static int watchdog_nmi_enable(int cpu)
goto out_save;
}
- printk(KERN_ERR "NMI watchdog failed to create perf event on cpu%i: %p\n", cpu, event);
+ printk(KERN_ERR "NMI watchdog disabled for cpu%i: unable to create perf event: %ld\n",
+ cpu, PTR_ERR(event));
return PTR_ERR(event);
/* success path */