From 9cfb38a7ba5a9c27c1af8093fb1af4b699c0a441 Mon Sep 17 00:00:00 2001
From: Wanpeng Li <wanpeng.li@hotmail.com>
Date: Sun, 9 Oct 2016 08:04:03 +0800
Subject: sched/fair: Fix sched domains NULL dereference in
 select_idle_sibling()

Commit:

  10e2f1acd01 ("sched/core: Rewrite and improve select_idle_siblings()")

... improved select_idle_sibling(), but also triggered a regression (crash)
during CPU-hotplug:

  BUG: unable to handle kernel NULL pointer dereference at 0000000000000078
  IP: [<ffffffffb10cd332>] select_idle_sibling+0x1c2/0x4f0
  Call Trace:
   <IRQ>
    select_task_rq_fair+0x749/0x930
    ? select_task_rq_fair+0xb4/0x930
    ? __lock_is_held+0x54/0x70
    try_to_wake_up+0x19a/0x5b0
    default_wake_function+0x12/0x20
    autoremove_wake_function+0x12/0x40
    __wake_up_common+0x55/0x90
    __wake_up+0x39/0x50
    wake_up_klogd_work_func+0x40/0x60
    irq_work_run_list+0x57/0x80
    irq_work_run+0x2c/0x30
    smp_irq_work_interrupt+0x2e/0x40
    irq_work_interrupt+0x96/0xa0
   <EOI>
    ? _raw_spin_unlock_irqrestore+0x45/0x80
    try_to_wake_up+0x4a/0x5b0
    wake_up_state+0x10/0x20
    __kthread_unpark+0x67/0x70
    kthread_unpark+0x22/0x30
    cpuhp_online_idle+0x3e/0x70
    cpu_startup_entry+0x6a/0x450
    start_secondary+0x154/0x180

This can be reproduced by running the ftrace test case of kselftest, the
test case will hot-unplug the CPU and the CPU will attach to the NULL
sched-domain during scheduler teardown.

The step 2 for the rewrite select_idle_siblings():

  | Step 2) tracks the average cost of the scan and compares this to the
  | average idle time guestimate for the CPU doing the wakeup.

If the CPU which doing the wakeup is the going hot-unplug CPU, then NULL
sched domain will be dereferenced to acquire the average cost of the scan.

This patch fix it by failing the search of an idle CPU in the LLC process
if this sched domain is NULL.

Tested-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Wanpeng Li <wanpeng.li@hotmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1475971443-3187-1-git-send-email-wanpeng.li@hotmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

(limited to 'kernel')
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 502e95a6e927..8b03fb5d1b9e 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5471,13 +5471,18 @@ static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd
  */
 static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int target)
 {
-	struct sched_domain *this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
-	u64 avg_idle = this_rq()->avg_idle;
-	u64 avg_cost = this_sd->avg_scan_cost;
+	struct sched_domain *this_sd;
+	u64 avg_cost, avg_idle = this_rq()->avg_idle;
 	u64 time, cost;
 	s64 delta;
 	int cpu, wrap;
 
+	this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
+	if (!this_sd)
+		return -1;
+
+	avg_cost = this_sd->avg_scan_cost;
+
 	/*
 	 * Due to large variance we need a large fuzz factor; hackbench in
 	 * particularly is sensitive here.
-- 
cgit v1.3-14-g43fede


From a705e07b9c80df27b6bb12f7a4cd4cf4ed2f728b Mon Sep 17 00:00:00 2001
From: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Date: Wed, 12 Oct 2016 13:18:56 +0300
Subject: cpu/hotplug: Use distinct name for cpu_hotplug.dep_map

Use distinctive name for cpu_hotplug.dep_map to avoid the actual
cpu_hotplug.lock appearing as cpu_hotplug.lock#2 in lockdep splats.

Signed-off-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
Acked-by: Gautham R. Shenoy <ego@linux.vnet.ibm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
Cc: Gautham R . Shenoy <ego@linux.vnet.ibm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: intel-gfx@lists.freedesktop.org
Cc: trivial@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/cpu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/cpu.c b/kernel/cpu.c
index 5df20d6d1520..29de1a9352c0 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -228,7 +228,7 @@ static struct {
 	.wq = __WAIT_QUEUE_HEAD_INITIALIZER(cpu_hotplug.wq),
 	.lock = __MUTEX_INITIALIZER(cpu_hotplug.lock),
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
-	.dep_map = {.name = "cpu_hotplug.lock" },
+	.dep_map = STATIC_LOCKDEP_MAP_INIT("cpu_hotplug.dep_map", &cpu_hotplug.dep_map),
 #endif
 };
 
-- 
cgit v1.3-14-g43fede


From 54e23845e965898f65f76aba79fa9db76d830fa9 Mon Sep 17 00:00:00 2001
From: Tobias Klauser <tklauser@distanz.ch>
Date: Mon, 17 Oct 2016 11:47:02 +0200
Subject: alarmtimer: Remove unused but set variable
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Remove the set but unused variable base in alarm_clock_get to fix the
following warning when building with 'W=1':

  kernel/time/alarmtimer.c: In function ‘alarm_timer_create’:
  kernel/time/alarmtimer.c:545:21: warning: variable ‘base’ set but not used [-Wunused-but-set-variable]

Signed-off-by: Tobias Klauser <tklauser@distanz.ch>
Cc: John Stultz <john.stultz@linaro.org>
Link: http://lkml.kernel.org/r/20161017094702.10873-1-tklauser@distanz.ch
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/alarmtimer.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index c3aad685bbc0..12dd190634ab 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -542,7 +542,6 @@ static int alarm_clock_get(clockid_t which_clock, struct timespec *tp)
 static int alarm_timer_create(struct k_itimer *new_timer)
 {
 	enum  alarmtimer_type type;
-	struct alarm_base *base;
 
 	if (!alarmtimer_get_rtcdev())
 		return -ENOTSUPP;
@@ -551,7 +550,6 @@ static int alarm_timer_create(struct k_itimer *new_timer)
 		return -EPERM;
 
 	type = clock2alarm(new_timer->it_clock);
-	base = &alarm_bases[type];
 	alarm_init(&new_timer->it.alarm.alarmtimer, type, alarm_handle_timer);
 	return 0;
 }
-- 
cgit v1.3-14-g43fede


From b5a9b340789b2b24c6896bcf7a065c31a4db671c Mon Sep 17 00:00:00 2001
From: Vincent Guittot <vincent.guittot@linaro.org>
Date: Wed, 19 Oct 2016 14:45:23 +0200
Subject: sched/fair: Fix incorrect task group ->load_avg

A scheduler performance regression has been reported by Joseph Salisbury,
which he bisected back to:

  3d30544f0212 ("sched/fair: Apply more PELT fixes)

The regression triggers when several levels of task groups are involved
(read: SystemD) and cpu_possible_mask != cpu_present_mask.

The root cause is that group entity's load (tg_child->se[i]->avg.load_avg)
is initialized to scale_load_down(se->load.weight). During the creation of
a child task group, its group entities on possible CPUs are attached to
parent's cfs_rq (tg_parent) and their loads are added to the parent's load
(tg_parent->load_avg) with update_tg_load_avg().

But only the load on online CPUs will then be updated to reflect real load,
whereas load on other CPUs will stay at the initial value.

The result is a tg_parent->load_avg that is higher than the real load, the
weight of group entities (tg_parent->se[i]->load.weight) on online CPUs is
smaller than it should be, and the task group gets a less running time than
what it could expect.

( This situation can be detected with /proc/sched_debug. The ".tg_load_avg"
  of the task group will be much higher than sum of ".tg_load_avg_contrib"
  of online cfs_rqs of the task group. )

The load of group entities don't have to be intialized to something else
than 0 because their load will increase when an entity is attached.

Reported-by: Joseph Salisbury <joseph.salisbury@canonical.com>
Tested-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: <stable@vger.kernel.org> # 4.8.x
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: joonwoop@codeaurora.org
Fixes: 3d30544f0212 ("sched/fair: Apply more PELT fixes)
Link: http://lkml.kernel.org/r/1476881123-10159-1-git-send-email-vincent.guittot@linaro.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 76ee7de1859d..d941c97dfbc3 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -690,7 +690,14 @@ void init_entity_runnable_average(struct sched_entity *se)
 	 * will definitely be update (after enqueue).
 	 */
 	sa->period_contrib = 1023;
-	sa->load_avg = scale_load_down(se->load.weight);
+	/*
+	 * Tasks are intialized with full load to be seen as heavy tasks until
+	 * they get a chance to stabilize to their real load level.
+	 * Group entities are intialized with zero load to reflect the fact that
+	 * nothing has been attached to the task group yet.
+	 */
+	if (entity_is_task(se))
+		sa->load_avg = scale_load_down(se->load.weight);
 	sa->load_sum = sa->load_avg * LOAD_AVG_MAX;
 	/*
 	 * At this point, util_avg won't be used in select_task_rq_fair anyway
-- 
cgit v1.3-14-g43fede


From 9beae1ea89305a9667ceaab6d0bf46a045ad71e7 Mon Sep 17 00:00:00 2001
From: Lorenzo Stoakes <lstoakes@gmail.com>
Date: Thu, 13 Oct 2016 01:20:17 +0100
Subject: mm: replace get_user_pages_remote() write/force parameters with
 gup_flags

This removes the 'write' and 'force' from get_user_pages_remote() and
replaces them with 'gup_flags' to make the use of FOLL_FORCE explicit in
callers as use of this flag can result in surprising behaviour (and
hence bugs) within the mm subsystem.

Signed-off-by: Lorenzo Stoakes <lstoakes@gmail.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/gpu/drm/etnaviv/etnaviv_gem.c   |  7 +++++--
 drivers/gpu/drm/i915/i915_gem_userptr.c |  6 +++++-
 drivers/infiniband/core/umem_odp.c      |  7 +++++--
 fs/exec.c                               |  9 +++++++--
 include/linux/mm.h                      |  2 +-
 kernel/events/uprobes.c                 |  6 ++++--
 mm/gup.c                                | 22 +++++++---------------
 mm/memory.c                             |  6 +++++-
 security/tomoyo/domain.c                |  2 +-
 9 files changed, 40 insertions(+), 27 deletions(-)

(limited to 'kernel')

diff --git a/drivers/gpu/drm/etnaviv/etnaviv_gem.c b/drivers/gpu/drm/etnaviv/etnaviv_gem.c
index 5ce3603e6eac..0370b842d9cc 100644
--- a/drivers/gpu/drm/etnaviv/etnaviv_gem.c
+++ b/drivers/gpu/drm/etnaviv/etnaviv_gem.c
@@ -748,19 +748,22 @@ static struct page **etnaviv_gem_userptr_do_get_pages(
 	int ret = 0, pinned, npages = etnaviv_obj->base.size >> PAGE_SHIFT;
 	struct page **pvec;
 	uintptr_t ptr;
+	unsigned int flags = 0;
 
 	pvec = drm_malloc_ab(npages, sizeof(struct page *));
 	if (!pvec)
 		return ERR_PTR(-ENOMEM);
 
+	if (!etnaviv_obj->userptr.ro)
+		flags |= FOLL_WRITE;
+
 	pinned = 0;
 	ptr = etnaviv_obj->userptr.ptr;
 
 	down_read(&mm->mmap_sem);
 	while (pinned < npages) {
 		ret = get_user_pages_remote(task, mm, ptr, npages - pinned,
-					    !etnaviv_obj->userptr.ro, 0,
-					    pvec + pinned, NULL);
+					    flags, pvec + pinned, NULL);
 		if (ret < 0)
 			break;
 
diff --git a/drivers/gpu/drm/i915/i915_gem_userptr.c b/drivers/gpu/drm/i915/i915_gem_userptr.c
index e537930c64b5..c6f780f5abc9 100644
--- a/drivers/gpu/drm/i915/i915_gem_userptr.c
+++ b/drivers/gpu/drm/i915/i915_gem_userptr.c
@@ -508,6 +508,10 @@ __i915_gem_userptr_get_pages_worker(struct work_struct *_work)
 	pvec = drm_malloc_gfp(npages, sizeof(struct page *), GFP_TEMPORARY);
 	if (pvec != NULL) {
 		struct mm_struct *mm = obj->userptr.mm->mm;
+		unsigned int flags = 0;
+
+		if (!obj->userptr.read_only)
+			flags |= FOLL_WRITE;
 
 		ret = -EFAULT;
 		if (atomic_inc_not_zero(&mm->mm_users)) {
@@ -517,7 +521,7 @@ __i915_gem_userptr_get_pages_worker(struct work_struct *_work)
 					(work->task, mm,
 					 obj->userptr.ptr + pinned * PAGE_SIZE,
 					 npages - pinned,
-					 !obj->userptr.read_only, 0,
+					 flags,
 					 pvec + pinned, NULL);
 				if (ret < 0)
 					break;
diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c
index 75077a018675..1f0fe3217f23 100644
--- a/drivers/infiniband/core/umem_odp.c
+++ b/drivers/infiniband/core/umem_odp.c
@@ -527,6 +527,7 @@ int ib_umem_odp_map_dma_pages(struct ib_umem *umem, u64 user_virt, u64 bcnt,
 	u64 off;
 	int j, k, ret = 0, start_idx, npages = 0;
 	u64 base_virt_addr;
+	unsigned int flags = 0;
 
 	if (access_mask == 0)
 		return -EINVAL;
@@ -556,6 +557,9 @@ int ib_umem_odp_map_dma_pages(struct ib_umem *umem, u64 user_virt, u64 bcnt,
 		goto out_put_task;
 	}
 
+	if (access_mask & ODP_WRITE_ALLOWED_BIT)
+		flags |= FOLL_WRITE;
+
 	start_idx = (user_virt - ib_umem_start(umem)) >> PAGE_SHIFT;
 	k = start_idx;
 
@@ -574,8 +578,7 @@ int ib_umem_odp_map_dma_pages(struct ib_umem *umem, u64 user_virt, u64 bcnt,
 		 */
 		npages = get_user_pages_remote(owning_process, owning_mm,
 				user_virt, gup_num_pages,
-				access_mask & ODP_WRITE_ALLOWED_BIT,
-				0, local_page_list, NULL);
+				flags, local_page_list, NULL);
 		up_read(&owning_mm->mmap_sem);
 
 		if (npages < 0)
diff --git a/fs/exec.c b/fs/exec.c
index 6fcfb3f7b137..4e497b9ee71e 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -191,6 +191,7 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
 {
 	struct page *page;
 	int ret;
+	unsigned int gup_flags = FOLL_FORCE;
 
 #ifdef CONFIG_STACK_GROWSUP
 	if (write) {
@@ -199,12 +200,16 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
 			return NULL;
 	}
 #endif
+
+	if (write)
+		gup_flags |= FOLL_WRITE;
+
 	/*
 	 * We are doing an exec().  'current' is the process
 	 * doing the exec and bprm->mm is the new process's mm.
 	 */
-	ret = get_user_pages_remote(current, bprm->mm, pos, 1, write,
-			1, &page, NULL);
+	ret = get_user_pages_remote(current, bprm->mm, pos, 1, gup_flags,
+			&page, NULL);
 	if (ret <= 0)
 		return NULL;
 
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 30bb5d9631bb..ecc4be7b67e0 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1276,7 +1276,7 @@ long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 		      struct vm_area_struct **vmas, int *nonblocking);
 long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm,
 			    unsigned long start, unsigned long nr_pages,
-			    int write, int force, struct page **pages,
+			    unsigned int gup_flags, struct page **pages,
 			    struct vm_area_struct **vmas);
 long get_user_pages(unsigned long start, unsigned long nr_pages,
 			    unsigned int gup_flags, struct page **pages,
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index d4129bb05e5d..f9ec9add2164 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -300,7 +300,8 @@ int uprobe_write_opcode(struct mm_struct *mm, unsigned long vaddr,
 
 retry:
 	/* Read the page with vaddr into memory */
-	ret = get_user_pages_remote(NULL, mm, vaddr, 1, 0, 1, &old_page, &vma);
+	ret = get_user_pages_remote(NULL, mm, vaddr, 1, FOLL_FORCE, &old_page,
+			&vma);
 	if (ret <= 0)
 		return ret;
 
@@ -1710,7 +1711,8 @@ static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr)
 	 * but we treat this as a 'remote' access since it is
 	 * essentially a kernel access to the memory.
 	 */
-	result = get_user_pages_remote(NULL, mm, vaddr, 1, 0, 1, &page, NULL);
+	result = get_user_pages_remote(NULL, mm, vaddr, 1, FOLL_FORCE, &page,
+			NULL);
 	if (result < 0)
 		return result;
 
diff --git a/mm/gup.c b/mm/gup.c
index bbc2e76cdd77..7aa113c2d373 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -915,9 +915,7 @@ EXPORT_SYMBOL(get_user_pages_unlocked);
  * @mm:		mm_struct of target mm
  * @start:	starting user address
  * @nr_pages:	number of pages from start to pin
- * @write:	whether pages will be written to by the caller
- * @force:	whether to force access even when user mapping is currently
- *		protected (but never forces write access to shared mapping).
+ * @gup_flags:	flags modifying lookup behaviour
  * @pages:	array that receives pointers to the pages pinned.
  *		Should be at least nr_pages long. Or NULL, if caller
  *		only intends to ensure the pages are faulted in.
@@ -946,9 +944,9 @@ EXPORT_SYMBOL(get_user_pages_unlocked);
  * or similar operation cannot guarantee anything stronger anyway because
  * locks can't be held over the syscall boundary.
  *
- * If write=0, the page must not be written to. If the page is written to,
- * set_page_dirty (or set_page_dirty_lock, as appropriate) must be called
- * after the page is finished with, and before put_page is called.
+ * If gup_flags & FOLL_WRITE == 0, the page must not be written to. If the page
+ * is written to, set_page_dirty (or set_page_dirty_lock, as appropriate) must
+ * be called after the page is finished with, and before put_page is called.
  *
  * get_user_pages is typically used for fewer-copy IO operations, to get a
  * handle on the memory by some means other than accesses via the user virtual
@@ -965,18 +963,12 @@ EXPORT_SYMBOL(get_user_pages_unlocked);
  */
 long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm,
 		unsigned long start, unsigned long nr_pages,
-		int write, int force, struct page **pages,
+		unsigned int gup_flags, struct page **pages,
 		struct vm_area_struct **vmas)
 {
-	unsigned int flags = FOLL_TOUCH | FOLL_REMOTE;
-
-	if (write)
-		flags |= FOLL_WRITE;
-	if (force)
-		flags |= FOLL_FORCE;
-
 	return __get_user_pages_locked(tsk, mm, start, nr_pages, pages, vmas,
-				       NULL, false, flags);
+				       NULL, false,
+				       gup_flags | FOLL_TOUCH | FOLL_REMOTE);
 }
 EXPORT_SYMBOL(get_user_pages_remote);
 
diff --git a/mm/memory.c b/mm/memory.c
index fc1987dfd8cc..20a9adb7b36e 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3873,6 +3873,10 @@ static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
 {
 	struct vm_area_struct *vma;
 	void *old_buf = buf;
+	unsigned int flags = FOLL_FORCE;
+
+	if (write)
+		flags |= FOLL_WRITE;
 
 	down_read(&mm->mmap_sem);
 	/* ignore errors, just check how much was successfully transferred */
@@ -3882,7 +3886,7 @@ static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
 		struct page *page = NULL;
 
 		ret = get_user_pages_remote(tsk, mm, addr, 1,
-				write, 1, &page, &vma);
+				flags, &page, &vma);
 		if (ret <= 0) {
 #ifndef CONFIG_HAVE_IOREMAP_PROT
 			break;
diff --git a/security/tomoyo/domain.c b/security/tomoyo/domain.c
index ade7c6cad172..682b73af7766 100644
--- a/security/tomoyo/domain.c
+++ b/security/tomoyo/domain.c
@@ -881,7 +881,7 @@ bool tomoyo_dump_page(struct linux_binprm *bprm, unsigned long pos,
 	 * the execve().
 	 */
 	if (get_user_pages_remote(current, bprm->mm, pos, 1,
-				0, 1, &page, NULL) <= 0)
+				FOLL_FORCE, &page, NULL) <= 0)
 		return false;
 #else
 	page = bprm->page[pos / PAGE_SIZE];
-- 
cgit v1.3-14-g43fede


From f307ab6dcea03f9d8e4d70508fd7d1ca57cfa7f9 Mon Sep 17 00:00:00 2001
From: Lorenzo Stoakes <lstoakes@gmail.com>
Date: Thu, 13 Oct 2016 01:20:20 +0100
Subject: mm: replace access_process_vm() write parameter with gup_flags

This removes the 'write' argument from access_process_vm() and replaces
it with 'gup_flags' as use of this function previously silently implied
FOLL_FORCE, whereas after this patch callers explicitly pass this flag.

We make this explicit as use of FOLL_FORCE can result in surprising
behaviour (and hence bugs) within the mm subsystem.

Signed-off-by: Lorenzo Stoakes <lstoakes@gmail.com>
Acked-by: Jesper Nilsson <jesper.nilsson@axis.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Michael Ellerman <mpe@ellerman.id.au>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/alpha/kernel/ptrace.c         |  9 ++++++---
 arch/blackfin/kernel/ptrace.c      |  5 +++--
 arch/cris/arch-v32/kernel/ptrace.c |  4 ++--
 arch/ia64/kernel/ptrace.c          | 14 +++++++++-----
 arch/m32r/kernel/ptrace.c          | 15 ++++++++++-----
 arch/mips/kernel/ptrace32.c        |  5 +++--
 arch/powerpc/kernel/ptrace32.c     |  5 +++--
 arch/score/kernel/ptrace.c         | 10 ++++++----
 arch/sparc/kernel/ptrace_64.c      | 24 ++++++++++++++++--------
 arch/x86/kernel/step.c             |  3 ++-
 arch/x86/um/ptrace_32.c            |  3 ++-
 arch/x86/um/ptrace_64.c            |  3 ++-
 include/linux/mm.h                 |  3 ++-
 kernel/ptrace.c                    | 16 ++++++++++------
 mm/memory.c                        |  8 ++------
 mm/nommu.c                         |  6 +++---
 mm/util.c                          |  5 +++--
 17 files changed, 84 insertions(+), 54 deletions(-)

(limited to 'kernel')

diff --git a/arch/alpha/kernel/ptrace.c b/arch/alpha/kernel/ptrace.c
index d9ee81769899..940dfb406591 100644
--- a/arch/alpha/kernel/ptrace.c
+++ b/arch/alpha/kernel/ptrace.c
@@ -157,14 +157,16 @@ put_reg(struct task_struct *task, unsigned long regno, unsigned long data)
 static inline int
 read_int(struct task_struct *task, unsigned long addr, int * data)
 {
-	int copied = access_process_vm(task, addr, data, sizeof(int), 0);
+	int copied = access_process_vm(task, addr, data, sizeof(int),
+			FOLL_FORCE);
 	return (copied == sizeof(int)) ? 0 : -EIO;
 }
 
 static inline int
 write_int(struct task_struct *task, unsigned long addr, int data)
 {
-	int copied = access_process_vm(task, addr, &data, sizeof(int), 1);
+	int copied = access_process_vm(task, addr, &data, sizeof(int),
+			FOLL_FORCE | FOLL_WRITE);
 	return (copied == sizeof(int)) ? 0 : -EIO;
 }
 
@@ -281,7 +283,8 @@ long arch_ptrace(struct task_struct *child, long request,
 	/* When I and D space are separate, these will need to be fixed.  */
 	case PTRACE_PEEKTEXT: /* read word at location addr. */
 	case PTRACE_PEEKDATA:
-		copied = access_process_vm(child, addr, &tmp, sizeof(tmp), 0);
+		copied = access_process_vm(child, addr, &tmp, sizeof(tmp),
+				FOLL_FORCE);
 		ret = -EIO;
 		if (copied != sizeof(tmp))
 			break;
diff --git a/arch/blackfin/kernel/ptrace.c b/arch/blackfin/kernel/ptrace.c
index 8b8fe671b1a6..8d79286ee4e8 100644
--- a/arch/blackfin/kernel/ptrace.c
+++ b/arch/blackfin/kernel/ptrace.c
@@ -271,7 +271,7 @@ long arch_ptrace(struct task_struct *child, long request,
 			case BFIN_MEM_ACCESS_CORE:
 			case BFIN_MEM_ACCESS_CORE_ONLY:
 				copied = access_process_vm(child, addr, &tmp,
-				                           to_copy, 0);
+							   to_copy, FOLL_FORCE);
 				if (copied)
 					break;
 
@@ -324,7 +324,8 @@ long arch_ptrace(struct task_struct *child, long request,
 			case BFIN_MEM_ACCESS_CORE:
 			case BFIN_MEM_ACCESS_CORE_ONLY:
 				copied = access_process_vm(child, addr, &data,
-				                           to_copy, 1);
+				                           to_copy,
+							   FOLL_FORCE | FOLL_WRITE);
 				break;
 			case BFIN_MEM_ACCESS_DMA:
 				if (safe_dma_memcpy(paddr, &data, to_copy))
diff --git a/arch/cris/arch-v32/kernel/ptrace.c b/arch/cris/arch-v32/kernel/ptrace.c
index f085229cf870..f0df654ac6fc 100644
--- a/arch/cris/arch-v32/kernel/ptrace.c
+++ b/arch/cris/arch-v32/kernel/ptrace.c
@@ -147,7 +147,7 @@ long arch_ptrace(struct task_struct *child, long request,
 				/* The trampoline page is globally mapped, no page table to traverse.*/
 				tmp = *(unsigned long*)addr;
 			} else {
-				copied = access_process_vm(child, addr, &tmp, sizeof(tmp), 0);
+				copied = access_process_vm(child, addr, &tmp, sizeof(tmp), FOLL_FORCE);
 
 				if (copied != sizeof(tmp))
 					break;
@@ -279,7 +279,7 @@ static int insn_size(struct task_struct *child, unsigned long pc)
   int opsize = 0;
 
   /* Read the opcode at pc (do what PTRACE_PEEKTEXT would do). */
-  copied = access_process_vm(child, pc, &opcode, sizeof(opcode), 0);
+  copied = access_process_vm(child, pc, &opcode, sizeof(opcode), FOLL_FORCE);
   if (copied != sizeof(opcode))
     return 0;
 
diff --git a/arch/ia64/kernel/ptrace.c b/arch/ia64/kernel/ptrace.c
index 6f54d511cc50..31aa8c0f68e1 100644
--- a/arch/ia64/kernel/ptrace.c
+++ b/arch/ia64/kernel/ptrace.c
@@ -453,7 +453,7 @@ ia64_peek (struct task_struct *child, struct switch_stack *child_stack,
 			return 0;
 		}
 	}
-	copied = access_process_vm(child, addr, &ret, sizeof(ret), 0);
+	copied = access_process_vm(child, addr, &ret, sizeof(ret), FOLL_FORCE);
 	if (copied != sizeof(ret))
 		return -EIO;
 	*val = ret;
@@ -489,7 +489,8 @@ ia64_poke (struct task_struct *child, struct switch_stack *child_stack,
 				*ia64_rse_skip_regs(krbs, regnum) = val;
 			}
 		}
-	} else if (access_process_vm(child, addr, &val, sizeof(val), 1)
+	} else if (access_process_vm(child, addr, &val, sizeof(val),
+				FOLL_FORCE | FOLL_WRITE)
 		   != sizeof(val))
 		return -EIO;
 	return 0;
@@ -543,7 +544,8 @@ ia64_sync_user_rbs (struct task_struct *child, struct switch_stack *sw,
 		ret = ia64_peek(child, sw, user_rbs_end, addr, &val);
 		if (ret < 0)
 			return ret;
-		if (access_process_vm(child, addr, &val, sizeof(val), 1)
+		if (access_process_vm(child, addr, &val, sizeof(val),
+				FOLL_FORCE | FOLL_WRITE)
 		    != sizeof(val))
 			return -EIO;
 	}
@@ -559,7 +561,8 @@ ia64_sync_kernel_rbs (struct task_struct *child, struct switch_stack *sw,
 
 	/* now copy word for word from user rbs to kernel rbs: */
 	for (addr = user_rbs_start; addr < user_rbs_end; addr += 8) {
-		if (access_process_vm(child, addr, &val, sizeof(val), 0)
+		if (access_process_vm(child, addr, &val, sizeof(val),
+				FOLL_FORCE)
 				!= sizeof(val))
 			return -EIO;
 
@@ -1156,7 +1159,8 @@ arch_ptrace (struct task_struct *child, long request,
 	case PTRACE_PEEKTEXT:
 	case PTRACE_PEEKDATA:
 		/* read word at location addr */
-		if (access_process_vm(child, addr, &data, sizeof(data), 0)
+		if (access_process_vm(child, addr, &data, sizeof(data),
+				FOLL_FORCE)
 		    != sizeof(data))
 			return -EIO;
 		/* ensure return value is not mistaken for error code */
diff --git a/arch/m32r/kernel/ptrace.c b/arch/m32r/kernel/ptrace.c
index 51f5e9aa4901..c145605a981f 100644
--- a/arch/m32r/kernel/ptrace.c
+++ b/arch/m32r/kernel/ptrace.c
@@ -493,7 +493,8 @@ unregister_all_debug_traps(struct task_struct *child)
 	int i;
 
 	for (i = 0; i < p->nr_trap; i++)
-		access_process_vm(child, p->addr[i], &p->insn[i], sizeof(p->insn[i]), 1);
+		access_process_vm(child, p->addr[i], &p->insn[i], sizeof(p->insn[i]),
+				FOLL_FORCE | FOLL_WRITE);
 	p->nr_trap = 0;
 }
 
@@ -537,7 +538,8 @@ embed_debug_trap(struct task_struct *child, unsigned long next_pc)
 	unsigned long next_insn, code;
 	unsigned long addr = next_pc & ~3;
 
-	if (access_process_vm(child, addr, &next_insn, sizeof(next_insn), 0)
+	if (access_process_vm(child, addr, &next_insn, sizeof(next_insn),
+			FOLL_FORCE)
 	    != sizeof(next_insn)) {
 		return -1; /* error */
 	}
@@ -546,7 +548,8 @@ embed_debug_trap(struct task_struct *child, unsigned long next_pc)
 	if (register_debug_trap(child, next_pc, next_insn, &code)) {
 		return -1; /* error */
 	}
-	if (access_process_vm(child, addr, &code, sizeof(code), 1)
+	if (access_process_vm(child, addr, &code, sizeof(code),
+			FOLL_FORCE | FOLL_WRITE)
 	    != sizeof(code)) {
 		return -1; /* error */
 	}
@@ -562,7 +565,8 @@ withdraw_debug_trap(struct pt_regs *regs)
  	addr = (regs->bpc - 2) & ~3;
 	regs->bpc -= 2;
 	if (unregister_debug_trap(current, addr, &code)) {
-	    access_process_vm(current, addr, &code, sizeof(code), 1);
+	    access_process_vm(current, addr, &code, sizeof(code),
+		    FOLL_FORCE | FOLL_WRITE);
 	    invalidate_cache();
 	}
 }
@@ -589,7 +593,8 @@ void user_enable_single_step(struct task_struct *child)
 	/* Compute next pc.  */
 	pc = get_stack_long(child, PT_BPC);
 
-	if (access_process_vm(child, pc&~3, &insn, sizeof(insn), 0)
+	if (access_process_vm(child, pc&~3, &insn, sizeof(insn),
+			FOLL_FORCE)
 	    != sizeof(insn))
 		return;
 
diff --git a/arch/mips/kernel/ptrace32.c b/arch/mips/kernel/ptrace32.c
index 283b5a1967d1..7e71a4e0281b 100644
--- a/arch/mips/kernel/ptrace32.c
+++ b/arch/mips/kernel/ptrace32.c
@@ -70,7 +70,7 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request,
 			break;
 
 		copied = access_process_vm(child, (u64)addrOthers, &tmp,
-				sizeof(tmp), 0);
+				sizeof(tmp), FOLL_FORCE);
 		if (copied != sizeof(tmp))
 			break;
 		ret = put_user(tmp, (u32 __user *) (unsigned long) data);
@@ -179,7 +179,8 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request,
 			break;
 		ret = 0;
 		if (access_process_vm(child, (u64)addrOthers, &data,
-					sizeof(data), 1) == sizeof(data))
+					sizeof(data),
+					FOLL_FORCE | FOLL_WRITE) == sizeof(data))
 			break;
 		ret = -EIO;
 		break;
diff --git a/arch/powerpc/kernel/ptrace32.c b/arch/powerpc/kernel/ptrace32.c
index f52b7db327c8..010b7b310237 100644
--- a/arch/powerpc/kernel/ptrace32.c
+++ b/arch/powerpc/kernel/ptrace32.c
@@ -74,7 +74,7 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request,
 			break;
 
 		copied = access_process_vm(child, (u64)addrOthers, &tmp,
-				sizeof(tmp), 0);
+				sizeof(tmp), FOLL_FORCE);
 		if (copied != sizeof(tmp))
 			break;
 		ret = put_user(tmp, (u32 __user *)data);
@@ -179,7 +179,8 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request,
 			break;
 		ret = 0;
 		if (access_process_vm(child, (u64)addrOthers, &tmp,
-					sizeof(tmp), 1) == sizeof(tmp))
+					sizeof(tmp),
+					FOLL_FORCE | FOLL_WRITE) == sizeof(tmp))
 			break;
 		ret = -EIO;
 		break;
diff --git a/arch/score/kernel/ptrace.c b/arch/score/kernel/ptrace.c
index 55836188b217..4f7314d5f334 100644
--- a/arch/score/kernel/ptrace.c
+++ b/arch/score/kernel/ptrace.c
@@ -131,7 +131,7 @@ read_tsk_long(struct task_struct *child,
 {
 	int copied;
 
-	copied = access_process_vm(child, addr, res, sizeof(*res), 0);
+	copied = access_process_vm(child, addr, res, sizeof(*res), FOLL_FORCE);
 
 	return copied != sizeof(*res) ? -EIO : 0;
 }
@@ -142,7 +142,7 @@ read_tsk_short(struct task_struct *child,
 {
 	int copied;
 
-	copied = access_process_vm(child, addr, res, sizeof(*res), 0);
+	copied = access_process_vm(child, addr, res, sizeof(*res), FOLL_FORCE);
 
 	return copied != sizeof(*res) ? -EIO : 0;
 }
@@ -153,7 +153,8 @@ write_tsk_short(struct task_struct *child,
 {
 	int copied;
 
-	copied = access_process_vm(child, addr, &val, sizeof(val), 1);
+	copied = access_process_vm(child, addr, &val, sizeof(val),
+			FOLL_FORCE | FOLL_WRITE);
 
 	return copied != sizeof(val) ? -EIO : 0;
 }
@@ -164,7 +165,8 @@ write_tsk_long(struct task_struct *child,
 {
 	int copied;
 
-	copied = access_process_vm(child, addr, &val, sizeof(val), 1);
+	copied = access_process_vm(child, addr, &val, sizeof(val),
+			FOLL_FORCE | FOLL_WRITE);
 
 	return copied != sizeof(val) ? -EIO : 0;
 }
diff --git a/arch/sparc/kernel/ptrace_64.c b/arch/sparc/kernel/ptrace_64.c
index 9ddc4928a089..ac082dd8c67d 100644
--- a/arch/sparc/kernel/ptrace_64.c
+++ b/arch/sparc/kernel/ptrace_64.c
@@ -127,7 +127,8 @@ static int get_from_target(struct task_struct *target, unsigned long uaddr,
 		if (copy_from_user(kbuf, (void __user *) uaddr, len))
 			return -EFAULT;
 	} else {
-		int len2 = access_process_vm(target, uaddr, kbuf, len, 0);
+		int len2 = access_process_vm(target, uaddr, kbuf, len,
+				FOLL_FORCE);
 		if (len2 != len)
 			return -EFAULT;
 	}
@@ -141,7 +142,8 @@ static int set_to_target(struct task_struct *target, unsigned long uaddr,
 		if (copy_to_user((void __user *) uaddr, kbuf, len))
 			return -EFAULT;
 	} else {
-		int len2 = access_process_vm(target, uaddr, kbuf, len, 1);
+		int len2 = access_process_vm(target, uaddr, kbuf, len,
+				FOLL_FORCE | FOLL_WRITE);
 		if (len2 != len)
 			return -EFAULT;
 	}
@@ -505,7 +507,8 @@ static int genregs32_get(struct task_struct *target,
 				if (access_process_vm(target,
 						      (unsigned long)
 						      &reg_window[pos],
-						      k, sizeof(*k), 0)
+						      k, sizeof(*k),
+						      FOLL_FORCE)
 				    != sizeof(*k))
 					return -EFAULT;
 				k++;
@@ -531,12 +534,14 @@ static int genregs32_get(struct task_struct *target,
 				if (access_process_vm(target,
 						      (unsigned long)
 						      &reg_window[pos],
-						      &reg, sizeof(reg), 0)
+						      &reg, sizeof(reg),
+						      FOLL_FORCE)
 				    != sizeof(reg))
 					return -EFAULT;
 				if (access_process_vm(target,
 						      (unsigned long) u,
-						      &reg, sizeof(reg), 1)
+						      &reg, sizeof(reg),
+						      FOLL_FORCE | FOLL_WRITE)
 				    != sizeof(reg))
 					return -EFAULT;
 				pos++;
@@ -615,7 +620,8 @@ static int genregs32_set(struct task_struct *target,
 						      (unsigned long)
 						      &reg_window[pos],
 						      (void *) k,
-						      sizeof(*k), 1)
+						      sizeof(*k),
+						      FOLL_FORCE | FOLL_WRITE)
 				    != sizeof(*k))
 					return -EFAULT;
 				k++;
@@ -642,13 +648,15 @@ static int genregs32_set(struct task_struct *target,
 				if (access_process_vm(target,
 						      (unsigned long)
 						      u,
-						      &reg, sizeof(reg), 0)
+						      &reg, sizeof(reg),
+						      FOLL_FORCE)
 				    != sizeof(reg))
 					return -EFAULT;
 				if (access_process_vm(target,
 						      (unsigned long)
 						      &reg_window[pos],
-						      &reg, sizeof(reg), 1)
+						      &reg, sizeof(reg),
+						      FOLL_FORCE | FOLL_WRITE)
 				    != sizeof(reg))
 					return -EFAULT;
 				pos++;
diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c
index c9a073866ca7..a23ce84a3f6c 100644
--- a/arch/x86/kernel/step.c
+++ b/arch/x86/kernel/step.c
@@ -57,7 +57,8 @@ static int is_setting_trap_flag(struct task_struct *child, struct pt_regs *regs)
 	unsigned char opcode[15];
 	unsigned long addr = convert_ip_to_linear(child, regs);
 
-	copied = access_process_vm(child, addr, opcode, sizeof(opcode), 0);
+	copied = access_process_vm(child, addr, opcode, sizeof(opcode),
+			FOLL_FORCE);
 	for (i = 0; i < copied; i++) {
 		switch (opcode[i]) {
 		/* popf and iret */
diff --git a/arch/x86/um/ptrace_32.c b/arch/x86/um/ptrace_32.c
index 5766ead6fdb9..60a5a5a85505 100644
--- a/arch/x86/um/ptrace_32.c
+++ b/arch/x86/um/ptrace_32.c
@@ -36,7 +36,8 @@ int is_syscall(unsigned long addr)
 		 * slow, but that doesn't matter, since it will be called only
 		 * in case of singlestepping, if copy_from_user failed.
 		 */
-		n = access_process_vm(current, addr, &instr, sizeof(instr), 0);
+		n = access_process_vm(current, addr, &instr, sizeof(instr),
+				FOLL_FORCE);
 		if (n != sizeof(instr)) {
 			printk(KERN_ERR "is_syscall : failed to read "
 			       "instruction from 0x%lx\n", addr);
diff --git a/arch/x86/um/ptrace_64.c b/arch/x86/um/ptrace_64.c
index 0b5c184dd5b3..e30202b1716e 100644
--- a/arch/x86/um/ptrace_64.c
+++ b/arch/x86/um/ptrace_64.c
@@ -212,7 +212,8 @@ int is_syscall(unsigned long addr)
 		 * slow, but that doesn't matter, since it will be called only
 		 * in case of singlestepping, if copy_from_user failed.
 		 */
-		n = access_process_vm(current, addr, &instr, sizeof(instr), 0);
+		n = access_process_vm(current, addr, &instr, sizeof(instr),
+				FOLL_FORCE);
 		if (n != sizeof(instr)) {
 			printk("is_syscall : failed to read instruction from "
 			       "0x%lx\n", addr);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index f31bf9058587..ffbd72979ee7 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1266,7 +1266,8 @@ static inline int fixup_user_fault(struct task_struct *tsk,
 }
 #endif
 
-extern int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write);
+extern int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len,
+		unsigned int gup_flags);
 extern int access_remote_vm(struct mm_struct *mm, unsigned long addr,
 		void *buf, int len, unsigned int gup_flags);
 
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 2a99027312a6..e6474f7272ec 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -537,7 +537,7 @@ int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __user *dst
 		int this_len, retval;
 
 		this_len = (len > sizeof(buf)) ? sizeof(buf) : len;
-		retval = access_process_vm(tsk, src, buf, this_len, 0);
+		retval = access_process_vm(tsk, src, buf, this_len, FOLL_FORCE);
 		if (!retval) {
 			if (copied)
 				break;
@@ -564,7 +564,8 @@ int ptrace_writedata(struct task_struct *tsk, char __user *src, unsigned long ds
 		this_len = (len > sizeof(buf)) ? sizeof(buf) : len;
 		if (copy_from_user(buf, src, this_len))
 			return -EFAULT;
-		retval = access_process_vm(tsk, dst, buf, this_len, 1);
+		retval = access_process_vm(tsk, dst, buf, this_len,
+				FOLL_FORCE | FOLL_WRITE);
 		if (!retval) {
 			if (copied)
 				break;
@@ -1127,7 +1128,7 @@ int generic_ptrace_peekdata(struct task_struct *tsk, unsigned long addr,
 	unsigned long tmp;
 	int copied;
 
-	copied = access_process_vm(tsk, addr, &tmp, sizeof(tmp), 0);
+	copied = access_process_vm(tsk, addr, &tmp, sizeof(tmp), FOLL_FORCE);
 	if (copied != sizeof(tmp))
 		return -EIO;
 	return put_user(tmp, (unsigned long __user *)data);
@@ -1138,7 +1139,8 @@ int generic_ptrace_pokedata(struct task_struct *tsk, unsigned long addr,
 {
 	int copied;
 
-	copied = access_process_vm(tsk, addr, &data, sizeof(data), 1);
+	copied = access_process_vm(tsk, addr, &data, sizeof(data),
+			FOLL_FORCE | FOLL_WRITE);
 	return (copied == sizeof(data)) ? 0 : -EIO;
 }
 
@@ -1155,7 +1157,8 @@ int compat_ptrace_request(struct task_struct *child, compat_long_t request,
 	switch (request) {
 	case PTRACE_PEEKTEXT:
 	case PTRACE_PEEKDATA:
-		ret = access_process_vm(child, addr, &word, sizeof(word), 0);
+		ret = access_process_vm(child, addr, &word, sizeof(word),
+				FOLL_FORCE);
 		if (ret != sizeof(word))
 			ret = -EIO;
 		else
@@ -1164,7 +1167,8 @@ int compat_ptrace_request(struct task_struct *child, compat_long_t request,
 
 	case PTRACE_POKETEXT:
 	case PTRACE_POKEDATA:
-		ret = access_process_vm(child, addr, &data, sizeof(data), 1);
+		ret = access_process_vm(child, addr, &data, sizeof(data),
+				FOLL_FORCE | FOLL_WRITE);
 		ret = (ret != sizeof(data) ? -EIO : 0);
 		break;
 
diff --git a/mm/memory.c b/mm/memory.c
index bac2d994850e..e18c57bdc75c 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3951,20 +3951,16 @@ int access_remote_vm(struct mm_struct *mm, unsigned long addr,
  * Do not walk the page table directly, use get_user_pages
  */
 int access_process_vm(struct task_struct *tsk, unsigned long addr,
-		void *buf, int len, int write)
+		void *buf, int len, unsigned int gup_flags)
 {
 	struct mm_struct *mm;
 	int ret;
-	unsigned int flags = FOLL_FORCE;
 
 	mm = get_task_mm(tsk);
 	if (!mm)
 		return 0;
 
-	if (write)
-		flags |= FOLL_WRITE;
-
-	ret = __access_remote_vm(tsk, mm, addr, buf, len, flags);
+	ret = __access_remote_vm(tsk, mm, addr, buf, len, gup_flags);
 
 	mmput(mm);
 
diff --git a/mm/nommu.c b/mm/nommu.c
index 93d5bb53fc63..db5fd1795298 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1861,7 +1861,8 @@ int access_remote_vm(struct mm_struct *mm, unsigned long addr,
  * Access another process' address space.
  * - source/target buffer must be kernel space
  */
-int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write)
+int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len,
+		unsigned int gup_flags)
 {
 	struct mm_struct *mm;
 
@@ -1872,8 +1873,7 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in
 	if (!mm)
 		return 0;
 
-	len = __access_remote_vm(tsk, mm, addr, buf, len,
-			write ? FOLL_WRITE : 0);
+	len = __access_remote_vm(tsk, mm, addr, buf, len, gup_flags);
 
 	mmput(mm);
 	return len;
diff --git a/mm/util.c b/mm/util.c
index 4c685bde5ebc..952cbe7ad7b7 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -624,7 +624,7 @@ int get_cmdline(struct task_struct *task, char *buffer, int buflen)
 	if (len > buflen)
 		len = buflen;
 
-	res = access_process_vm(task, arg_start, buffer, len, 0);
+	res = access_process_vm(task, arg_start, buffer, len, FOLL_FORCE);
 
 	/*
 	 * If the nul at the end of args has been overwritten, then
@@ -639,7 +639,8 @@ int get_cmdline(struct task_struct *task, char *buffer, int buflen)
 			if (len > buflen - res)
 				len = buflen - res;
 			res += access_process_vm(task, env_start,
-						 buffer+res, len, 0);
+						 buffer+res, len,
+						 FOLL_FORCE);
 			res = strnlen(buffer, res);
 		}
 	}
-- 
cgit v1.3-14-g43fede


From 8835ca59dac2bc1e0136791abf3ccd51588803ce Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Wed, 19 Oct 2016 09:11:24 -0700
Subject: printk: suppress empty continuation lines

We have a fairly common pattern where you print several things as
continuations on one single line in a loop, and then at the end you do

	printk(KERN_CONT "\n");

to flush the buffered output.

But if the output was flushed by something else (concurrent printk
activity, or just system logging), we don't want that final flushing to
just print an empty line.

So just suppress empty continuation lines when they couldn't be merged
into the line they are a continuation of.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/printk/printk.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'kernel')

diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index d5e397315473..de08fc90baaf 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -1769,6 +1769,10 @@ static size_t log_output(int facility, int level, enum log_flags lflags, const c
 		cont_flush();
 	}
 
+	/* Skip empty continuation lines that couldn't be added - they just flush */
+	if (!text_len && (lflags & LOG_CONT))
+		return 0;
+
 	/* If it doesn't end in a newline, try to buffer the current line */
 	if (!(lflags & LOG_NEWLINE)) {
 		if (cont_add(facility, level, lflags, text, text_len))
-- 
cgit v1.3-14-g43fede


From 3118dac501bc0317de099db81618d589503351e1 Mon Sep 17 00:00:00 2001
From: Sudip Mukherjee <sudipm.mukherjee@gmail.com>
Date: Thu, 6 Oct 2016 23:06:43 +0530
Subject: kernel/irq: Export irq_set_parent()

The TPS65217 driver grew interrupt support which uses
irq_set_parent(). While it's not yet clear why this is used in the first
place, building the driver as a module fails with:

 ERROR: ".irq_set_parent" [drivers/mfd/tps65217.ko] undefined!

The correctness of the driver change is still investigated, but for now
it's less trouble to export irq_set_parent() than dealing with the build
wreckage.

[ tglx: Rewrote changelog and made the export GPL ]

Fixes: 6556bdacf646 ("mfd: tps65217: Add support for IRQs")
Signed-off-by: Sudip Mukherjee <sudip.mukherjee@codethink.co.uk>
Cc: Sudip Mukherjee <sudipm.mukherjee@gmail.com>
Cc: Marcin Niestroj <m.niestroj@grinn-global.com>
Cc: Grygorii Strashko <grygorii.strashko@ti.com>
Cc: Tony Lindgren <tony@atomide.com>
Cc: Lee Jones <lee.jones@linaro.org>
Link: http://lkml.kernel.org/r/1475775403-27207-1-git-send-email-sudipm.mukherjee@gmail.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/manage.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 0c5f1a5db654..9c4d30483264 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -721,6 +721,7 @@ int irq_set_parent(int irq, int parent_irq)
 	irq_put_desc_unlock(desc, flags);
 	return 0;
 }
+EXPORT_SYMBOL_GPL(irq_set_parent);
 #endif
 
 /*
-- 
cgit v1.3-14-g43fede


From f660f6066716b700148f60dba3461e65efff3123 Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagi@grimberg.me>
Date: Mon, 10 Oct 2016 15:10:51 +0300
Subject: softirq: Display IRQ_POLL for irq-poll statistics

This library was moved to the generic area and was
renamed to irq-poll. Hence, update proc/softirqs output accordingly.

Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 kernel/softirq.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/softirq.c b/kernel/softirq.c
index 1bf81ef91375..744fa611cae0 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -58,7 +58,7 @@ static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp
 DEFINE_PER_CPU(struct task_struct *, ksoftirqd);
 
 const char * const softirq_to_name[NR_SOFTIRQS] = {
-	"HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "BLOCK_IOPOLL",
+	"HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "IRQ_POLL",
 	"TASKLET", "SCHED", "HRTIMER", "RCU"
 };
 
-- 
cgit v1.3-14-g43fede


From 1adb469b9b76276d7e5ea36a20a24c47d6618a0b Mon Sep 17 00:00:00 2001
From: Jon Hunter <jonathanh@nvidia.com>
Date: Fri, 21 Oct 2016 16:24:09 +0100
Subject: PM / suspend: Fix missing KERN_CONT for suspend message

Commit 4bcc595ccd80 (printk: reinstate KERN_CONT for printing
continuation lines) exposed a missing KERN_CONT from one of the
messages shown on entering suspend. With v4.9-rc1, the 'done.' shown
after syncing the filesystems no longer appears as a continuation but
a new message with its own timestamp.

[    9.259566] PM: Syncing filesystems ... [    9.264119] done.

Fix this by adding the KERN_CONT log level for the 'done.' part of the
message seen after syncing filesystems. While we are at it, convert
these suspend printks to pr_info and pr_cont, respectively.

Signed-off-by: Jon Hunter <jonathanh@nvidia.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/power/suspend.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 1e7f5da648d9..6ccb08f57fcb 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -498,9 +498,9 @@ static int enter_state(suspend_state_t state)
 
 #ifndef CONFIG_SUSPEND_SKIP_SYNC
 	trace_suspend_resume(TPS("sync_filesystems"), 0, true);
-	printk(KERN_INFO "PM: Syncing filesystems ... ");
+	pr_info("PM: Syncing filesystems ... ");
 	sys_sync();
-	printk("done.\n");
+	pr_cont("done.\n");
 	trace_suspend_resume(TPS("sync_filesystems"), 0, false);
 #endif
 
-- 
cgit v1.3-14-g43fede


From b831275a3553c32091222ac619cfddd73a5553fb Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 24 Oct 2016 11:41:56 +0200
Subject: timers: Plug locking race vs. timer migration

Linus noticed that lock_timer_base() lacks a READ_ONCE() for accessing the
timer flags. As a consequence the compiler is allowed to reload the flags
between the initial check for TIMER_MIGRATION and the following timer base
computation and the spin lock of the base.

While this has not been observed (yet), we need to make sure that it never
happens.

Fixes: 0eeda71bc30d ("timer: Replace timer base by a cpu index")
Reported-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/alpine.DEB.2.20.1610241711220.4983@nanos
Cc: stable@vger.kernel.org
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
---
 kernel/time/timer.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 2d47980a1bc4..0d4b91c5a374 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -943,7 +943,14 @@ static struct timer_base *lock_timer_base(struct timer_list *timer,
 {
 	for (;;) {
 		struct timer_base *base;
-		u32 tf = timer->flags;
+		u32 tf;
+
+		/*
+		 * We need to use READ_ONCE() here, otherwise the compiler
+		 * might re-read @tf between the check for TIMER_MIGRATING
+		 * and spin_lock().
+		 */
+		tf = READ_ONCE(timer->flags);
 
 		if (!(tf & TIMER_MIGRATING)) {
 			base = get_timer_base(tf);
-- 
cgit v1.3-14-g43fede


From 4da9152a4308dcbf611cde399c695c359fc9145f Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 24 Oct 2016 11:55:10 +0200
Subject: timers: Lock base for same bucket optimization

Linus stumbled over the unlocked modification of the timer expiry value in
mod_timer() which is an optimization for timers which stay in the same
bucket - due to the bucket granularity - despite their expiry time getting
updated.

The optimization itself still makes sense even if we take the lock, because
in case that the bucket stays the same, we avoid the pointless
queue/enqueue dance.

Make the check and the modification of timer->expires protected by the base
lock and shuffle the remaining code around so we can keep the lock held
when we actually have to requeue the timer to a different bucket.

Fixes: f00c0afdfa62 ("timers: Implement optimization for same expiry time in mod_timer()")
Reported-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/alpine.DEB.2.20.1610241711220.4983@nanos
Cc: stable@vger.kernel.org
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
---
 kernel/time/timer.c | 28 +++++++++++++++++-----------
 1 file changed, 17 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 0d4b91c5a374..ccf913038f9f 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -971,6 +971,8 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
 	unsigned long clk = 0, flags;
 	int ret = 0;
 
+	BUG_ON(!timer->function);
+
 	/*
 	 * This is a common optimization triggered by the networking code - if
 	 * the timer is re-modified to have the same timeout or ends up in the
@@ -979,13 +981,16 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
 	if (timer_pending(timer)) {
 		if (timer->expires == expires)
 			return 1;
+
 		/*
-		 * Take the current timer_jiffies of base, but without holding
-		 * the lock!
+		 * We lock timer base and calculate the bucket index right
+		 * here. If the timer ends up in the same bucket, then we
+		 * just update the expiry time and avoid the whole
+		 * dequeue/enqueue dance.
 		 */
-		base = get_timer_base(timer->flags);
-		clk = base->clk;
+		base = lock_timer_base(timer, &flags);
 
+		clk = base->clk;
 		idx = calc_wheel_index(expires, clk);
 
 		/*
@@ -995,14 +1000,14 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
 		 */
 		if (idx == timer_get_idx(timer)) {
 			timer->expires = expires;
-			return 1;
+			ret = 1;
+			goto out_unlock;
 		}
+	} else {
+		base = lock_timer_base(timer, &flags);
 	}
 
 	timer_stats_timer_set_start_info(timer);
-	BUG_ON(!timer->function);
-
-	base = lock_timer_base(timer, &flags);
 
 	ret = detach_if_pending(timer, base, false);
 	if (!ret && pending_only)
@@ -1035,9 +1040,10 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
 	timer->expires = expires;
 	/*
 	 * If 'idx' was calculated above and the base time did not advance
-	 * between calculating 'idx' and taking the lock, only enqueue_timer()
-	 * and trigger_dyntick_cpu() is required. Otherwise we need to
-	 * (re)calculate the wheel index via internal_add_timer().
+	 * between calculating 'idx' and possibly switching the base, only
+	 * enqueue_timer() and trigger_dyntick_cpu() is required. Otherwise
+	 * we need to (re)calculate the wheel index via
+	 * internal_add_timer().
 	 */
 	if (idx != UINT_MAX && clk == base->clk) {
 		enqueue_timer(base, timer, idx);
-- 
cgit v1.3-14-g43fede


From 041ad7bc758db259bb960ef795197dd14aab19a6 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 22 Oct 2016 11:07:35 +0000
Subject: timers: Prevent base clock rewind when forwarding clock

Ashton and Michael reported, that kernel versions 4.8 and later suffer from
USB timeouts which are caused by the timer wheel rework.

This is caused by a bug in the base clock forwarding mechanism, which leads
to timers expiring early. The scenario which leads to this is:

run_timers()
  while (jiffies >= base->clk) {
    collect_expired_timers();
    base->clk++;
    expire_timers();
  }

So base->clk = jiffies + 1. Now the cpu goes idle:

idle()
  get_next_timer_interrupt()
    nextevt = __next_time_interrupt();
    if (time_after(nextevt, base->clk))
       	base->clk = jiffies;

jiffies has not advanced since run_timers(), so this assignment effectively
decrements base->clk by one.

base->clk is the index into the timer wheel arrays. So let's assume the
following state after the base->clk increment in run_timers():

 jiffies = 0
 base->clk = 1

A timer gets enqueued with an expiry delta of 63 ticks (which is the case
with the USB timeout and HZ=250) so the resulting bucket index is:

  base->clk + delta = 1 + 63 = 64

The timer goes into the first wheel level. The array size is 64 so it ends
up in bucket 0, which is correct as it takes 63 ticks to advance base->clk
to index into bucket 0 again.

If the cpu goes idle before jiffies advance, then the bug in the forwarding
mechanism sets base->clk back to 0, so the next invocation of run_timers()
at the next tick will index into bucket 0 and therefore expire the timer 62
ticks too early.

Instead of blindly setting base->clk to jiffies we must make the forwarding
conditional on jiffies > base->clk, but we cannot use jiffies for this as
we might run into the following issue:

  if (time_after(jiffies, base->clk) {
    if (time_after(nextevt, base->clk))
       base->clk = jiffies;

jiffies can increment between the check and the assigment far enough to
advance beyond nextevt. So we need to use a stable value for checking.

get_next_timer_interrupt() has the basej argument which is the jiffies
value snapshot taken in the calling code. So we can just that.

Thanks to Ashton for bisecting and providing trace data!

Fixes: a683f390b93f ("timers: Forward the wheel clock whenever possible")
Reported-by: Ashton Holmes <scoopta@gmail.com>
Reported-by: Michael Thayer <michael.thayer@oracle.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Michal Necasek <michal.necasek@oracle.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: knut.osmundsen@oracle.com
Cc: stable@vger.kernel.org
Cc: stern@rowland.harvard.edu
Cc: rt@linutronix.de
Link: http://lkml.kernel.org/r/20161022110552.175308322@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/timer.c | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index ccf913038f9f..7c446fb5163a 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -1523,12 +1523,16 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
 	is_max_delta = (nextevt == base->clk + NEXT_TIMER_MAX_DELTA);
 	base->next_expiry = nextevt;
 	/*
-	 * We have a fresh next event. Check whether we can forward the base:
+	 * We have a fresh next event. Check whether we can forward the
+	 * base. We can only do that when @basej is past base->clk
+	 * otherwise we might rewind base->clk.
 	 */
-	if (time_after(nextevt, jiffies))
-		base->clk = jiffies;
-	else if (time_after(nextevt, base->clk))
-		base->clk = nextevt;
+	if (time_after(basej, base->clk)) {
+		if (time_after(nextevt, basej))
+			base->clk = basej;
+		else if (time_after(nextevt, base->clk))
+			base->clk = nextevt;
+	}
 
 	if (time_before_eq(nextevt, basej)) {
 		expires = basem;
-- 
cgit v1.3-14-g43fede


From 6bad6bccf2d717f652d37e63cf261eaa23466009 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 22 Oct 2016 11:07:37 +0000
Subject: timers: Prevent base clock corruption when forwarding

When a timer is enqueued we try to forward the timer base clock. This
mechanism has two issues:

1) Forwarding a remote base unlocked

The forwarding function is called from get_target_base() with the current
timer base lock held. But if the new target base is a different base than
the current base (can happen with NOHZ, sigh!) then the forwarding is done
on an unlocked base. This can lead to corruption of base->clk.

Solution is simple: Invoke the forwarding after the target base is locked.

2) Possible corruption due to jiffies advancing

This is similar to the issue in get_net_timer_interrupt() which was fixed
in the previous patch. jiffies can advance between check and assignement
and therefore advancing base->clk beyond the next expiry value.

So we need to read jiffies into a local variable once and do the checks and
assignment with the local copy.

Fixes: a683f390b93f("timers: Forward the wheel clock whenever possible")
Reported-by: Ashton Holmes <scoopta@gmail.com>
Reported-by: Michael Thayer <michael.thayer@oracle.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Michal Necasek <michal.necasek@oracle.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: knut.osmundsen@oracle.com
Cc: stable@vger.kernel.org
Cc: stern@rowland.harvard.edu
Cc: rt@linutronix.de
Link: http://lkml.kernel.org/r/20161022110552.253640125@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/timer.c | 23 ++++++++++-------------
 1 file changed, 10 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 7c446fb5163a..c611c47de884 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -878,7 +878,7 @@ static inline struct timer_base *get_timer_base(u32 tflags)
 
 #ifdef CONFIG_NO_HZ_COMMON
 static inline struct timer_base *
-__get_target_base(struct timer_base *base, unsigned tflags)
+get_target_base(struct timer_base *base, unsigned tflags)
 {
 #ifdef CONFIG_SMP
 	if ((tflags & TIMER_PINNED) || !base->migration_enabled)
@@ -891,25 +891,27 @@ __get_target_base(struct timer_base *base, unsigned tflags)
 
 static inline void forward_timer_base(struct timer_base *base)
 {
+	unsigned long jnow = READ_ONCE(jiffies);
+
 	/*
 	 * We only forward the base when it's idle and we have a delta between
 	 * base clock and jiffies.
 	 */
-	if (!base->is_idle || (long) (jiffies - base->clk) < 2)
+	if (!base->is_idle || (long) (jnow - base->clk) < 2)
 		return;
 
 	/*
 	 * If the next expiry value is > jiffies, then we fast forward to
 	 * jiffies otherwise we forward to the next expiry value.
 	 */
-	if (time_after(base->next_expiry, jiffies))
-		base->clk = jiffies;
+	if (time_after(base->next_expiry, jnow))
+		base->clk = jnow;
 	else
 		base->clk = base->next_expiry;
 }
 #else
 static inline struct timer_base *
-__get_target_base(struct timer_base *base, unsigned tflags)
+get_target_base(struct timer_base *base, unsigned tflags)
 {
 	return get_timer_this_cpu_base(tflags);
 }
@@ -917,14 +919,6 @@ __get_target_base(struct timer_base *base, unsigned tflags)
 static inline void forward_timer_base(struct timer_base *base) { }
 #endif
 
-static inline struct timer_base *
-get_target_base(struct timer_base *base, unsigned tflags)
-{
-	struct timer_base *target = __get_target_base(base, tflags);
-
-	forward_timer_base(target);
-	return target;
-}
 
 /*
  * We are using hashed locking: Holding per_cpu(timer_bases[x]).lock means
@@ -1037,6 +1031,9 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
 		}
 	}
 
+	/* Try to forward a stale timer base clock */
+	forward_timer_base(base);
+
 	timer->expires = expires;
 	/*
 	 * If 'idx' was calculated above and the base time did not advance
-- 
cgit v1.3-14-g43fede


From f5d6d2da0d9098a4aa0ebcc187aa0fc167045d6b Mon Sep 17 00:00:00 2001
From: Tobias Klauser <tklauser@distanz.ch>
Date: Wed, 26 Oct 2016 13:37:04 +0200
Subject: sched/fair: Remove unused but set variable 'rq'
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Since commit:

  8663e24d56dc ("sched/fair: Reorder cgroup creation code")

... the variable 'rq' in alloc_fair_sched_group() is set but no longer used.
Remove it to fix the following GCC warning when building with 'W=1':

  kernel/sched/fair.c:8842:13: warning: variable ‘rq’ set but not used [-Wunused-but-set-variable]

Signed-off-by: Tobias Klauser <tklauser@distanz.ch>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20161026113704.8981-1-tklauser@distanz.ch
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index d941c97dfbc3..c242944f5cbd 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -8839,7 +8839,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
 {
 	struct sched_entity *se;
 	struct cfs_rq *cfs_rq;
-	struct rq *rq;
 	int i;
 
 	tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
@@ -8854,8 +8853,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
 	init_cfs_bandwidth(tg_cfs_bandwidth(tg));
 
 	for_each_possible_cpu(i) {
-		rq = cpu_rq(i);
-
 		cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
 				      GFP_KERNEL, cpu_to_node(i));
 		if (!cfs_rq)
-- 
cgit v1.3-14-g43fede


From 9dcb8b685fc30813b35ab4b4bf39244430753190 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Wed, 26 Oct 2016 10:15:30 -0700
Subject: mm: remove per-zone hashtable of bitlock waitqueues

The per-zone waitqueues exist because of a scalability issue with the
page waitqueues on some NUMA machines, but it turns out that they hurt
normal loads, and now with the vmalloced stacks they also end up
breaking gfs2 that uses a bit_wait on a stack object:

     wait_on_bit(&gh->gh_iflags, HIF_WAIT, TASK_UNINTERRUPTIBLE)

where 'gh' can be a reference to the local variable 'mount_gh' on the
stack of fill_super().

The reason the per-zone hash table breaks for this case is that there is
no "zone" for virtual allocations, and trying to look up the physical
page to get at it will fail (with a BUG_ON()).

It turns out that I actually complained to the mm people about the
per-zone hash table for another reason just a month ago: the zone lookup
also hurts the regular use of "unlock_page()" a lot, because the zone
lookup ends up forcing several unnecessary cache misses and generates
horrible code.

As part of that earlier discussion, we had a much better solution for
the NUMA scalability issue - by just making the page lock have a
separate contention bit, the waitqueue doesn't even have to be looked at
for the normal case.

Peter Zijlstra already has a patch for that, but let's see if anybody
even notices.  In the meantime, let's fix the actual gfs2 breakage by
simplifying the bitlock waitqueues and removing the per-zone issue.

Reported-by: Andreas Gruenbacher <agruenba@redhat.com>
Tested-by: Bob Peterson <rpeterso@redhat.com>
Acked-by: Mel Gorman <mgorman@techsingularity.net>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Steven Whitehouse <swhiteho@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmzone.h |  30 +------------
 kernel/sched/core.c    |  16 +++++++
 kernel/sched/wait.c    |  10 -----
 mm/filemap.c           |   4 +-
 mm/memory_hotplug.c    |  28 ------------
 mm/page_alloc.c        | 115 +------------------------------------------------
 6 files changed, 21 insertions(+), 182 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 7f2ae99e5daf..0f088f3a2fed 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -440,33 +440,7 @@ struct zone {
 	seqlock_t		span_seqlock;
 #endif
 
-	/*
-	 * wait_table		-- the array holding the hash table
-	 * wait_table_hash_nr_entries	-- the size of the hash table array
-	 * wait_table_bits	-- wait_table_size == (1 << wait_table_bits)
-	 *
-	 * The purpose of all these is to keep track of the people
-	 * waiting for a page to become available and make them
-	 * runnable again when possible. The trouble is that this
-	 * consumes a lot of space, especially when so few things
-	 * wait on pages at a given time. So instead of using
-	 * per-page waitqueues, we use a waitqueue hash table.
-	 *
-	 * The bucket discipline is to sleep on the same queue when
-	 * colliding and wake all in that wait queue when removing.
-	 * When something wakes, it must check to be sure its page is
-	 * truly available, a la thundering herd. The cost of a
-	 * collision is great, but given the expected load of the
-	 * table, they should be so rare as to be outweighed by the
-	 * benefits from the saved space.
-	 *
-	 * __wait_on_page_locked() and unlock_page() in mm/filemap.c, are the
-	 * primary users of these fields, and in mm/page_alloc.c
-	 * free_area_init_core() performs the initialization of them.
-	 */
-	wait_queue_head_t	*wait_table;
-	unsigned long		wait_table_hash_nr_entries;
-	unsigned long		wait_table_bits;
+	int initialized;
 
 	/* Write-intensive fields used from the page allocator */
 	ZONE_PADDING(_pad1_)
@@ -546,7 +520,7 @@ static inline bool zone_spans_pfn(const struct zone *zone, unsigned long pfn)
 
 static inline bool zone_is_initialized(struct zone *zone)
 {
-	return !!zone->wait_table;
+	return zone->initialized;
 }
 
 static inline bool zone_is_empty(struct zone *zone)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 94732d1ab00a..42d4027f9e26 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7515,11 +7515,27 @@ static struct kmem_cache *task_group_cache __read_mostly;
 DECLARE_PER_CPU(cpumask_var_t, load_balance_mask);
 DECLARE_PER_CPU(cpumask_var_t, select_idle_mask);
 
+#define WAIT_TABLE_BITS 8
+#define WAIT_TABLE_SIZE (1 << WAIT_TABLE_BITS)
+static wait_queue_head_t bit_wait_table[WAIT_TABLE_SIZE] __cacheline_aligned;
+
+wait_queue_head_t *bit_waitqueue(void *word, int bit)
+{
+	const int shift = BITS_PER_LONG == 32 ? 5 : 6;
+	unsigned long val = (unsigned long)word << shift | bit;
+
+	return bit_wait_table + hash_long(val, WAIT_TABLE_BITS);
+}
+EXPORT_SYMBOL(bit_waitqueue);
+
 void __init sched_init(void)
 {
 	int i, j;
 	unsigned long alloc_size = 0, ptr;
 
+	for (i = 0; i < WAIT_TABLE_SIZE; i++)
+		init_waitqueue_head(bit_wait_table + i);
+
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	alloc_size += 2 * nr_cpu_ids * sizeof(void **);
 #endif
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 4f7053579fe3..9453efe9b25a 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -480,16 +480,6 @@ void wake_up_bit(void *word, int bit)
 }
 EXPORT_SYMBOL(wake_up_bit);
 
-wait_queue_head_t *bit_waitqueue(void *word, int bit)
-{
-	const int shift = BITS_PER_LONG == 32 ? 5 : 6;
-	const struct zone *zone = page_zone(virt_to_page(word));
-	unsigned long val = (unsigned long)word << shift | bit;
-
-	return &zone->wait_table[hash_long(val, zone->wait_table_bits)];
-}
-EXPORT_SYMBOL(bit_waitqueue);
-
 /*
  * Manipulate the atomic_t address to produce a better bit waitqueue table hash
  * index (we're keying off bit -1, but that would produce a horrible hash
diff --git a/mm/filemap.c b/mm/filemap.c
index 849f459ad078..c7fe2f16503f 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -790,9 +790,7 @@ EXPORT_SYMBOL(__page_cache_alloc);
  */
 wait_queue_head_t *page_waitqueue(struct page *page)
 {
-	const struct zone *zone = page_zone(page);
-
-	return &zone->wait_table[hash_ptr(page, zone->wait_table_bits)];
+	return bit_waitqueue(page, 0);
 }
 EXPORT_SYMBOL(page_waitqueue);
 
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 962927309b6e..b18dab401be6 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -268,7 +268,6 @@ void __init register_page_bootmem_info_node(struct pglist_data *pgdat)
 	unsigned long i, pfn, end_pfn, nr_pages;
 	int node = pgdat->node_id;
 	struct page *page;
-	struct zone *zone;
 
 	nr_pages = PAGE_ALIGN(sizeof(struct pglist_data)) >> PAGE_SHIFT;
 	page = virt_to_page(pgdat);
@@ -276,19 +275,6 @@ void __init register_page_bootmem_info_node(struct pglist_data *pgdat)
 	for (i = 0; i < nr_pages; i++, page++)
 		get_page_bootmem(node, page, NODE_INFO);
 
-	zone = &pgdat->node_zones[0];
-	for (; zone < pgdat->node_zones + MAX_NR_ZONES - 1; zone++) {
-		if (zone_is_initialized(zone)) {
-			nr_pages = zone->wait_table_hash_nr_entries
-				* sizeof(wait_queue_head_t);
-			nr_pages = PAGE_ALIGN(nr_pages) >> PAGE_SHIFT;
-			page = virt_to_page(zone->wait_table);
-
-			for (i = 0; i < nr_pages; i++, page++)
-				get_page_bootmem(node, page, NODE_INFO);
-		}
-	}
-
 	pfn = pgdat->node_start_pfn;
 	end_pfn = pgdat_end_pfn(pgdat);
 
@@ -2158,20 +2144,6 @@ void try_offline_node(int nid)
 	 */
 	node_set_offline(nid);
 	unregister_one_node(nid);
-
-	/* free waittable in each zone */
-	for (i = 0; i < MAX_NR_ZONES; i++) {
-		struct zone *zone = pgdat->node_zones + i;
-
-		/*
-		 * wait_table may be allocated from boot memory,
-		 * here only free if it's allocated by vmalloc.
-		 */
-		if (is_vmalloc_addr(zone->wait_table)) {
-			vfree(zone->wait_table);
-			zone->wait_table = NULL;
-		}
-	}
 }
 EXPORT_SYMBOL(try_offline_node);
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 2b3bf6767d54..de7c6e43b1c9 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4976,72 +4976,6 @@ void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone)
 #endif
 }
 
-/*
- * Helper functions to size the waitqueue hash table.
- * Essentially these want to choose hash table sizes sufficiently
- * large so that collisions trying to wait on pages are rare.
- * But in fact, the number of active page waitqueues on typical
- * systems is ridiculously low, less than 200. So this is even
- * conservative, even though it seems large.
- *
- * The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to
- * waitqueues, i.e. the size of the waitq table given the number of pages.
- */
-#define PAGES_PER_WAITQUEUE	256
-
-#ifndef CONFIG_MEMORY_HOTPLUG
-static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)
-{
-	unsigned long size = 1;
-
-	pages /= PAGES_PER_WAITQUEUE;
-
-	while (size < pages)
-		size <<= 1;
-
-	/*
-	 * Once we have dozens or even hundreds of threads sleeping
-	 * on IO we've got bigger problems than wait queue collision.
-	 * Limit the size of the wait table to a reasonable size.
-	 */
-	size = min(size, 4096UL);
-
-	return max(size, 4UL);
-}
-#else
-/*
- * A zone's size might be changed by hot-add, so it is not possible to determine
- * a suitable size for its wait_table.  So we use the maximum size now.
- *
- * The max wait table size = 4096 x sizeof(wait_queue_head_t).   ie:
- *
- *    i386 (preemption config)    : 4096 x 16 = 64Kbyte.
- *    ia64, x86-64 (no preemption): 4096 x 20 = 80Kbyte.
- *    ia64, x86-64 (preemption)   : 4096 x 24 = 96Kbyte.
- *
- * The maximum entries are prepared when a zone's memory is (512K + 256) pages
- * or more by the traditional way. (See above).  It equals:
- *
- *    i386, x86-64, powerpc(4K page size) : =  ( 2G + 1M)byte.
- *    ia64(16K page size)                 : =  ( 8G + 4M)byte.
- *    powerpc (64K page size)             : =  (32G +16M)byte.
- */
-static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)
-{
-	return 4096UL;
-}
-#endif
-
-/*
- * This is an integer logarithm so that shifts can be used later
- * to extract the more random high bits from the multiplicative
- * hash function before the remainder is taken.
- */
-static inline unsigned long wait_table_bits(unsigned long size)
-{
-	return ffz(~size);
-}
-
 /*
  * Initially all pages are reserved - free ones are freed
  * up by free_all_bootmem() once the early boot process is
@@ -5304,49 +5238,6 @@ void __init setup_per_cpu_pageset(void)
 			alloc_percpu(struct per_cpu_nodestat);
 }
 
-static noinline __ref
-int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
-{
-	int i;
-	size_t alloc_size;
-
-	/*
-	 * The per-page waitqueue mechanism uses hashed waitqueues
-	 * per zone.
-	 */
-	zone->wait_table_hash_nr_entries =
-		 wait_table_hash_nr_entries(zone_size_pages);
-	zone->wait_table_bits =
-		wait_table_bits(zone->wait_table_hash_nr_entries);
-	alloc_size = zone->wait_table_hash_nr_entries
-					* sizeof(wait_queue_head_t);
-
-	if (!slab_is_available()) {
-		zone->wait_table = (wait_queue_head_t *)
-			memblock_virt_alloc_node_nopanic(
-				alloc_size, zone->zone_pgdat->node_id);
-	} else {
-		/*
-		 * This case means that a zone whose size was 0 gets new memory
-		 * via memory hot-add.
-		 * But it may be the case that a new node was hot-added.  In
-		 * this case vmalloc() will not be able to use this new node's
-		 * memory - this wait_table must be initialized to use this new
-		 * node itself as well.
-		 * To use this new node's memory, further consideration will be
-		 * necessary.
-		 */
-		zone->wait_table = vmalloc(alloc_size);
-	}
-	if (!zone->wait_table)
-		return -ENOMEM;
-
-	for (i = 0; i < zone->wait_table_hash_nr_entries; ++i)
-		init_waitqueue_head(zone->wait_table + i);
-
-	return 0;
-}
-
 static __meminit void zone_pcp_init(struct zone *zone)
 {
 	/*
@@ -5367,10 +5258,7 @@ int __meminit init_currently_empty_zone(struct zone *zone,
 					unsigned long size)
 {
 	struct pglist_data *pgdat = zone->zone_pgdat;
-	int ret;
-	ret = zone_wait_table_init(zone, size);
-	if (ret)
-		return ret;
+
 	pgdat->nr_zones = zone_idx(zone) + 1;
 
 	zone->zone_start_pfn = zone_start_pfn;
@@ -5382,6 +5270,7 @@ int __meminit init_currently_empty_zone(struct zone *zone,
 			zone_start_pfn, (zone_start_pfn + size));
 
 	zone_init_free_lists(zone);
+	zone->initialized = 1;
 
 	return 0;
 }
-- 
cgit v1.3-14-g43fede


From b274c0bb394c6a69ac12feac7c2db81f5aff5a55 Mon Sep 17 00:00:00 2001
From: Andrey Konovalov <andreyknvl@google.com>
Date: Thu, 27 Oct 2016 17:46:21 -0700
Subject: kcov: properly check if we are in an interrupt

in_interrupt() returns a nonzero value when we are either in an
interrupt or have bh disabled via local_bh_disable().  Since we are
interested in only ignoring coverage from actual interrupts, do a proper
check instead of just calling in_interrupt().

As a result of this change, kcov will start to collect coverage from
within local_bh_disable()/local_bh_enable() sections.

Link: http://lkml.kernel.org/r/1476115803-20712-1-git-send-email-andreyknvl@google.com
Signed-off-by: Andrey Konovalov <andreyknvl@google.com>
Acked-by: Dmitry Vyukov <dvyukov@google.com>
Cc: Nicolai Stange <nicstange@gmail.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: James Morse <james.morse@arm.com>
Cc: Vegard Nossum <vegard.nossum@oracle.com>
Cc: Quentin Casasnovas <quentin.casasnovas@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kcov.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/kcov.c b/kernel/kcov.c
index 8d44b3fea9d0..30e6d05aa5a9 100644
--- a/kernel/kcov.c
+++ b/kernel/kcov.c
@@ -53,8 +53,15 @@ void notrace __sanitizer_cov_trace_pc(void)
 	/*
 	 * We are interested in code coverage as a function of a syscall inputs,
 	 * so we ignore code executed in interrupts.
+	 * The checks for whether we are in an interrupt are open-coded, because
+	 * 1. We can't use in_interrupt() here, since it also returns true
+	 *    when we are inside local_bh_disable() section.
+	 * 2. We don't want to use (in_irq() | in_serving_softirq() | in_nmi()),
+	 *    since that leads to slower generated code (three separate tests,
+	 *    one for each of the flags).
 	 */
-	if (!t || in_interrupt())
+	if (!t || (preempt_count() & (HARDIRQ_MASK | SOFTIRQ_OFFSET
+							| NMI_MASK)))
 		return;
 	mode = READ_ONCE(t->kcov_mode);
 	if (mode == KCOV_MODE_TRACE) {
-- 
cgit v1.3-14-g43fede


From 0933840acf7b65d6d30a5b6089d882afea57aca3 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@redhat.com>
Date: Thu, 20 Oct 2016 13:10:11 +0200
Subject: perf/core: Protect PMU device removal with a 'pmu_bus_running' check,
 to fix CONFIG_DEBUG_TEST_DRIVER_REMOVE=y kernel panic

CAI Qian reported a crash in the PMU uncore device removal code,
enabled by the CONFIG_DEBUG_TEST_DRIVER_REMOVE=y option:

  https://marc.info/?l=linux-kernel&m=147688837328451

The reason for the crash is that perf_pmu_unregister() tries to remove
a PMU device which is not added at this point. We add PMU devices
only after pmu_bus is registered, which happens in the
perf_event_sysfs_init() call and sets the 'pmu_bus_running' flag.

The fix is to get the 'pmu_bus_running' flag state at the point
the PMU is taken out of the PMU list and remove the device
later only if it's set.

Reported-by: CAI Qian <caiqian@redhat.com>
Tested-by: CAI Qian <caiqian@redhat.com>
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Kan Liang <kan.liang@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rob Herring <robh@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20161020111011.GA13361@krava
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index c6e47e97b33f..a5d2e62faf7e 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -8855,7 +8855,10 @@ EXPORT_SYMBOL_GPL(perf_pmu_register);
 
 void perf_pmu_unregister(struct pmu *pmu)
 {
+	int remove_device;
+
 	mutex_lock(&pmus_lock);
+	remove_device = pmu_bus_running;
 	list_del_rcu(&pmu->entry);
 	mutex_unlock(&pmus_lock);
 
@@ -8869,10 +8872,12 @@ void perf_pmu_unregister(struct pmu *pmu)
 	free_percpu(pmu->pmu_disable_count);
 	if (pmu->type >= PERF_TYPE_MAX)
 		idr_remove(&pmu_idr, pmu->type);
-	if (pmu->nr_addr_filters)
-		device_remove_file(pmu->dev, &dev_attr_nr_addr_filters);
-	device_del(pmu->dev);
-	put_device(pmu->dev);
+	if (remove_device) {
+		if (pmu->nr_addr_filters)
+			device_remove_file(pmu->dev, &dev_attr_nr_addr_filters);
+		device_del(pmu->dev);
+		put_device(pmu->dev);
+	}
 	free_pmu_context(pmu);
 }
 EXPORT_SYMBOL_GPL(perf_pmu_unregister);
-- 
cgit v1.3-14-g43fede


From 5aab90ce1ec449912a2ebc4d45e0c85dac29e9dd Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@redhat.com>
Date: Wed, 26 Oct 2016 11:48:24 +0200
Subject: perf/powerpc: Don't call perf_event_disable() from atomic context

The trinity syscall fuzzer triggered following WARN() on powerpc:

  WARNING: CPU: 9 PID: 2998 at arch/powerpc/kernel/hw_breakpoint.c:278
  ...
  NIP [c00000000093aedc] .hw_breakpoint_handler+0x28c/0x2b0
  LR [c00000000093aed8] .hw_breakpoint_handler+0x288/0x2b0
  Call Trace:
  [c0000002f7933580] [c00000000093aed8] .hw_breakpoint_handler+0x288/0x2b0 (unreliable)
  [c0000002f7933630] [c0000000000f671c] .notifier_call_chain+0x7c/0xf0
  [c0000002f79336d0] [c0000000000f6abc] .__atomic_notifier_call_chain+0xbc/0x1c0
  [c0000002f7933780] [c0000000000f6c40] .notify_die+0x70/0xd0
  [c0000002f7933820] [c00000000001a74c] .do_break+0x4c/0x100
  [c0000002f7933920] [c0000000000089fc] handle_dabr_fault+0x14/0x48

Followed by a lockdep warning:

  ===============================
  [ INFO: suspicious RCU usage. ]
  4.8.0-rc5+ #7 Tainted: G        W
  -------------------------------
  ./include/linux/rcupdate.h:556 Illegal context switch in RCU read-side critical section!

  other info that might help us debug this:

  rcu_scheduler_active = 1, debug_locks = 0
  2 locks held by ls/2998:
   #0:  (rcu_read_lock){......}, at: [<c0000000000f6a00>] .__atomic_notifier_call_chain+0x0/0x1c0
   #1:  (rcu_read_lock){......}, at: [<c00000000093ac50>] .hw_breakpoint_handler+0x0/0x2b0

  stack backtrace:
  CPU: 9 PID: 2998 Comm: ls Tainted: G        W       4.8.0-rc5+ #7
  Call Trace:
  [c0000002f7933150] [c00000000094b1f8] .dump_stack+0xe0/0x14c (unreliable)
  [c0000002f79331e0] [c00000000013c468] .lockdep_rcu_suspicious+0x138/0x180
  [c0000002f7933270] [c0000000001005d8] .___might_sleep+0x278/0x2e0
  [c0000002f7933300] [c000000000935584] .mutex_lock_nested+0x64/0x5a0
  [c0000002f7933410] [c00000000023084c] .perf_event_ctx_lock_nested+0x16c/0x380
  [c0000002f7933500] [c000000000230a80] .perf_event_disable+0x20/0x60
  [c0000002f7933580] [c00000000093aeec] .hw_breakpoint_handler+0x29c/0x2b0
  [c0000002f7933630] [c0000000000f671c] .notifier_call_chain+0x7c/0xf0
  [c0000002f79336d0] [c0000000000f6abc] .__atomic_notifier_call_chain+0xbc/0x1c0
  [c0000002f7933780] [c0000000000f6c40] .notify_die+0x70/0xd0
  [c0000002f7933820] [c00000000001a74c] .do_break+0x4c/0x100
  [c0000002f7933920] [c0000000000089fc] handle_dabr_fault+0x14/0x48

While it looks like the first WARN() is probably valid, the other one is
triggered by disabling event via perf_event_disable() from atomic context.

The event is disabled here in case we were not able to emulate
the instruction that hit the breakpoint. By disabling the event
we unschedule the event and make sure it's not scheduled back.

But we can't call perf_event_disable() from atomic context, instead
we need to use the event's pending_disable irq_work method to disable it.

Reported-by: Jan Stancek <jstancek@redhat.com>
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Huang Ying <ying.huang@intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Michael Neuling <mikey@neuling.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20161026094824.GA21397@krava
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/powerpc/kernel/hw_breakpoint.c |  2 +-
 include/linux/perf_event.h          |  1 +
 kernel/events/core.c                | 10 ++++++++--
 3 files changed, 10 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/arch/powerpc/kernel/hw_breakpoint.c b/arch/powerpc/kernel/hw_breakpoint.c
index 9781c69eae57..03d089b3ed72 100644
--- a/arch/powerpc/kernel/hw_breakpoint.c
+++ b/arch/powerpc/kernel/hw_breakpoint.c
@@ -275,7 +275,7 @@ int hw_breakpoint_handler(struct die_args *args)
 	if (!stepped) {
 		WARN(1, "Unable to handle hardware breakpoint. Breakpoint at "
 			"0x%lx will be disabled.", info->address);
-		perf_event_disable(bp);
+		perf_event_disable_inatomic(bp);
 		goto out;
 	}
 	/*
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 060d0ede88df..4741ecdb9817 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1257,6 +1257,7 @@ extern u64 perf_swevent_set_period(struct perf_event *event);
 extern void perf_event_enable(struct perf_event *event);
 extern void perf_event_disable(struct perf_event *event);
 extern void perf_event_disable_local(struct perf_event *event);
+extern void perf_event_disable_inatomic(struct perf_event *event);
 extern void perf_event_task_tick(void);
 #else /* !CONFIG_PERF_EVENTS: */
 static inline void *
diff --git a/kernel/events/core.c b/kernel/events/core.c
index a5d2e62faf7e..0e292132efac 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1960,6 +1960,12 @@ void perf_event_disable(struct perf_event *event)
 }
 EXPORT_SYMBOL_GPL(perf_event_disable);
 
+void perf_event_disable_inatomic(struct perf_event *event)
+{
+	event->pending_disable = 1;
+	irq_work_queue(&event->pending);
+}
+
 static void perf_set_shadow_time(struct perf_event *event,
 				 struct perf_event_context *ctx,
 				 u64 tstamp)
@@ -7075,8 +7081,8 @@ static int __perf_event_overflow(struct perf_event *event,
 	if (events && atomic_dec_and_test(&event->event_limit)) {
 		ret = 1;
 		event->pending_kill = POLL_HUP;
-		event->pending_disable = 1;
-		irq_work_queue(&event->pending);
+
+		perf_event_disable_inatomic(event);
 	}
 
 	READ_ONCE(event->overflow_handler)(event, data, regs);
-- 
cgit v1.3-14-g43fede


From 405c0759712f57b680f66aee9c55cd06ad1cbdef Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Mon, 31 Oct 2016 08:11:43 -0700
Subject: fork: Add task stack refcounting sanity check and prevent premature
 task stack freeing

If something goes wrong with task stack refcounting and a stack
refcount hits zero too early, warn and leak it rather than
potentially freeing it early (and silently).

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/f29119c783a9680a4b4656e751b6123917ace94b.1477926663.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/fork.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index 623259fc794d..997ac1d584f7 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -315,6 +315,9 @@ static void account_kernel_stack(struct task_struct *tsk, int account)
 
 static void release_task_stack(struct task_struct *tsk)
 {
+	if (WARN_ON(tsk->state != TASK_DEAD))
+		return;  /* Better to leak the stack than to free prematurely */
+
 	account_kernel_stack(tsk, -1);
 	arch_release_thread_stack(tsk->stack);
 	free_thread_stack(tsk);
@@ -1862,6 +1865,7 @@ bad_fork_cleanup_count:
 	atomic_dec(&p->cred->user->processes);
 	exit_creds(p);
 bad_fork_free:
+	p->state = TASK_DEAD;
 	put_task_stack(p);
 	free_task(p);
 fork_out:
-- 
cgit v1.3-14-g43fede


From ceb75787bc75d0a7b88519ab8a68067ac690f55a Mon Sep 17 00:00:00 2001
From: Johan Hovold <johan@kernel.org>
Date: Tue, 1 Nov 2016 11:49:56 +0100
Subject: PM / sleep: fix device reference leak in test_suspend

Make sure to drop the reference taken by class_find_device() after
opening the RTC device.

Fixes: 77437fd4e61f (pm: boot time suspend selftest)
Signed-off-by: Johan Hovold <johan@kernel.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/power/suspend_test.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c
index 084452e34a12..bdff5ed57f10 100644
--- a/kernel/power/suspend_test.c
+++ b/kernel/power/suspend_test.c
@@ -203,8 +203,10 @@ static int __init test_suspend(void)
 
 	/* RTCs have initialized by now too ... can we use one? */
 	dev = class_find_device(rtc_class, NULL, NULL, has_wakealarm);
-	if (dev)
+	if (dev) {
 		rtc = rtc_class_open(dev_name(dev));
+		put_device(dev);
+	}
 	if (!rtc) {
 		printk(warn_no_rtc);
 		return 0;
-- 
cgit v1.3-14-g43fede


From 382005027fedc50b28d40ae64ef1461cca38953e Mon Sep 17 00:00:00 2001
From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Date: Wed, 2 Nov 2016 19:50:29 +0900
Subject: sched/core: Fix oops in sched_show_task()

When CONFIG_THREAD_INFO_IN_TASK=y, it is possible that an exited thread
remains in the task list after its stack pointer was already set to NULL.

Therefore, thread_saved_pc() and stack_not_used() in sched_show_task()
will trigger NULL pointer dereference if an attempt to dump such thread's
traces (e.g. SysRq-t, khungtaskd) is made.

Since show_stack() in sched_show_task() calls try_get_task_stack() and
sched_show_task() is called from interrupt context, calling
try_get_task_stack() from sched_show_task() will be safe as well.

Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Acked-by: Andy Lutomirski <luto@kernel.org>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: bp@alien8.de
Cc: brgerst@gmail.com
Cc: jann@thejh.net
Cc: keescook@chromium.org
Cc: linux-api@vger.kernel.org
Cc: tycho.andersen@canonical.com
Link: http://lkml.kernel.org/r/201611021950.FEJ34368.HFFJOOMLtQOVSF@I-love.SAKURA.ne.jp
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 42d4027f9e26..9abf66b6c271 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5192,6 +5192,8 @@ void sched_show_task(struct task_struct *p)
 	int ppid;
 	unsigned long state = p->state;
 
+	if (!try_get_task_stack(p))
+		return;
 	if (state)
 		state = __ffs(state) + 1;
 	printk(KERN_INFO "%-15.15s %c", p->comm,
@@ -5221,6 +5223,7 @@ void sched_show_task(struct task_struct *p)
 
 	print_worker_info(KERN_INFO, p);
 	show_stack(p, NULL);
+	put_task_stack(p);
 }
 
 void show_state_filter(unsigned long state_filter)
-- 
cgit v1.3-14-g43fede


From 8243d5597793b5e85143c9a935e1b971c59740a9 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Tue, 1 Nov 2016 17:47:18 -0600
Subject: sched/core: Remove pointless printout in sched_show_task()

In sched_show_task() we print out a useless hex number, not even a
symbol, and there's a big question mark whether this even makes sense
anyway, I suspect we should just remove it all.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Acked-by: Andy Lutomirski <luto@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: bp@alien8.de
Cc: brgerst@gmail.com
Cc: jann@thejh.net
Cc: keescook@chromium.org
Cc: linux-api@vger.kernel.org
Cc: tycho.andersen@canonical.com
Link: http://lkml.kernel.org/r/CA+55aFzphURPFzAvU4z6Moy7ZmimcwPuUdYU8bj9z0J+S8X1rw@mail.gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c | 9 ---------
 1 file changed, 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 9abf66b6c271..154fd689fe02 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5198,17 +5198,8 @@ void sched_show_task(struct task_struct *p)
 		state = __ffs(state) + 1;
 	printk(KERN_INFO "%-15.15s %c", p->comm,
 		state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
-#if BITS_PER_LONG == 32
-	if (state == TASK_RUNNING)
-		printk(KERN_CONT " running  ");
-	else
-		printk(KERN_CONT " %08lx ", thread_saved_pc(p));
-#else
 	if (state == TASK_RUNNING)
 		printk(KERN_CONT "  running task    ");
-	else
-		printk(KERN_CONT " %016lx ", thread_saved_pc(p));
-#endif
 #ifdef CONFIG_DEBUG_STACK_USAGE
 	free = stack_not_used(p);
 #endif
-- 
cgit v1.3-14-g43fede


From 243d52126184b072a18fe2130ce0008f8aa3a340 Mon Sep 17 00:00:00 2001
From: WANG Cong <xiyou.wangcong@gmail.com>
Date: Thu, 3 Nov 2016 09:42:36 -0700
Subject: taskstats: fix the length of cgroupstats_cmd_get_policy

cgroupstats_cmd_get_policy is [CGROUPSTATS_CMD_ATTR_MAX+1],
taskstats_cmd_get_policy[TASKSTATS_CMD_ATTR_MAX+1],
but their family.maxattr is TASKSTATS_CMD_ATTR_MAX.
CGROUPSTATS_CMD_ATTR_MAX is less than TASKSTATS_CMD_ATTR_MAX,
so we could end up accessing out-of-bound.

Change cgroupstats_cmd_get_policy to TASKSTATS_CMD_ATTR_MAX+1,
this is safe because the rest are initialized to 0's.

Reported-by: Andrey Konovalov <andreyknvl@google.com>
Tested-by: Andrey Konovalov <andreyknvl@google.com>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/taskstats.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index b3f05ee20d18..cbb387a265db 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -54,7 +54,11 @@ static const struct nla_policy taskstats_cmd_get_policy[TASKSTATS_CMD_ATTR_MAX+1
 	[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK] = { .type = NLA_STRING },
 	[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK] = { .type = NLA_STRING },};
 
-static const struct nla_policy cgroupstats_cmd_get_policy[CGROUPSTATS_CMD_ATTR_MAX+1] = {
+/*
+ * We have to use TASKSTATS_CMD_ATTR_MAX here, it is the maxattr in the family.
+ * Make sure they are always aligned.
+ */
+static const struct nla_policy cgroupstats_cmd_get_policy[TASKSTATS_CMD_ATTR_MAX+1] = {
 	[CGROUPSTATS_CMD_ATTR_FD] = { .type = NLA_U32 },
 };
 
-- 
cgit v1.3-14-g43fede


From 483bed2b0ddd12ec33fc9407e0c6e1088e77a97c Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 4 Nov 2016 00:01:19 +0100
Subject: bpf: fix htab map destruction when extra reserve is in use

Commit a6ed3ea65d98 ("bpf: restore behavior of bpf_map_update_elem")
added an extra per-cpu reserve to the hash table map to restore old
behaviour from pre prealloc times. When non-prealloc is in use for a
map, then problem is that once a hash table extra element has been
linked into the hash-table, and the hash table is destroyed due to
refcount dropping to zero, then htab_map_free() -> delete_all_elements()
will walk the whole hash table and drop all elements via htab_elem_free().
The problem is that the element from the extra reserve is first fed
to the wrong backend allocator and eventually freed twice.

Fixes: a6ed3ea65d98 ("bpf: restore behavior of bpf_map_update_elem")
Reported-by: Dmitry Vyukov <dvyukov@google.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/hashtab.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 570eeca7bdfa..ad1bc67aff1b 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -687,7 +687,8 @@ static void delete_all_elements(struct bpf_htab *htab)
 
 		hlist_for_each_entry_safe(l, n, head, hash_node) {
 			hlist_del_rcu(&l->hash_node);
-			htab_elem_free(htab, l);
+			if (l->state != HTAB_EXTRA_ELEM_USED)
+				htab_elem_free(htab, l);
 		}
 	}
 }
-- 
cgit v1.3-14-g43fede


From 20b2b24f91f70e7d3f0918c077546cb21bd73a87 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 4 Nov 2016 00:56:31 +0100
Subject: bpf: fix map not being uncharged during map creation failure

In map_create(), we first find and create the map, then once that
suceeded, we charge it to the user's RLIMIT_MEMLOCK, and then fetch
a new anon fd through anon_inode_getfd(). The problem is, once the
latter fails f.e. due to RLIMIT_NOFILE limit, then we only destruct
the map via map->ops->map_free(), but without uncharging the previously
locked memory first. That means that the user_struct allocation is
leaked as well as the accounted RLIMIT_MEMLOCK memory not released.
Make the label names in the fix consistent with bpf_prog_load().

Fixes: aaac3ba95e4c ("bpf: charge user for creation of BPF maps and programs")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/syscall.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 228f962447a5..237f3d6a7ddc 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -194,7 +194,7 @@ static int map_create(union bpf_attr *attr)
 
 	err = bpf_map_charge_memlock(map);
 	if (err)
-		goto free_map;
+		goto free_map_nouncharge;
 
 	err = bpf_map_new_fd(map);
 	if (err < 0)
@@ -204,6 +204,8 @@ static int map_create(union bpf_attr *attr)
 	return err;
 
 free_map:
+	bpf_map_uncharge_memlock(map);
+free_map_nouncharge:
 	map->ops->map_free(map);
 	return err;
 }
-- 
cgit v1.3-14-g43fede


From 7ee7e87dfb158e79019ea1d5ea1b0e6f2bc93ee4 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 7 Nov 2016 19:57:00 +0100
Subject: genirq: Use irq type from irqdata instead of irqdesc

The type flags in the irq descriptor are there for historical reasons and
only updated via irq_modify_status() or irq_set_type(). Both functions also
update the type flags in irqdata. __setup_irq() is the only left over user
of the type flags in the irq descriptor.

If __setup_irq() is called with empty irq type flags, then the type flags
are retrieved from irqdata. If an interrupt is shared, then the type flags
are compared with the type flags stored in the irq descriptor.

On x86 the ioapic does not have a irq_set_type() callback because the type
is defined in the BIOS tables and cannot be changed. The type is stored in
irqdata at setup time without updating the type data in the irq
descriptor. As a result the comparison described above fails.

There is no point in updating the irq descriptor flags because the only
relevant storage is irqdata. Use the type flags from irqdata for both
retrieval and comparison in __setup_irq() instead.

Aside of that the print out in case of non matching type flags has the old
and new type flags arguments flipped. Fix that as well.

For correctness sake the flags stored in the irq descriptor should be
removed, but this is beyond the scope of this bugfix and will be done in a
later patch.

Fixes: 4b357daed698 ("genirq: Look-up trigger type if not specified by caller")
Reported-and-tested-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Marc Zyngier <marc.zyngier@arm.com>
Cc: Jon Hunter <jonathanh@nvidia.com>
Cc: stable@vger.kernel.org
Link: http://lkml.kernel.org/r/alpine.DEB.2.20.1611072020360.3501@nanos
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/manage.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 9c4d30483264..6b669593e7eb 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -1341,12 +1341,12 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
 
 	} else if (new->flags & IRQF_TRIGGER_MASK) {
 		unsigned int nmsk = new->flags & IRQF_TRIGGER_MASK;
-		unsigned int omsk = irq_settings_get_trigger_mask(desc);
+		unsigned int omsk = irqd_get_trigger_type(&desc->irq_data);
 
 		if (nmsk != omsk)
 			/* hope the handler works with current  trigger mode */
 			pr_warn("irq %d uses trigger mode %u; requested %u\n",
-				irq, nmsk, omsk);
+				irq, omsk, nmsk);
 	}
 
 	*old_ptr = new;
-- 
cgit v1.3-14-g43fede


From c6c7d83b9c9e6a8b3e6d84c820ac61fbffc9e396 Mon Sep 17 00:00:00 2001
From: Hans de Goede <hdegoede@redhat.com>
Date: Thu, 10 Nov 2016 10:46:26 -0800
Subject: Revert "console: don't prefer first registered if DT specifies
 stdout-path"

This reverts commit 05fd007e4629 ("console: don't prefer first
registered if DT specifies stdout-path").

The reverted commit changes existing behavior on which many ARM boards
rely.  Many ARM small-board-computers, like e.g.  the Raspberry Pi have
both a video output and a serial console.  Depending on whether the user
is using the device as a more regular computer; or as a headless device
we need to have the console on either one or the other.

Many users rely on the kernel behavior of the console being present on
both outputs, before the reverted commit the console setup with no
console= kernel arguments on an ARM board which sets stdout-path in dt
would look like this:

  [root@localhost ~]# cat /proc/consoles
  ttyS0                -W- (EC p a)    4:64
  tty0                 -WU (E  p  )    4:1

Where as after the reverted commit, it looks like this:

  [root@localhost ~]# cat /proc/consoles
  ttyS0                -W- (EC p a)    4:64

This commit reverts commit 05fd007e4629 ("console: don't prefer first
registered if DT specifies stdout-path") restoring the original
behavior.

Fixes: 05fd007e4629 ("console: don't prefer first registered if DT specifies stdout-path")
Link: http://lkml.kernel.org/r/20161104121135.4780-2-hdegoede@redhat.com
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Cc: Paul Burton <paul.burton@imgtec.com>
Cc: Rob Herring <robh+dt@kernel.org>
Cc: Frank Rowand <frowand.list@gmail.com>
Cc: Thorsten Leemhuis <regressions@leemhuis.info>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/of/base.c       |  2 --
 include/linux/console.h |  6 ------
 kernel/printk/printk.c  | 13 +------------
 3 files changed, 1 insertion(+), 20 deletions(-)

(limited to 'kernel')

diff --git a/drivers/of/base.c b/drivers/of/base.c
index d687e6de24a0..a0bccb54a9bd 100644
--- a/drivers/of/base.c
+++ b/drivers/of/base.c
@@ -2077,8 +2077,6 @@ void of_alias_scan(void * (*dt_alloc)(u64 size, u64 align))
 			name = of_get_property(of_aliases, "stdout", NULL);
 		if (name)
 			of_stdout = of_find_node_opts_by_path(name, &of_stdout_options);
-		if (of_stdout)
-			console_set_by_of();
 	}
 
 	if (!of_aliases)
diff --git a/include/linux/console.h b/include/linux/console.h
index 3672809234a7..d530c4627e54 100644
--- a/include/linux/console.h
+++ b/include/linux/console.h
@@ -173,12 +173,6 @@ static inline void console_sysfs_notify(void)
 #endif
 extern bool console_suspend_enabled;
 
-#ifdef CONFIG_OF
-extern void console_set_by_of(void);
-#else
-static inline void console_set_by_of(void) {}
-#endif
-
 /* Suspend and resume console messages over PM events */
 extern void suspend_console(void);
 extern void resume_console(void);
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index de08fc90baaf..5028f4fd504a 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -253,17 +253,6 @@ static int preferred_console = -1;
 int console_set_on_cmdline;
 EXPORT_SYMBOL(console_set_on_cmdline);
 
-#ifdef CONFIG_OF
-static bool of_specified_console;
-
-void console_set_by_of(void)
-{
-	of_specified_console = true;
-}
-#else
-# define of_specified_console false
-#endif
-
 /* Flag: console code may call schedule() */
 static int console_may_schedule;
 
@@ -2657,7 +2646,7 @@ void register_console(struct console *newcon)
 	 *	didn't select a console we take the first one
 	 *	that registers here.
 	 */
-	if (preferred_console < 0 && !of_specified_console) {
+	if (preferred_console < 0) {
 		if (newcon->index < 0)
 			newcon->index = 0;
 		if (newcon->setup == NULL ||
-- 
cgit v1.3-14-g43fede


From f5c9f9c72395c3291c2e35c905dedae2b98475a4 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Mon, 14 Nov 2016 09:31:52 -0800
Subject: Revert "printk: make reading the kernel log flush pending lines"

This reverts commit bfd8d3f23b51018388be0411ccbc2d56277fe294.

It turns out that this flushes things much too aggressiverly, and causes
lines to break up when the system logger races with new continuation
lines being printed.

There's a pending patch to make printk() flushing much more
straightforward, but it's too invasive for 4.9, so in the meantime let's
just not make the system message logging flush continuation lines.
They'll be flushed by the final newline anyway.

Suggested-by: Petr Mladek <pmladek@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/printk/printk.c | 11 -----------
 1 file changed, 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 5028f4fd504a..f7a55e9ff2f7 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -783,8 +783,6 @@ static ssize_t devkmsg_write(struct kiocb *iocb, struct iov_iter *from)
 	return ret;
 }
 
-static void cont_flush(void);
-
 static ssize_t devkmsg_read(struct file *file, char __user *buf,
 			    size_t count, loff_t *ppos)
 {
@@ -800,7 +798,6 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,
 	if (ret)
 		return ret;
 	raw_spin_lock_irq(&logbuf_lock);
-	cont_flush();
 	while (user->seq == log_next_seq) {
 		if (file->f_flags & O_NONBLOCK) {
 			ret = -EAGAIN;
@@ -863,7 +860,6 @@ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence)
 		return -ESPIPE;
 
 	raw_spin_lock_irq(&logbuf_lock);
-	cont_flush();
 	switch (whence) {
 	case SEEK_SET:
 		/* the first record */
@@ -902,7 +898,6 @@ static unsigned int devkmsg_poll(struct file *file, poll_table *wait)
 	poll_wait(file, &log_wait, wait);
 
 	raw_spin_lock_irq(&logbuf_lock);
-	cont_flush();
 	if (user->seq < log_next_seq) {
 		/* return error when data has vanished underneath us */
 		if (user->seq < log_first_seq)
@@ -1289,7 +1284,6 @@ static int syslog_print(char __user *buf, int size)
 		size_t skip;
 
 		raw_spin_lock_irq(&logbuf_lock);
-		cont_flush();
 		if (syslog_seq < log_first_seq) {
 			/* messages are gone, move to first one */
 			syslog_seq = log_first_seq;
@@ -1349,7 +1343,6 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
 		return -ENOMEM;
 
 	raw_spin_lock_irq(&logbuf_lock);
-	cont_flush();
 	if (buf) {
 		u64 next_seq;
 		u64 seq;
@@ -1511,7 +1504,6 @@ int do_syslog(int type, char __user *buf, int len, int source)
 	/* Number of chars in the log buffer */
 	case SYSLOG_ACTION_SIZE_UNREAD:
 		raw_spin_lock_irq(&logbuf_lock);
-		cont_flush();
 		if (syslog_seq < log_first_seq) {
 			/* messages are gone, move to first one */
 			syslog_seq = log_first_seq;
@@ -3028,7 +3020,6 @@ void kmsg_dump(enum kmsg_dump_reason reason)
 		dumper->active = true;
 
 		raw_spin_lock_irqsave(&logbuf_lock, flags);
-		cont_flush();
 		dumper->cur_seq = clear_seq;
 		dumper->cur_idx = clear_idx;
 		dumper->next_seq = log_next_seq;
@@ -3119,7 +3110,6 @@ bool kmsg_dump_get_line(struct kmsg_dumper *dumper, bool syslog,
 	bool ret;
 
 	raw_spin_lock_irqsave(&logbuf_lock, flags);
-	cont_flush();
 	ret = kmsg_dump_get_line_nolock(dumper, syslog, line, size, len);
 	raw_spin_unlock_irqrestore(&logbuf_lock, flags);
 
@@ -3162,7 +3152,6 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog,
 		goto out;
 
 	raw_spin_lock_irqsave(&logbuf_lock, flags);
-	cont_flush();
 	if (dumper->cur_seq < log_first_seq) {
 		/* messages are gone, move to first available one */
 		dumper->cur_seq = log_first_seq;
-- 
cgit v1.3-14-g43fede


From 977c1f9c8c022d0173181766b34a0db3705265a4 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@fb.com>
Date: Mon, 7 Nov 2016 15:14:20 -0800
Subject: ftrace: Ignore FTRACE_FL_DISABLED while walking dyn_ftrace records

ftrace_shutdown() checks for sanity of ftrace records
and if dyn_ftrace->flags is not zero, it will warn.
It can happen that 'flags' are set to FTRACE_FL_DISABLED at this point,
since some module was loaded, but before ftrace_module_enable()
cleared the flags for this module.

In other words the module.c is doing:
ftrace_module_init(mod); // calls ftrace_update_code() that sets flags=FTRACE_FL_DISABLED
... // here ftrace_shutdown() is called that warns, since
err = prepare_coming_module(mod); // didn't have a chance to clear FTRACE_FL_DISABLED

Fix it by ignoring disabled records.
It's similar to what __ftrace_hash_rec_update() is already doing.

Link: http://lkml.kernel.org/r/1478560460-3818619-1-git-send-email-ast@fb.com

Cc: stable@vger.kernel.org
Fixes: b7ffffbb46f2 "ftrace: Add infrastructure for delayed enabling of module functions"
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 2050a7652a86..326498baab83 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -2763,7 +2763,7 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command)
 		struct dyn_ftrace *rec;
 
 		do_for_each_ftrace_rec(pg, rec) {
-			if (FTRACE_WARN_ON_ONCE(rec->flags))
+			if (FTRACE_WARN_ON_ONCE(rec->flags & ~FTRACE_FL_DISABLED))
 				pr_warn("  %pS flags:%lx\n",
 					(void *)rec->ip, rec->flags);
 		} while_for_each_ftrace_rec();
-- 
cgit v1.3-14-g43fede


From 546fece4eae871f033925ccf0ff2b740725ae915 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Mon, 14 Nov 2016 16:31:49 -0500
Subject: ftrace: Add more checks for FTRACE_FL_DISABLED in processing ip
 records

When a module is first loaded and its function ip records are added to the
ftrace list of functions to modify, they are set to DISABLED, as their text
is still in a read only state. When the module is fully loaded, and can be
updated, the flag is cleared, and if their's any functions that should be
tracing them, it is updated at that moment.

But there's several locations that do record accounting and should ignore
records that are marked as disabled, or they can cause issues.

Alexei already fixed one location, but others need to be addressed.

Cc: stable@vger.kernel.org
Fixes: b7ffffbb46f2 "ftrace: Add infrastructure for delayed enabling of module functions"
Reported-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 326498baab83..da87b3cba5b3 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1862,6 +1862,10 @@ static int __ftrace_hash_update_ipmodify(struct ftrace_ops *ops,
 
 	/* Update rec->flags */
 	do_for_each_ftrace_rec(pg, rec) {
+
+		if (rec->flags & FTRACE_FL_DISABLED)
+			continue;
+
 		/* We need to update only differences of filter_hash */
 		in_old = !!ftrace_lookup_ip(old_hash, rec->ip);
 		in_new = !!ftrace_lookup_ip(new_hash, rec->ip);
@@ -1884,6 +1888,10 @@ rollback:
 
 	/* Roll back what we did above */
 	do_for_each_ftrace_rec(pg, rec) {
+
+		if (rec->flags & FTRACE_FL_DISABLED)
+			continue;
+
 		if (rec == end)
 			goto err_out;
 
@@ -2397,6 +2405,10 @@ void __weak ftrace_replace_code(int enable)
 		return;
 
 	do_for_each_ftrace_rec(pg, rec) {
+
+		if (rec->flags & FTRACE_FL_DISABLED)
+			continue;
+
 		failed = __ftrace_replace_code(rec, enable);
 		if (failed) {
 			ftrace_bug(failed, rec);
@@ -3598,6 +3610,10 @@ match_records(struct ftrace_hash *hash, char *func, int len, char *mod)
 		goto out_unlock;
 
 	do_for_each_ftrace_rec(pg, rec) {
+
+		if (rec->flags & FTRACE_FL_DISABLED)
+			continue;
+
 		if (ftrace_match_record(rec, &func_g, mod_match, exclude_mod)) {
 			ret = enter_record(hash, rec, clear_filter);
 			if (ret < 0) {
@@ -3793,6 +3809,9 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
 
 	do_for_each_ftrace_rec(pg, rec) {
 
+		if (rec->flags & FTRACE_FL_DISABLED)
+			continue;
+
 		if (!ftrace_match_record(rec, &func_g, NULL, 0))
 			continue;
 
@@ -4685,6 +4704,9 @@ ftrace_set_func(unsigned long *array, int *idx, int size, char *buffer)
 
 	do_for_each_ftrace_rec(pg, rec) {
 
+		if (rec->flags & FTRACE_FL_DISABLED)
+			continue;
+
 		if (ftrace_match_record(rec, &func_g, NULL, 0)) {
 			/* if it is in the array */
 			exists = false;
-- 
cgit v1.3-14-g43fede


From 864c2357ca898c6171fe5284f5ecc795c8ce27a8 Mon Sep 17 00:00:00 2001
From: David Carrillo-Cisneros <davidcc@google.com>
Date: Tue, 1 Nov 2016 11:52:58 -0700
Subject: perf/core: Do not set cpuctx->cgrp for unscheduled cgroups

Commit:

  db4a835601b7 ("perf/core: Set cgroup in CPU contexts for new cgroup events")

failed to verify that event->cgrp is actually the scheduled cgroup
in a CPU before setting cpuctx->cgrp. This patch fixes that.

Now that there is a different path for scheduled and unscheduled
cgroup, add a warning to catch when cpuctx->cgrp is still set after
the last cgroup event has been unsheduled.

To verify the bug:

  # Create 2 cgroups.
  mkdir /dev/cgroups/devices/g1
  mkdir /dev/cgroups/devices/g2

  # launch a task, bind it to a cpu and move it to g1
  CPU=2
  while :; do : ; done &
  P=$!

  taskset -pc $CPU $P
  echo $P > /dev/cgroups/devices/g1/tasks

  # monitor g2 (it runs no tasks) and observe output
  perf stat -e cycles -I 1000 -C $CPU -G g2

  #           time             counts unit events
     1.000091408          7,579,527      cycles                    g2
     2.000350111      <not counted>      cycles                    g2
     3.000589181      <not counted>      cycles                    g2
     4.000771428      <not counted>      cycles                    g2

  # note first line that displays that a task run in g2, despite
  # g2 having no tasks. This is because cpuctx->cgrp was wrongly
  # set when context of new event was installed.
  # After applying the fix we obtain the right output:

  perf stat -e cycles -I 1000 -C $CPU -G g2
  #           time             counts unit events
     1.000119615      <not counted>      cycles                    g2
     2.000389430      <not counted>      cycles                    g2
     3.000590962      <not counted>      cycles                    g2

Signed-off-by: David Carrillo-Cisneros <davidcc@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Kan Liang <kan.liang@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Nilay Vaish <nilayvaish@gmail.com>
Cc: Paul Turner <pjt@google.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vegard Nossum <vegard.nossum@gmail.com>
Link: http://lkml.kernel.org/r/1478026378-86083-1-git-send-email-davidcc@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 0e292132efac..ff230bb4a02e 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -902,6 +902,17 @@ list_update_cgroup_event(struct perf_event *event,
 	 * this will always be called from the right CPU.
 	 */
 	cpuctx = __get_cpu_context(ctx);
+
+	/* Only set/clear cpuctx->cgrp if current task uses event->cgrp. */
+	if (perf_cgroup_from_task(current, ctx) != event->cgrp) {
+		/*
+		 * We are removing the last cpu event in this context.
+		 * If that event is not active in this cpu, cpuctx->cgrp
+		 * should've been cleared by perf_cgroup_switch.
+		 */
+		WARN_ON_ONCE(!add && cpuctx->cgrp);
+		return;
+	}
 	cpuctx->cgrp = add ? event->cgrp : NULL;
 }
 
-- 
cgit v1.3-14-g43fede


From f23cc643f9baec7f71f2b74692da3cf03abbbfda Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fb.com>
Date: Mon, 14 Nov 2016 15:45:36 -0500
Subject: bpf: fix range arithmetic for bpf map access

I made some invalid assumptions with BPF_AND and BPF_MOD that could result in
invalid accesses to bpf map entries.  Fix this up by doing a few things

1) Kill BPF_MOD support.  This doesn't actually get used by the compiler in real
life and just adds extra complexity.

2) Fix the logic for BPF_AND, don't allow AND of negative numbers and set the
minimum value to 0 for positive AND's.

3) Don't do operations on the ranges if they are set to the limits, as they are
by definition undefined, and allowing arithmetic operations on those values
could make them appear valid when they really aren't.

This fixes the testcase provided by Jann as well as a few other theoretical
problems.

Reported-by: Jann Horn <jannh@google.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf_verifier.h |  5 ++--
 kernel/bpf/verifier.c        | 70 +++++++++++++++++++++++++++++---------------
 2 files changed, 50 insertions(+), 25 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 7035b997aaa5..6aaf425cebc3 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -14,7 +14,7 @@
   * are obviously wrong for any sort of memory access.
   */
 #define BPF_REGISTER_MAX_RANGE (1024 * 1024 * 1024)
-#define BPF_REGISTER_MIN_RANGE -(1024 * 1024 * 1024)
+#define BPF_REGISTER_MIN_RANGE -1
 
 struct bpf_reg_state {
 	enum bpf_reg_type type;
@@ -22,7 +22,8 @@ struct bpf_reg_state {
 	 * Used to determine if any memory access using this register will
 	 * result in a bad access.
 	 */
-	u64 min_value, max_value;
+	s64 min_value;
+	u64 max_value;
 	union {
 		/* valid when type == CONST_IMM | PTR_TO_STACK | UNKNOWN_VALUE */
 		s64 imm;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 99a7e5b388f2..6a936159c6e0 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -216,8 +216,8 @@ static void print_verifier_state(struct bpf_verifier_state *state)
 				reg->map_ptr->key_size,
 				reg->map_ptr->value_size);
 		if (reg->min_value != BPF_REGISTER_MIN_RANGE)
-			verbose(",min_value=%llu",
-				(unsigned long long)reg->min_value);
+			verbose(",min_value=%lld",
+				(long long)reg->min_value);
 		if (reg->max_value != BPF_REGISTER_MAX_RANGE)
 			verbose(",max_value=%llu",
 				(unsigned long long)reg->max_value);
@@ -758,7 +758,7 @@ static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off,
 			 * index'es we need to make sure that whatever we use
 			 * will have a set floor within our range.
 			 */
-			if ((s64)reg->min_value < 0) {
+			if (reg->min_value < 0) {
 				verbose("R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n",
 					regno);
 				return -EACCES;
@@ -1468,7 +1468,8 @@ static void check_reg_overflow(struct bpf_reg_state *reg)
 {
 	if (reg->max_value > BPF_REGISTER_MAX_RANGE)
 		reg->max_value = BPF_REGISTER_MAX_RANGE;
-	if ((s64)reg->min_value < BPF_REGISTER_MIN_RANGE)
+	if (reg->min_value < BPF_REGISTER_MIN_RANGE ||
+	    reg->min_value > BPF_REGISTER_MAX_RANGE)
 		reg->min_value = BPF_REGISTER_MIN_RANGE;
 }
 
@@ -1476,7 +1477,8 @@ static void adjust_reg_min_max_vals(struct bpf_verifier_env *env,
 				    struct bpf_insn *insn)
 {
 	struct bpf_reg_state *regs = env->cur_state.regs, *dst_reg;
-	u64 min_val = BPF_REGISTER_MIN_RANGE, max_val = BPF_REGISTER_MAX_RANGE;
+	s64 min_val = BPF_REGISTER_MIN_RANGE;
+	u64 max_val = BPF_REGISTER_MAX_RANGE;
 	bool min_set = false, max_set = false;
 	u8 opcode = BPF_OP(insn->code);
 
@@ -1512,22 +1514,43 @@ static void adjust_reg_min_max_vals(struct bpf_verifier_env *env,
 		return;
 	}
 
+	/* If one of our values was at the end of our ranges then we can't just
+	 * do our normal operations to the register, we need to set the values
+	 * to the min/max since they are undefined.
+	 */
+	if (min_val == BPF_REGISTER_MIN_RANGE)
+		dst_reg->min_value = BPF_REGISTER_MIN_RANGE;
+	if (max_val == BPF_REGISTER_MAX_RANGE)
+		dst_reg->max_value = BPF_REGISTER_MAX_RANGE;
+
 	switch (opcode) {
 	case BPF_ADD:
-		dst_reg->min_value += min_val;
-		dst_reg->max_value += max_val;
+		if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE)
+			dst_reg->min_value += min_val;
+		if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE)
+			dst_reg->max_value += max_val;
 		break;
 	case BPF_SUB:
-		dst_reg->min_value -= min_val;
-		dst_reg->max_value -= max_val;
+		if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE)
+			dst_reg->min_value -= min_val;
+		if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE)
+			dst_reg->max_value -= max_val;
 		break;
 	case BPF_MUL:
-		dst_reg->min_value *= min_val;
-		dst_reg->max_value *= max_val;
+		if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE)
+			dst_reg->min_value *= min_val;
+		if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE)
+			dst_reg->max_value *= max_val;
 		break;
 	case BPF_AND:
-		/* & is special since it could end up with 0 bits set. */
-		dst_reg->min_value &= min_val;
+		/* Disallow AND'ing of negative numbers, ain't nobody got time
+		 * for that.  Otherwise the minimum is 0 and the max is the max
+		 * value we could AND against.
+		 */
+		if (min_val < 0)
+			dst_reg->min_value = BPF_REGISTER_MIN_RANGE;
+		else
+			dst_reg->min_value = 0;
 		dst_reg->max_value = max_val;
 		break;
 	case BPF_LSH:
@@ -1537,24 +1560,25 @@ static void adjust_reg_min_max_vals(struct bpf_verifier_env *env,
 		 */
 		if (min_val > ilog2(BPF_REGISTER_MAX_RANGE))
 			dst_reg->min_value = BPF_REGISTER_MIN_RANGE;
-		else
+		else if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE)
 			dst_reg->min_value <<= min_val;
 
 		if (max_val > ilog2(BPF_REGISTER_MAX_RANGE))
 			dst_reg->max_value = BPF_REGISTER_MAX_RANGE;
-		else
+		else if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE)
 			dst_reg->max_value <<= max_val;
 		break;
 	case BPF_RSH:
-		dst_reg->min_value >>= min_val;
-		dst_reg->max_value >>= max_val;
-		break;
-	case BPF_MOD:
-		/* % is special since it is an unsigned modulus, so the floor
-		 * will always be 0.
+		/* RSH by a negative number is undefined, and the BPF_RSH is an
+		 * unsigned shift, so make the appropriate casts.
 		 */
-		dst_reg->min_value = 0;
-		dst_reg->max_value = max_val - 1;
+		if (min_val < 0 || dst_reg->min_value < 0)
+			dst_reg->min_value = BPF_REGISTER_MIN_RANGE;
+		else
+			dst_reg->min_value =
+				(u64)(dst_reg->min_value) >> min_val;
+		if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE)
+			dst_reg->max_value >>= max_val;
 		break;
 	default:
 		reset_reg_range_values(regs, insn->dst_reg);
-- 
cgit v1.3-14-g43fede


From e245d99e6cc4a0b904b87b46b4f60d46fb405987 Mon Sep 17 00:00:00 2001
From: Babu Moger <babu.moger@oracle.com>
Date: Wed, 2 Nov 2016 09:36:33 -0700
Subject: lockdep: Limit static allocations if PROVE_LOCKING_SMALL is defined

Reduce the size of data structure for lockdep entries by half if
PROVE_LOCKING_SMALL if defined. This is used only for sparc.

Signed-off-by: Babu Moger <babu.moger@oracle.com>
Acked-by: Sam Ravnborg <sam@ravnborg.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/locking/lockdep_internals.h | 20 +++++++++++++++++---
 1 file changed, 17 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/locking/lockdep_internals.h b/kernel/locking/lockdep_internals.h
index 51c4b24b6328..c2b88490d857 100644
--- a/kernel/locking/lockdep_internals.h
+++ b/kernel/locking/lockdep_internals.h
@@ -45,6 +45,14 @@ enum {
 #define LOCKF_USED_IN_IRQ_READ \
 		(LOCKF_USED_IN_HARDIRQ_READ | LOCKF_USED_IN_SOFTIRQ_READ)
 
+/*
+ * CONFIG_PROVE_LOCKING_SMALL is defined for sparc. Sparc requires .text,
+ * .data and .bss to fit in required 32MB limit for the kernel. With
+ * PROVE_LOCKING we could go over this limit and cause system boot-up problems.
+ * So, reduce the static allocations for lockdeps related structures so that
+ * everything fits in current required size limit.
+ */
+#ifdef CONFIG_PROVE_LOCKING_SMALL
 /*
  * MAX_LOCKDEP_ENTRIES is the maximum number of lock dependencies
  * we track.
@@ -54,18 +62,24 @@ enum {
  * table (if it's not there yet), and we check it for lock order
  * conflicts and deadlocks.
  */
+#define MAX_LOCKDEP_ENTRIES	16384UL
+#define MAX_LOCKDEP_CHAINS_BITS	15
+#define MAX_STACK_TRACE_ENTRIES	262144UL
+#else
 #define MAX_LOCKDEP_ENTRIES	32768UL
 
 #define MAX_LOCKDEP_CHAINS_BITS	16
-#define MAX_LOCKDEP_CHAINS	(1UL << MAX_LOCKDEP_CHAINS_BITS)
-
-#define MAX_LOCKDEP_CHAIN_HLOCKS (MAX_LOCKDEP_CHAINS*5)
 
 /*
  * Stack-trace: tightly packed array of stack backtrace
  * addresses. Protected by the hash_lock.
  */
 #define MAX_STACK_TRACE_ENTRIES	524288UL
+#endif
+
+#define MAX_LOCKDEP_CHAINS	(1UL << MAX_LOCKDEP_CHAINS_BITS)
+
+#define MAX_LOCKDEP_CHAIN_HLOCKS (MAX_LOCKDEP_CHAINS*5)
 
 extern struct list_head all_lock_classes;
 extern struct lock_chain lock_chains[];
-- 
cgit v1.3-14-g43fede


From e96271f3ed7e702fa36dd0605c0c5b5f065af816 Mon Sep 17 00:00:00 2001
From: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Date: Fri, 18 Nov 2016 13:38:43 +0200
Subject: perf/core: Fix address filter parser

The token table passed into match_token() must be null-terminated, which
it currently is not in the perf's address filter string parser, as caught
by Vince's perf_fuzzer and KASAN.

It doesn't blow up otherwise because of the alignment padding of the table
to the next element in the .rodata, which is luck.

Fixing by adding a null-terminator to the token table.

Reported-by: Vince Weaver <vincent.weaver@maine.edu>
Tested-by: Vince Weaver <vincent.weaver@maine.edu>
Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: dvyukov@google.com
Cc: stable@vger.kernel.org # v4.7+
Fixes: 375637bc524 ("perf/core: Introduce address range filtering")
Link: http://lkml.kernel.org/r/877f81f264.fsf@ashishki-desk.ger.corp.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index ff230bb4a02e..6ee1febdf6ff 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -8029,6 +8029,7 @@ restart:
  * if <size> is not specified, the range is treated as a single address.
  */
 enum {
+	IF_ACT_NONE = -1,
 	IF_ACT_FILTER,
 	IF_ACT_START,
 	IF_ACT_STOP,
@@ -8052,6 +8053,7 @@ static const match_table_t if_tokens = {
 	{ IF_SRC_KERNEL,	"%u/%u" },
 	{ IF_SRC_FILEADDR,	"%u@%s" },
 	{ IF_SRC_KERNELADDR,	"%u" },
+	{ IF_ACT_NONE,		NULL },
 };
 
 /*
-- 
cgit v1.3-14-g43fede


From 18f649ef344127ef6de23a5a4272dbe2fdb73dde Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Mon, 14 Nov 2016 19:46:09 +0100
Subject: sched/autogroup: Fix autogroup_move_group() to never skip
 sched_move_task()

The PF_EXITING check in task_wants_autogroup() is no longer needed. Remove
it, but see the next patch.

However the comment is correct in that autogroup_move_group() must always
change task_group() for every thread so the sysctl_ check is very wrong;
we can race with cgroups and even sys_setsid() is not safe because a task
running with task_group() == ag->tg must participate in refcounting:

	int main(void)
	{
		int sctl = open("/proc/sys/kernel/sched_autogroup_enabled", O_WRONLY);

		assert(sctl > 0);
		if (fork()) {
			wait(NULL); // destroy the child's ag/tg
			pause();
		}

		assert(pwrite(sctl, "1\n", 2, 0) == 2);
		assert(setsid() > 0);
		if (fork())
			pause();

		kill(getppid(), SIGKILL);
		sleep(1);

		// The child has gone, the grandchild runs with kref == 1
		assert(pwrite(sctl, "0\n", 2, 0) == 2);
		assert(setsid() > 0);

		// runs with the freed ag/tg
		for (;;)
			sleep(1);

		return 0;
	}

crashes the kernel. It doesn't really need sleep(1), it doesn't matter if
autogroup_move_group() actually frees the task_group or this happens later.

Reported-by: Vern Lovejoy <vlovejoy@redhat.com>
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: hartsjc@redhat.com
Cc: vbendel@redhat.com
Link: http://lkml.kernel.org/r/20161114184609.GA15965@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/auto_group.c | 23 ++++++++++++-----------
 1 file changed, 12 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c
index a5d966cb8891..ad2b19ad6ca0 100644
--- a/kernel/sched/auto_group.c
+++ b/kernel/sched/auto_group.c
@@ -111,14 +111,11 @@ bool task_wants_autogroup(struct task_struct *p, struct task_group *tg)
 {
 	if (tg != &root_task_group)
 		return false;
-
 	/*
-	 * We can only assume the task group can't go away on us if
-	 * autogroup_move_group() can see us on ->thread_group list.
+	 * If we race with autogroup_move_group() the caller can use the old
+	 * value of signal->autogroup but in this case sched_move_task() will
+	 * be called again before autogroup_kref_put().
 	 */
-	if (p->flags & PF_EXITING)
-		return false;
-
 	return true;
 }
 
@@ -138,13 +135,17 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag)
 	}
 
 	p->signal->autogroup = autogroup_kref_get(ag);
-
-	if (!READ_ONCE(sysctl_sched_autogroup_enabled))
-		goto out;
-
+	/*
+	 * We can't avoid sched_move_task() after we changed signal->autogroup,
+	 * this process can already run with task_group() == prev->tg or we can
+	 * race with cgroup code which can read autogroup = prev under rq->lock.
+	 * In the latter case for_each_thread() can not miss a migrating thread,
+	 * cpu_cgroup_attach() must not be possible after cgroup_exit() and it
+	 * can't be removed from thread list, we hold ->siglock.
+	 */
 	for_each_thread(p, t)
 		sched_move_task(t);
-out:
+
 	unlock_task_sighand(p, &flags);
 	autogroup_kref_put(prev);
 }
-- 
cgit v1.3-14-g43fede


From 8e5bfa8c1f8471aa4a2d30be631ef2b50e10abaf Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Mon, 14 Nov 2016 19:46:12 +0100
Subject: sched/autogroup: Do not use autogroup->tg in zombie threads

Exactly because for_each_thread() in autogroup_move_group() can't see it
and update its ->sched_task_group before _put() and possibly free().

So the exiting task needs another sched_move_task() before exit_notify()
and we need to re-introduce the PF_EXITING (or similar) check removed by
the previous change for another reason.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: hartsjc@redhat.com
Cc: vbendel@redhat.com
Cc: vlovejoy@redhat.com
Link: http://lkml.kernel.org/r/20161114184612.GA15968@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h     |  2 ++
 kernel/exit.c             |  1 +
 kernel/sched/auto_group.c | 19 +++++++++++++++++++
 3 files changed, 22 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 348f51b0ec92..e9c009dc3a4a 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2567,6 +2567,7 @@ extern void sched_autogroup_create_attach(struct task_struct *p);
 extern void sched_autogroup_detach(struct task_struct *p);
 extern void sched_autogroup_fork(struct signal_struct *sig);
 extern void sched_autogroup_exit(struct signal_struct *sig);
+extern void sched_autogroup_exit_task(struct task_struct *p);
 #ifdef CONFIG_PROC_FS
 extern void proc_sched_autogroup_show_task(struct task_struct *p, struct seq_file *m);
 extern int proc_sched_autogroup_set_nice(struct task_struct *p, int nice);
@@ -2576,6 +2577,7 @@ static inline void sched_autogroup_create_attach(struct task_struct *p) { }
 static inline void sched_autogroup_detach(struct task_struct *p) { }
 static inline void sched_autogroup_fork(struct signal_struct *sig) { }
 static inline void sched_autogroup_exit(struct signal_struct *sig) { }
+static inline void sched_autogroup_exit_task(struct task_struct *p) { }
 #endif
 
 extern int yield_to(struct task_struct *p, bool preempt);
diff --git a/kernel/exit.c b/kernel/exit.c
index 9d68c45ebbe3..3076f3089919 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -836,6 +836,7 @@ void __noreturn do_exit(long code)
 	 */
 	perf_event_exit_task(tsk);
 
+	sched_autogroup_exit_task(tsk);
 	cgroup_exit(tsk);
 
 	/*
diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c
index ad2b19ad6ca0..f1c8fd566246 100644
--- a/kernel/sched/auto_group.c
+++ b/kernel/sched/auto_group.c
@@ -115,10 +115,26 @@ bool task_wants_autogroup(struct task_struct *p, struct task_group *tg)
 	 * If we race with autogroup_move_group() the caller can use the old
 	 * value of signal->autogroup but in this case sched_move_task() will
 	 * be called again before autogroup_kref_put().
+	 *
+	 * However, there is no way sched_autogroup_exit_task() could tell us
+	 * to avoid autogroup->tg, so we abuse PF_EXITING flag for this case.
 	 */
+	if (p->flags & PF_EXITING)
+		return false;
+
 	return true;
 }
 
+void sched_autogroup_exit_task(struct task_struct *p)
+{
+	/*
+	 * We are going to call exit_notify() and autogroup_move_group() can't
+	 * see this thread after that: we can no longer use signal->autogroup.
+	 * See the PF_EXITING check in task_wants_autogroup().
+	 */
+	sched_move_task(p);
+}
+
 static void
 autogroup_move_group(struct task_struct *p, struct autogroup *ag)
 {
@@ -142,6 +158,9 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag)
 	 * In the latter case for_each_thread() can not miss a migrating thread,
 	 * cpu_cgroup_attach() must not be possible after cgroup_exit() and it
 	 * can't be removed from thread list, we hold ->siglock.
+	 *
+	 * If an exiting thread was already removed from thread list we rely on
+	 * sched_autogroup_exit_task().
 	 */
 	for_each_thread(p, t)
 		sched_move_task(t);
-- 
cgit v1.3-14-g43fede


From 83929cce95251cc77e5659bf493bd424ae0e7a67 Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Wed, 23 Nov 2016 11:33:37 +0100
Subject: sched/autogroup: Fix 64-bit kernel nice level adjustment

Michael Kerrisk reported:

> Regarding the previous paragraph...  My tests indicate
> that writing *any* value to the autogroup [nice priority level]
> file causes the task group to get a lower priority.

Because autogroup didn't call the then meaningless scale_load()...

Autogroup nice level adjustment has been broken ever since load
resolution was increased for 64-bit kernels.  Use scale_load() to
scale group weight.

Michael Kerrisk tested this patch to fix the problem:

> Applied and tested against 4.9-rc6 on an Intel u7 (4 cores).
> Test setup:
>
> Terminal window 1: running 40 CPU burner jobs
> Terminal window 2: running 40 CPU burner jobs
> Terminal window 1: running  1 CPU burner job
>
> Demonstrated that:
> * Writing "0" to the autogroup file for TW1 now causes no change
>   to the rate at which the process on the terminal consume CPU.
> * Writing -20 to the autogroup file for TW1 caused those processes
>   to get the lion's share of CPU while TW2 TW3 get a tiny amount.
> * Writing -20 to the autogroup files for TW1 and TW3 allowed the
>   process on TW3 to get as much CPU as it was getting as when
>   the autogroup nice values for both terminals were 0.

Reported-by: Michael Kerrisk <mtk.manpages@gmail.com>
Tested-by: Michael Kerrisk <mtk.manpages@gmail.com>
Signed-off-by: Mike Galbraith <umgwanakikbuti@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-man <linux-man@vger.kernel.org>
Cc: stable@vger.kernel.org
Link: http://lkml.kernel.org/r/1479897217.4306.6.camel@gmx.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/auto_group.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c
index f1c8fd566246..da39489d2d80 100644
--- a/kernel/sched/auto_group.c
+++ b/kernel/sched/auto_group.c
@@ -212,6 +212,7 @@ int proc_sched_autogroup_set_nice(struct task_struct *p, int nice)
 {
 	static unsigned long next = INITIAL_JIFFIES;
 	struct autogroup *ag;
+	unsigned long shares;
 	int err;
 
 	if (nice < MIN_NICE || nice > MAX_NICE)
@@ -230,9 +231,10 @@ int proc_sched_autogroup_set_nice(struct task_struct *p, int nice)
 
 	next = HZ / 10 + jiffies;
 	ag = autogroup_task_get(p);
+	shares = scale_load(sched_prio_to_weight[nice + 20]);
 
 	down_write(&ag->lock);
-	err = sched_group_set_shares(ag->tg, sched_prio_to_weight[nice + 20]);
+	err = sched_group_set_shares(ag->tg, shares);
 	if (!err)
 		ag->nice = nice;
 	up_write(&ag->lock);
-- 
cgit v1.3-14-g43fede


From faaae2a581435f32781a105dda3501df388fddcb Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Tue, 29 Nov 2016 15:20:14 -0800
Subject: Re-enable CONFIG_MODVERSIONS in a slightly weaker form

This enables CONFIG_MODVERSIONS again, but allows for missing symbol CRC
information in order to work around the issue that newer binutils
versions seem to occasionally drop the CRC on the floor.  binutils 2.26
seems to work fine, while binutils 2.27 seems to break MODVERSIONS of
symbols that have been defined in assembler files.

[ We've had random missing CRC's before - it may be an old problem that
  just is now reliably triggered with the weak asm symbols and a new
  version of binutils ]

Some day I really do want to remove MODVERSIONS entirely.  Sadly, today
does not appear to be that day: Debian people apparently do want the
option to enable MODVERSIONS to make it easier to have external modules
across kernel versions, and this seems to be a fairly minimal fix for
the annoying problem.

Cc: Ben Hutchings <ben@decadent.org.uk>
Acked-by: Michal Marek <mmarek@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 init/Kconfig    | 1 -
 kernel/module.c | 5 +++--
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/init/Kconfig b/init/Kconfig
index c4fbc1e55c25..34407f15e6d3 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1945,7 +1945,6 @@ config MODULE_FORCE_UNLOAD
 
 config MODVERSIONS
 	bool "Module versioning support"
-	depends on BROKEN
 	help
 	  Usually, you have to use modules compiled with your kernel.
 	  Saying Y here makes it sometimes possible to use modules
diff --git a/kernel/module.c b/kernel/module.c
index f57dd63186e6..0e54d5bf0097 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1301,8 +1301,9 @@ static int check_version(Elf_Shdr *sechdrs,
 		goto bad_version;
 	}
 
-	pr_warn("%s: no symbol version for %s\n", mod->name, symname);
-	return 0;
+	/* Broken toolchain. Warn once, then let it go.. */
+	pr_warn_once("%s: no symbol version for %s\n", mod->name, symname);
+	return 1;
 
 bad_version:
 	pr_warn("%s: disagrees about version of symbol %s\n",
-- 
cgit v1.3-14-g43fede


From e2d2afe15ed452f91797a80dbc0a17838ba03ed4 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fb.com>
Date: Tue, 29 Nov 2016 12:27:09 -0500
Subject: bpf: fix states equal logic for varlen access

If we have a branch that looks something like this

int foo = map->value;
if (condition) {
  foo += blah;
} else {
  foo = bar;
}
map->array[foo] = baz;

We will incorrectly assume that the !condition branch is equal to the condition
branch as the register for foo will be UNKNOWN_VALUE in both cases.  We need to
adjust this logic to only do this if we didn't do a varlen access after we
processed the !condition branch, otherwise we have different ranges and need to
check the other branch as well.

Fixes: 484611357c19 ("bpf: allow access into map value arrays")
Reported-by: Jann Horn <jannh@google.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/verifier.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 6a936159c6e0..8199821f54cf 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2454,6 +2454,7 @@ static bool states_equal(struct bpf_verifier_env *env,
 			 struct bpf_verifier_state *old,
 			 struct bpf_verifier_state *cur)
 {
+	bool varlen_map_access = env->varlen_map_value_access;
 	struct bpf_reg_state *rold, *rcur;
 	int i;
 
@@ -2467,12 +2468,17 @@ static bool states_equal(struct bpf_verifier_env *env,
 		/* If the ranges were not the same, but everything else was and
 		 * we didn't do a variable access into a map then we are a-ok.
 		 */
-		if (!env->varlen_map_value_access &&
+		if (!varlen_map_access &&
 		    rold->type == rcur->type && rold->imm == rcur->imm)
 			continue;
 
+		/* If we didn't map access then again we don't care about the
+		 * mismatched range values and it's ok if our old type was
+		 * UNKNOWN and we didn't go to a NOT_INIT'ed reg.
+		 */
 		if (rold->type == NOT_INIT ||
-		    (rold->type == UNKNOWN_VALUE && rcur->type != NOT_INIT))
+		    (!varlen_map_access && rold->type == UNKNOWN_VALUE &&
+		     rcur->type != NOT_INIT))
 			continue;
 
 		if (rold->type == PTR_TO_PACKET && rcur->type == PTR_TO_PACKET &&
-- 
cgit v1.3-14-g43fede


From dbb26055defd03d59f678cb5f2c992abe05b064a Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 30 Nov 2016 21:04:41 +0000
Subject: locking/rtmutex: Prevent dequeue vs. unlock race

David reported a futex/rtmutex state corruption. It's caused by the
following problem:

CPU0		CPU1		CPU2

l->owner=T1
		rt_mutex_lock(l)
		lock(l->wait_lock)
		l->owner = T1 | HAS_WAITERS;
		enqueue(T2)
		boost()
		  unlock(l->wait_lock)
		schedule()

				rt_mutex_lock(l)
				lock(l->wait_lock)
				l->owner = T1 | HAS_WAITERS;
				enqueue(T3)
				boost()
				  unlock(l->wait_lock)
				schedule()
		signal(->T2)	signal(->T3)
		lock(l->wait_lock)
		dequeue(T2)
		deboost()
		  unlock(l->wait_lock)
				lock(l->wait_lock)
				dequeue(T3)
				  ===> wait list is now empty
				deboost()
				 unlock(l->wait_lock)
		lock(l->wait_lock)
		fixup_rt_mutex_waiters()
		  if (wait_list_empty(l)) {
		    owner = l->owner & ~HAS_WAITERS;
		    l->owner = owner
		     ==> l->owner = T1
		  }

				lock(l->wait_lock)
rt_mutex_unlock(l)		fixup_rt_mutex_waiters()
				  if (wait_list_empty(l)) {
				    owner = l->owner & ~HAS_WAITERS;
cmpxchg(l->owner, T1, NULL)
 ===> Success (l->owner = NULL)
				    l->owner = owner
				     ==> l->owner = T1
				  }

That means the problem is caused by fixup_rt_mutex_waiters() which does the
RMW to clear the waiters bit unconditionally when there are no waiters in
the rtmutexes rbtree.

This can be fatal: A concurrent unlock can release the rtmutex in the
fastpath because the waiters bit is not set. If the cmpxchg() gets in the
middle of the RMW operation then the previous owner, which just unlocked
the rtmutex is set as the owner again when the write takes place after the
successfull cmpxchg().

The solution is rather trivial: verify that the owner member of the rtmutex
has the waiters bit set before clearing it. This does not require a
cmpxchg() or other atomic operations because the waiters bit can only be
set and cleared with the rtmutex wait_lock held. It's also safe against the
fast path unlock attempt. The unlock attempt via cmpxchg() will either see
the bit set and take the slowpath or see the bit cleared and release it
atomically in the fastpath.

It's remarkable that the test program provided by David triggers on ARM64
and MIPS64 really quick, but it refuses to reproduce on x86-64, while the
problem exists there as well. That refusal might explain that this got not
discovered earlier despite the bug existing from day one of the rtmutex
implementation more than 10 years ago.

Thanks to David for meticulously instrumenting the code and providing the
information which allowed to decode this subtle problem.

Reported-by: David Daney <ddaney@caviumnetworks.com>
Tested-by: David Daney <david.daney@cavium.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Steven Rostedt <rostedt@goodmis.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sebastian Siewior <bigeasy@linutronix.de>
Cc: Will Deacon <will.deacon@arm.com>
Cc: stable@vger.kernel.org
Fixes: 23f78d4a03c5 ("[PATCH] pi-futex: rt mutex core")
Link: http://lkml.kernel.org/r/20161130210030.351136722@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/locking/rtmutex.c | 68 ++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 66 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 1ec0f48962b3..2c49d76f96c3 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -65,8 +65,72 @@ static inline void clear_rt_mutex_waiters(struct rt_mutex *lock)
 
 static void fixup_rt_mutex_waiters(struct rt_mutex *lock)
 {
-	if (!rt_mutex_has_waiters(lock))
-		clear_rt_mutex_waiters(lock);
+	unsigned long owner, *p = (unsigned long *) &lock->owner;
+
+	if (rt_mutex_has_waiters(lock))
+		return;
+
+	/*
+	 * The rbtree has no waiters enqueued, now make sure that the
+	 * lock->owner still has the waiters bit set, otherwise the
+	 * following can happen:
+	 *
+	 * CPU 0	CPU 1		CPU2
+	 * l->owner=T1
+	 *		rt_mutex_lock(l)
+	 *		lock(l->lock)
+	 *		l->owner = T1 | HAS_WAITERS;
+	 *		enqueue(T2)
+	 *		boost()
+	 *		  unlock(l->lock)
+	 *		block()
+	 *
+	 *				rt_mutex_lock(l)
+	 *				lock(l->lock)
+	 *				l->owner = T1 | HAS_WAITERS;
+	 *				enqueue(T3)
+	 *				boost()
+	 *				  unlock(l->lock)
+	 *				block()
+	 *		signal(->T2)	signal(->T3)
+	 *		lock(l->lock)
+	 *		dequeue(T2)
+	 *		deboost()
+	 *		  unlock(l->lock)
+	 *				lock(l->lock)
+	 *				dequeue(T3)
+	 *				 ==> wait list is empty
+	 *				deboost()
+	 *				 unlock(l->lock)
+	 *		lock(l->lock)
+	 *		fixup_rt_mutex_waiters()
+	 *		  if (wait_list_empty(l) {
+	 *		    l->owner = owner
+	 *		    owner = l->owner & ~HAS_WAITERS;
+	 *		      ==> l->owner = T1
+	 *		  }
+	 *				lock(l->lock)
+	 * rt_mutex_unlock(l)		fixup_rt_mutex_waiters()
+	 *				  if (wait_list_empty(l) {
+	 *				    owner = l->owner & ~HAS_WAITERS;
+	 * cmpxchg(l->owner, T1, NULL)
+	 *  ===> Success (l->owner = NULL)
+	 *
+	 *				    l->owner = owner
+	 *				      ==> l->owner = T1
+	 *				  }
+	 *
+	 * With the check for the waiter bit in place T3 on CPU2 will not
+	 * overwrite. All tasks fiddling with the waiters bit are
+	 * serialized by l->lock, so nothing else can modify the waiters
+	 * bit. If the bit is set then nothing can change l->owner either
+	 * so the simple RMW is safe. The cmpxchg() will simply fail if it
+	 * happens in the middle of the RMW because the waiters bit is
+	 * still set.
+	 */
+	owner = READ_ONCE(*p);
+	if (owner & RT_MUTEX_HAS_WAITERS)
+		WRITE_ONCE(*p, owner & ~RT_MUTEX_HAS_WAITERS);
 }
 
 /*
-- 
cgit v1.3-14-g43fede


From 1be5d4fa0af34fb7bafa205aeb59f5c7cc7a089d Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 30 Nov 2016 21:04:42 +0000
Subject: locking/rtmutex: Use READ_ONCE() in rt_mutex_owner()

While debugging the rtmutex unlock vs. dequeue race Will suggested to use
READ_ONCE() in rt_mutex_owner() as it might race against the
cmpxchg_release() in unlock_rt_mutex_safe().

Will: "It's a minor thing which will most likely not matter in practice"

Careful search did not unearth an actual problem in todays code, but it's
better to be safe than surprised.

Suggested-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: David Daney <ddaney@caviumnetworks.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sebastian Siewior <bigeasy@linutronix.de>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: <stable@vger.kernel.org>
Link: http://lkml.kernel.org/r/20161130210030.431379999@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/locking/rtmutex_common.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h
index 4f5f83c7d2d3..e317e1cbb3eb 100644
--- a/kernel/locking/rtmutex_common.h
+++ b/kernel/locking/rtmutex_common.h
@@ -75,8 +75,9 @@ task_top_pi_waiter(struct task_struct *p)
 
 static inline struct task_struct *rt_mutex_owner(struct rt_mutex *lock)
 {
-	return (struct task_struct *)
-		((unsigned long)lock->owner & ~RT_MUTEX_OWNER_MASKALL);
+	unsigned long owner = (unsigned long) READ_ONCE(lock->owner);
+
+	return (struct task_struct *) (owner & ~RT_MUTEX_OWNER_MASKALL);
 }
 
 /*
-- 
cgit v1.3-14-g43fede


From 8fc31ce8896fc3cea1d79688c8ff950ad4e73afe Mon Sep 17 00:00:00 2001
From: David Carrillo-Cisneros <davidcc@google.com>
Date: Sun, 4 Dec 2016 00:46:17 -0800
Subject: perf/core: Remove invalid warning from list_update_cgroup_even()t

The warning introduced in commit:

  864c2357ca89 ("perf/core: Do not set cpuctx->cgrp for unscheduled cgroups")

assumed that a cgroup switch always precedes list_del_event. This is
not the case. Remove warning.

Make sure that cpuctx->cgrp is NULL until a cgroup event is sched in
or ctx->nr_cgroups == 0.

Signed-off-by: David Carrillo-Cisneros <davidcc@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Borislav Petkov <bp@suse.de>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Kan Liang <kan.liang@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Nilay Vaish <nilayvaish@gmail.com>
Cc: Paul Turner <pjt@google.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ravi V Shankar <ravi.v.shankar@intel.com>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vegard Nossum <vegard.nossum@gmail.com>
Cc: Vikas Shivappa <vikas.shivappa@linux.intel.com>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Link: http://lkml.kernel.org/r/1480841177-27299-1-git-send-email-davidcc@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c | 19 ++++++++-----------
 1 file changed, 8 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 6ee1febdf6ff..02c8421f8c01 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -903,17 +903,14 @@ list_update_cgroup_event(struct perf_event *event,
 	 */
 	cpuctx = __get_cpu_context(ctx);
 
-	/* Only set/clear cpuctx->cgrp if current task uses event->cgrp. */
-	if (perf_cgroup_from_task(current, ctx) != event->cgrp) {
-		/*
-		 * We are removing the last cpu event in this context.
-		 * If that event is not active in this cpu, cpuctx->cgrp
-		 * should've been cleared by perf_cgroup_switch.
-		 */
-		WARN_ON_ONCE(!add && cpuctx->cgrp);
-		return;
-	}
-	cpuctx->cgrp = add ? event->cgrp : NULL;
+	/*
+	 * cpuctx->cgrp is NULL until a cgroup event is sched in or
+	 * ctx->nr_cgroup == 0 .
+	 */
+	if (add && perf_cgroup_from_task(current, ctx) == event->cgrp)
+		cpuctx->cgrp = event->cgrp;
+	else if (!add)
+		cpuctx->cgrp = NULL;
 }
 
 #else /* !CONFIG_CGROUP_PERF */
-- 
cgit v1.3-14-g43fede


From f943fe0faf27991d256e10b5a85f175385c64cdc Mon Sep 17 00:00:00 2001
From: Dmitry Vyukov <dvyukov@google.com>
Date: Mon, 28 Nov 2016 15:24:43 +0100
Subject: lockdep: Fix report formatting

Since commit:

  4bcc595ccd80 ("printk: reinstate KERN_CONT for printing continuation lines")

printk() requires KERN_CONT to continue log messages. Lots of printk()
in lockdep.c and print_ip_sym() don't have it. As the result lockdep
reports are completely messed up.

Add missing KERN_CONT and inline print_ip_sym() where necessary.

Example of a messed up report:

  0-rc5+ #41 Not tainted
  -------------------------------------------------------
  syz-executor0/5036 is trying to acquire lock:
   (
  rtnl_mutex
  ){+.+.+.}
  , at:
  [<ffffffff86b3d6ac>] rtnl_lock+0x1c/0x20
  but task is already holding lock:
   (
  &net->packet.sklist_lock
  ){+.+...}
  , at:
  [<ffffffff873541a6>] packet_diag_dump+0x1a6/0x1920
  which lock already depends on the new lock.
  the existing dependency chain (in reverse order) is:
  -> #3
   (
  &net->packet.sklist_lock
  +.+...}
  ...

Without this patch all scripts that parse kernel bug reports are broken.

Signed-off-by: Dmitry Vyukov <dvyukov@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: andreyknvl@google.com
Cc: aryabinin@virtuozzo.com
Cc: joe@perches.com
Cc: syzkaller@googlegroups.com
Link: http://lkml.kernel.org/r/1480343083-48731-1-git-send-email-dvyukov@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/locking/lockdep.c | 111 ++++++++++++++++++++++++-----------------------
 1 file changed, 57 insertions(+), 54 deletions(-)

(limited to 'kernel')

diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 589d763a49b3..4d7ffc0a0d00 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -506,13 +506,13 @@ static void __print_lock_name(struct lock_class *class)
 	name = class->name;
 	if (!name) {
 		name = __get_key_name(class->key, str);
-		printk("%s", name);
+		printk(KERN_CONT "%s", name);
 	} else {
-		printk("%s", name);
+		printk(KERN_CONT "%s", name);
 		if (class->name_version > 1)
-			printk("#%d", class->name_version);
+			printk(KERN_CONT "#%d", class->name_version);
 		if (class->subclass)
-			printk("/%d", class->subclass);
+			printk(KERN_CONT "/%d", class->subclass);
 	}
 }
 
@@ -522,9 +522,9 @@ static void print_lock_name(struct lock_class *class)
 
 	get_usage_chars(class, usage);
 
-	printk(" (");
+	printk(KERN_CONT " (");
 	__print_lock_name(class);
-	printk("){%s}", usage);
+	printk(KERN_CONT "){%s}", usage);
 }
 
 static void print_lockdep_cache(struct lockdep_map *lock)
@@ -536,7 +536,7 @@ static void print_lockdep_cache(struct lockdep_map *lock)
 	if (!name)
 		name = __get_key_name(lock->key->subkeys, str);
 
-	printk("%s", name);
+	printk(KERN_CONT "%s", name);
 }
 
 static void print_lock(struct held_lock *hlock)
@@ -551,13 +551,13 @@ static void print_lock(struct held_lock *hlock)
 	barrier();
 
 	if (!class_idx || (class_idx - 1) >= MAX_LOCKDEP_KEYS) {
-		printk("<RELEASED>\n");
+		printk(KERN_CONT "<RELEASED>\n");
 		return;
 	}
 
 	print_lock_name(lock_classes + class_idx - 1);
-	printk(", at: ");
-	print_ip_sym(hlock->acquire_ip);
+	printk(KERN_CONT ", at: [<%p>] %pS\n",
+		(void *)hlock->acquire_ip, (void *)hlock->acquire_ip);
 }
 
 static void lockdep_print_held_locks(struct task_struct *curr)
@@ -792,8 +792,8 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
 
 		printk("\nnew class %p: %s", class->key, class->name);
 		if (class->name_version > 1)
-			printk("#%d", class->name_version);
-		printk("\n");
+			printk(KERN_CONT "#%d", class->name_version);
+		printk(KERN_CONT "\n");
 		dump_stack();
 
 		if (!graph_lock()) {
@@ -1071,7 +1071,7 @@ print_circular_bug_entry(struct lock_list *target, int depth)
 		return 0;
 	printk("\n-> #%u", depth);
 	print_lock_name(target->class);
-	printk(":\n");
+	printk(KERN_CONT ":\n");
 	print_stack_trace(&target->trace, 6);
 
 	return 0;
@@ -1102,11 +1102,11 @@ print_circular_lock_scenario(struct held_lock *src,
 	if (parent != source) {
 		printk("Chain exists of:\n  ");
 		__print_lock_name(source);
-		printk(" --> ");
+		printk(KERN_CONT " --> ");
 		__print_lock_name(parent);
-		printk(" --> ");
+		printk(KERN_CONT " --> ");
 		__print_lock_name(target);
-		printk("\n\n");
+		printk(KERN_CONT "\n\n");
 	}
 
 	printk(" Possible unsafe locking scenario:\n\n");
@@ -1114,16 +1114,16 @@ print_circular_lock_scenario(struct held_lock *src,
 	printk("       ----                    ----\n");
 	printk("  lock(");
 	__print_lock_name(target);
-	printk(");\n");
+	printk(KERN_CONT ");\n");
 	printk("                               lock(");
 	__print_lock_name(parent);
-	printk(");\n");
+	printk(KERN_CONT ");\n");
 	printk("                               lock(");
 	__print_lock_name(target);
-	printk(");\n");
+	printk(KERN_CONT ");\n");
 	printk("  lock(");
 	__print_lock_name(source);
-	printk(");\n");
+	printk(KERN_CONT ");\n");
 	printk("\n *** DEADLOCK ***\n\n");
 }
 
@@ -1359,22 +1359,22 @@ static void print_lock_class_header(struct lock_class *class, int depth)
 
 	printk("%*s->", depth, "");
 	print_lock_name(class);
-	printk(" ops: %lu", class->ops);
-	printk(" {\n");
+	printk(KERN_CONT " ops: %lu", class->ops);
+	printk(KERN_CONT " {\n");
 
 	for (bit = 0; bit < LOCK_USAGE_STATES; bit++) {
 		if (class->usage_mask & (1 << bit)) {
 			int len = depth;
 
 			len += printk("%*s   %s", depth, "", usage_str[bit]);
-			len += printk(" at:\n");
+			len += printk(KERN_CONT " at:\n");
 			print_stack_trace(class->usage_traces + bit, len);
 		}
 	}
 	printk("%*s }\n", depth, "");
 
-	printk("%*s ... key      at: ",depth,"");
-	print_ip_sym((unsigned long)class->key);
+	printk("%*s ... key      at: [<%p>] %pS\n",
+		depth, "", class->key, class->key);
 }
 
 /*
@@ -1437,11 +1437,11 @@ print_irq_lock_scenario(struct lock_list *safe_entry,
 	if (middle_class != unsafe_class) {
 		printk("Chain exists of:\n  ");
 		__print_lock_name(safe_class);
-		printk(" --> ");
+		printk(KERN_CONT " --> ");
 		__print_lock_name(middle_class);
-		printk(" --> ");
+		printk(KERN_CONT " --> ");
 		__print_lock_name(unsafe_class);
-		printk("\n\n");
+		printk(KERN_CONT "\n\n");
 	}
 
 	printk(" Possible interrupt unsafe locking scenario:\n\n");
@@ -1449,18 +1449,18 @@ print_irq_lock_scenario(struct lock_list *safe_entry,
 	printk("       ----                    ----\n");
 	printk("  lock(");
 	__print_lock_name(unsafe_class);
-	printk(");\n");
+	printk(KERN_CONT ");\n");
 	printk("                               local_irq_disable();\n");
 	printk("                               lock(");
 	__print_lock_name(safe_class);
-	printk(");\n");
+	printk(KERN_CONT ");\n");
 	printk("                               lock(");
 	__print_lock_name(middle_class);
-	printk(");\n");
+	printk(KERN_CONT ");\n");
 	printk("  <Interrupt>\n");
 	printk("    lock(");
 	__print_lock_name(safe_class);
-	printk(");\n");
+	printk(KERN_CONT ");\n");
 	printk("\n *** DEADLOCK ***\n\n");
 }
 
@@ -1497,9 +1497,9 @@ print_bad_irq_dependency(struct task_struct *curr,
 	print_lock(prev);
 	printk("which would create a new lock dependency:\n");
 	print_lock_name(hlock_class(prev));
-	printk(" ->");
+	printk(KERN_CONT " ->");
 	print_lock_name(hlock_class(next));
-	printk("\n");
+	printk(KERN_CONT "\n");
 
 	printk("\nbut this new dependency connects a %s-irq-safe lock:\n",
 		irqclass);
@@ -1521,8 +1521,7 @@ print_bad_irq_dependency(struct task_struct *curr,
 
 	lockdep_print_held_locks(curr);
 
-	printk("\nthe dependencies between %s-irq-safe lock", irqclass);
-	printk(" and the holding lock:\n");
+	printk("\nthe dependencies between %s-irq-safe lock and the holding lock:\n", irqclass);
 	if (!save_trace(&prev_root->trace))
 		return 0;
 	print_shortest_lock_dependencies(backwards_entry, prev_root);
@@ -1694,10 +1693,10 @@ print_deadlock_scenario(struct held_lock *nxt,
 	printk("       ----\n");
 	printk("  lock(");
 	__print_lock_name(prev);
-	printk(");\n");
+	printk(KERN_CONT ");\n");
 	printk("  lock(");
 	__print_lock_name(next);
-	printk(");\n");
+	printk(KERN_CONT ");\n");
 	printk("\n *** DEADLOCK ***\n\n");
 	printk(" May be due to missing lock nesting notation\n\n");
 }
@@ -1891,9 +1890,9 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
 		graph_unlock();
 		printk("\n new dependency: ");
 		print_lock_name(hlock_class(prev));
-		printk(" => ");
+		printk(KERN_CONT " => ");
 		print_lock_name(hlock_class(next));
-		printk("\n");
+		printk(KERN_CONT "\n");
 		dump_stack();
 		return graph_lock();
 	}
@@ -2343,11 +2342,11 @@ print_usage_bug_scenario(struct held_lock *lock)
 	printk("       ----\n");
 	printk("  lock(");
 	__print_lock_name(class);
-	printk(");\n");
+	printk(KERN_CONT ");\n");
 	printk("  <Interrupt>\n");
 	printk("    lock(");
 	__print_lock_name(class);
-	printk(");\n");
+	printk(KERN_CONT ");\n");
 	printk("\n *** DEADLOCK ***\n\n");
 }
 
@@ -2522,14 +2521,18 @@ check_usage_backwards(struct task_struct *curr, struct held_lock *this,
 void print_irqtrace_events(struct task_struct *curr)
 {
 	printk("irq event stamp: %u\n", curr->irq_events);
-	printk("hardirqs last  enabled at (%u): ", curr->hardirq_enable_event);
-	print_ip_sym(curr->hardirq_enable_ip);
-	printk("hardirqs last disabled at (%u): ", curr->hardirq_disable_event);
-	print_ip_sym(curr->hardirq_disable_ip);
-	printk("softirqs last  enabled at (%u): ", curr->softirq_enable_event);
-	print_ip_sym(curr->softirq_enable_ip);
-	printk("softirqs last disabled at (%u): ", curr->softirq_disable_event);
-	print_ip_sym(curr->softirq_disable_ip);
+	printk("hardirqs last  enabled at (%u): [<%p>] %pS\n",
+		curr->hardirq_enable_event, (void *)curr->hardirq_enable_ip,
+		(void *)curr->hardirq_enable_ip);
+	printk("hardirqs last disabled at (%u): [<%p>] %pS\n",
+		curr->hardirq_disable_event, (void *)curr->hardirq_disable_ip,
+		(void *)curr->hardirq_disable_ip);
+	printk("softirqs last  enabled at (%u): [<%p>] %pS\n",
+		curr->softirq_enable_event, (void *)curr->softirq_enable_ip,
+		(void *)curr->softirq_enable_ip);
+	printk("softirqs last disabled at (%u): [<%p>] %pS\n",
+		curr->softirq_disable_event, (void *)curr->softirq_disable_ip,
+		(void *)curr->softirq_disable_ip);
 }
 
 static int HARDIRQ_verbose(struct lock_class *class)
@@ -3235,8 +3238,8 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
 	if (very_verbose(class)) {
 		printk("\nacquire class [%p] %s", class->key, class->name);
 		if (class->name_version > 1)
-			printk("#%d", class->name_version);
-		printk("\n");
+			printk(KERN_CONT "#%d", class->name_version);
+		printk(KERN_CONT "\n");
 		dump_stack();
 	}
 
@@ -3378,7 +3381,7 @@ print_unlock_imbalance_bug(struct task_struct *curr, struct lockdep_map *lock,
 	printk("%s/%d is trying to release lock (",
 		curr->comm, task_pid_nr(curr));
 	print_lockdep_cache(lock);
-	printk(") at:\n");
+	printk(KERN_CONT ") at:\n");
 	print_ip_sym(ip);
 	printk("but there are no more locks to release!\n");
 	printk("\nother info that might help us debug this:\n");
@@ -3871,7 +3874,7 @@ print_lock_contention_bug(struct task_struct *curr, struct lockdep_map *lock,
 	printk("%s/%d is trying to contend lock (",
 		curr->comm, task_pid_nr(curr));
 	print_lockdep_cache(lock);
-	printk(") at:\n");
+	printk(KERN_CONT ") at:\n");
 	print_ip_sym(ip);
 	printk("but there are no locks held!\n");
 	printk("\nother info that might help us debug this:\n");
-- 
cgit v1.3-14-g43fede


From 166ad0e1e2132ff0cda08b94af8301655fcabbcd Mon Sep 17 00:00:00 2001
From: Kefeng Wang <wangkefeng.wang@huawei.com>
Date: Wed, 7 Dec 2016 14:44:36 -0800
Subject: kcov: add missing #include <linux/sched.h>

In __sanitizer_cov_trace_pc we use task_struct and fields within it, but
as we haven't included <linux/sched.h>, it is not guaranteed to be
defined.  While we usually happen to acquire the definition through a
transitive include, this is fragile (and hasn't been true in the past,
causing issues with backports).

Include <linux/sched.h> to avoid any fragility.

[mark.rutland@arm.com: rewrote changelog]
Link: http://lkml.kernel.org/r/1481007384-27529-1-git-send-email-wangkefeng.wang@huawei.com
Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: James Morse <james.morse@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kcov.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/kcov.c b/kernel/kcov.c
index 30e6d05aa5a9..3cbb0c879705 100644
--- a/kernel/kcov.c
+++ b/kernel/kcov.c
@@ -7,6 +7,7 @@
 #include <linux/fs.h>
 #include <linux/mm.h>
 #include <linux/printk.h>
+#include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/spinlock.h>
 #include <linux/vmalloc.h>
-- 
cgit v1.3-14-g43fede